In [90]:
import requests
import simplejson as json
import re
import pandas as pd
from progress.bar import Bar


urls_csv = 'product_urls.csv'


In [151]:
def get_raw_urls(urls_csv):
    array_of_product_information = []
    product_ids = []
    with open(urls_csv, 'r') as f:
        for line in f.readlines():
            url = line.strip().split(',')
            array_of_product_information.append((url[1]))
            
    return array_of_product_information[1:21]

def strip_urls_for_prod_id(array_of_product_information):
    array_of_product_ids = []
    bad_urls = []
    for x in array_of_product_information:
        matches = re.search(r'.*/(.*).prd', x) 
        try:
            array_of_product_ids.append(matches.group(1))
        except:
            bad_urls.append(x)
            
    return array_of_product_ids, bad_urls

def strip_raw_json(r_text):
    r_text = r_text.replace('\n',"")
    r_text = r_text.replace('\r',"")
    r_text = r_text.replace('\t',"")
    r_text = r_text.replace('(',"")
    r_text = r_text.replace(')',"")
    r_text = r_text.replace(';',"")
        
    return r_text

def get_raw_json(product_id):
    r = requests.get('http://www.very.co.uk/json/catalog/product/productInfo.jsp?sdgProductId=' + product_id)
    text = strip_raw_json(r.text)
    return text

In [152]:
product_info_df = pd.DataFrame(columns=['itemNumber', 'identification', 'Cnet Enabled', 'collect plus', 
                              'number of images', 'description word count', 'number of options',
                              'now from', 'now to price', 'save from price', 'save to price',
                              'was from', 'was to', 'stars', 'count of reviews'])


urls = get_raw_urls(urls_csv)
product_ids, bad_urls = strip_urls_for_prod_id(urls)

bar = Bar('Processing', max=len(product_ids))

for x in product_ids:
    
    product_information = {}
    print(x)    
    r_text = get_raw_json(x)

    json_load = json.loads(r_text)
    try:
        product_information['itemNumber'] = json_load['productData'][0]['identification']['itemNumber']
    except: 
        product_information['itemNumber'] = 'null'
    
    try:
        product_information['identification'] = json_load['identification'][0]
    except:
        product_information['identification'] = 'null'
    
    try:
        product_information['Cnet Enabled'] = json_load['productData'][0]['cnet']['isCNETProductFeaturesEnabled']
    except:
        product_information['Cnet Enabled'] = 'null'
        
    try:    
        product_information['collect plus'] = json_load['productData'][0]['collectPlus']
    except:
        product_information['collect plus'] = 'null'
        
    try:
        product_information['number of images'] = len(json_load['productData'][0]['images'])
    except:
        product_information['number of images'] = 0
        
    try: 
        product_information['description word count'] = len(var.split(json_load['productData'][0]['name']['longDescription']))
    except:
        product_information['description word count'] = 0
        
    try:    
        product_information['number of options'] = len(json_load['productData'][0]['options'])
    except:
        product_information['number of options'] = 0
            
    try:
        product_information['now from'] = json_load['productData'][0]['price']['nowFrom']
        product_information['now to price'] = json_load['productData'][0]['price']['nowToPrice']
        product_information['save from price'] = json_load['productData'][0]['price']['saveFromPrice']
        product_information['save to price'] = json_load['productData'][0]['price']['saveToPrice']
        product_information['was from'] = json_load['productData'][0]['price']['wasFrom']
        product_information['was to'] = json_load['productData'][0]['price']['wasTo']
    except:  
        product_information['now from'] = 0
        product_information['now to price'] = 0
        product_information['save from price'] = 0
        product_information['save to price'] = 0
        product_information['was from'] = 0
        product_information['was to'] = 0
    
    try:
        product_information['stars'] = json_load['productData'][0]['reviews']['stars']
        product_information['count of reviews'] = json_load['productData'][0]['reviews']['total']
    except:
        product_information['stars'] = 0
        product_information['count of reviews'] = 0

    print(product_information) 
    
    series = pd.Series(product_information)
    product_info_df = product_info_df.append(series, ignore_index=True)
    
    bar.next()
    
product_info_df.to_csv('output.csv')
    
bar.finish()

1458060944
{'itemNumber': '4P9X7', 'identification': '1458060944', 'Cnet Enabled': True, 'collect plus': 'true', 'number of images': 4, 'description word count': 2, 'number of options': 3, 'now from': '80', 'now to price': '', 'save from price': '', 'save to price': '', 'was from': '', 'was to': '', 'stars': '5.0', 'count of reviews': '2'}
1600151711
{'itemNumber': 'LAPLD', 'identification': '1600151711', 'Cnet Enabled': True, 'collect plus': 'true', 'number of images': 3, 'description word count': 1, 'number of options': 2, 'now from': '35', 'now to price': '', 'save from price': '35', 'save to price': '', 'was from': '70', 'was to': '', 'stars': '3.0', 'count of reviews': '1'}
1600124182
{'itemNumber': 'KQWM7', 'identification': '1600124182', 'Cnet Enabled': True, 'collect plus': 'true', 'number of images': 3, 'description word count': 1, 'number of options': 2, 'now from': '33.50', 'now to price': '', 'save from price': '14.50', 'save to price': '', 'was from': '48', 'was to': '', '

In [153]:
product_info_df

Unnamed: 0,itemNumber,identification,Cnet Enabled,collect plus,number of images,description word count,number of options,now from,now to price,save from price,save to price,was from,was to,stars,count of reviews
0,4P9X7,1458060944,True,True,4,2,3,80.0,,,,,,5.0,2.0
1,LAPLD,1600151711,True,True,3,1,2,35.0,,35.0,,70.0,,3.0,1.0
2,KQWM7,1600124182,True,True,3,1,2,33.5,,14.5,,48.0,,4.076900005340576,13.0
3,LNUGN,1600184574,True,True,4,1,2,15.0,,13.0,,28.0,,,
4,LFTCD,1600165087,True,True,4,1,2,9.0,,3.0,,12.0,,4.5,2.0
5,LL3AK,1600177043,True,True,3,1,2,80.0,,,,,,5.0,2.0
6,LLLWT,1600177898,True,True,1,1,1,19.99,,,,,,,
7,K9FK7,1600077311,True,False,3,1,1,499.99,,100.0,,599.99,,4.652200222015381,23.0
8,L49UH,1600168837,True,True,4,1,2,49.0,59.0,,,,,4.0,3.0
9,KYRKK,1600141914,True,True,4,1,2,13.0,,5.0,,18.0,,4.5,10.0


Unnamed: 0,itemNumber,identification,Cnet Enabled,collect plus,number of images,description word count,number of options,now from,now to price,save from price,save to price,was from,was to,stars,count of reviews
0,4P9X7,1458060944,True,True,4,2,3,80,,,,,,5.0,2
