In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
# -*- coding: utf-8 -*-
"""
Created on Sun Dec 13 18:16:03 2015

@author: Himanshu
"""

import json
import urllib
import asyncio
import aiohttp
import pandas as pd
import csv
import time
import requests

from collections import OrderedDict
from urllib.request import urlopen

In [3]:
product_list=['Product ID','Name','Price','Currency','Retailer ID',
                    'Retailer Name','Brand ID','Brand Name','Description',
                    'image_small','image_XLarge','image_Medium','image_Large','image_IPhoneSmall',
                    'image_Best','image_Original','image_IPhone',
                    'Extract Date','Last Modified','Colors','Sizes','Categories']

In [5]:
""" Creating master product list """
def get_brands_from_api():
    url = 'http://api.shopstyle.com/api/v2/brands?pid=uid5281-8673591-38'
    response = requests.get(url)
    results = response.json()
#     brand_response = urlopen(url)
#     brand_results = brand_response.read().decode('utf8')
#     results = json.loads(brand_results)

    data = results['brands']
    brand_dict = OrderedDict()
    for b in data:
        brand_dict[b['id']] = b['name']
    #brand_dict = list(brand_dict.keys())[:2] ### only two brands at once
    return brand_dict

In [6]:
brand_lists = get_brands_from_api()
brand_ids=list(brand_lists.keys())

In [7]:
len(brand_lists)

8493

In [8]:
class async_product_scraper:
    """Download product info for brands on shopstyle api"""
    
    def __init__(self):
        pass
        
    async def _fetch(self, url, params, session):
        """
        Params:
            url: str, request url
            session: aiohttp.ClientSession() object
        Returns:
            response : JSON/Python Dict
        """
        
        async with session.get(url, params=params) as response:
            return await response.json()
        
    async def run(self, brandID):
        BASEURL = 'http://api.shopstyle.com/api/v2/products?pid=uid5281-8673591-38&format=json&'
  
        # offset params to get products with offset 0-5000
        offset_params = [{'fl': 'b{}'.format(brandID), 'offset': r, 'limit': 50} for r in range(0,5000,50)]
        async with aiohttp.ClientSession() as session:
            tasks = [asyncio.ensure_future(self._fetch(BASEURL, offset, session)) 
                     for offset in offset_params]
            return await asyncio.gather(*tasks, return_exceptions=False)

## Multiple brands is slower        
#     async def run(self, brandIDs):
#         BASEURL = 'http://api.shopstyle.com/api/v2/products?pid=uid5281-8673591-38&format=json&'
#         # offset params to get products with offset 0-5000
#         offset_params = [[{'fl': 'b{}'.format(brandID), 'offset': r, 'limit': 50} for r in range(0,5000,50)] for brandID in brandIDs]
#         offset_params = [item for sublist in offset_params for item in sublist]
#         async with aiohttp.ClientSession() as session:
#             tasks = [asyncio.ensure_future(self._fetch(BASEURL, offset, session)) 
#                      for offset in offset_params]
#             return await asyncio.gather(*tasks, return_exceptions=False)

class product_writer:
    
    def __init__(self, response):
        self.response = response
        
    def product_info_df(self):
        DFS = [pd.DataFrame(r['products']) for r in self.response]
        products_df = pd.concat(DFS,ignore_index=True)
#         product_dict = self.response['products']
#         products_df = pd.DataFrame(product_dict)
        
        # retailer id/name
        retailer_df = products_df['retailer'].apply(pd.Series)[['id', 'name']]
        retailer_df.columns = ['retailer_id', 'retailer_name']
        products_df = pd.concat([products_df.drop(['retailer'], axis=1), retailer_df], axis=1)
        
        # sizes
        products_df.loc[:, 'sizes'] = products_df['sizes'].apply(lambda x: [size['name'] for size in x] if x else '')
        
        # brand id/name
        brand_placeholder = {'id': '', 'name': ''} # placeholder for NaNs
        products_df.loc[:, 'brand'] = products_df['brand'].apply(lambda d: d if isinstance(d, dict) else brand_placeholder)
        brand_df = pd.DataFrame(products_df['brand'].apply(pd.Series))
        brand_df.columns = ['brand_id', 'brand_name']
        products_df = pd.concat([products_df.drop(['brand'], axis=1), brand_df], axis=1)
        
#       image urls
        image_df = products_df['image'].apply(lambda x: pd.Series(x['sizes'])).applymap(lambda y: y['url'])
        products_df = pd.concat([products_df.drop(['image'],axis=1), image_df], axis=1)       
        
        # colors
        products_df.loc[:, 'colors'] = products_df['colors'].apply(lambda x: [color['name'] for color in x] if x else '')
        
        # categories
        products_df.loc[: ,'categories'] = products_df['categories'].apply(lambda x: [category['name'] for category in x] if x else '')
        
        return products_df

In [9]:
product_scraper = async_product_scraper()

In [10]:
def downloading(brandID):
        loop = asyncio.get_event_loop()
        future = asyncio.ensure_future(product_scraper.run(brandID))
        loop.run_until_complete(future)
        result = future.result()

        parser = product_writer(result)
        df = parser.product_info_df()
        keep_columns =['id', 'name', 'price', 'currency', 'retailer_id', 'retailer_name',
         'brand_id', 'brand_name', 'description',
         'Small', 'XLarge', 'Medium', 'Large', 'IPhoneSmall', 'Best', 
         'Original', 'IPhone', 'extractDate', 'lastModified', 'colors', 'sizes', 'categories']
        df = df[keep_columns]
        with open('data/products_brand_Apr.csv', 'a') as f:
            writer = csv.writer(f, delimiter=',')
            writer.writerow(product_list)    
            df.to_csv(f,encoding='utf-8',index=False,header=False)
        return df

In [11]:
PERIOD_OF_TIME = 5
for brandID in brand_ids[:10]:
    start = time.time()
    print("Downloading brand " + str(brandID))
    downloading(brandID)
    print('Time Used:', time.time()-start, 'sec')

Downloading brand 3
Time Used: 6.64926815032959 sec
Downloading brand 5
Time Used: 1.039794921875 sec
Downloading brand 6
Time Used: 5.293516159057617 sec
Downloading brand 7
Time Used: 0.5486218929290771 sec
Downloading brand 14
Time Used: 9.613234996795654 sec
Downloading brand 15
Time Used: 4.957540988922119 sec
Downloading brand 16
Time Used: 0.5401949882507324 sec
Downloading brand 17
Time Used: 0.49471092224121094 sec
Downloading brand 18
Time Used: 5.850774049758911 sec
Downloading brand 19
Time Used: 1.5974528789520264 sec


In [15]:
PERIOD_OF_TIME = 5
for brandID in brand_ids[10:100]:
    start = time.time()
    print("Downloading brand " + str(brandID))
    df=downloading(brandID)
    print('Time Used:', time.time()-start, 'sec')

Downloading brand 20
Time Used: 2.694216012954712 sec
Downloading brand 22
Time Used: 0.7197179794311523 sec
Downloading brand 24
Time Used: 0.5947921276092529 sec
Downloading brand 25
Time Used: 1.2949309349060059 sec
Downloading brand 28
Time Used: 1.8220739364624023 sec
Downloading brand 30
Time Used: 3.614392042160034 sec
Downloading brand 31
Time Used: 4.01398491859436 sec
Downloading brand 33
Time Used: 1.6735751628875732 sec
Downloading brand 36
Time Used: 1.1370611190795898 sec
Downloading brand 38
Time Used: 6.737213850021362 sec
Downloading brand 39
Time Used: 0.4324679374694824 sec
Downloading brand 41
Time Used: 1.6333260536193848 sec
Downloading brand 42


ClientConnectorError: [Errno 8] Cannot connect to host api.shopstyle.com:80 ssl:False [nodename nor servname provided, or not known]

In [51]:
PERIOD_OF_TIME = 5
for brandID in brand_ids[22:100]:
    start = time.time()
    print("Downloading brand " + str(brandID))
    df=downloading(brandID)
    print('Time Used:', time.time()-start, 'sec')

Downloading brand 42
Time Used: 0.814511775970459 sec
Downloading brand 44
Time Used: 0.7824978828430176 sec
Downloading brand 46
Time Used: 3.837007999420166 sec
Downloading brand 49
Time Used: 0.7536768913269043 sec
Downloading brand 51
Time Used: 6.168938875198364 sec
Downloading brand 55
Time Used: 0.8314721584320068 sec
Downloading brand 56
Time Used: 0.7674429416656494 sec
Downloading brand 59
Time Used: 0.733961820602417 sec
Downloading brand 60
Time Used: 0.9114410877227783 sec
Downloading brand 61
Time Used: 4.556144952774048 sec
Downloading brand 63
Time Used: 2.289518117904663 sec
Downloading brand 64
Time Used: 0.4735381603240967 sec
Downloading brand 65
Time Used: 6.214260101318359 sec
Downloading brand 66
Time Used: 1.0727519989013672 sec
Downloading brand 69
Time Used: 0.6294698715209961 sec
Downloading brand 72
Time Used: 2.943419933319092 sec
Downloading brand 73
Time Used: 0.7468550205230713 sec
Downloading brand 75
Time Used: 4.640359878540039 sec
Downloading brand 7

ClientResponseError: 0, message='Attempt to decode JSON with unexpected mimetype: '

In [60]:
PERIOD_OF_TIME = 5
for brandID in brand_ids[82:150]:
    start = time.time()
    print("Downloading brand " + str(brandID))
    df=downloading(brandID)
    print('Time Used:', time.time()-start, 'sec')

Downloading brand 156
Time Used: 6.242364883422852 sec
Downloading brand 163
Time Used: 1.266050100326538 sec
Downloading brand 164
Time Used: 0.6422851085662842 sec
Downloading brand 165
Time Used: 1.0086939334869385 sec
Downloading brand 166
Time Used: 0.5838720798492432 sec
Downloading brand 167
Time Used: 2.173578977584839 sec
Downloading brand 169
Time Used: 0.9458708763122559 sec
Downloading brand 170


ClientConnectorError: [Errno 8] Cannot connect to host api.shopstyle.com:80 ssl:False [nodename nor servname provided, or not known]

In [62]:
PERIOD_OF_TIME = 5
for brandID in brand_ids[89:150]:
    start = time.time()
    print("Downloading brand " + str(brandID))
    df=downloading(brandID)
    print('Time Used:', time.time()-start, 'sec')

Downloading brand 170
Time Used: 2.5766029357910156 sec
Downloading brand 171
Time Used: 8.368900060653687 sec
Downloading brand 172
Time Used: 7.898025035858154 sec
Downloading brand 173
Time Used: 7.501443862915039 sec
Downloading brand 174
Time Used: 2.991771936416626 sec
Downloading brand 175
Time Used: 5.748074054718018 sec
Downloading brand 177
Time Used: 1.9318428039550781 sec
Downloading brand 178
Time Used: 0.9513850212097168 sec
Downloading brand 179
Time Used: 8.788113117218018 sec
Downloading brand 180
Time Used: 2.5404059886932373 sec
Downloading brand 181
Time Used: 0.48406219482421875 sec
Downloading brand 183
Time Used: 1.9030399322509766 sec
Downloading brand 184
Time Used: 2.165540933609009 sec
Downloading brand 185
Time Used: 1.3743338584899902 sec
Downloading brand 189
Time Used: 0.8877639770507812 sec
Downloading brand 191
Time Used: 0.5040149688720703 sec
Downloading brand 192
Time Used: 6.474982023239136 sec
Downloading brand 193
Time Used: 1.1371259689331055 sec

ClientConnectorError: [Errno 8] Cannot connect to host api.shopstyle.com:80 ssl:False [nodename nor servname provided, or not known]

In [66]:
PERIOD_OF_TIME = 5
for brandID in brand_ids[123:150]:
    start = time.time()
    print("Downloading brand " + str(brandID))
    df=downloading(brandID)
    print('Time Used:', time.time()-start, 'sec')

Downloading brand 222
Time Used: 4.068339109420776 sec
Downloading brand 223
Time Used: 1.0415749549865723 sec
Downloading brand 224
Time Used: 0.6218488216400146 sec
Downloading brand 225
Time Used: 5.546773910522461 sec
Downloading brand 226
Time Used: 10.218796014785767 sec
Downloading brand 227
Time Used: 5.770864963531494 sec
Downloading brand 230


ClientConnectorError: [Errno 8] Cannot connect to host api.shopstyle.com:80 ssl:False [nodename nor servname provided, or not known]

In [68]:
PERIOD_OF_TIME = 5
for brandID in brand_ids[129:150]:
    start = time.time()
    print("Downloading brand " + str(brandID))
    df=downloading(brandID)
    print('Time Used:', time.time()-start, 'sec')

Downloading brand 230
Time Used: 10.938329935073853 sec
Downloading brand 231


ClientResponseError: 0, message='Attempt to decode JSON with unexpected mimetype: '

In [69]:
PERIOD_OF_TIME = 5
for brandID in brand_ids[130:150]:
    start = time.time()
    print("Downloading brand " + str(brandID))
    df=downloading(brandID)
    print('Time Used:', time.time()-start, 'sec')

Downloading brand 231
Time Used: 6.482553958892822 sec
Downloading brand 235
Time Used: 0.6615869998931885 sec
Downloading brand 236
Time Used: 0.5822710990905762 sec
Downloading brand 237
Time Used: 2.4836478233337402 sec
Downloading brand 239
Time Used: 0.8990440368652344 sec
Downloading brand 242
Time Used: 1.1296370029449463 sec
Downloading brand 244
Time Used: 1.2698051929473877 sec
Downloading brand 245


ClientConnectorError: [Errno 8] Cannot connect to host api.shopstyle.com:80 ssl:False [nodename nor servname provided, or not known]

In [71]:
PERIOD_OF_TIME = 5
for brandID in brand_ids[137:150]:
    start = time.time()
    print("Downloading brand " + str(brandID))
    df=downloading(brandID)
    print('Time Used:', time.time()-start, 'sec')

Downloading brand 245
Time Used: 2.1249899864196777 sec
Downloading brand 246
Time Used: 2.4467570781707764 sec
Downloading brand 250
Time Used: 0.5765950679779053 sec
Downloading brand 251
Time Used: 3.8200600147247314 sec
Downloading brand 252
Time Used: 6.257076978683472 sec
Downloading brand 253
Time Used: 7.110929012298584 sec
Downloading brand 254
Time Used: 1.5426909923553467 sec
Downloading brand 255
Time Used: 0.966418981552124 sec
Downloading brand 257
Time Used: 1.2923610210418701 sec
Downloading brand 261
Time Used: 1.0700080394744873 sec
Downloading brand 262
Time Used: 1.1138818264007568 sec
Downloading brand 263
Time Used: 6.18603515625 sec
Downloading brand 265


ClientConnectorError: [Errno 8] Cannot connect to host api.shopstyle.com:80 ssl:False [nodename nor servname provided, or not known]

In [73]:
PERIOD_OF_TIME = 5
for brandID in brand_ids[149:200]:
    start = time.time()
    print("Downloading brand " + str(brandID))
    df=downloading(brandID)
    print('Time Used:', time.time()-start, 'sec')

Downloading brand 265
Time Used: 3.0920138359069824 sec
Downloading brand 266
Time Used: 2.4937760829925537 sec
Downloading brand 267
Time Used: 7.978736162185669 sec
Downloading brand 268
Time Used: 1.827200174331665 sec
Downloading brand 269
Time Used: 0.7564330101013184 sec
Downloading brand 270
Time Used: 0.6943340301513672 sec
Downloading brand 273
Time Used: 4.6215760707855225 sec
Downloading brand 278
Time Used: 0.4429450035095215 sec
Downloading brand 280
Time Used: 0.5866591930389404 sec
Downloading brand 282
Time Used: 0.4048941135406494 sec
Downloading brand 283
Time Used: 1.4913511276245117 sec
Downloading brand 284
Time Used: 4.9007649421691895 sec
Downloading brand 286
Time Used: 1.7615599632263184 sec
Downloading brand 288
Time Used: 1.060662031173706 sec
Downloading brand 289
Time Used: 1.8428900241851807 sec
Downloading brand 290
Time Used: 2.5054140090942383 sec
Downloading brand 291
Time Used: 3.4425721168518066 sec
Downloading brand 292
Time Used: 10.927501201629639

ClientConnectorError: [Errno 8] Cannot connect to host api.shopstyle.com:80 ssl:False [nodename nor servname provided, or not known]

In [75]:
PERIOD_OF_TIME = 5
for brandID in brand_ids[167:200]:
    start = time.time()
    print("Downloading brand " + str(brandID))
    df=downloading(brandID)
    print('Time Used:', time.time()-start, 'sec')
    time.sleep(5)

Downloading brand 293
Time Used: 1.1523070335388184 sec
Downloading brand 294
Time Used: 0.9059901237487793 sec
Downloading brand 295
Time Used: 2.9952361583709717 sec
Downloading brand 296


ClientResponseError: 0, message='Attempt to decode JSON with unexpected mimetype: '

In [77]:
PERIOD_OF_TIME = 5
for brandID in brand_ids[170:200]:
    start = time.time()
    print("Downloading brand " + str(brandID))
    df=downloading(brandID)
    print('Time Used:', time.time()-start, 'sec')
    time.sleep(5)

Downloading brand 296
Time Used: 4.959430932998657 sec
Downloading brand 298
Time Used: 6.839138984680176 sec
Downloading brand 299
Time Used: 0.5571999549865723 sec
Downloading brand 300
Time Used: 3.689012050628662 sec
Downloading brand 301
Time Used: 1.104248046875 sec
Downloading brand 306
Time Used: 0.6153090000152588 sec
Downloading brand 307
Time Used: 0.5112438201904297 sec
Downloading brand 308
Time Used: 1.020658016204834 sec
Downloading brand 309


ClientResponseError: 0, message='Attempt to decode JSON with unexpected mimetype: '

In [None]:
PERIOD_OF_TIME = 5
for brandID in brand_ids[178:200]:
    start = time.time()
    print("Downloading brand " + str(brandID))
    df=downloading(brandID)
    print('Time Used:', time.time()-start, 'sec')
    time.sleep(5)

In [78]:
brand_ids.index('309')

178

In [64]:
df_products=pd.read_csv("data/products_brand_Apr.csv", header=0)

In [65]:
len(df_products)

124673

In [46]:
df_products.head()

Unnamed: 0,Product ID,Name,Price,Currency,Retailer ID,Retailer Name,Brand ID,Brand Name,Description,image_small,...,image_Large,image_IPhoneSmall,image_Best,image_Original,image_IPhone,Extract Date,Last Modified,Colors,Sizes,Categories
0,685109155,7 For All Mankind Zip Front Mini Skirt,189.0,USD,105,REVOLVE,3,7 For All Mankind,7 For All Mankind Zip Front Mini Skirt in Navy...,https://img.shopstyle-cdn.com/pim/5e/39/5e39ed...,...,https://img.shopstyle-cdn.com/sim/5e/39/5e39ed...,https://img.shopstyle-cdn.com/mim/5e/39/5e39ed...,https://img.shopstyle-cdn.com/pim/5e/39/5e39ed...,https://img.shopstyle-cdn.com/pim/5e/39/5e39ed...,https://img.shopstyle-cdn.com/mim/5e/39/5e39ed...,2017-11-04,2018-02-26,['Navy'],"['27', '29', '30']",['Mini Skirts']
1,642153271,7 For All Mankind(R) Dojo Wide Leg Jeans,159.0,USD,1,Nordstrom,3,7 For All Mankind,Hand-distressed details add a lived-in look to...,https://img.shopstyle-cdn.com/pim/ad/06/ad061f...,...,https://img.shopstyle-cdn.com/sim/ad/06/ad061f...,https://img.shopstyle-cdn.com/mim/ad/06/ad061f...,https://img.shopstyle-cdn.com/pim/ad/06/ad061f...,https://img.shopstyle-cdn.com/pim/ad/06/ad061f...,https://img.shopstyle-cdn.com/mim/ad/06/ad061f...,2017-05-02,2018-04-07,['Blue'],"['24', '25', '23', '26', '27', '28', '29', '31...",['Distressed Denim']
2,706495263,7 For All Mankind(R) Roxanne Faux Suede Stripe...,229.0,USD,1,Nordstrom,3,7 For All Mankind,Pink faux-suede tuxedo stripes highlight the s...,https://img.shopstyle-cdn.com/pim/d5/74/d574cf...,...,https://img.shopstyle-cdn.com/sim/d5/74/d574cf...,https://img.shopstyle-cdn.com/mim/d5/74/d574cf...,https://img.shopstyle-cdn.com/pim/d5/74/d574cf...,https://img.shopstyle-cdn.com/pim/d5/74/d574cf...,https://img.shopstyle-cdn.com/mim/d5/74/d574cf...,2018-01-21,2018-04-08,['Blue'],"['27', '24', '28', '32', '30', '29', '31']","['Skinny Denim', 'Stretch Denim']"
3,536353752,7 For All Mankind b(air) Ankle Knee Hole Skinny.,179.0,USD,105,REVOLVE,3,7 For All Mankind,7 For All Mankind b(air) Ankle Knee Hole Skinn...,https://img.shopstyle-cdn.com/pim/c1/12/c112d3...,...,https://img.shopstyle-cdn.com/sim/c1/12/c112d3...,https://img.shopstyle-cdn.com/mim/c1/12/c112d3...,https://img.shopstyle-cdn.com/pim/c1/12/c112d3...,https://img.shopstyle-cdn.com/pim/c1/12/c112d3...,https://img.shopstyle-cdn.com/mim/c1/12/c112d3...,2016-08-10,2018-03-31,['Black'],"['24', '25', '26', '27', '28', '29', '30']",['Skinny Denim']
4,704320894,7 For All Mankind Roxanne Ankle Jean.,229.0,USD,105,REVOLVE,3,7 For All Mankind,7 For All Mankind Roxanne Ankle Jean. - size 2...,https://img.shopstyle-cdn.com/pim/bc/86/bc86eb...,...,https://img.shopstyle-cdn.com/sim/bc/86/bc86eb...,https://img.shopstyle-cdn.com/mim/bc/86/bc86eb...,https://img.shopstyle-cdn.com/pim/bc/86/bc86eb...,https://img.shopstyle-cdn.com/pim/bc/86/bc86eb...,https://img.shopstyle-cdn.com/mim/bc/86/bc86eb...,2018-01-31,2018-04-06,['Vintage Blue Dunes'],"['26', '27', '28', '30']",['Skinny Denim']


In [None]:
# delete the brands if len(products)<10