# 1.Data Collection

The data collection in this project consists of three steps: 

    1.1 Scrap all product's urls

    1.2 Scrap all products' information

    1.3 Scrap all reviews from each product's page

## 1.3 Scrap product reviews

In order to scrape the reviews, we use the API calls. I tried 2 different APIs:

    1. RapidAPI: this is a pay-per-request API (which is free for the first 500 requests), it has a built-in proxy to avoid being blocked by the server. However, it only allows scraping the first 100 reviews for each product
    
    2. Bazaarvoice API: this is a free API that allows scraping all available reviews. However, we have to set the function to sleep in addition to rotating IP addresses using VPN to avoid being blocked by the site. For this reason, it's slower than rapidAPI. 

In this project, I decided to go with the second API with VPN

In [1]:
import pandas as pd
import json
import time
from bs4 import BeautifulSoup
import requests
import math
import numpy as np

In [2]:
def scrape_reviews(p_id):
    """ this function sends multiple requests (each return 100 reviews - which is the max number set by the server) 
    until retrieving all of the reviews for a given product 
    """
    url = 'https://api.bazaarvoice.com/data/reviews.json'
    params = {
        'Filter': f'ProductId:{p_id}',
        'Sort': 'Helpfulness:desc',
        'Limit': 100,
        'Offset': 0, 
        'Include': 'Products,Comments',
        'Stats': 'Reviews',
        'passkey': 'rwbw526r2e7spptqd2qzbkp7',
        'apiversion': 5.4
    }

    page = 0
    total_reviews = np.nan
    total_pages = 0
    all_reviews_per_product_df = pd.DataFrame()
    single_product_df = pd.DataFrame()
    
    while page <= total_pages: # loops through all pages, each page contains 100 reviews 
        try:
            response = requests.get(url, params=params, timeout=15) # I use a VPN for this but one could use some rotating proxies
        except:
            print('Cannot connect!')
            return single_product_df, all_reviews_per_product_df

        # break if we have an error or have scraped all the reviews
        if (response.status_code != 200):
            print('Request failed p_id: ', p_id, '')
            break
        elif (all_reviews_per_product_df.shape[0] >= total_reviews):
            print('Total reviews', str(total_reviews), ' p_id: ', p_id, '')
            break
        else:
            soup_dict = response.json() #parse results
            
            if page == 0:
                total_reviews = soup_dict['TotalResults']
                total_pages = math.ceil(total_reviews/100)
                single_product_df = extract_single_product_info(p_id, soup_dict)
            
            # extract reviewer information
            for review_in in range(len(soup_dict['Results'])): 
                single_review_df = extract_single_review(p_id, review_in, soup_dict)
                all_reviews_per_product_df = all_reviews_per_product_df.append(single_review_df, ignore_index=True)

            # pause to prevent being blocked from website
            time.sleep(5)
            page += 1
            params['Offset'] = all_reviews_per_product_df.shape[0]
                
            # Show number of reviews scraped
            print(f'{p_id}: {all_reviews_per_product_df.shape[0]} reviews')
    return single_product_df, all_reviews_per_product_df

In [3]:
def extract_single_product_info(p_id, soup_dict):
    """this function extracts all the atrtibutes of a given product from a json file 
    output: single product df
    """
    single_product_dict = (dict.fromkeys(['product_id', 'Total_reviews', 'RecommendedCount','AverageOverallRating', 
                                          '1star', '2star', '3star', '4star', '5star',
                                          'normal', 'combination', 'dry', 'oily', 
                                          'acneConcern', 'agingConcern', 'blackheadsConcern', 'dullnessConcern', 
                                          'rednessConcern', 'sensitivityConcern','stretchmarksConcern', 'celluliteConcern', 
                                          'darkCirclesConcern', 'sunDamageConcern', 'callusesConcern', 
                                          'cuticlesConcern', 'poresConcern', 'unevenSkinTonesConcern',
                                          'nonStaffReviews', 'staffReviews', 'incentivizedReviews',
                                          '13to17', '18to24', '25to34', '35to44', '45to54', 'over54'], np.nan))
    
    if bool(soup_dict['Includes']): # if this is not empty
        shared_prod_id = list(soup_dict['Includes']['Products'].keys()) #sometimes a family of products with different p_id share the same page

        fam_prod_id = []

        for prod_in in shared_prod_id:
            if 'BV_FE_FAMILY' in soup_dict['Includes']['Products'][prod_in]['Attributes'].keys():
                (fam_prod_id.append(soup_dict['Includes']['Products'][prod_in]['Attributes']['BV_FE_FAMILY']
                                    ['Values'][0]['Value']))
            if 'BV_WB_FAMILY' in soup_dict['Includes']['Products'][prod_in]['Attributes'].keys():
                (fam_prod_id.append(soup_dict['Includes']['Products'][prod_in]['Attributes']['BV_WB_FAMILY']
                                    ['Values'][0]['Value']))

        single_product_dict['product_id'] = p_id

        if p_id in shared_prod_id:
            p_id = p_id
        else: 
            p_id = shared_prod_id[0]

        single_product_dict['Total_reviews'] = soup_dict['TotalResults']
        single_product_dict['RecommendedCount'] = soup_dict['Includes']['Products'][p_id]['ReviewStatistics']['RecommendedCount']
        single_product_dict['AverageOverallRating'] = soup_dict['Includes']['Products'][p_id]['ReviewStatistics']['AverageOverallRating']

        # review counts by age groups
        if 'age' in soup_dict['Includes']['Products'][p_id]['ReviewStatistics']['ContextDataDistribution'].keys():
            age_group_dict = {row['Value']: row['Count'] for row in (soup_dict['Includes']['Products'][p_id]
                                                                     ['ReviewStatistics']['ContextDataDistribution']['age']['Values'])}
            for key, val in age_group_dict.items(): 
                single_product_dict[key] = val

        # review counts by ratings
        if 'RatingDistribution' in soup_dict['Includes']['Products'][p_id]['ReviewStatistics'].keys():
            rating_dict = {row['RatingValue']: row['Count'] for row in (soup_dict['Includes']['Products'][p_id]['ReviewStatistics']['RatingDistribution'])}
            ratings = ['1star', '2star', '3star', '4star', '5star']
            for key1, val1 in rating_dict.items():
                single_product_dict[ratings[key1-1]] = val1

        #review counts by skin type
        if 'skinType' in soup_dict['Includes']['Products'][p_id]['ReviewStatistics']['ContextDataDistribution'].keys():
            skinType_dict = ({row['Value']:row['Count'] for row in soup_dict['Includes']['Products'][p_id]
                              ['ReviewStatistics']['ContextDataDistribution']['skinType']['Values']})
            for key2, val2 in skinType_dict.items():
                single_product_dict[key2] = val2

        # reviews by staff counts
        if 'StaffContext' in soup_dict['Includes']['Products'][p_id]['ReviewStatistics']['ContextDataDistribution'].keys():
            isSephoraStaff_dict = ({row['Value']:row['Count'] for row in soup_dict['Includes']['Products']
                                    [p_id]['ReviewStatistics']['ContextDataDistribution']['StaffContext']['Values']})
            for key3, val3 in isSephoraStaff_dict.items():
                if key3 == 'true' or key3 == 'True': 
                    single_product_dict['staffReviews'] = val3
                else:     
                    single_product_dict['nonStaffReviews'] = val3

        # whether reviewers received the product for free
        if 'IncentivizedReview' in soup_dict['Includes']['Products'][p_id]['ReviewStatistics']['ContextDataDistribution'].keys():
            promotion_dict = ({row['Value']:row['Count'] for row in soup_dict['Includes']['Products'][p_id]
                               ['ReviewStatistics']['ContextDataDistribution']['IncentivizedReview']['Values']})
            for key4, val4 in promotion_dict.items():
                if key4 == 'true' or key4 == 'True':
                    single_product_dict['incentivizedReviews'] = val4

        # reviews counts by skin concerns 
        if 'skinConcerns' in soup_dict['Includes']['Products'][p_id]['ReviewStatistics']['ContextDataDistribution'].keys():
            dict_of_skinConcers = {row['Value']: row['Count']for row in (soup_dict['Includes']['Products'][p_id]
                                                         ['ReviewStatistics']['ContextDataDistribution']
                                                         ['skinConcerns']['Values'])}
            if 'acne' in dict_of_skinConcers.keys(): 
                single_product_dict['acneConcern'] = dict_of_skinConcers['acne'] # acne concern

            if 'aging' in dict_of_skinConcers.keys(): 
                single_product_dict['agingConcern'] = dict_of_skinConcers['aging'] 

            if 'dullness' in dict_of_skinConcers.keys(): 
                single_product_dict['dullnessConcern'] = dict_of_skinConcers['dullness'] 

            if 'redness' in dict_of_skinConcers.keys(): 
                single_product_dict['rednessConcern'] = dict_of_skinConcers['redness'] 

            if 'blackheads' in dict_of_skinConcers.keys(): 
                single_product_dict['blackheadsConcern'] = dict_of_skinConcers['blackheads'] 

            if 'sensitivity' in dict_of_skinConcers.keys(): 
                single_product_dict['sensitivityConcern'] = dict_of_skinConcers['sensitivity'] 

            if 'blackheads' in dict_of_skinConcers.keys(): 
                single_product_dict['blackheadsConcern'] = dict_of_skinConcers['blackheads'] 

            if 'stretchMarks' in dict_of_skinConcers.keys(): 
                single_product_dict['stretchmarksConcern'] = dict_of_skinConcers['stretchMarks'] 

            if 'cellulite' in dict_of_skinConcers.keys(): 
                single_product_dict['celluliteConcern'] = dict_of_skinConcers['cellulite'] 

            if 'darkCircles' in dict_of_skinConcers.keys(): 
                single_product_dict['darkCirclesConcern'] = dict_of_skinConcers['darkCircles'] 

            if 'sunDamage' in dict_of_skinConcers.keys(): 
                single_product_dict['sunDamageConcern'] = dict_of_skinConcers['sunDamage'] 

            if 'calluses' in dict_of_skinConcers.keys(): 
                single_product_dict['callusesConcern'] = dict_of_skinConcers['calluses'] 

            if 'cuticles' in dict_of_skinConcers.keys(): 
                single_product_dict['cuticlesConcern'] = dict_of_skinConcers['cuticles'] 

            if 'pores' in dict_of_skinConcers.keys(): 
                single_product_dict['poresConcern'] = dict_of_skinConcers['pores'] 

            if 'unevenSkinTones' in dict_of_skinConcers.keys(): 
                single_product_dict['unevenSkinTonesConcern'] = dict_of_skinConcers['unevenSkinTones'] 
                
    single_product = pd.DataFrame(single_product_dict, index = [0])
    
    return single_product

In [4]:
def extract_single_review(p_id, review_in, soup_dict):
    """ this function extracts all the attributes of a single review
    output: single review df
    """
    if soup_dict['Results'][review_in]['ProductId'] in soup_dict['Includes']['Products'].keys(): 
        single_review_dict = (dict.fromkeys(['UserNickname', 'Rating', 'ReviewText', 'userSkinType', 'userEyeColor', 
                                     'isSephoraStaff', 'isVerifiedPurchase','userHairColor', 'userSkinTone', 
                                     'isIncentivizedReview','p_id','author_id', 'TotalPositiveFeedbackCount', 
                                     'TotalNegativeFeedbackCount', 'TotalFeedbackCount', 'userSkinConcern'], np.nan))
        
        single_review_dict['UserNickname'] = soup_dict['Results'][review_in]['UserNickname']
        single_review_dict['Rating'] = soup_dict['Results'][review_in]['Rating']
        single_review_dict['p_id'] = p_id
        single_review_dict['author_id'] = soup_dict['Results'][review_in]['AuthorId']
        
        if 'ReviewText' in soup_dict['Results'][review_in].keys():
            single_review_dict['ReviewText'] = soup_dict['Results'][review_in]['ReviewText'] 
            
        if 'skinConcerns' in soup_dict['Results'][review_in]['ContextDataValues'].keys():
            single_review_dict['userSkinConcern'] = soup_dict['Results'][review_in]['ContextDataValues']['skinConcerns']['Value']
        
        if 'skinType' in soup_dict['Results'][review_in]['ContextDataValues'].keys():
            single_review_dict['userSkinType']= soup_dict['Results'][review_in]['ContextDataValues']['skinType']['Value']
        
        if 'eyeColor' in soup_dict['Results'][review_in]['ContextDataValues'].keys():
            single_review_dict['userEyeColor']= soup_dict['Results'][review_in]['ContextDataValues']['eyeColor']['Value']
        
        if 'skinTone' in soup_dict['Results'][review_in]['ContextDataValues'].keys():
            single_review_dict['userSkinTone'] = soup_dict['Results'][review_in]['ContextDataValues']['skinTone']['Value']
            
        if 'StaffContext' in soup_dict['Results'][review_in]['ContextDataValues'].keys():
            single_review_dict['isSephoraStaff'] = soup_dict['Results'][review_in]['ContextDataValues']['StaffContext']['Value'] #if True, reviewer is a Sephora staff
        
        if 'VerifiedPurchaser' in soup_dict['Results'][review_in]['ContextDataValues'].keys(): 
            single_review_dict['isVerifiedPurchase'] = soup_dict['Results'][review_in]['ContextDataValues']['VerifiedPurchaser']['Value']
        
        if 'hairColor'in soup_dict['Results'][review_in]['ContextDataValues'].keys():
            single_review_dict['userHairColor'] = soup_dict['Results'][review_in]['ContextDataValues']['hairColor']['Value']
            
        if 'IncentivizedReview' in soup_dict['Results'][review_in]['ContextDataValues'].keys(): #if true, received the product as a promotion
            single_review_dict['isIncentivizedReview'] = soup_dict['Results'][review_in]['ContextDataValues']['IncentivizedReview']['Value']
            
        if 'TotalPositiveFeedbackCount' in soup_dict['Results'][review_in].keys():
            single_review_dict['TotalPositiveFeedbackCount'] = soup_dict['Results'][review_in]['TotalPositiveFeedbackCount']
        
        if 'TotalNegativeFeedbackCount' in soup_dict['Results'][review_in].keys(): 
            single_review_dict['TotalNegativeFeedbackCount'] = soup_dict['Results'][review_in]['TotalNegativeFeedbackCount']
        
        if 'TotalFeedbackCount' in soup_dict['Results'][review_in].keys():
            single_review_dict['TotalFeedbackCount'] = soup_dict['Results'][review_in]['TotalFeedbackCount']   
                                                                                                                                                                                                                                                      
    else: 
        print( 'p_id ', p_id, ' doesnt match review product_id')
        
    single_review_df = pd.DataFrame(single_review_dict, index = [0])
    
    return single_review_df

In [5]:
# only scrap products with at least 100 reviews 
data = pd.read_csv('Raw_data/Products_to_scrap_reviews_less_than_100_reviews.csv', index_col = 0)
data.reset_index(inplace = True)
data.head()

Unnamed: 0,index,brand_name,product_name,price,size,quick_look_description,category,url,sku,quick_sale_description,...,product_id,no_reviews,rating,short_description,long_description,usage,ingredients,highlights,special_category,unique_product_name
0,52,Supergoop!,Mini PLAY Everyday Sunscreen Lotion SPF 30 PA++++,$32.00,5.5 oz/ 162 mL,"A fast-absorbing, non-greasy, water and sweat-...",sunscreen-sun-protection,https://www.sephora.com/product/supergoop-play...,2329175,Shop Supergoop!’s PLAY Everyday Lotion SPF 30 ...,...,P454385,52,4.5,"<b>What it is: </b> A fast-absorbing, non-grea...","<b>What it is: </b> A fast-absorbing, non-grea...",<b>Suggested Usage:</b><br>-Apply generously a...,-Sunflower Extract: Rich in beta carotene and...,"['Clean at Sephora', 'SPF']",[{'seoName': 'clean-skin-care'}],Mini PLAY Everyday Sunscreen Lotion SPF 30 PA+...
1,53,Supergoop!,PLAY Body Sunscreen Mousse SPF 50,$34.00,6.5 oz/ 181 mL,"A lightweight, whipped, antioxidant-rich, reef...",sunscreen-sun-protection,https://www.sephora.com/product/supergoop-play...,2346138,Shop Supergoop!’s PLAY Body Mousse SPF 50 with...,...,P456407,21,4.0476,"<b>What it is: </b> A lightweight, whipped, an...","<b>What it is: </b> A lightweight, whipped, an...",<b>Suggested Usage:</b><br>-Apply generously a...,-Blue Sea Kale: A powerful antioxidant that h...,"['Clean at Sephora', 'SPF']",[{'seoName': 'clean-skin-care'}],PLAY Body Sunscreen Mousse SPF 50 Supergoop!
2,102,Farmacy,Honey Halo Moisturizer Jumbo,$68.00,3.4 oz/ 100 mL,"An intense moisturizer that replenishes dry, d...",moisturizing-cream-oils-mists,https://www.sephora.com/product/farmacy-honey-...,2527638,What it is: An intense moisturizer that reple...,...,P475540,4,4.75,<b>What it is: </b>An intense moisturizer tha...,<b>What it is: </b>An intense moisturizer tha...,<b>Suggested Usage:<br></b>- Use in the mornin...,-Buckwheat Honey with Propolis and Royal Jelly...,"['Good for: Anti-Aging', 'Good for: Dryness', ...",[{'seoName': 'clean-skin-care'}],Honey Halo Moisturizer Jumbo Farmacy
3,103,Fenty Skin,Instant Reset Brightening Overnight Recovery G...,$36.00,1.7 oz/ 50 mL Refill,"A noncomedogenic, benefit-packed, luxe, gel-cr...",moisturizing-cream-oils-mists,https://www.sephora.com/product/fenty-skin-rih...,2418911,"What it is: A noncomedogenic, benefit-packed,...",...,P476496,1,5.0,"<b>What it is: </b>A noncomedogenic, benefit-...","<b>What it is: </b>A noncomedogenic, benefit-...","Suggested Usage:<br>-Use on dry, cleansed skin...",-Kalahari Melon Oil (Wild Watermelon): Rich in...,"['Hydrating', 'Good for: Loss of firmness', 'H...",[{'seoName': 'clean-skin-care'}],Instant Reset Brightening Overnight Recovery G...
4,112,The INKEY List,C-50 Blemish Night Treatment,$12.99,1 oz/ 30 mL,"An overnight treatment, with its powerful comb...",moisturizing-cream-oils-mists,https://www.sephora.com/product/the-inkey-list...,2425080,Shop The INKEY List’s C-50 Blemish Night Treat...,...,P469544,39,4.0769,"<b>What it is: </b>An overnight treatment, wi...","<b>What it is: </b>An overnight treatment, wi...",<b>Suggested Usage:<br></b>-To be used in the ...,-5% STAY&reg;-C 50: Prevents and reduces the l...,"['Good for: Acne/Blemishes', 'Clean at Sephora...",[{'seoName': 'clean-skin-care'}],C-50 Blemish Night Treatment The INKEY List


In [6]:
# initiate empty DataFrame to hold data
all_products_df = pd.DataFrame()
all_reviews_df = pd.DataFrame() # all reviews for all products 

In [None]:
# loops through all products
# break this down into couple portions in case there was some errors pulling data from the web we dont have to
# restart from the beginning

for prod_in in range(200, data.shape[0]):
    p_id = data['product_id'][prod_in]
    single_product_df, all_reviews_per_product_df = scrape_reviews(p_id) #scrapes all reviews for a single product
    all_products_df = all_products_df.append(single_product_df, ignore_index=True) # adds product information to df
    all_reviews_df = all_reviews_df.append(all_reviews_per_product_df, ignore_index = True) # adds all reviews for the given product to df
    

### 1.3.1 Write product reviews and product information to csv file

In [None]:
# to write into a new file
# note that products_information_with_review_stats_sephora_1 and 2 have different numbers of columns 
all_products_df.to_csv(path_or_buf= 'Raw_data/products_information_with_review_stats_sephora_All.csv', header = True)
#all_reviews_df.to_csv(path_or_buf= 'Raw_data/product_reviews_400.csv', header = True)

In [None]:
# to append new rows, 'a' mode is for appending data
all_products_df.to_csv('Raw_data/products_information_with_review_stats_sephora.csv', mode='a', index=False, header=False)
all_reviews_df.to_csv(path_or_buf= 'Raw_data/product_reviews.csv', mode='a', index=False, header=False)

In [None]:
all_products_df.head()

In [None]:
all_reviews_df.head()

### 1.3.2 Back-up codes

#### 1.3.2.1 Get reviews using RapidAPI

In [None]:
# get reviews using rapid api
# loop through all the products
url = "https://sephora.p.rapidapi.com/reviews/list"

# we use the rapidapi which is a full service solution and we dont have to use a proxy
# this api is not free and it can only scrap 100 reviews per products 

for prod_in in range(0): #range(data.shape[0]):
    p_id = data['p_id'][prod_in]
    
    querystring = {"ProductId": p_id, "Limit":"100","Offset":"0"}
    headers = {
        'x-rapidapi-host': "sephora.p.rapidapi.com",
        'x-rapidapi-key': "097df65f24mshcb8c73047ad42eap1a0d4cjsnb4c8855cc82c"
    }
    response = requests.request("GET", url, headers=headers, params=querystring)
    
    # convert text content to dict
    soup_dict = response.json()
        
    if soup_dict['HasErrors']:
        print ('Request failed p_id: ', p_id, '')
    else:
        single_product_df = extract_single_product_info(p_id, soup_dict)
        all_products_df = all_products_df.append(single_product_df, ignore_index=True)
        # extract reviewer information
        for review_in in range(len(soup_dict['Results'])): 
        single_review_df = extract_single_review(p_id, review_in, soup_dict)
        all_reviews_df = all_reviews_df.append(single_review_df, ignore_index=True)
        

#### 1.3.2.2 Check a single product

In [None]:
# debug a single product using bazaarvoice API
p_id = data['product_id'][53]
url = 'https://api.bazaarvoice.com/data/reviews.json'
params = {
    'Filter': f'ProductId:{p_id}',
    'Sort': 'Helpfulness:desc',
    'Limit': 100,
    'Offset': 0, 
    'Include': 'Products,Comments',
    'Stats': 'Reviews',
    'passkey': 'rwbw526r2e7spptqd2qzbkp7',
    'apiversion': 5.4
}
page = 0
total_reviews = np.nan
total_pages = 0
all_reviews_per_product_df = pd.DataFrame()
single_product_df = pd.DataFrame()
    
while page <= 0: #total_pages:
    try:
        response = requests.get(url, params=params, timeout=15) # I use a VPN for this but one could use some rotating proxies
    except:
        print('Cannot connect!')

        # break if we have an error or have all the reviews
    if (response.status_code != 200):
        print('Request failed p_id: ', p_id, '')
        break
    elif (all_reviews_per_product_df.shape[0] >= total_reviews):
        print('Total reviews', str(total_reviews), ' p_id: ', p_id, '')
        break
    else:
        soup_dict = response.json() #parse results
            
        if page == 0:
            total_reviews = soup_dict['TotalResults']
            total_pages = math.ceil(total_reviews/100)
            single_product_df = extract_single_product_info(p_id, soup_dict)
            
            # extract reviewer information
        for review_in in range(len(soup_dict['Results'])): 
            single_review_df = extract_single_review(p_id, review_in, soup_dict)
            all_reviews_per_product_df = all_reviews_per_product_df.append(single_review_df, ignore_index=True)

            # pause to prevent being blocked from website
        time.sleep(10)
        page += 1
        params['Offset'] = all_reviews_per_product_df.shape[0]
                
            # Show number of reviews scraped
        print(f'{p_id}: {all_reviews_per_product_df.shape[0]} reviews')

#### 1.3.2.3 Rotating proxies

In [None]:
# credit this function to scrapingbee.com
# use this to rotate among a list of proxies if not using VPN
ip_addresses = [ "mysuperproxy.com:5000", "mysuperproxy.com:5001", "mysuperproxy.com:5100", "mysuperproxy.com:5010", "mysuperproxy.com:5050", "mysuperproxy.com:8080", "mysuperproxy.com:8001", 
"mysuperproxy.com:8000", "mysuperproxy.com:8050" ]

def proxy_request(request_type, url, **kwargs):
   while True:
      try:
         proxy = random.randint(0, len(ip_addresses) - 1)
            proxies = {"http": ip_addresses(proxy), "https": ip_addresses(proxy)}
            response = requests.get(request_type, url, proxies=proxies, timeout=5, **kwargs)
            print(f"Proxy currently being used: {proxy['https']}")
         break
      except:
         print("Error, looking for another proxy")
   return response

#### 1.3.2.4 Extract_single_product_info older version

In [None]:
# this function extracts the product information
# older version

def extract_single_product_info_ver1(p_id, soup_dict):
    shared_prod_id = list(soup_dict['Includes']['Products'].keys()) #sometimes a family of products with different p_id share the same page
    
    fam_prod_id = [np.nan, np.nan, np.nan, np.nan]
    
    if 'BV_FE_FAMILY' in soup_dict['Includes']['Products'][shared_prod_id[0]]['Attributes'].keys(): 
        fam_prod_id[0] = (soup_dict['Includes']['Products'][shared_prod_id[0]]['Attributes']['BV_FE_FAMILY']['Values']
                    [0]['Value'])
    if 'BV_WB_FAMILY' in soup_dict['Includes']['Products'][shared_prod_id[0]]['Attributes'].keys(): 
        fam_prod_id[1] = (soup_dict['Includes']['Products'][shared_prod_id[0]]['Attributes']['BV_WB_FAMILY']['Values']
                    [0]['Value'])
    if 'BV_FE_EXPAND' in soup_dict['Includes']['Products'][shared_prod_id[0]]['Attributes'].keys(): 
        fam_prod_id[2] = (soup_dict['Includes']['Products'][shared_prod_id[0]]['Attributes']['BV_FE_EXPAND']['Values']
                    [0]['Value'])
    if 'BV_WB_EXPAND' in soup_dict['Includes']['Products'][shared_prod_id[0]]['Attributes'].keys(): 
        fam_prod_id[3] = (soup_dict['Includes']['Products'][shared_prod_id[0]]['Attributes']['BV_WB_EXPAND']['Values']
                    [0]['Value'])
    
    single_product_dict = (dict.fromkeys(['product_id', 'Total_reviews', 'RecommendedCount','AverageOverallRating', 
                                          '1star', '2star', '3star', '4star', '5star',
                                          'normal', 'combination', 'dry', 'oily', 
                                          'acneConcern', 'agingConcern', 'blackheadsConcern', 'dullnessConcern', 
                                          'rednessConcern', 'sensitivityConcern','stretchmarksConcern', 'celluliteConcern', 
                                          'darkCirclesConcern', 'sunDamageConcern', 'callusesConcern', 
                                          'cuticlesConcern', 'poresConcern', 'unevenSkinTonesConcern',
                                          'nonStaffReviews', 'staffReviews', 'incentivizedReviews',
                                          '13to17', '18to24', '25to34', '35to44', '45to54', 'over54'], np.nan))
    
    single_product_dict['product_id'] = p_id
    
    if p_id in fam_prod_id or p_id in fam_prod_id: 
        
        if p_id in fam_prod_id: # if p_id is the family id
            p_id = shared_prod_id[0] #take the first id
            
            single_product_dict['Total_reviews'] = soup_dict['TotalResults']
            single_product_dict['RecommendedCount'] = soup_dict['Includes']['Products'][p_id]['ReviewStatistics']['RecommendedCount']
            single_product_dict['AverageOverallRating'] = soup_dict['Includes']['Products'][p_id]['ReviewStatistics']['AverageOverallRating']
        
            # review counts by age groups
            age_group_dict = {row['Value']: row['Count'] for row in (soup_dict['Includes']['Products'][p_id]
                                                                     ['ReviewStatistics']['ContextDataDistribution']['age']['Values'])}
            for key, val in age_group_dict.items(): 
                single_product_dict[key] = val
                
            # review counts by ratings
            rating_dict = {row['RatingValue']: row['Count'] for row in (soup_dict['Includes']['Products'][p_id]['ReviewStatistics']['RatingDistribution'])}
            ratings = ['1star', '2star', '3star', '4star', '5star']
            for key1, val1 in rating_dict.items():
                single_product_dict[ratings[key1-1]] = val1
            
            #review counts by skin type
            skinType_dict = ({row['Value']:row['Count'] for row in soup_dict['Includes']['Products'][p_id]
                              ['ReviewStatistics']['ContextDataDistribution']['skinType']['Values']})
            for key2, val2 in skinType_dict.items():
                single_product_dict[key2] = val2
            
            # reviews by staff counts
            isSephoraStaff_dict = ({row['Value']:row['Count'] for row in soup_dict['Includes']['Products']
                                    [p_id]['ReviewStatistics']['ContextDataDistribution']['StaffContext']['Values']})
            for key3, val3 in isSephoraStaff_dict.items():
                if key3 == 'true' or key3 == 'True': 
                    single_product_dict['staffReviews'] = val3
                else:     
                    single_product_dict['nonStaffReviews'] = val3
            
            # whether reviewers received the product for free
            promotion_dict = ({row['Value']:row['Count'] for row in soup_dict['Includes']['Products'][p_id]
                               ['ReviewStatistics']['ContextDataDistribution']['IncentivizedReview']['Values']})
            for key4, val4 in promotion_dict.items():
                if key4 == 'true' or key4 == 'True':
                    single_product_dict['incentivizedReviews'] = val4
                    
            # reviews counts by skin concerns 
            if 'skinConcerns' in soup_dict['Includes']['Products'][p_id]['ReviewStatistics']['ContextDataDistribution'].keys():
                dict_of_skinConcers = {row['Value']: row['Count']for row in (soup_dict['Includes']['Products'][p_id]
                                                             ['ReviewStatistics']['ContextDataDistribution']
                                                             ['skinConcerns']['Values'])}
                if 'acne' in dict_of_skinConcers.keys(): 
                    single_product_dict['acneConcern'] = dict_of_skinConcers['acne'] # acne concern

                if 'aging' in dict_of_skinConcers.keys(): 
                    single_product_dict['agingConcern'] = dict_of_skinConcers['aging'] 
    
                if 'dullness' in dict_of_skinConcers.keys(): 
                    single_product_dict['dullnessConcern'] = dict_of_skinConcers['dullness'] 

                if 'redness' in dict_of_skinConcers.keys(): 
                    single_product_dict['rednessConcern'] = dict_of_skinConcers['redness'] 

                if 'blackheads' in dict_of_skinConcers.keys(): 
                    single_product_dict['blackheadsConcern'] = dict_of_skinConcers['blackheads'] 

                if 'sensitivity' in dict_of_skinConcers.keys(): 
                    single_product_dict['sensitivityConcern'] = dict_of_skinConcers['sensitivity'] 

                if 'blackheads' in dict_of_skinConcers.keys(): 
                    single_product_dict['blackheadsConcern'] = dict_of_skinConcers['blackheads'] 

                if 'stretchMarks' in dict_of_skinConcers.keys(): 
                    single_product_dict['stretchmarksConcern'] = dict_of_skinConcers['stretchMarks'] 
 
                if 'cellulite' in dict_of_skinConcers.keys(): 
                    single_product_dict['celluliteConcern'] = dict_of_skinConcers['cellulite'] 

                if 'darkCircles' in dict_of_skinConcers.keys(): 
                    single_product_dict['darkCirclesConcern'] = dict_of_skinConcers['darkCircles'] 

                if 'sunDamage' in dict_of_skinConcers.keys(): 
                    single_product_dict['sunDamageConcern'] = dict_of_skinConcers['sunDamage'] 

                if 'calluses' in dict_of_skinConcers.keys(): 
                    single_product_dict['callusesConcern'] = dict_of_skinConcers['calluses'] 

                if 'cuticles' in dict_of_skinConcers.keys(): 
                    single_product_dict['cuticlesConcern'] = dict_of_skinConcers['cuticles'] 

                if 'pores' in dict_of_skinConcers.keys(): 
                    single_product_dict['poresConcern'] = dict_of_skinConcers['pores'] 

                if 'unevenSkinTones' in dict_of_skinConcers.keys(): 
                    single_product_dict['unevenSkinTonesConcern'] = dict_of_skinConcers['unevenSkinTones'] 

    else:
        print( 'p_id ', p_id, ' doesnt match product id')
    
    single_product = pd.DataFrame(single_product_dict, index = [0])
    
    return single_product

#### 1.3.2.5 Only scrap product information and skip the reviews

In [30]:
all_products_df = pd.DataFrame()
for prod_in in range(data.shape[0]):
    p_id = data['product_id'][prod_in]
    url = 'https://api.bazaarvoice.com/data/reviews.json'
    params = {
        'Filter': f'ProductId:{p_id}',
        'Sort': 'Helpfulness:desc',
        'Limit': 100,
        'Offset': 0, 
        'Include': 'Products,Comments',
        'Stats': 'Reviews',
        'passkey': 'rwbw526r2e7spptqd2qzbkp7',
        'apiversion': 5.4
    }

    single_product_df = pd.DataFrame()
    try:
        response = requests.get(url, params=params, timeout=15) # I use a VPN for this but one could use some rotating proxies
    except:
        print('Cannot connect!')
            
    # break if we have an error or have scraped all the reviews
    if (response.status_code != 200):
        print('Request failed p_id: ', p_id, '')
        break
    else:
        soup_dict = response.json() #parse results
        single_product_df = extract_single_product_info(p_id, soup_dict)
        print('p_id: ', p_id)
        
        if len(single_product_df['product_id']) >= 1:
            all_products_df = all_products_df.append(single_product_df, ignore_index=True) # adds product information to df
            

p_id:  P454385
p_id:  P456407
p_id:  P475540
p_id:  P476496
p_id:  P469544
p_id:  P469518
p_id:  P469517
p_id:  P445416
p_id:  P475172
p_id:  P472021
p_id:  P469526
p_id:  P454029
p_id:  P473723
p_id:  P474968
p_id:  P477812
p_id:  P448925
p_id:  P442734
p_id:  P472466
p_id:  P475112
p_id:  P468229
p_id:  P444216
p_id:  P468834
p_id:  P422260
p_id:  P470032
p_id:  P474964
p_id:  P476677
p_id:  P442963
p_id:  P456194
p_id:  P476570
p_id:  P460626
p_id:  P458759
p_id:  P440484
p_id:  P468375
p_id:  P474967
p_id:  P474121
p_id:  P472341
p_id:  P474122
p_id:  P428646
p_id:  P376135
p_id:  P422334
p_id:  P474108
p_id:  P473160
p_id:  P415703
p_id:  P423163
p_id:  P404015
p_id:  P435801
p_id:  P430338
p_id:  P474111
p_id:  P392892
p_id:  P410818
p_id:  P449856
p_id:  P448927
p_id:  P429504
p_id:  P403455
p_id:  P449164
p_id:  P422428
p_id:  P409816
p_id:  P397665
p_id:  P448921
p_id:  P422848
p_id:  P448910
p_id:  P405825
p_id:  P380000
p_id:  P448175
p_id:  P469833
p_id:  P387619
p_id:  P42

p_id:  P458900
p_id:  P296413
p_id:  P270607
p_id:  P441865
p_id:  P414661
p_id:  P458961
p_id:  P454085
p_id:  P462379
p_id:  P422070
p_id:  P467756
p_id:  P467613
p_id:  P122661
p_id:  P467112
p_id:  P423123
p_id:  P462344
p_id:  P465368
p_id:  P459143
p_id:  P385674
p_id:  P407444
p_id:  P448554
p_id:  P456569
p_id:  P4032
p_id:  P423690
p_id:  P9939
p_id:  P397310
p_id:  P428422
p_id:  P416826
p_id:  P440489
p_id:  P440491
p_id:  P388200
p_id:  P401570
p_id:  P417609
p_id:  P456147
p_id:  P442546
p_id:  P440496
p_id:  P448546
p_id:  P421953
p_id:  P416144
p_id:  P442545
p_id:  P460854
p_id:  P406544
p_id:  P455364
p_id:  P478513
p_id:  P460723
p_id:  P467647
p_id:  P455221
p_id:  P467648
p_id:  P472064
p_id:  P444718
p_id:  P478514
p_id:  P445445
p_id:  P460702
p_id:  P461668
p_id:  P426836
p_id:  P478030
p_id:  P406924
p_id:  P91627361
p_id:  P58978876
p_id:  P436359
p_id:  P442566
p_id:  P469521
p_id:  P416816
p_id:  P429954
p_id:  P479353
p_id:  P441841
p_id:  P462343
p_id:  P47

p_id:  P422264
p_id:  P443837
p_id:  P423130
p_id:  P474080
p_id:  P461947
p_id:  P12336
p_id:  P397890
p_id:  P471009
p_id:  P436346
p_id:  P397624
p_id:  P472031
p_id:  P382204
p_id:  P428658
p_id:  P432253
p_id:  P397622
p_id:  P430813
p_id:  P473148
p_id:  P472024
p_id:  P472342
p_id:  P379707
p_id:  P471102
p_id:  P433522
p_id:  P440500
p_id:  P458959
p_id:  P375864
p_id:  P421950
p_id:  P436387
p_id:  P458958
p_id:  P442748
p_id:  P421766
p_id:  P217513
p_id:  P410657
p_id:  P174633
p_id:  P407645
p_id:  P470033
p_id:  P467118
p_id:  P433444
p_id:  P470541
p_id:  P469118
p_id:  P392608
p_id:  P397625
p_id:  P433962
p_id:  P415620
p_id:  P456414
p_id:  P443305
p_id:  P420660
p_id:  P411360
p_id:  P399622
p_id:  P426080
p_id:  P416538
p_id:  P405032
p_id:  P442850
p_id:  P173622
p_id:  P417115
p_id:  P456424
p_id:  P381021
p_id:  P415618
p_id:  P378219
p_id:  P440284
p_id:  P440285
p_id:  P448711
p_id:  P442859
p_id:  P446105
p_id:  P442547
p_id:  P417605
p_id:  P448712
p_id:  P432

p_id:  P472311
p_id:  P476501
p_id:  P471033
p_id:  P469844
p_id:  P471042
p_id:  P468161
p_id:  P470255
p_id:  P461137
p_id:  P472454
p_id:  P474854
p_id:  P472319
p_id:  P470007
p_id:  P443352
p_id:  P468233
p_id:  P476860
p_id:  P460019
p_id:  P442842
p_id:  P448194
p_id:  P476515
p_id:  P458965
p_id:  P446638
p_id:  P474956
p_id:  P460512
p_id:  P461555
p_id:  P471549
p_id:  P469541
p_id:  P443349
p_id:  P457275
p_id:  P407387
p_id:  P456216
p_id:  P474376
p_id:  P476538
p_id:  P472452
p_id:  P476568
p_id:  P476447
p_id:  P472456
p_id:  P428644
p_id:  P476553
p_id:  P442002
p_id:  P472167
p_id:  P437988
p_id:  P428643
p_id:  P472180
p_id:  P468149
p_id:  P405826
p_id:  P476554
p_id:  P386763
p_id:  P475558
p_id:  P471547
p_id:  P437509
p_id:  P386765
p_id:  P470533
p_id:  P386759
p_id:  P425868
p_id:  P441867
p_id:  P463371
p_id:  P459131
p_id:  P475557
p_id:  P472022
p_id:  P400211
p_id:  P470126
p_id:  P406081
p_id:  P432272
p_id:  P425603
p_id:  P459133
p_id:  P444967
p_id:  P45