# Example of Multithreaded scraping 

This is an example of setting up a web scrape of multiple product reviews from Amazon. At this point we have already scraped a bunch of products from Amazon, and now we want to scrape the reviews left for each of the products. 

## Scraping Reviews

Because I know from the previous scrape how many reviews a product has, and that each product review page has 10 reviews, I can utilise the power of parallel requests to scrape reveiews in parallel rather than in a sequential manner.

In [1]:
#Import modules for multi threading and scraping
import multiprocessing as mp
import threading
import time
import requests
from lxml import html  
from fake_useragent import UserAgent
ua = UserAgent()

In [2]:
# helper functions for picking out the salient details from a review block


def get_asin(review):
    xpath_asin = ".//a[@data-hook='review-title']/@href"
    return review.xpath(xpath_asin)[0][-10:]

def get_review_id(review):
    return review.xpath("@id")[0]


def get_stars(review):
    xpath_stars = ".//i[@data-hook='review-star-rating']//text()"
    return review.xpath(xpath_stars)[0][0]


def get_title(review):
    xpath_title = ".//a[@data-hook='review-title']//text()"
    return review.xpath(xpath_title)[0]


def get_comment(review):
    xpath_comment = ".//span[@data-hook='review-body']//text()"
    if review.xpath(xpath_comment) != []:
        return review.xpath(xpath_comment)[0]
    else: 
        return "QQQQQQQQQ" 


def get_author(review):
    xpath_author = ".//a[@data-hook='review-author']/@href"
    if review.xpath(xpath_author) != [] and len(review.xpath(xpath_author)[0]) > 26:
        return review.xpath(xpath_author)[0][26:]
    else:
        return 0


def get_date(review):
    xpath_date = ".//span[@data-hook='review-date']//text()"
    return review.xpath(xpath_date)[0][3:]


def get_verified(review):
    xpath_verified = ".//span[@data-hook='avp-badge']//text()"
    if review.xpath(xpath_verified) != []:
        return review.xpath(xpath_verified)[0]
    else:
        return 0


def get_helpful_count(review):
    xpath_helpful = ".//span[@data-hook='helpful-vote-statement']//text()"
    if review.xpath(xpath_helpful) != []:
        score = review.xpath(xpath_helpful)[0].split()[0]
        if score == "One":
            return 1
        else:
            return score
    else:
        return 0


def get_image_count(review):
    xpath_image = ".//img[@data-hook='review-image-tile']"
    if review.xpath(xpath_image) != []:
        return len(review.xpath(xpath_image))
    else:
        return 0


def get_author_status(review):
    xpath_status = ".//span[@data-hook='review-author']/following-sibling::span[@class='a-size-mini a-color-link c7yBadgeAUI c7yTopDownDashedStrike c7y-badge-text a-text-bold']/text()"
    if review.xpath(xpath_status) != []:
        return review.xpath(xpath_status)[0]
    else:
        return "none"
    
def get_video_block(review):
    xpath_video = "div/div/span/div[starts-with(@id,'video-block')]"
    if review.xpath(xpath_video) != []:
        return 1
    else:
        return 0
    

Next I will define the function to go through each review on a review page, and extract the features to a dictionary

In [3]:
def get_reviews_2(page):
    """
    Will Return a dictionary of all review details from a given Amazon review page
    """
    
    
    review_dict = {
    'asin': [],
    'page': [],
    'stars' : [],
    'author': [],
    'date': [],
    'title':[],
    'comment': [],
    'verified': [],
    'helpful': [],
    'pics': [],
    'video': [],
    'comment_id': [],
    'author_status':[]
    }
    
    #set up the request - use a fake user agent
    headers = {'User-Agent': ua.safari}
    r = requests.get(page, headers=headers)
    if r.status_code != 200:
        print('status error',r.status_code,page)

    #get test response from request
    reviews_page = r.text

    #parse the page
    parser = html.fromstring(reviews_page)

    # get the individual products
    xpath_review = "//div[@data-hook='review']"
    reviews = parser.xpath(xpath_review)

    for review in reviews:
        #add returned values to the list within the dictionary
        review_dict['asin'].append(get_asin(review))
        review_dict['page'].append(page)
        review_dict['stars'].append(get_stars(review))
        review_dict['title'].append(get_title(review))
        review_dict['comment'].append(get_comment(review))
        review_dict['author'].append(get_author(review))
        review_dict['date'].append(get_date(review))
        review_dict['comment_id'].append(get_review_id(review))
        review_dict['verified'].append(get_verified(review))
        review_dict['helpful'].append(get_helpful_count(review))
        review_dict['author_status'].append(get_author_status(review))
        review_dict['pics'].append(get_image_count(review))
        review_dict['video'].append(get_video_block(review))
    
#    print(review_dict) ## Debugging
    return review_dict

In order to use the multi threading, I will loop through my products dataframe, and for each product, generate a request for the appropriate number of review page scrapes based on the 10 reviews per page fact

In [4]:
from multiprocessing.pool import ThreadPool

In [5]:
def async_get(urls):
    """
    will return a list of dictionaries containing the relevant review information
    from each url passed to it
    """
    ls_=[]                                          # set up a list to store the results
    pool = ThreadPool(18)                           # Create a Threadpool with 18 threads maximum
    results = pool.map_async(get_reviews_2, urls)   # map the get_reviews_2 function asynchronously to all urls
    results.wait()                                  # wait for the results to come in
    ls_.append(results.get())                       # add the returned dictionary from get_reviews_2 to the ls_ list
    pool.close()                                    # close the pool once all threads have finished
    pool.join()                                     # close open threads
    return ls_                                      # return the list of dictionaries

## Example Usage

Let's get some review data for one playmobil product

In [6]:
# example to generate some product review pages

url = "https://www.amazon.co.uk/Playmobil-5568-City-Children´s-Playground/product-reviews/B00IF1VVFO/ref=cm_cr_arp_d_paging_btm_next_2?ie=UTF8&reviewerType=all_reviews&pageNumber="
urls = []
for i in range(1,18):
    urls.append(url + str(i))

In [7]:
# Example Run of the above process
start = time.time()

playmobil_reviews = async_get(urls)

end = time.time()
print(end - start)

1.3902678489685059


In [8]:

# playmobil_reviews ## uncomment this line to view the list of dictionaries returned