In [55]:
import requests
from bs4 import BeautifulSoup
import json
import random

# for authentication
headers = ({'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
            AppleWebKit/537.36 (KHTML, like Gecko) \
            Chrome/90.0.4430.212 Safari/537.36',
            'Accept-Language': 'en-US, en;q=0.5'})

def get_reviews(keyword, n=5, k=7):
    """
    Get product review info for top n products with the given keyword from Amazon (warning: n > 5 takes a long time)
    
    Args:
        keyword (str): product to search for
        n (int): number of products to get reviews from
        k (int): number of reviews to get per product
    
    Returns:
        product_dict (dict[str:[str]]): product names (keys) mapped to lists review info (values)
    
    """
    # compile product names and review info here
    product_dict = {}

    
    # search for product on amazon and get html response
    url = f'https://www.amazon.com/s?k={keyword}'
    html = requests.get(url, headers=headers).text
    
    
    # get the first page of products returned by amazon search
    products = BeautifulSoup(html).find_all(class_='a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal')
    
    
    # USE THIS CLASS FOR REVIEW TITLES
    titles = 'a-size-base a-link-normal review-title a-color-base review-title-content a-text-bold'
    
    # USE THIS CLASS FOR REVIEW TEXT 
    # (doesn't get the entire review if it's too long, would have to visit each review's page making it way too slow)
    texts = 'a-size-base review-text'
    
    # get random sample of products (if there are enough of them)
    if len(products) >= n:
        products = random.sample(products, k=n)
    
    for product in products:
            
        href = product.attrs['href']
        product_url = f'https://www.amazon.com/{href}'
        product_html = requests.get(product_url, headers=headers).text
        
        product_title = BeautifulSoup(product_html).find(id='productTitle').text.strip()
        review_data = []
        
        # pass either titles or texts to 'class_=' param 
        reviews = BeautifulSoup(product_html).find_all(class_=texts)
        
        # get random sample of reviews (if there are enough of them)
        if len(reviews) >= k:
            reviews = random.sample(reviews, k=k)
    
        for r in reviews:
            # clean up review text
            r = r.text.strip().replace('\n', ' ').replace('Read more', '')
            review_data.append(r.encode("ascii", "ignore").decode())
            
        product_dict[product_title] = review_data
        
        
    return product_dict
        
   
x = get_reviews('soccer ball', )

In [56]:
json.dumps(x)

'{"GlowCity Glow in The Dark Soccer Ball- Light Up, Indoor or Outdoor Soccer Balls with 2 LED Lights and Pre-Installed Batteries - Gift Ideas for Teen Boys and Girls\\ufeff (Official (Size 5), Red)": ["So cool, wish I had an image to show. Cool gift for late night practice. Glows well. ", "Couldn\'t recommend this more highly son loved this so much ", "The ball arrived on time for a (42nd!) birthday present and we have been really impressed with it so far.  Great for a kick about at dusk or night time and the fact that you can only see the ball makes it a really good fun ( avoid playing near obstacles / cliff edges ideally ).  Make sure you have a pump and needle adapter as it arrives deflated so you\'ll need to pump it up before you can use it.  Also although this looks like a traditional football it\'s basically a single skin rubber ball with printed panels, this is not a criticism, it\'s a thick skin and I don\'t expect it would pop easily but just be prepared that it sounds a bit l