In [110]:
import urllib
from urllib.error import HTTPError, URLError
from bs4 import BeautifulSoup
import re
import time
import csv

In [111]:
def loop_reviews(url, opener, limit, stars=[], reviews=[], titles=[]):
    
    regex = re.compile('^customer_review')
    
    try:
        response = opener.open(url)
        bs = BeautifulSoup(response, 'html.parser')
    except HTTPError as e:
        print(e)
    except URLError as e:
        print('The server could not be find.')
    
    try: # Get link to reviews

        sublink = bs.find('a', {'data-hook': 'see-all-reviews-link-foot'}).attrs['href']
        reviews_link = f'https://www.amazon.com{sublink}' 
        
    except: # There is no reviews button (there is no review)
        print('There is no reviews button')
        
    else:

        # Go to reviews
        try:
            response = opener.open(reviews_link)
            bs = BeautifulSoup(response, 'html.parser')
        except HTTPError as e:
            print(e)
        except URLError as e:
            print('The server could not be find.')

        while True:

            if len(stars) >= limit:
                stars = stars[:limit]
                reviews = reviews[:limit]
                titles = titles[:limit]
                break
            
            tags_array = bs.find_all('div', {'id': regex})

            for tag in tags_array:

                try: # Reviews not from USA have different tag structure
                    star = tag.select("div[class='a-row'] > a[class='a-link-normal']")[0].attrs['title'][0]
                    review = tag.find('div', {'class': 'a-row a-spacing-small review-data'}).span.span.get_text()
                    #title = tag.find('a', {'data-hook': 'review-title'}).span
 
                    title = tag.find('a', {'data-hook': 'review-title'}).find('span', class_=None)

                    if title == None:  # skip reviews not in English
                        continue
                    title = title.get_text()
                    
                    star = star.replace(',', '') # We will save data in csv
                    review = review.replace(',', '')
                    title = title.replace(',', '')
                    
                    stars.append(star)
                    reviews.append(review)
                    titles.append(title)
                except:
                    print('Not from USA')


            try: # Go to the next reviews page

                sublink_reviews = list(bs.find('li', {'class': 'a-last'}).children)[0].attrs['href']

                next_reviews = f'https://www.amazon.com{sublink_reviews}'

            except:
                print('This is the last reviews page!')
                break
                
            else:
                try:
                    response = opener.open(next_reviews)
                    bs = BeautifulSoup(response, 'html.parser')
                except HTTPError as e:
                    print(e)
                except URLError as e:
                    print('The server could not be find.')
            
    return stars, reviews, titles

In [112]:
def search_product(product, opener, limit=50, links=[]):
    
    formatted_product = product.replace(' ', '+')
    
    try:
        response = opener.open(f'https://www.amazon.com/s?k={formatted_product}')
        bs = BeautifulSoup(response, 'html.parser')
    except HTTPError as e:
        print(e)
    except URLError as e:
        print('The server could not be find.')
    
    product_links = bs.find_all('a', {'class': 'a-link-normal s-no-outline'}, limit=limit)
    
    for link in product_links:
        if link not in links:
            links.append(f"https://www.amazon.com/{link.attrs['href']}")
    
    while True:

        if len(links) < limit:
            try:
                get_next = bs.find('a', {'class': 's-pagination-next'}).attrs['href']
                         
            except:
                print("It's the last product page!")
                break
                         
            else:   
                         
                next_link = f"https://www.amazon.com/{get_next}"
                
                try: # Go to next products page
                    response = opener.open(next_link)
                    bs = BeautifulSoup(response, 'html.parser')
                except HTTPError as e:
                    print(e)
                except URLError as e:
                    print('The server could not be find.')
                else: # Get product links
                    product_links = bs.find_all('a', {'class': 'a-link-normal s-no-outline'}, limit=limit)

                    for link in product_links:
                        if link not in links:
                            links.append(f"https://www.amazon.com/{link.attrs['href']}")
        else:
            break
                     
    return links[:limit]

In [113]:
def scrap(products:list, reviews_count:int=200, limit:int=500):
    
    links = []
    stars = []
    reviews = []
    titles = []
    
    opener = urllib.request.build_opener()
    opener.addheaders = [('User-agent', 'Mozilla/5.0')]
    
    for product in products:
        
        if len(links) == limit:
            print(f'links founded: {links}')
            break
        
        links = search_product(product, opener, limit=limit, links=links)

    for link in links:
        stars, reviews, titles = loop_reviews(link, opener=opener, limit=reviews_count, stars=stars, reviews=reviews, titles=titles)
        
    return stars, reviews, titles

In [114]:
def save_csv(stars, reviews, titles, file:str='myfile.csv'):
    
    file = open(file, 'w+')
    
    try:
        writer = csv.writer(file)
        writer.writerow(('stars', 'reviews', 'titles'))
        for i in range(len(stars)):
            writer.writerow((stars[i], reviews[i], titles[i]))
    finally:
        file.close()

In [115]:
# a = time.time()

# stars, reviews, titles = scrap(['cloths'], reviews_count=1000, limit=5)

# save_csv(stars, reviews, titles)

# b = time.time()

# print(b-a)

# print('just not to print return')