## IMDB User Reviews Review Scraper
## Hung Pham

First step, we will need to import all the necessary libraries:

In [1]:
import pandas as pd #Using panda to create our dataframe
# Import Selenium and its sub libraries
import selenium 
from selenium import webdriver
# Import BS4
import requests #needed to load the page for BS4
from bs4 import BeautifulSoup
import re

As we chose Chrome as our main web browser, we will need to download Chrome driver and tell Selenium where to find it:

In [2]:
PATH = r"D:\temp\chromedriver.exe" #path to the webdriver file

the below dataset variable will contain all review information.

In [3]:
# creating dataframe for storing reviews
dataset = pd.DataFrame(columns = ['User_name', 
        'Review title', 
        'Review Rating',
        'Review date',
        'Review_body',
        'Movie_name',
         ])


The function below will fetch reviews from a link.

In [4]:
def get_review(url):

    '''
    Get the review from input as url for IMDB movies list.
    The function takes 2 input the url of the movies and the name of the folder to store the data
    For each folder, the function will grab the review for each movies and store into respective file.
    '''


    driver = webdriver.Chrome(PATH) #tell selenium to use Chrome and find the webdriver file in this location
    driver.get(url) #tell Selenium to open the webpage
    driver.implicitly_wait(1) # tell the webdriver to wait for 1 seconds for the page to load

    #After the webpage opened, we can extract the title, hyperlink, year of each movies
    #Set initial empty list for each element:
    title = []
    link = []
    year = []

    #Grab the block of each individual movie
    block = driver.find_elements_by_class_name('lister-item')
    #Set up for loop to run through all 50 movies in the first page
    print("looping through 10 movies in first page")
    for i in range(0,10):
        try:
            #Extracting title
            ftitle = block[i].find_element_by_class_name('lister-item-header').text

            #The extracted title has extra elements, so we will have to do some cleaning
            #Remove the order in front of the title
            forder = block[i].find_element_by_class_name('lister-item-index').text
            #Extract the year last 6 letter of the title
            fyear = ftitle[-6:]
            #Drop the order, year and only keep the movie's name
            ftitle = ftitle.replace(forder+' ', '')[:-7 ]
            #Then extract the link with cleaned title
            flink = block[i].find_element_by_link_text(ftitle).get_attribute('href')

            #Add item to the respective lists
            title.append(ftitle)
            year.append(fyear)
            link.append(flink)
        except:
            continue
    
    print("collecting movie title,year,link done")
    # After that, we can use BeautifulSoup to extract the user reviews link 
    #Set an empty list to store user review link
    print("collecting user review links")
    user_review_links = []
    for url in link:
        url = url
        #setup user agent for BS4, except some rare case, it would be the same for most browser 
        user_agent = {'User-agent': 'Mozilla/5.0'}
        #Use request.get to load the whole page
        response = requests.get(url, headers = user_agent)
        #Parse the request object to BS4 to transform it into html structure
        soup = BeautifulSoup(response.text, 'html.parser')
        #Create a link like https://www.imdb.com/title/tt5354160/reviews/?ref_=tt_ql_urv
        review_link = 'https://www.imdb.com/title/'+url[27:url.find('?')-1]+'/reviews/'
        #Append the newly grabed link into its list
        user_review_links.append(review_link)
    print("user review collection done")

    #Then create the first data frame to summarize our data at this point:
    #Create dictionary for data and columns' name
    movie_data = {'Movie_name': title, 
            'Year': year, 
            'link': link,
            'user_review' : user_review_links,
            }
    movies = pd.DataFrame(data = movie_data) #create dataframe
    driver.quit() #tell Selenium to close the webpage

    # Step 2, we will grab the data from each user review page
    # Use Selenium to go to each user review page
    for i in range(len(movies['user_review'])): 
        print("collecting data from review page",i)
        driver = webdriver.Chrome(PATH)
        driver.get(movies['user_review'][i])
        driver.implicitly_wait(1) # tell the webdriver to wait for 1 seconds for the page to load to prevent blocked by anti spam software


        # Set up action to click on 'load more' button
        # note that each page on imdb has 25 reviews
        page = 1 #Set initial variable for while loop
        #We want at least 1000 review, so get 50 at a safe number
        while page<50:  
            try:
                #find the load more button on the webpage
                load_more = driver.find_element_by_id('load-more-trigger')
                #click on that button
                load_more.click()
                page+=1 #move on to next loadmore button
            except:
                #If couldnt find any button to click, stop
                break
        # After fully expand the page, we will grab data from whole website
        review = driver.find_elements_by_class_name('review-container')
        #Set list for each element:
        title = []
        content = []
        rating = []
        date = []
        user_name = []
        #run for loop to get 
        for n in range(0,20):
            print("collecting review ",n)
            try:
                #Some reviewers only give review text or rating without the other, 
                #so we use try/except here to make sure each block of content must has all the element before append them to the list

                #Check if each review has all the elements
                ftitle = review[n].find_element_by_class_name('title').text
                #For the review content, some of them are hidden as spoiler, 
                #so we use the attribute 'textContent' here after extracting the 'content' tag
                fcontent = review[n].find_element_by_class_name('content').get_attribute("textContent").strip()
                frating = review[n].find_element_by_class_name('rating-other-user-rating').text[0]
                fdate = review[n].find_element_by_class_name('review-date').text
                fname = review[n].find_element_by_class_name('display-name-link').text


                #Then add them to the respective list
                title.append(ftitle)
                content.append(fcontent)
                rating.append(frating)
                date.append(fdate)
                user_name.append(fname)
            except:
                continue
        #Build data dictionary for dataframe
        data = {'User_name': user_name, 
            'Review title': title, 
            'Review Rating': rating,
            'Review date' : date,
            'Review_body' : content,
           }
        #Build dataframe for each movie to export
        review = pd.DataFrame(data = data)
        movie = movies['Movie_name'][i] #grab the movie name from the movies list    
        review['Movie_name'] = movie #create new column with the same movie name column   
        dataset.append(review) #append review to dataset
        driver.quit()
    

Then we can save the newly created raw dataframe into a csv file for future use

the function call below will start the scraping process. it takes about 15 seconds to scrape data for each movie. this code is not completed yet. a lot of fixing is yet to be done.

In [None]:
bangla_movies_link = 'https://www.imdb.com/search/title/?languages=bn&sort=year,desc&user_rating=1.0,10.0&title_type=feature&count=250&start=0&ref_=adv_nxt'
get_review(bangla_movies_link)

converting dataframe to csv

In [None]:
dataset.to_csv('bangla movie user rating dataset.csv')