## Gather Data 

In [142]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import matplotlib.pyplot as plt
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait 
from selenium.webdriver.support import expected_conditions as EC 

**First, we would scrape the top 100 movies of all time from Rotten Tomatoes**

In [113]:
URL = "https://www.rottentomatoes.com/top/bestofrt/" #URL where the table for the top 100 movies exist
page = requests.get(URL) 
soup = bs(page.content, 'lxml') #Creating the soup
movie_elems = soup.find_all(class_='table')
headers=[]
for header in movie_elems[0].findAll('th'): #Getting the headers of the table
    headers.append(header.text)

In [143]:
table = movie_elems[0].findAll('tr') #Creating this variable to find the row observations for each movie

In [116]:
ranks = []
for rows in table[1:]: #Loop through the 'table' to get the rank
    for rank in rows.find('td', class_ = 'bold'):
        ranks.append(rank)

In [115]:
movies = []
for rows in table[1:]: #Loop through the 'table' to get the movie names
    for movie in rows.find('a'):
        movies.append(movie)

In [117]:
ratings = []
for rows in table[1:]: #Loop through the 'table' to get the critic rating
    for rating in rows.find('span', class_='tMeterScore'):
        ratings.append(rating)

In [118]:
number_reviews = []
for rows in table[1:]: #Loop through the 'table' to get the number of critic reviews
    for review in rows.find('td', class_='right hidden-xs'):
        number_reviews.append(review)

In [120]:
movie_reviews = {'rankings': ranks, 'ratings': ratings, 'movies': movies, 'no_of_reviews': number_reviews}

In [121]:
df_t = pd.DataFrame(movie_reviews)
df_t.columns = headers #Chaning the headers to the one scraped from the website

**We would now gather all the data like synopsis, critic rating, number of critic reviews, Audience rating and number of Audience rating for each of the movie in the top 100 list**

For this exercise, we would be using Selenium. Selenium is a powerful tool that helps us in automating the web browser. In combination with beautiful soup it becomes one of the most powerful web scraping tools

In [111]:
driver = webdriver.Chrome(executable_path='/Users/akshaygupta/Downloads/chromedriver')
driver.get("https://www.rottentomatoes.com/top/bestofrt/")
driver.implicitly_wait(10)
df = pd.DataFrame()
for i in range(1,101):
    driver.find_element_by_xpath('//*[@id="top_movies_main"]/div/table/tbody/tr['+str(i)+']/td[3]/a').click()
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME,'mop-ratings-wrap__percentage')))
    html = driver.page_source
    soup = bs(html, 'lxml')
    section = soup.find_all(class_='mop-ratings-wrap__info')
    review = section[0].find('p').text
    score_section = section[0].find_all('div')
    critic_rat = score_section[0].find_all('span')[1].text
    critic_num = score_section[1].find_all('small')[0].text
    audience_rat = score_section[2].find_all('span')[1].text
    audience_num = score_section[3].find('strong').text
    temp = pd.DataFrame({
        'Synopsis': review.strip(),
        'Critic Rating': critic_rat.strip(),
        'Number of Critic Reviews': critic_num.strip(),
        'Audience Rating': audience_rat.strip(),
        'Number of Audience Reviews': audience_num.strip()[audience_num.find(':')+2:]
    }, index=[i])
    df = pd.concat([df, temp])
    driver.back()
driver.quit()

In [112]:
df

Unnamed: 0,Synopsis,Critic Rating,Number of Critic Reviews,Audience Rating,Number of Audience Reviews
1,Black Panther elevates superhero cinema to thr...,96%,512,79%,88211
2,"Exciting, entertaining, and emotionally impact...",94%,528,90%,70313
3,"With Jordan Peele's second inventive, ambitiou...",93%,533,59%,13108
4,"Heartwarming, funny, and beautifully animated,...",97%,443,94%,53146
5,Lady Bird delivers fresh insights about the tu...,99%,391,79%,22595
...,...,...,...,...,...
96,"A career highlight for Preston Sturges, The La...",100%,47,87%,7550
97,Drawing on strong performances by Al Pacino an...,98%,83,97%,411450
98,As bruised and cynical as the decade that prod...,99%,75,93%,77907
99,Won't You Be My Neighbor? takes a fittingly pa...,97%,249,94%,4382


In [124]:
df_t.shape[0]

100

In [125]:
df.shape[0]

100

In [127]:
df_t

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Rank               100 non-null    object
 1   RatingTomatometer  100 non-null    object
 2   Title              100 non-null    object
 3   No. of Reviews     100 non-null    object
dtypes: object(4)
memory usage: 3.2+ KB


In [132]:
df.index

Int64Index([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
             14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
             27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
             40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
             53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
             66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
             79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
             92,  93,  94,  95,  96,  97,  98,  99, 100],
           dtype='int64')

In [138]:
df.index = df.index.set_names(['Rank'])

In [139]:
df

Unnamed: 0_level_0,Synopsis,Critic Rating,Number of Critic Reviews,Audience Rating,Number of Audience Reviews
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Black Panther elevates superhero cinema to thr...,96%,512,79%,88211
2,"Exciting, entertaining, and emotionally impact...",94%,528,90%,70313
3,"With Jordan Peele's second inventive, ambitiou...",93%,533,59%,13108
4,"Heartwarming, funny, and beautifully animated,...",97%,443,94%,53146
5,Lady Bird delivers fresh insights about the tu...,99%,391,79%,22595
...,...,...,...,...,...
96,"A career highlight for Preston Sturges, The La...",100%,47,87%,7550
97,Drawing on strong performances by Al Pacino an...,98%,83,97%,411450
98,As bruised and cynical as the decade that prod...,99%,75,93%,77907
99,Won't You Be My Neighbor? takes a fittingly pa...,97%,249,94%,4382


In [141]:
df_t.merge(df,'inner', left_on = 'Rank', right_on='Rank')

ValueError: You are trying to merge on object and int64 columns. If you wish to proceed you should use pd.concat