In [1]:
import requests 
import pandas as pd
from bs4 import BeautifulSoup
import re
import time

# Scraping data from rotten as a table

In [2]:
url="https://www.rottentomatoes.com/top/bestofrt/"
response = requests.get(url)

In [3]:
rotten = pd.read_html(str(url))[2]
rotten = rotten[['Title','RatingTomatometer','Rank']]
rotten

Unnamed: 0,Title,RatingTomatometer,Rank
0,Black Panther (2018),96%,1.0
1,Avengers: Endgame (2019),94%,2.0
2,Us (2019),93%,3.0
3,Toy Story 4 (2019),97%,4.0
4,The Wizard of Oz (1939),98%,5.0
...,...,...,...
95,Won't You Be My Neighbor? (2018),97%,96.0
96,Chinatown (1974),99%,97.0
97,Lawrence of Arabia (1962),98%,98.0
98,I Am Not Your Negro (2017),99%,99.0


# Setting up title & year into arrays from the data we scraped, using regex

In [4]:
title = []
year = []
pattern = "(.*)\s\s*\((\d*)"
for d in rotten['Title']:
    rottenMovies = re.search(pattern,d)
    title.append(rottenMovies.group(1))
    year.append(rottenMovies.group(2))


In [5]:
rotten['Year'] = year
rotten['Title'] = title
rotten.columns = ['Title', 'Tomatometer', 'Rotten rank', 'Year']
rotten

Unnamed: 0,Title,Tomatometer,Rotten rank,Year
0,Black Panther,96%,1.0,2018
1,Avengers: Endgame,94%,2.0,2019
2,Us,93%,3.0,2019
3,Toy Story 4,97%,4.0,2019
4,The Wizard of Oz,98%,5.0,1939
...,...,...,...,...
95,Won't You Be My Neighbor?,97%,96.0,2018
96,Chinatown,99%,97.0,1974
97,Lawrence of Arabia,98%,98.0,1962
98,I Am Not Your Negro,99%,99.0,2017


# Creating a list of relevant URLs for rotten into an array for further use

In [6]:
url_list = []
rotten_url = "https://www.rottentomatoes.com"
pattern = "\/m.*"
soup = BeautifulSoup(response.content, 'lxml')
soup = soup.find('table', {'class': 'table'})
urls = [a['href'] for a in soup.find_all('a', {'class': 'unstyled articleLink'})]
for movie in urls:
    url_list.append(rotten_url + re.search(pattern,movie).group())
url_list

['https://www.rottentomatoes.com/m/black_panther_2018',
 'https://www.rottentomatoes.com/m/avengers_endgame',
 'https://www.rottentomatoes.com/m/us_2019',
 'https://www.rottentomatoes.com/m/toy_story_4',
 'https://www.rottentomatoes.com/m/the_wizard_of_oz_1939',
 'https://www.rottentomatoes.com/m/mission_impossible_fallout',
 'https://www.rottentomatoes.com/m/lady_bird',
 'https://www.rottentomatoes.com/m/citizen_kane',
 'https://www.rottentomatoes.com/m/the_irishman',
 'https://www.rottentomatoes.com/m/blackkklansman',
 'https://www.rottentomatoes.com/m/the_cabinet_of_dr_caligari',
 'https://www.rottentomatoes.com/m/get_out',
 'https://www.rottentomatoes.com/m/1003707-casablanca',
 'https://www.rottentomatoes.com/m/spider_man_into_the_spider_verse',
 'https://www.rottentomatoes.com/m/mad_max_fury_road',
 'https://www.rottentomatoes.com/m/nosferatu',
 'https://www.rottentomatoes.com/m/moonlight_2016',
 'https://www.rottentomatoes.com/m/booksmart',
 'https://www.rottentomatoes.com/m/a_s

# Moving through all URLs in order to scrape Genre and Avg Rating 

In [7]:
avg_rating = []
movie_genre = []
javascript_regex = 'tomatometerAllCritics":{"avgScore":(\d.?\d*)\,'
for i in url_list:
    response1 = ''
    while response1 == '':
        try:
            response1 = requests.get(i)
            break
        except:
            print("Connection refused by the server..")
            print("Sleeping for 5 seconds")
            time.sleep(5)
            print("continue...")
            continue
    
    soup1 = BeautifulSoup(response1.content, 'lxml')
    movie_genre.append(soup1.find_all('div', {'class': 'meta-value'})[1].text.replace(",","").split())
    scripts = soup1.find('script',{'type' : 'text/javascript'}).get_text()
    score = re.search(javascript_regex, scripts)
    if score is None:
        print("No score in: " + i)
        avg_rating.append(None)
    else:
        print("Getting score from: " + i)
        avg_rating.append(score.group(1))

Getting score from: https://www.rottentomatoes.com/m/black_panther_2018
Getting score from: https://www.rottentomatoes.com/m/avengers_endgame
Getting score from: https://www.rottentomatoes.com/m/us_2019
Getting score from: https://www.rottentomatoes.com/m/toy_story_4
Getting score from: https://www.rottentomatoes.com/m/the_wizard_of_oz_1939
Getting score from: https://www.rottentomatoes.com/m/mission_impossible_fallout
Getting score from: https://www.rottentomatoes.com/m/lady_bird
Getting score from: https://www.rottentomatoes.com/m/citizen_kane
Getting score from: https://www.rottentomatoes.com/m/the_irishman
Getting score from: https://www.rottentomatoes.com/m/blackkklansman
Getting score from: https://www.rottentomatoes.com/m/the_cabinet_of_dr_caligari
Getting score from: https://www.rottentomatoes.com/m/get_out
Getting score from: https://www.rottentomatoes.com/m/1003707-casablanca
Getting score from: https://www.rottentomatoes.com/m/spider_man_into_the_spider_verse
Getting score f

In [8]:
genre = []
for g in movie_genre:
    if g[0]=='Classics':
        genre.append(g[1])
    else:
        genre.append(g[0])

In [9]:
rotten['Critic Rating'] = avg_rating
rotten['Genre'] = genre

# Data Cleaning & Convertions

In [10]:
rotten['Tomatometer'] = rotten['Tomatometer'].str.replace('%','').astype('float64')
rotten['Tomatometer'] = rotten['Tomatometer']/10.0
rotten['Genre'] = rotten['Genre'].str.replace(",","")
rotten['Critic Rating'] = rotten['Critic Rating'].astype('float')
rotten['Difference'] = rotten['Tomatometer'] - rotten['Critic Rating']

In [11]:
rotten

Unnamed: 0,Title,Tomatometer,Rotten rank,Year,Critic Rating,Genre,Difference
0,Black Panther,9.6,1.0,2018,8.28,Action,1.32
1,Avengers: Endgame,9.4,2.0,2019,8.22,Action,1.18
2,Us,9.3,3.0,2019,7.95,Horror,1.35
3,Toy Story 4,9.7,4.0,2019,8.36,Animation,1.34
4,The Wizard of Oz,9.8,5.0,1939,9.40,Kids,0.40
...,...,...,...,...,...,...,...
95,Won't You Be My Neighbor?,9.7,96.0,2018,8.72,Documentary,0.98
96,Chinatown,9.9,97.0,1974,9.34,Drama,0.56
97,Lawrence of Arabia,9.8,98.0,1962,9.27,Action,0.53
98,I Am Not Your Negro,9.9,99.0,2017,8.89,Documentary,1.01


In [12]:
rotten['Rotten rank']= rotten['Rotten rank'].astype('int64')
rotten['Year']= rotten['Year'].astype('int64')

rotten.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Title          100 non-null    object 
 1   Tomatometer    100 non-null    float64
 2   Rotten rank    100 non-null    int64  
 3   Year           100 non-null    int64  
 4   Critic Rating  100 non-null    float64
 5   Genre          100 non-null    object 
 6   Difference     100 non-null    float64
dtypes: float64(3), int64(2), object(2)
memory usage: 5.6+ KB


# Saving the dataframe we made into a csv file to load

In [13]:
rotten.to_csv('rottenData.csv')