### Gathering the data

In [159]:
#Importing the required libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [160]:
#Using requests to get the HTML code for the website
URL = "https://www.rottentomatoes.com/top/bestofrt/"
page = requests.get(URL)

In [161]:
#Creating the soup
soup = BeautifulSoup(page.content, 'lxml')

In [162]:
#Inspecting the HTML code of the rotten tomatoes website
print(soup.prettify())

<!DOCTYPE html>
<html lang="en" xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://opengraphprotocol.org/schema/">
 <head prefix="og: http://ogp.me/ns# flixstertomatoes: http://ogp.me/ns/apps/flixstertomatoes#">
  <script src="//cdn.optimizely.com/js/594670329.js">
  </script>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="width=device-width,initial-scale=1" name="viewport"/>
  <meta content="no-referrer" name="referrer"/>
  <meta content="never" name="referrer"/>
  <meta content="VPPXtECgUUeuATBacnqnCm4ydGO99reF-xgNklSbNbc" name="google-site-verification"/>
  <meta content="034F16304017CA7DCF45D43850915323" name="msvalidate.01"/>
  <link href="https://staticv2-4.rottentomatoes.com/static/images/iphone/apple-touch-icon.png" rel="apple-touch-icon"/>
  <link href="https://staticv2-4.rottentomatoes.com/static/images/icons/favicon.ico" rel="shortcut icon" type="image/x-icon"/>
  <link href="https://staticv2-4.rottentomatoes.com/static/sty

In [163]:

movie_elems = soup.find_all(class_='table')

In [164]:
headers=[]
for header in movie_elems[0].findAll('th'):
    headers.append(header.text)

In [165]:
table = movie_elems[0].findAll('tr')

In [166]:
movies = []
for rows in table[1:]:
    for movie in rows.find('a'):
        movies.append(movie)

In [167]:
bullets = []
for rows in table[1:]:
    for bullet in rows.find('td', class_ = 'bold'):
        bullets.append(bullet)

In [168]:
ratings = []
for rows in table[1:]:
    for rating in rows.find('span', class_='tMeterScore'):
        ratings.append(rating)

In [169]:
number_reviews = []
for rows in table[1:]:
    for review in rows.find('td', class_='right hidden-xs'):
        number_reviews.append(review)

In [170]:
movie_reviews = {'bullets': bullets, 'ratings': ratings, 'movies': movies, 'no_of_reviews': number_reviews}

In [171]:
df = pd.DataFrame(movie_reviews)
df.columns = headers

### Assess

In [172]:
df.head()

Unnamed: 0,Rank,RatingTomatometer,Title,No. of Reviews
0,1.0,96%,\n Black Panther (2018),512
1,2.0,94%,\n Avengers: Endgame (2019),528
2,3.0,93%,\n Us (2019),533
3,4.0,97%,\n Toy Story 4 (2019),443
4,5.0,99%,\n Lady Bird (2017),391


In [173]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Rank               100 non-null    object
 1   RatingTomatometer  100 non-null    object
 2   Title              100 non-null    object
 3   No. of Reviews     100 non-null    object
dtypes: object(4)
memory usage: 3.2+ KB


In [175]:
df.tail()

Unnamed: 0,Rank,RatingTomatometer,Title,No. of Reviews
95,96.0,98%,"\n The Godfather, Part II (1974)",83
96,97.0,99%,\n Chinatown (1974),75
97,98.0,97%,\n Won't You Be My Neighbor? (2018),249
98,99.0,99%,\n Rear Window (1954),74
99,100.0,98%,\n The Babadook (2014),238


- Percentages sign in the RatingTomatometer
- Rank can be the index
- \n at the start of the Title

### Clean

In [155]:
df['Title'] = df['Title'].apply(lambda x: x.strip('\n'))

In [156]:
df['Rank'] = df['Rank'].apply(lambda x: x.strip('.'))
df.set_index('Rank', inplace=True)