**Scraping Rotten Tomatoes and creating a dataframe to analyse the data**

**Import libraries**

In [1]:
import requests
from bs4 import BeautifulSoup
import numpy as np 
import pandas as pd 

**Making a request**


In [2]:
url = 'https://www.rottentomatoes.com/top/bestofrt/top_100_action__adventure_movies/'
response=requests.get(url)
response.status_code

200

In [3]:
html = response.content
html[:100]

b'<!DOCTYPE html>\n<html lang="en" xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://opengr'

**Choosing a parser and exporting the file**

In [4]:
soup = BeautifulSoup(html,'lxml')

In [5]:
with open('best_action_movies.html','wb') as file:
    file.write(soup.prettify('utf-8'))

**Extracting**

In [6]:
# class ='unstyled articleLink' is the class for all movie titles 
titles = soup.find_all('a', class_='unstyled articleLink')
titles[:10]

[<a class="unstyled articleLink" href="/browse/opening/">Opening This Week</a>,
 <a class="unstyled articleLink" href="/browse/in-theaters/">Top Box Office</a>,
 <a class="unstyled articleLink" href="/browse/upcoming/">Coming Soon to Theaters</a>,
 <a class="unstyled articleLink" href="/browse/box-office/">Weekend Earnings</a>,
 <a class="unstyled articleLink" href="/browse/cf-in-theaters/">Certified Fresh Movies</a>,
 <a class="unstyled articleLink" href="/dvd/"><h2 class="title">On Dvd &amp; Streaming</h2></a>,
 <a class="unstyled articleLink" href="/browse/dvd-streaming-all/?services=vudu">VUDU</a>,
 <a class="unstyled articleLink" href="/browse/dvd-streaming-all/?services=netflix_iw">Netflix Streaming</a>,
 <a class="unstyled articleLink" href="/browse/dvd-streaming-all/?services=itunes">iTunes</a>,
 <a class="unstyled articleLink" href="/browse/dvd-streaming-all/?services=amazon_prime;amazon">Amazon and Amazon Prime</a>]

In [7]:
titles[43].text

'\n            Black Panther (2018)'

In [8]:
titles[43].text.strip().split('(')[0].strip()

'Black Panther'

In [9]:
titles[43].text.strip().split(' ')[-1].strip('()')

'2018'

In [10]:
titles[142].text

'\n            Catch Me If You Can (2002)'

In [11]:
film_title=[]
year=[]

for i in range(43,143):
    film_title.append(titles[i].text.strip().split('(')[0].strip())
    year.append(titles[i].text.strip().split(' ')[-1].strip('()'))

print(film_title[:10]) # list of film titles / names 
print(' ')
print(year[:10]) # year of release

['Black Panther', 'Avengers: Endgame', 'Mission: Impossible - Fallout', 'Mad Max: Fury Road', 'Spider-Man: Into the Spider-Verse', 'Wonder Woman', 'Dunkirk', 'Coco', 'Thor: Ragnarok', 'Seven Samurai']
 
['2018', '2019', '2018', '2015', '2018', '2017', '2017', '2017', '2017', '1956']


In [12]:
len(film_title)

100

In [13]:
len(year)

100

In [14]:
index = list(np.arange(0,100))
print(index)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99]


**So I have got 2 columns in my dataset and they are film_title, year and I will use index as the index for the dataset**

In [15]:
reviews = soup.find_all('td', class_='right hidden-xs')
len(reviews)

100

In [16]:
reviews[:10]

[<td class="right hidden-xs">525</td>,
 <td class="right hidden-xs">544</td>,
 <td class="right hidden-xs">435</td>,
 <td class="right hidden-xs">427</td>,
 <td class="right hidden-xs">391</td>,
 <td class="right hidden-xs">468</td>,
 <td class="right hidden-xs">461</td>,
 <td class="right hidden-xs">351</td>,
 <td class="right hidden-xs">435</td>,
 <td class="right hidden-xs">87</td>]

In [17]:
int(reviews[0].text)

525

In [18]:
no_of_reviews = []
for i in range(len(reviews)):
    no_of_reviews.append(int(reviews[i].text))
    
print(no_of_reviews)

[525, 544, 435, 427, 391, 468, 461, 351, 435, 87, 421, 479, 443, 48, 384, 448, 294, 65, 360, 394, 393, 133, 92, 298, 410, 345, 52, 441, 422, 381, 83, 325, 331, 52, 96, 325, 289, 122, 253, 481, 354, 435, 281, 366, 132, 332, 68, 223, 45, 363, 260, 281, 65, 205, 45, 42, 382, 42, 351, 219, 76, 104, 275, 61, 61, 329, 40, 240, 269, 313, 59, 421, 41, 172, 77, 259, 288, 315, 89, 539, 278, 130, 51, 73, 40, 336, 255, 306, 44, 419, 83, 49, 79, 158, 50, 42, 92, 130, 322, 203]


**So I got the 3rd column for the dataset and that is no_of_reviews which is of integer type**

In [19]:
ratings = soup.find_all('span', class_="tMeterScore")
ratings[:10]

[<span class="tMeterScore cfp-item-score">81%</span>,
 <span class="tMeterScore cfp-item-score">82%</span>,
 <span class="tMeterScore cfp-item-score">92%</span>,
 <span class="tMeterScore">100%</span>,
 <span class="tMeterScore">100%</span>,
 <span class="tMeterScore">100%</span>,
 <span class="tMeterScore">100%</span>,
 <span class="tMeterScore">96%</span>,
 <span class="tMeterScore">94%</span>,
 <span class="tMeterScore">93%</span>]

In [20]:
ratings[7].text.strip('%')

'96'

In [21]:
ratings[107].text.strip('%')

'\xa097'

In [22]:
film_rating_perc = []
for i in range(7,107):
    film_rating_perc.append(ratings[i].text.strip('\xa0%'))
    
print(film_rating_perc)

['96', '94', '93', '85', '78', '75', '60', '96', '85', '100', '100', '23', '99', '92', '37', '100', '98', '96', '94', '97', '97', '97', '93', '92', '97', '93', '100', '93', '90', '93', '100', '93', '90', '98', '98', '94', '92', '92', '97', '98', '98', '90', '94', '100', '88', '90', '92', '98', '94', '96', '96', '98', '94', '97', '93', '96', '85', '94', '87', '95', '90', '92', '92', '99', '97', '100', '91', '95', '94', '100', '97', '98', '100', '89', '100', '89', '95', '97', '94', '95', '95', '95', '90', '98', '94', '93', '90', '97', '85', '100', '97', '97', '94', '93', '90', '96', '79', '92', '92', '98']


**So I got the 4th column for the dataset and that is film_rating_perc which is a categorical variable**

In [23]:
titles[43].attrs

{'href': '/m/black_panther_2018', 'class': ['unstyled', 'articleLink']}

In [24]:
base_url = 'https://www.rottentomatoes.com'

In [25]:
relative_url = [titles[i].get('href') for i in range(43,143)]
relative_url

['/m/black_panther_2018',
 '/m/avengers_endgame',
 '/m/mission_impossible_fallout',
 '/m/mad_max_fury_road',
 '/m/spider_man_into_the_spider_verse',
 '/m/wonder_woman_2017',
 '/m/dunkirk_2017',
 '/m/coco_2017',
 '/m/thor_ragnarok_2017',
 '/m/seven_samurai_1956',
 '/m/logan_2017',
 '/m/star_wars_the_last_jedi',
 '/m/star_wars_episode_vii_the_force_awakens',
 '/m/1000355-adventures_of_robin_hood',
 '/m/incredibles_2',
 '/m/spider_man_far_from_home',
 '/m/zootopia',
 '/m/1011615-king_kong',
 '/m/war_for_the_planet_of_the_apes',
 '/m/baby_driver',
 '/m/spider_man_homecoming',
 '/m/1013775-metropolis',
 '/m/jaws',
 '/m/up',
 '/m/shazam',
 '/m/the_dark_knight',
 '/m/treasure_of_the_sierra_madre',
 '/m/blade_runner_2049',
 '/m/captain_america_civil_war',
 '/m/skyfall',
 '/m/french_connection',
 '/m/the_jungle_book_2016',
 '/m/harry_potter_and_the_deathly_hallows_part_2_2011',
 '/m/1000121-39_steps',
 '/m/apocalypse_now',
 '/m/mission_impossible_rogue_nation',
 '/m/the_hurt_locker',
 '/m/lawre

In [26]:
from urllib.parse import urljoin

full_url = [urljoin(base_url,url) for url in relative_url]
full_url

['https://www.rottentomatoes.com/m/black_panther_2018',
 'https://www.rottentomatoes.com/m/avengers_endgame',
 'https://www.rottentomatoes.com/m/mission_impossible_fallout',
 'https://www.rottentomatoes.com/m/mad_max_fury_road',
 'https://www.rottentomatoes.com/m/spider_man_into_the_spider_verse',
 'https://www.rottentomatoes.com/m/wonder_woman_2017',
 'https://www.rottentomatoes.com/m/dunkirk_2017',
 'https://www.rottentomatoes.com/m/coco_2017',
 'https://www.rottentomatoes.com/m/thor_ragnarok_2017',
 'https://www.rottentomatoes.com/m/seven_samurai_1956',
 'https://www.rottentomatoes.com/m/logan_2017',
 'https://www.rottentomatoes.com/m/star_wars_the_last_jedi',
 'https://www.rottentomatoes.com/m/star_wars_episode_vii_the_force_awakens',
 'https://www.rottentomatoes.com/m/1000355-adventures_of_robin_hood',
 'https://www.rottentomatoes.com/m/incredibles_2',
 'https://www.rottentomatoes.com/m/spider_man_far_from_home',
 'https://www.rottentomatoes.com/m/zootopia',
 'https://www.rottento

**Now I want to go to these sites and collect more information about the movies like Genre, Director etc. from multiple pages**

**Let us explore a bit taking an example**

In [27]:
response = requests.get('https://www.rottentomatoes.com/m/the_lion_king')
html = response.content 
soup = BeautifulSoup(html,'lxml')

In [28]:
info = soup.find_all('div' ,class_='meta-value')
info

[<div class="meta-value" data-qa="movie-info-item-value">G
                     </div>,
 <div class="meta-value genre" data-qa="movie-info-item-value">
                         
                         musical, 
                         
                         adventure, 
                         
                         animation, 
                         
                         kids &amp; family
                         
                     </div>,
 <div class="meta-value" data-qa="movie-info-item-value">English
                     </div>,
 <div class="meta-value" data-qa="movie-info-item-value">
 <a data-qa="movie-info-director" href="/celebrity/roger_allers">Roger Allers</a>, 
                         
                             <a data-qa="movie-info-director" href="/celebrity/rob_minkoff">Rob Minkoff</a>
 </div>,
 <div class="meta-value" data-qa="movie-info-item-value">
 <a href="/celebrity/don-hahn">Don Hahn</a>
 </div>,
 <div class="meta-value" data-qa="movie-info-it

In [29]:
len(info)

13

In [30]:
info[1].text.strip(' \n').replace('\n','').replace(' ','') # Genre

'musical,adventure,animation,kids&family'

In [31]:
info[2].text.strip('\n ').replace('\n','').replace(' ','') # Language

'English'

In [32]:
info[3].text.strip('\n ').replace('\n','').replace(' ','') # Directors

'RogerAllers,RobMinkoff'

In [33]:
info[4].text.strip('\n ').replace('\n','').replace(' ','') # Writers

'DonHahn'

In [52]:
genre=[]
language=[]
directors=[]
writers=[]

for url in full_url:
    response = requests.get(url)
    html = response.content
    soup = BeautifulSoup(html,'lxml')
    info = soup.find_all('div' ,class_='meta-value')
    genre.append(info[1].text.strip(' \n').replace('\n','').replace(' ',''))
    language.append(info[2].text.strip('\n ').replace('\n','').replace(' ',''))
    directors.append(info[3].text.strip('\n ').replace('\n','').replace(' ',''))
    writers.append(info[4].text.strip('\n ').replace('\n','').replace(' ',''))

In [53]:
print(genre)
print(' ')
print(language)
print(' ')
print(directors)
print(' ')
print(writers)

['action,adventure,fantasy', 'action,sci-fi,adventure,fantasy', 'action,mystery&thriller,adventure', 'action,adventure', 'animation,fantasy,kids&family,comedy,action,adventure', 'action,adventure,fantasy', 'war,history,drama', 'comedy,adventure,animation,music,kids&family', 'action,comedy,adventure,sci-fi,fantasy', 'Japanese', 'action,adventure,fantasy', 'action,sci-fi,adventure,fantasy', 'action,sci-fi,adventure,fantasy', 'action,adventure', 'action,comedy,adventure,animation,kids&family', 'action,comedy,adventure,fantasy', 'adventure,comedy,animation,kids&family', 'English', 'action,sci-fi,adventure', 'action,mystery&thriller', 'action,comedy,adventure,fantasy', 'German', 'mystery&thriller,adventure,horror', 'adventure,comedy,animation,kids&family', 'action,comedy,adventure,fantasy', 'action,adventure,fantasy', 'English', 'sci-fi,mystery&thriller', 'action,adventure,fantasy', 'action,mystery&thriller,adventure', 'drama,crime,mystery&thriller', 'adventure,action,kids&family,fantasy', 

In [55]:
len(genre),len(language),len(directors),len(writers)

(100, 100, 100, 100)

In [56]:
film_dict = {
    'film_name':film_title,
    'year_of_release':year,
    'film_rating':film_rating_perc,
    'film_reviews':no_of_reviews,
    'film_genre':genre,
    'film_lang':language,
    'film_directors':directors,
    'film_writers':writers
}
films = pd.DataFrame(data=film_dict)
films.head()

Unnamed: 0,film_name,year_of_release,film_rating,film_reviews,film_genre,film_lang,film_directors,film_writers
0,Black Panther,2018,96,525,"action,adventure,fantasy",English,RyanCoogler,KevinFeige
1,Avengers: Endgame,2019,94,544,"action,sci-fi,adventure,fantasy",English,"AnthonyRusso,JoeRusso",KevinFeige
2,Mission: Impossible - Fallout,2018,93,435,"action,mystery&thriller,adventure",English,ChristopherMcQuarrie,"TomCruise,ChristopherMcQuarrie,JakeMyers,J.J.A..."
3,Mad Max: Fury Road,2015,85,427,"action,adventure",English,GeorgeMiller,"DougMitchell,GeorgeMiller,P.J.Voeten"
4,Spider-Man: Into the Spider-Verse,2018,78,391,"animation,fantasy,kids&family,comedy,action,ad...",English,"BobPersichetti,PeterRamsey,RodneyRothman","AviArad,AmyPascal,PhilLord,ChrisMiller,Christi..."


In [57]:
films.to_csv('films.csv', index=False)

**Importing the created dataset**

In [58]:
dataset= pd.read_csv('films.csv')

In [60]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   film_name        100 non-null    object
 1   year_of_release  100 non-null    int64 
 2   film_rating      100 non-null    int64 
 3   film_reviews     100 non-null    int64 
 4   film_genre       100 non-null    object
 5   film_lang        100 non-null    object
 6   film_directors   100 non-null    object
 7   film_writers     100 non-null    object
dtypes: int64(3), object(5)
memory usage: 4.4+ KB


In [61]:
dataset.sample(10)

Unnamed: 0,film_name,year_of_release,film_rating,film_reviews,film_genre,film_lang,film_directors,film_writers
43,Isle of Dogs,2018,100,366,"animation,comedy,adventure",English,WesAnderson,"WesAnderson,ScottRudin,StevenRales,JeremyDawson"
2,Mission: Impossible - Fallout,2018,93,435,"action,mystery&thriller,adventure",English,ChristopherMcQuarrie,"TomCruise,ChristopherMcQuarrie,JakeMyers,J.J.A..."
16,Zootopia,2016,98,294,"adventure,comedy,animation,kids&family",English,"ByronHoward,RichMoore",ClarkSpencer
38,The LEGO Movie,2014,97,253,"adventure,comedy,kids&family,animation",English,"PhilLord,ChrisMiller","DanLin,RoyLee"
71,Guardians of the Galaxy Vol. 2,2017,98,421,"action,comedy,adventure,sci-fi,fantasy",English,JamesGunn,KevinFeige
52,The Terminator,1984,94,65,"action,sci-fi,mystery&thriller",English,JamesCameron,GaleAnneHurd
41,Ant-Man and the Wasp,2018,90,435,"action,comedy,adventure,fantasy",English,PeytonReed,"KevinFeige,StephenBroussard"
94,"Aguirre, the Wrath of God",1972,90,50,English,WernerHerzog,WernerHerzog,WernerHerzog
51,Iron Man,2008,98,281,"sci-fi,action,fantasy,adventure",English,JonFavreau,"AviArad,KevinFeige"
87,Captain America: The Winter Soldier,2014,97,306,"action,adventure,fantasy",English,"AnthonyRusso,JoeRusso",KevinFeige


**Now as we can see that there are some mistakes in columns and we can correct them by visiting the site and changing the values.**