In [55]:
#importing libraries
import requests
import lxml
import lxml_html_clean
from requests_html import HTMLSession
from bs4 import BeautifulSoup
import re
import pandas as pd
import matplotlib as plt
import seaborn as sns

In [56]:
#Webscraping section

In [57]:
#getting url-setup

#This url didn't work because of the 'show more' button making things difficult
#url = "https://movies.disney.com/a-z"
url = "https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films"
headers = {"User-Agent": "For class web scraping assignment, email: cnelson1845@gmail.com"}
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text)
print(r.status_code)

200


In [58]:
#using pandas to get all the tables and then turning the relevant ones into dataframes
tables = pd.read_html(r.text)
#getting the relevant tables of past and current movies (no future)
tables = tables[0:9]
dfs = []
for table in tables:
    dfs.append(table)
print(dfs)

[          Release date                                   Title  \
0    December 21, 1937         Snow White and the Seven Dwarfs   
1     February 7, 1940                               Pinocchio   
2    November 13, 1940                                Fantasia   
3        June 20, 1941                    The Reluctant Dragon   
4     October 23, 1941                                   Dumbo   
5      August 21, 1942                                   Bambi   
6     February 6, 1943                          Saludos Amigos   
7        July 17, 1943               Victory Through Air Power   
8     February 3, 1945                    The Three Caballeros   
9       April 20, 1946                         Make Mine Music   
10   November 12, 1946                       Song of the South   
11  September 27, 1947                      Fun and Fancy Free   
12        May 27, 1948                             Melody Time   
13   November 29, 1948                     So Dear to My Heart   
14     Oc

  tables = pd.read_html(r.text)


In [59]:
#combining the dataframes
movies = pd.concat(dfs, ignore_index=True)
print(movies)

          Release date                            Title  \
0    December 21, 1937  Snow White and the Seven Dwarfs   
1     February 7, 1940                        Pinocchio   
2    November 13, 1940                         Fantasia   
3        June 20, 1941             The Reluctant Dragon   
4     October 23, 1941                            Dumbo   
..                 ...                              ...   
534     April 22, 2025     Sea Lions of the Galapagos ‡   
535       May 23, 2025                    Lilo & Stitch   
536      June 20, 2025                             Elio   
537     August 8, 2025                  Freakier Friday   
538   October 10, 2025                       Tron: Ares   

                                                 Notes  
0    first film to be distributed by RKO Radio Pict...  
1     Inducted into the National Film Registry in 1994  
2    anthology film Inducted into the National Film...  
3          fictionalized tour around the Disney studio  
4     

In [60]:
#removing the irrelevant column of description
movies = movies.drop('Notes', axis=1)
print(movies.head())

        Release date                            Title
0  December 21, 1937  Snow White and the Seven Dwarfs
1   February 7, 1940                        Pinocchio
2  November 13, 1940                         Fantasia
3      June 20, 1941             The Reluctant Dragon
4   October 23, 1941                            Dumbo


In [98]:
#Getting the title and release year from each column and putting it in a dictionary
the_movies = {}
for idx, movie in movies.iterrows():
    #getting title
    title = movie["Title"]
    #getting year
    year = movie["Release date"][-4:]
    #placing into dictionary
    the_movies[title] = year
print(the_movies)

{'Snow White and the Seven Dwarfs': '1937', 'Pinocchio': '1940', 'Fantasia': '1940', 'The Reluctant Dragon': '1941', 'Dumbo': '2019', 'Bambi': '1942', 'Saludos Amigos': '1943', 'Victory Through Air Power': '1943', 'The Three Caballeros': '1945', 'Make Mine Music': '1946', 'Song of the South': '1946', 'Fun and Fancy Free': '1947', 'Melody Time': '1948', 'So Dear to My Heart': '1948', 'The Adventures of Ichabod and Mr. Toad': '1949', 'Cinderella': '2015', 'Treasure Island': '1950', 'Alice in Wonderland': '2010', 'The Story of Robin Hood': '1952', 'Peter Pan': '1953', 'The Sword and the Rose': '1953', 'The Living Desert': '1953', 'Rob Roy: The Highland Rogue': '1954', 'The Vanishing Prairie': '1954', '20,000 Leagues Under the Sea': '1954', 'Davy Crockett: King of the Wild Frontier': '1955', 'Lady and the Tramp': '1955', 'The African Lion': '1955', 'The Littlest Outlaw': '1955', 'The Great Locomotive Chase': '1956', 'Davy Crockett and the River Pirates': '1956', 'Secrets of Life': '1956', 

In [62]:
#getting Rotten Tomatoes urls for all the titles
#Note: Rotten Tomatoes does the title all lowercase with no special characters, and all spaces replaced by underscores
#Ex- Terminator: Dark Fate
#url is- https://www.rottentomatoes.com/m/terminator_dark_fate
#format: https://www.rottentomatoes.com/m/{title}
#Notes: If there are multiple version they either have the year after the movie title:
#Ex: https://www.rottentomatoes.com/m/the_little_mermaid_1989
#or they add a little internal number before the title 
#Ex: https://www.rottentomatoes.com/m/1074108-101_dalmatians
#The last one is worst case scenario and we look for it last, for all of these urls we will need to check if we get a proper request back

urls = []
base = "https://www.rottentomatoes.com/m/"
for title in the_movies:
    #formatting title for rotten tomatoes
    clean = ""
    #all lowercase
    
    #removing special characters

    #replacing spaces with underscores

    #creating full url with it (make sure year is still there)
    full_url = f"{base}{clean}"
print(len(urls))
print(urls)

0
[]


In [63]:
#checking all the urls, if they fail, we add the year
for url in urls:
    #create a request object and check if the status code is 200 or not

    #if not, add the year of the movie
    pass

In [64]:
#checking urls again, and if they fail we mark them, and I manually find the id for the movie


In [65]:
#getting relevant information from each url
title = []
audience_score = []
critic_score = []
revenue = []
genre = []
awards = []
#This is film rating-PG,R,etc
rating = []
#We will get the full date now, but we will likely need to split it into month and year (and season) later
release_date = []
for url in urls:
    #getting title

    #getting audience score

    #getting critic score

    #getting rating

    #getting box-office revenue

    #getting genre

    #getting release date

    #getting awards (may be none)

    pass



In [66]:
#putting the information into a dataframe
df = pd.DataFrame({"Title":title,"Audience-Score":audience_score,"Critic-Score":critic_score,"Revenue":revenue,"Release-Date":release_date,"Film-Rating":rating,"Genre":genre,"Awards":awards})
#printing to check what we will need to clean next
print(df.head())

Empty DataFrame
Columns: [Title, Audience-Score, Critic-Score, Revenue, Release-Date, Film-Rating, Genre, Awards]
Index: []


In [67]:
#cleaning the dataframe


In [68]:
#creating month, year, decade, and season column from "Release-Date" and deleting release date column


In [69]:
#Analysis Section - From here on, rating refers to score unless otherwise specified

In [70]:
#What is the average movie rating each year? (audience and critic)

In [71]:
#Average movie rating each decade (audience and critic)

In [72]:
#Note section:
#What is the trend in ratings? (rising, falling, stagnant, other)

#Is it different between audiences and critics? If so, by what degree? 

#Any other anomalies you notice? Any dips or highs? Why might that be?


In [73]:
#What is the average box office revenue each year? 


In [74]:
#Revenue for each decade


In [75]:
#Revenue for each season? 


In [76]:
#Does revenue correlate with ratings? 


In [77]:
#Does revenue correlate more strongly with audience or critic ratings?


In [78]:
#Note: What is the trend regarding revenue?

#Please note any other anomalies and your explanation (it's okay if you don't know)


In [79]:
#How many total directors have there been? 


In [80]:
#How many different directors have there been each year? Each decade?


In [81]:
#What is the average revenue of movies made by each director?


In [82]:
#What is the average rating of movies made by each director?


In [83]:
#Do directors with more movies have better ratings?


In [84]:
#Do directors movies get better ratings as they make more movies?


In [85]:
#Note: Make any notes about directors and relationships with other variables



In [86]:
#Any interesting relationships between film rating and rating?


In [87]:
#What about film rating and revenue? 


In [88]:
#Note: Note anything interesting regarding film rating



In [89]:
#Any interesting relationships between genre and rating?


In [90]:
#What about genre and revenue?


In [91]:
#What is the count of different film ratings in each genre?


In [92]:
#Note: Add anything interesting regarding genre



In [93]:
#What is the average number of awards a movie gets?


In [94]:
#What is the average number of movie awards for each decade? Does it show a trend?


In [95]:
#What about year? Does it show any trends?


In [96]:
#Note: Add anything interesting regarding awards


In [97]:
#Note: Add any overall thoughts

