In [1]:
#importing libraries
import requests
import lxml
import lxml_html_clean
from requests_html import HTMLSession
from bs4 import BeautifulSoup
from pprint import pprint
import re
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns

In [2]:
#Webscraping section

In [3]:
#getting url-setup

#This url didn't work because of the 'show more' button making things difficult
#url = "https://movies.disney.com/a-z"
url = "https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films"
headers = {"User-Agent": "For class web scraping assignment, email: cnelson1845@gmail.com"}
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text)
print(r.status_code)

200


In [4]:
#using pandas to get all the tables and then turning the relevant ones into dataframes
tables = pd.read_html(r.text)
#getting the relevant tables of past and current movies (no future)
tables = tables[0:9]
dfs = []
for table in tables:
    dfs.append(table)
print(dfs)

  tables = pd.read_html(r.text)


[          Release date                                   Title  \
0    December 21, 1937         Snow White and the Seven Dwarfs   
1     February 7, 1940                               Pinocchio   
2    November 13, 1940                                Fantasia   
3        June 20, 1941                    The Reluctant Dragon   
4     October 23, 1941                                   Dumbo   
5      August 21, 1942                                   Bambi   
6     February 6, 1943                          Saludos Amigos   
7        July 17, 1943               Victory Through Air Power   
8     February 3, 1945                    The Three Caballeros   
9       April 20, 1946                         Make Mine Music   
10   November 12, 1946                       Song of the South   
11  September 27, 1947                      Fun and Fancy Free   
12        May 27, 1948                             Melody Time   
13   November 29, 1948                     So Dear to My Heart   
14     Oc

In [5]:
#combining the dataframes
movies = pd.concat(dfs, ignore_index=True)
print(movies.head())

        Release date                            Title  \
0  December 21, 1937  Snow White and the Seven Dwarfs   
1   February 7, 1940                        Pinocchio   
2  November 13, 1940                         Fantasia   
3      June 20, 1941             The Reluctant Dragon   
4   October 23, 1941                            Dumbo   

                                               Notes  
0  first film to be distributed by RKO Radio Pict...  
1   Inducted into the National Film Registry in 1994  
2  anthology film Inducted into the National Film...  
3        fictionalized tour around the Disney studio  
4   Inducted into the National Film Registry in 2017  


In [6]:
#removing the irrelevant column of description
movies = movies.drop('Notes', axis=1)
print(movies.head())

        Release date                            Title
0  December 21, 1937  Snow White and the Seven Dwarfs
1   February 7, 1940                        Pinocchio
2  November 13, 1940                         Fantasia
3      June 20, 1941             The Reluctant Dragon
4   October 23, 1941                            Dumbo


In [7]:
#Getting the title and release year from each column and putting it in a dictionary
the_movies = {}
for idx, movie in movies.iterrows():
    #getting title
    title = movie["Title"]
    #getting year
    year = movie["Release date"][-4:]
    #placing into dictionary
    the_movies[title] = year
pprint(the_movies)

{'101 Dalmatians': '1996',
 "101 Dalmatians II: Patch's London Adventure ‡": '2003',
 '102 Dalmatians': '2000',
 '20,000 Leagues Under the Sea': '1954',
 "A Bug's Life": '1998',
 'A Far Off Place': '1993',
 'A Goofy Movie': '1995',
 "A Kid in King Arthur's Court": '1995',
 'A Tiger Walks': '1964',
 'A Very Merry Pooh Year ‡': '2002',
 'A Wrinkle in Time': '2018',
 'ABCD 2 *': '2015',
 'African Cats': '2011',
 'Air Bud': '1997',
 'Aladdin': '2019',
 'Alexander and the Terrible, Horrible, No Good, Very Bad Day': '2014',
 'Alexander and the Terrible, Horrible, No Good, Very Bad Road Trip ‡': '2025',
 'Alice Through the Looking Glass': '2016',
 'Alice in Wonderland': '2010',
 'Aliens of the Deep': '2005',
 'Almost Angels': '1962',
 "America's Heart and Soul": '2004',
 'Amy': '1981',
 'Angels in the Outfield': '1994',
 'Arjun: The Warrior Prince *': '2012',
 'Around the World in 80 Days': '2004',
 'Artemis Fowl ‡': '2020',
 'Atlantis: The Lost Empire': '2001',
 'Babes in Toyland': '1961',
 

In [8]:
#getting Rotten Tomatoes urls for all the titles
#Note: Rotten Tomatoes does the title all lowercase with no special characters, and all spaces replaced by underscores
#Ex- Terminator: Dark Fate
#url is- https://www.rottentomatoes.com/m/terminator_dark_fate
#format: https://www.rottentomatoes.com/m/{title}
#Notes: If there are multiple version they either have the year after the movie title:
#Ex: https://www.rottentomatoes.com/m/the_little_mermaid_1989
#or they add a little internal number before the title 
#Ex: https://www.rottentomatoes.com/m/1074108-101_dalmatians
#The last one is worst case scenario and we look for it last, for all of these urls we will need to check if we get a proper request back

urls = []
clean_dict = {}
base = "https://www.rottentomatoes.com/m/"
for title in the_movies:
    #formatting title for rotten tomatoes
    clean = title
    if "[N 1]" in clean:
        clean = clean.replace("[N 1]","")
    #all lowercase
    clean = clean.lower()
    #removing special characters
    clean = clean.replace(":","").replace("!","").replace("/","").replace("'","").replace(",","").replace(".","")
    clean = clean.replace(" ‡","").replace(" †","").replace(" §","").replace(" *","").replace("½","_12")
    #replacing some other special characters
    clean = clean.replace("&","and").replace("í","i")
    #replacing spaces with underscores
    clean = clean.replace(" ","_").replace("-","_")
    #adding it to another dictionary to keep track of years
    clean_dict[clean] = the_movies[title]
    #creating full url with it (make sure year is still there)
    full_url = f"{base}{clean}"
    urls.append(full_url)
print(len(urls))
pprint(urls)

525
['https://www.rottentomatoes.com/m/snow_white_and_the_seven_dwarfs',
 'https://www.rottentomatoes.com/m/pinocchio',
 'https://www.rottentomatoes.com/m/fantasia',
 'https://www.rottentomatoes.com/m/the_reluctant_dragon',
 'https://www.rottentomatoes.com/m/dumbo',
 'https://www.rottentomatoes.com/m/bambi',
 'https://www.rottentomatoes.com/m/saludos_amigos',
 'https://www.rottentomatoes.com/m/victory_through_air_power',
 'https://www.rottentomatoes.com/m/the_three_caballeros',
 'https://www.rottentomatoes.com/m/make_mine_music',
 'https://www.rottentomatoes.com/m/song_of_the_south',
 'https://www.rottentomatoes.com/m/fun_and_fancy_free',
 'https://www.rottentomatoes.com/m/melody_time',
 'https://www.rottentomatoes.com/m/so_dear_to_my_heart',
 'https://www.rottentomatoes.com/m/the_adventures_of_ichabod_and_mr_toad',
 'https://www.rottentomatoes.com/m/cinderella',
 'https://www.rottentomatoes.com/m/treasure_island',
 'https://www.rottentomatoes.com/m/alice_in_wonderland',
 'https://www.

In [9]:
#Note: This box takes awhile, I'm talking about 10-15 minutes

#making a dictionary to mark the bad ones
marked = {}
#checking all the urls, if they fail, we add the year
for i, url in enumerate(urls):
    #create a request object and check if the status code is 200 or not
    r = requests.get(url)
    if not r.status_code == 200:
        #trying to fix it by removing 'the'
        current = url.replace("the_","",1)
        s = requests.get(current)
        if s.status_code == 200:
            urls[i] = current
            continue
        #trying to fix by removing a
        current = url.replace("a_","",1)
        s = requests.get(current)
        if s.status_code == 200:
            urls[i] = current
            continue
        #we try fixing it by adding the year
        title = url.rsplit("/",1)[1]
        current = url+f"_{clean_dict[title]}"
        s = requests.get(current)
        if s.status_code == 200:
            urls[i] = current
            continue
        #we try and fix it by changing roman numeral ii to number 2
        current = url.replace("ii","2")
        s = requests.get(current)
        if s.status_code == 200:
            urls[i] = current
            continue
        #adding to marked since it couldn't be easily fixed
        marked[i] = url
#pretty printing it so I can read it
pprint(marked)

{7: 'https://www.rottentomatoes.com/m/victory_through_air_power',
 18: 'https://www.rottentomatoes.com/m/the_story_of_robin_hood',
 24: 'https://www.rottentomatoes.com/m/20000_leagues_under_the_sea',
 36: 'https://www.rottentomatoes.com/m/the_light_in_the_forest',
 37: 'https://www.rottentomatoes.com/m/white_wilderness',
 43: 'https://www.rottentomatoes.com/m/toby_tyler_or_10_weeks_with_a_circus',
 44: 'https://www.rottentomatoes.com/m/kidnapped',
 51: 'https://www.rottentomatoes.com/m/the_absent_minded_professor',
 58: 'https://www.rottentomatoes.com/m/big_red',
 71: 'https://www.rottentomatoes.com/m/the_moon_spinners',
 83: 'https://www.rottentomatoes.com/m/the_gnome_mobile',
 192: 'https://www.rottentomatoes.com/m/angels_in_the_outfield',
 195: 'https://www.rottentomatoes.com/m/rudyard_kiplings_the_jungle_book',
 218: 'https://www.rottentomatoes.com/m/george_of_the_jungle',
 227: 'https://www.rottentomatoes.com/m/serengeti_symphony',
 230: 'https://www.rottentomatoes.com/m/mighty_jo

In [10]:
#manually fixing the bad urls
urls[7] = "https://www.rottentomatoes.com/m/victory-through-air-power"
urls[18] = "https://www.rottentomatoes.com/m/story_of_robin_hood_and_his_merrie_men"
urls[24] = "https://www.rottentomatoes.com/m/1000079-20000_leagues_under_the_sea"
urls[36] = "https://www.rottentomatoes.com/m/1012343-light_in_the_forest"
urls[37] = "https://www.rottentomatoes.com/m/truelife_adventures_white_wilderness"
urls[43] = "https://www.rottentomatoes.com/m/toby_tyler"
urls[44] = "https://www.rottentomatoes.com/m/1011511-kidnapped"
urls[51] = "https://www.rottentomatoes.com/m/absentminded_professor"
urls[58] = "https://www.rottentomatoes.com/m/1002343-big_red"
urls[71] = "https://www.rottentomatoes.com/m/moonspinners"
urls[83] = "https://www.rottentomatoes.com/m/gnomemobile"
urls[192] = "https://www.rottentomatoes.com/m/1055833-angels_in_the_outfield"
urls[195] = "https://www.rottentomatoes.com/m/1057637-jungle_book"
urls[218] = "https://www.rottentomatoes.com/m/1077847-george_of_the_jungle"
urls[227] = "" #There is no entry in rotten tomatoes for Serengeti Symphony, will delete at end
urls[230] = "https://www.rottentomatoes.com/m/1084582-mighty_joe_young"
#urls[262] = "https://www.rottentomatoes.com/m/spirited_away" #different
urls[265] = "https://www.rottentomatoes.com/m/winnie-the-pooh-a-very-merry-pooh-year"
urls[285] = "https://www.rottentomatoes.com/m/winnie_the_pooh_springtime_with_roo"
urls[287] = "https://www.rottentomatoes.com/m/1141548-sacred_planet"
urls[291] = "https://www.rottentomatoes.com/m/the-three-musketeers2004"
#urls[302] = "https://www.rottentomatoes.com/m/howls_moving_castle" #different
urls[305] = "https://www.rottentomatoes.com/m/lilo_and_stitch_2"
urls[309] = "https://www.rottentomatoes.com/m/chronicles_of_narnia_lion_witch_wardrobe"
urls[311] = "https://www.rottentomatoes.com/m/1155467-glory_road"
urls[321] = "https://www.rottentomatoes.com/m/the_nightmare_before_christmas"
urls[322] = "https://www.rottentomatoes.com/m/santa_clause_3"
urls[332] = "https://www.rottentomatoes.com/m/the_gameplan"
urls[342] = "https://www.rottentomatoes.com/m/10008587-beverly_hills_chihuahua"
urls[344] = "https://www.rottentomatoes.com/m/1194524-high_school_musical_3_senior_year"
urls[350] = "https://www.rottentomatoes.com/m/jonas_brothers_3d"
urls[354] = "https://www.rottentomatoes.com/m/trail-of-the-panda"
urls[357] = "https://www.rottentomatoes.com/m/hexe_lilli_der_drache_und_das_magische_buch"
urls[358] = "https://www.rottentomatoes.com/m/10009462-g_force"
#urls[360] = "https://www.rottentomatoes.com/m/walt_and_el_grupo" #different
urls[361] = "" #there is no entry for SpangaS op Survival, thus it will be deleted at the end
urls[364] = "https://www.rottentomatoes.com/m/10008502-christmas_carol"
urls[366] = "https://www.rottentomatoes.com/m/10009596-old_dogs"
urls[368] = "https://www.rottentomatoes.com/m/high_school_musical_el_desafio"
urls[374] = "https://www.rottentomatoes.com/m/gedo-senki-tales-from-earthsea"
urls[382] = "https://www.rottentomatoes.com/m/10011582-tron_legacy"
urls[433] = "https://www.rottentomatoes.com/m/tini_el_gran_cambio_de_violetta"
urls[441] = "https://www.rottentomatoes.com/m/march_of_the_penguins_2_the_next_step"
urls[449] = "https://www.rottentomatoes.com/m/disneynature_expedition_china"
urls[485] = "https://www.rottentomatoes.com/m/the_last_warrior_2017"
urls[487] = "https://www.rottentomatoes.com/m/the_beatles_get_back_the_rooftop_concert"
#Note: The null entries were Dutch in origin

#deleted null entries
urls.pop(361)
urls.pop(227)

''

In [None]:
#Note: This is going to take a long time again, 8-10 minutes

#getting relevant information from each url
title = []
audience_score = []
critic_score = []
revenue = []
genre = []
runtime = []
director = []
year = []
#This is film rating-PG,R,etc
rating = []
#We will get the full date now, but we will likely need to split it into month and year (and season) later
release_date = []
for url in urls:
    r = requests.get(url)
    if r.status_code != 200:
        raise KeyError
    soup = BeautifulSoup(r.text)

    #getting title
    titl = soup.find("rt-text", slot="title",context="heading")
    title.append(titl.text)
    #getting audience score
    audScore = soup.find("rt-text", slot="audienceScore")
    audience_score.append(audScore.text)
    #getting critic score
    criticScore = soup.find("rt-text", slot="criticsScore")
    critic_score.append(criticScore.text)

    #getting film rating, release decade, and runtime
    texts = soup.find_all("rt-text",attrs={"slot": "metadataProp", "context": "label", "size": "0.875"})
    
    #accounting for movies that don't have a rating in the title
    if len(texts)==1:
        rating.append(None)
        #getting release decade
        year.append(texts[0].text)
        #getting runtime
        runtime.append(None)
    elif len(texts)==2:
        if texts[0] == "TV-G": #Handling the weird Mickey Movie that only has rating and runtime, not year
            rating.append(texts[0])
            #getting release decade
            year.append(None)
            #getting runtime
            runtime.append(texts[1].text)
        else:
            rating.append(None)
            #getting release decade
            year.append(texts[0].text)
            #getting runtime
            runtime.append(texts[1].text)
    else: #len(texts)==3
        rating.append(texts[0].text)
        #getting release decade
        year.append(texts[1].text)
        #getting runtime
        runtime.append(texts[2].text)

    #getting genre(s)
    texts = soup.find_all("rt-text",attrs={"slot": "metadataGenre","size": "0.875"})
    gs = []
    for text in texts:
        gs.append(text.text)
    genre.append(gs)

    #getting release date
    texts = soup.find_all("div",class_="category-wrap",attrs={"data-qa": "item"})
    has_date = False
    has_revenue = False
    has_director = False
    for text in texts:
        current = text.find("dt",class_="key").text
        if current.strip() == "Release Date (Theaters)":
            has_date=True
            temp = text.find("dd",attrs={"data-qa":"item-value-group"}) 
            release_date.append(temp.find("rt-text").text)
    #getting box-office revenue (might not exist)
        if current.strip() == "Box Office (Gross USA)":
            has_revenue = True
            temp = text.find("dd",attrs={"data-qa":"item-value-group"}) 
            revenue.append(temp.find("rt-text").text)
    #getting director
        if current.strip() == "Director":
            has_director = True
            temp = text.find("dd",attrs={"data-qa":"item-value-group"}) 
            director.append(temp.find("rt-link").text)
    if not has_date:
        release_date.append(None)
    if not has_revenue:
            revenue.append(None)
    if not has_director:
            director.append(None)


In [27]:
#putting the information into a dataframe
df = pd.DataFrame({"Title":title,"Audience-Score":audience_score,"Critic-Score":critic_score,"Revenue":revenue,"Release-Date":release_date,"Year":year, "Film-Rating":rating,"Director":director,"Genre":genre,"Runtime":runtime})
#printing to check what we will need to clean next
print(df.head())

                             Title Audience-Score Critic-Score Revenue  \
0  Snow White and the Seven Dwarfs            78%          97%    None   
1                        Pinocchio            33%           0%   $3.7M   
2                         Fantasia            83%          95%  $24.8M   
3             The Reluctant Dragon            66%         100%    None   
4                            Dumbo            70%          95%    None   

             Release-Date  Year Film-Rating          Director  \
0                    None  1937           G        David Hand   
1      Dec 25, 2002, Wide  2002           G   Roberto Benigni   
2   Nov 13, 1940, Limited  1940           G       James Algar   
3    Jan 2, 1941, Limited  1941        None  Alfred L. Werker   
4  Oct 23, 1941, Original  1941           G    Ben Sharpsteen   

                                     Genre Runtime  
0      [Kids & Family, Fantasy, Animation]  1h 20m  
1  [Kids & Family, Fantasy, Comedy, Drama]  1h 47m  
2    

In [None]:
#cleaning the dataframe
#changing all None values to np.nan
df = df.applymap(lambda x: np.nan if x is None else x)
df = df.applymap(lambda x: np.nan if x == '' else x) 
#removing % from audience and critic score
df["Audience-Score"] = df["Audience-Score"].replace("%","", regex=True)
df["Critic-Score"] = df["Critic-Score"].replace("%","", regex=True)
df["Audience-Score"] = df["Audience-Score"].astype("Int64")
df['Critic-Score'] = df['Critic-Score'].astype("Int64")
#Turning revenue into a integer and removing words and symbols
df["Revenue"] = df["Revenue"].replace("\$", "", regex=True)

def convert_revenue(x):
    if isinstance(x, str):
        if "M" in x:
            temp = float(x.replace("M", "")) * 1000000
            return temp
        elif "K" in x:
            return float(x.replace("K", "")) * 1000
        else:
            raise ValueError
    else:
        return np.nan

df["Revenue"] = df["Revenue"].apply(convert_revenue)
print(df.head())

                             Title  Audience-Score  Critic-Score     Revenue  \
0  Snow White and the Seven Dwarfs              78            97         NaN   
1                        Pinocchio              33             0   3700000.0   
2                         Fantasia              83            95  24800000.0   
3             The Reluctant Dragon              66           100         NaN   
4                            Dumbo              70            95         NaN   

             Release-Date  Year Film-Rating          Director  \
0                     NaN  1937           G        David Hand   
1      Dec 25, 2002, Wide  2002           G   Roberto Benigni   
2   Nov 13, 1940, Limited  1940           G       James Algar   
3    Jan 2, 1941, Limited  1941         NaN  Alfred L. Werker   
4  Oct 23, 1941, Original  1941           G    Ben Sharpsteen   

                                     Genre Runtime  
0      [Kids & Family, Fantasy, Animation]  1h 20m  
1  [Kids & Family, Fan

  df["Revenue"] = df["Revenue"].replace("\$", "", regex=True)
  df = df.applymap(lambda x: np.nan if x is None else x)
  df = df.applymap(lambda x: np.nan if x == '' else x)
  df = df.applymap(lambda x: np.nan if x == "<NA>" else x)


In [None]:
#cleaning up the Year and ensuring it is an int
df["Year"] = df["Year"].apply(lambda x: "2025" if x == "Now Playing" else x)
df["Year"] = df.apply(lambda row: (re.sub(r"^[A-Za-z]{3}\s","",row.Year)), axis=1)
#Dealing with weird Mickey Movie case:
df.loc[df["Year"] == 'TV-G', "Film-Rating"] = 'TV-G'
df.loc[df["Year"] == 'TV-G', "Year"] = np.nan
#Making rest into ints
df['Year'] = df['Year'].astype("Int64")
#Note: The box office revenue is not adjusted for inflation, this will need to be accounted for
print(df.head())

                             Title  Audience-Score  Critic-Score     Revenue  \
0  Snow White and the Seven Dwarfs              78            97         NaN   
1                        Pinocchio              33             0   3700000.0   
2                         Fantasia              83            95  24800000.0   
3             The Reluctant Dragon              66           100         NaN   
4                            Dumbo              70            95         NaN   

             Release-Date  Year Film-Rating          Director  \
0                     NaN  1937           G        David Hand   
1      Dec 25, 2002, Wide  2002           G   Roberto Benigni   
2   Nov 13, 1940, Limited  1940           G       James Algar   
3    Jan 2, 1941, Limited  1941         NaN  Alfred L. Werker   
4  Oct 23, 1941, Original  1941           G    Ben Sharpsteen   

                                     Genre Runtime  
0      [Kids & Family, Fantasy, Animation]  1h 20m  
1  [Kids & Family, Fan

In [30]:
#changing runtime into only minutes and making it an int
def runtime_to_minutes(x):
    # handle NaN or missing strings
    if not isinstance(x, str) or x.strip() == "":
        return np.nan  
    # Extract hours and minutes 
    hour = re.search(r"(\d+)h", x)
    minute = re.search(r"(\d+)m", x)
    hours = int(hour.group(1)) if hour else 0
    minutes = int(minute.group(1)) if minute else 0
    return hours * 60 + minutes

df["Runtime"] = df["Runtime"].apply(runtime_to_minutes)

#Cleaning release date
#Removing the word from the release date
df["Release-Date"] = df["Release-Date"].apply(lambda x: re.sub(r",\s[A-Za-z]+$", "", x) if isinstance(x, str) else x)
#Changing it into a datetime object
df["Release-Date"] = pd.to_datetime(df["Release-Date"], errors="coerce")
print(df.head())

                             Title  Audience-Score  Critic-Score     Revenue  \
0  Snow White and the Seven Dwarfs              78            97         NaN   
1                        Pinocchio              33             0   3700000.0   
2                         Fantasia              83            95  24800000.0   
3             The Reluctant Dragon              66           100         NaN   
4                            Dumbo              70            95         NaN   

  Release-Date  Year Film-Rating          Director  \
0          NaT  1937           G        David Hand   
1   2002-12-25  2002           G   Roberto Benigni   
2   1940-11-13  1940           G       James Algar   
3   1941-01-02  1941         NaN  Alfred L. Werker   
4   1941-10-23  1941           G    Ben Sharpsteen   

                                     Genre  Runtime  
0      [Kids & Family, Fantasy, Animation]     80.0  
1  [Kids & Family, Fantasy, Comedy, Drama]    107.0  
2      [Kids & Family, Fantasy, 

In [37]:
#confirming everything is clean by getting unique counts
print("Column: Film-Rating")
print(df["Film-Rating"].unique())
print("-" * 40)
print("Column: Revenue")
print(df["Revenue"].unique())
print("-" * 40)
print("Column: Runtime")
print(df["Runtime"].unique())
print("-" * 40)
print("Column: Year")
print(df["Year"].unique())
print("-" * 40)
print("Column: Audience Score")
print(df["Audience-Score"].unique())
print("-" * 40)
print("Column: Critic Score")
print(df["Critic-Score"].unique())
print("-" * 40)

Column: Film-Rating
['G' nan 'PG' 'TV-G' 'TV-PG' 'PG-13' 'R' 'TV-14']
----------------------------------------
Column: Revenue
[      nan 3.700e+06 2.480e+07 3.810e+07 3.350e+07 9.130e+07 6.630e+07
 1.830e+07 1.102e+08 8.500e+06 1.240e+07 3.730e+07 1.720e+07 2.200e+07
 1.370e+07 1.302e+08 7.900e+06 1.810e+07 1.490e+07 7.300e+06 1.382e+05
 2.700e+06 5.870e+07 5.070e+07 3.556e+08 2.740e+07 4.160e+07 1.290e+07
 2.380e+07 1.700e+06 2.060e+07 3.060e+07 4.560e+07 8.700e+06 9.560e+07
 5.020e+07 3.300e+06 1.455e+08 4.320e+07 1.770e+07 1.910e+07 8.200e+06
 3.530e+07 1.416e+08 2.470e+07 1.340e+07 3.180e+04 6.700e+06 2.390e+07
 3.430e+07 3.280e+07 2.890e+07 1.001e+08 2.650e+07 2.300e+07 5.990e+07
 9.910e+07 1.053e+08 2.460e+07 9.300e+07 2.140e+07 4.400e+06 1.206e+08
 1.628e+08 5.060e+07 3.680e+07 1.940e+07 9.740e+07 6.200e+06 2.459e+08
 6.050e+07 6.970e+07 1.158e+08 6.690e+07 8.960e+07 3.670e+07 1.082e+08
 1.730e+07 4.940e+05 8.120e+07 4.840e+07 1.700e+07 1.000e+07 1.920e+07
 1.392e+08 6.730e+07 

In [None]:
#making sure everything is its proper type (str or int or such)
print(df.dtypes)
print("_______________________________")
#Fixing those that are incorrect
df["Title"].astype(str)
df["Film-Rating"].astype(str)
df["Director"].astype(str)
df["Runtime"] = df["Runtime"].round(0).astype("Int64")
df["Revenue"] = df["Revenue"].round(0).astype("Int64")
#checking again
print(df.dtypes)

Title                     object
Audience-Score             Int64
Critic-Score               Int64
Revenue                    Int64
Release-Date      datetime64[ns]
Year                       Int64
Film-Rating               object
Director                  object
Genre                     object
Runtime                    Int64
dtype: object
_______________________________
Title                     object
Audience-Score             Int64
Critic-Score               Int64
Revenue                    Int64
Release-Date      datetime64[ns]
Year                       Int64
Film-Rating               object
Director                  object
Genre                     object
Runtime                    Int64
dtype: object
                             Title  Audience-Score  Critic-Score   Revenue  \
0  Snow White and the Seven Dwarfs              78            97      <NA>   
1                        Pinocchio              33             0   3700000   
2                         Fantasia            

In [None]:
#creating month, year, decade, and season column from "Release-Date" and deleting release date column


In [None]:
#saving the dataframe as a csv


In [None]:
#Analysis Section - From here on, rating refers to score unless otherwise specified

In [None]:
#What is the average movie rating each year? (audience and critic)

In [None]:
#Average movie rating each decade (audience and critic)

In [None]:
#Note section:
#What is the trend in ratings? (rising, falling, stagnant, other)

#Is it different between audiences and critics? If so, by what degree? 

#Any other anomalies you notice? Any dips or highs? Why might that be?


In [None]:
#What is the average box office revenue each year? 


In [None]:
#Revenue for each decade


In [None]:
#Revenue for each season? 


In [None]:
#Does revenue correlate with ratings? 


In [None]:
#Does revenue correlate more strongly with audience or critic ratings?


In [None]:
#Note: What is the trend regarding revenue?

#Please note any other anomalies and your explanation (it's okay if you don't know)


In [None]:
#How many total directors have there been? 


In [None]:
#How many different directors have there been each year? Each decade?


In [None]:
#What is the average revenue of movies made by each director?


In [None]:
#What is the average rating of movies made by each director?


In [None]:
#Do directors with more movies have better ratings?


In [None]:
#Do directors movies get better ratings as they make more movies?


In [None]:
#Note: Make any notes about directors and relationships with other variables



In [None]:
#Any interesting relationships between film rating and rating?


In [None]:
#What about film rating and revenue? 


In [None]:
#Note: Note anything interesting regarding film rating



In [None]:
#Any interesting relationships between genre and rating?


In [None]:
#What about genre and revenue?


In [None]:
#What is the count of different film ratings in each genre?


In [None]:
#Note: Add anything interesting regarding genre



In [None]:
#Note: Add any overall thoughts

