In [349]:
import pandas as pd
import sqlite3

In [350]:
im_conn = sqlite3.connect("../data/im.db")

In [351]:
bom_df = pd.read_csv("../zippedData/bom.movie_gross.csv.gz")
bom_df.head()

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year
0,Toy Story 3,BV,415000000.0,652000000,2010
1,Alice in Wonderland (2010),BV,334200000.0,691300000,2010
2,Harry Potter and the Deathly Hallows Part 1,WB,296000000.0,664300000,2010
3,Inception,WB,292600000.0,535700000,2010
4,Shrek Forever After,P/DW,238700000.0,513900000,2010


In [352]:
movieinfo_df = pd.read_csv("../zippedData/rt.movie_info.tsv.gz", sep='\t', compression='gzip')
movieinfo_df.head()

Unnamed: 0,id,synopsis,rating,genre,director,writer,theater_date,dvd_date,currency,box_office,runtime,studio
0,1,"This gritty, fast-paced, and innovative police...",R,Action and Adventure|Classics|Drama,William Friedkin,Ernest Tidyman,"Oct 9, 1971","Sep 25, 2001",,,104 minutes,
1,3,"New York City, not-too-distant-future: Eric Pa...",R,Drama|Science Fiction and Fantasy,David Cronenberg,David Cronenberg|Don DeLillo,"Aug 17, 2012","Jan 1, 2013",$,600000.0,108 minutes,Entertainment One
2,5,Illeana Douglas delivers a superb performance ...,R,Drama|Musical and Performing Arts,Allison Anders,Allison Anders,"Sep 13, 1996","Apr 18, 2000",,,116 minutes,
3,6,Michael Douglas runs afoul of a treacherous su...,R,Drama|Mystery and Suspense,Barry Levinson,Paul Attanasio|Michael Crichton,"Dec 9, 1994","Aug 27, 1997",,,128 minutes,
4,7,,NR,Drama|Romance,Rodney Bennett,Giles Cooper,,,,,200 minutes,


In [353]:
reviews_df = pd.read_csv(
    "../zippedData/rt.reviews.tsv.gz",
    sep='\t',
    compression='gzip',
    encoding='latin1'   # or encoding='ISO-8859-1'
)
reviews_df.head()


Unnamed: 0,id,review,rating,fresh,critic,top_critic,publisher,date
0,3,A distinctly gallows take on contemporary fina...,3/5,fresh,PJ Nabarro,0,Patrick Nabarro,"November 10, 2018"
1,3,It's an allegory in search of a meaning that n...,,rotten,Annalee Newitz,0,io9.com,"May 23, 2018"
2,3,... life lived in a bubble in financial dealin...,,fresh,Sean Axmaker,0,Stream on Demand,"January 4, 2018"
3,3,Continuing along a line introduced in last yea...,,fresh,Daniel Kasman,0,MUBI,"November 16, 2017"
4,3,... a perverse twist on neorealism...,,fresh,,0,Cinema Scope,"October 12, 2017"


In [354]:
tmdb_df = pd.read_csv("../zippedData/tmdb.movies.csv.gz")
tmdb_df.head()

Unnamed: 0.1,Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count
0,0,"[12, 14, 10751]",12444,en,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788
1,1,"[14, 12, 16, 10751]",10191,en,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610
2,2,"[12, 28, 878]",10138,en,Iron Man 2,28.515,2010-05-07,Iron Man 2,6.8,12368
3,3,"[16, 35, 10751]",862,en,Toy Story,28.005,1995-11-22,Toy Story,7.9,10174
4,4,"[28, 878, 12]",27205,en,Inception,27.92,2010-07-16,Inception,8.3,22186


In [355]:
budgets_df = pd.read_csv("../zippedData/tn.movie_budgets.csv.gz")
budgets_df.head()

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross
0,1,"Dec 18, 2009",Avatar,"$425,000,000","$760,507,625","$2,776,345,279"
1,2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875"
2,3,"Jun 7, 2019",Dark Phoenix,"$350,000,000","$42,762,350","$149,762,350"
3,4,"May 1, 2015",Avengers: Age of Ultron,"$330,600,000","$459,005,868","$1,403,013,963"
4,5,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,"$317,000,000","$620,181,382","$1,316,721,747"


Basic Cleaning

In [356]:
# check shape and missing values for all datasets

datasets = {
    "BOM": bom_df,
    "Movie Info": movieinfo_df,
    "Reviews": reviews_df,
    "TMDB": tmdb_df,
    "Budgets": budgets_df
}

for name, df in datasets.items():
    print(f"\n{name} dataset: {df.shape[0]} rows, {df.shape[1]} columns")
    print("Missing values summary:")
    display(df.isnull().sum().sort_values(ascending=True).head(10))



BOM dataset: 3387 rows, 5 columns
Missing values summary:


title                0
year                 0
studio               5
domestic_gross      28
foreign_gross     1350
dtype: int64


Movie Info dataset: 1560 rows, 12 columns
Missing values summary:


id                 0
rating             3
genre              8
runtime           30
synopsis          62
director         199
theater_date     359
dvd_date         359
writer           449
studio          1066
dtype: int64


Reviews dataset: 54432 rows, 8 columns
Missing values summary:


id                0
fresh             0
top_critic        0
date              0
publisher       309
critic         2722
review         5563
rating        13517
dtype: int64


TMDB dataset: 26517 rows, 10 columns
Missing values summary:


Unnamed: 0           0
genre_ids            0
id                   0
original_language    0
original_title       0
popularity           0
release_date         0
title                0
vote_average         0
vote_count           0
dtype: int64


Budgets dataset: 5782 rows, 6 columns
Missing values summary:


id                   0
release_date         0
movie                0
production_budget    0
domestic_gross       0
worldwide_gross      0
dtype: int64

In [357]:
#drop duplicates

for name, df in datasets.items():
    df.drop_duplicates(inplace=True)


In [358]:
# this helps you identify if numeric columns like budgets or grosses are stored as text

for name, df in datasets.items():
    display(df.dtypes)



title              object
studio             object
domestic_gross    float64
foreign_gross      object
year                int64
dtype: object

id               int64
synopsis        object
rating          object
genre           object
director        object
writer          object
theater_date    object
dvd_date        object
currency        object
box_office      object
runtime         object
studio          object
dtype: object

id             int64
review        object
rating        object
fresh         object
critic        object
top_critic     int64
publisher     object
date          object
dtype: object

Unnamed: 0             int64
genre_ids             object
id                     int64
original_language     object
original_title        object
popularity           float64
release_date          object
title                 object
vote_average         float64
vote_count             int64
dtype: object

id                    int64
release_date         object
movie                object
production_budget    object
domestic_gross       object
worldwide_gross      object
dtype: object

In [359]:
# Remove $ and commas, convert to integer

for col in ["production_budget", "domestic_gross", "worldwide_gross"]:
    budgets_df[col] = (budgets_df[col].replace('[\$,]', '', regex=True).astype(float))

budgets_df[["production_budget", "domestic_gross", "worldwide_gross"]].head()


Unnamed: 0,production_budget,domestic_gross,worldwide_gross
0,425000000.0,760507625.0,2776345000.0
1,410600000.0,241063875.0,1045664000.0
2,350000000.0,42762350.0,149762400.0
3,330600000.0,459005868.0,1403014000.0
4,317000000.0,620181382.0,1316722000.0


In [360]:
# Make column names consistent across datasets; all lowercase, no spaces.

bom_df.columns = bom_df.columns.str.lower().str.replace(" ", "_")

movieinfo_df.columns = movieinfo_df.columns.str.lower().str.replace(" ", "_")

reviews_df.columns = reviews_df.columns.str.lower().str.replace(" ", "_")

tmdb_df.columns = tmdb_df.columns.str.lower().str.replace(" ", "_")

budgets_df.columns = budgets_df.columns.str.lower().str.replace(" ", "_")



Loading Cleaned Data into SQLite

In [361]:
# Create a new database (or connect if exists)
conn = sqlite3.connect("../data/movies_cleaned.db")

# Save each dataframe as a SQL table

bom_df.to_sql("bom_gross", conn, if_exists="replace", index=False)

movieinfo_df.to_sql("rt_movie_info", conn, if_exists="replace", index=False)

reviews_df.to_sql("rt_reviews", conn, if_exists="replace", index=False)

tmdb_df.to_sql("tmdb_movies", conn, if_exists="replace", index=False)

budgets_df.to_sql("movie_budgets", conn, if_exists="replace", index=False)

print("Cleaned datasets loaded into SQLite database successfully!")


Cleaned datasets loaded into SQLite database successfully!


In [362]:
# verification of tables in SQLite

pd.read_sql("SELECT name FROM sqlite_master WHERE type='table';", conn)


Unnamed: 0,name
0,bom_gross
1,rt_movie_info
2,rt_reviews
3,tmdb_movies
4,movie_budgets


In [363]:
pd.read_sql("PRAGMA table_info(rt_movie_info);", conn)


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,id,INTEGER,0,,0
1,1,synopsis,TEXT,0,,0
2,2,rating,TEXT,0,,0
3,3,genre,TEXT,0,,0
4,4,director,TEXT,0,,0
5,5,writer,TEXT,0,,0
6,6,theater_date,TEXT,0,,0
7,7,dvd_date,TEXT,0,,0
8,8,currency,TEXT,0,,0
9,9,box_office,TEXT,0,,0


In [364]:
for table in ["bom_gross", "rt_reviews", "tmdb_movies", "movie_budgets"]:
    print(f"\n{table} columns:")
    display(pd.read_sql(f"PRAGMA table_info({table});", conn))



bom_gross columns:


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,title,TEXT,0,,0
1,1,studio,TEXT,0,,0
2,2,domestic_gross,REAL,0,,0
3,3,foreign_gross,TEXT,0,,0
4,4,year,INTEGER,0,,0



rt_reviews columns:


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,id,INTEGER,0,,0
1,1,review,TEXT,0,,0
2,2,rating,TEXT,0,,0
3,3,fresh,TEXT,0,,0
4,4,critic,TEXT,0,,0
5,5,top_critic,INTEGER,0,,0
6,6,publisher,TEXT,0,,0
7,7,date,TEXT,0,,0



tmdb_movies columns:


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,unnamed:_0,INTEGER,0,,0
1,1,genre_ids,TEXT,0,,0
2,2,id,INTEGER,0,,0
3,3,original_language,TEXT,0,,0
4,4,original_title,TEXT,0,,0
5,5,popularity,REAL,0,,0
6,6,release_date,TEXT,0,,0
7,7,title,TEXT,0,,0
8,8,vote_average,REAL,0,,0
9,9,vote_count,INTEGER,0,,0



movie_budgets columns:


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,id,INTEGER,0,,0
1,1,release_date,TEXT,0,,0
2,2,movie,TEXT,0,,0
3,3,production_budget,REAL,0,,0
4,4,domestic_gross,REAL,0,,0
5,5,worldwide_gross,REAL,0,,0


OBJECTIVE 1: Identify High-Performing Studios & Genres

We start by finding which studios consistently produce the highest grossing movies. 
 
The `bom_gross` and `movie_budgets` tables are used here.

In [365]:
query_studio_performance = """
SELECT
    bg.studio,
    ROUND(AVG(mb.worldwide_gross), 2) AS avg_worldwide_gross,
    COUNT(mb.movie) AS num_movies
FROM movie_budgets mb
JOIN bom_gross bg
    ON mb.movie = bg.title
GROUP BY bg.studio
HAVING num_movies > 3
ORDER BY avg_worldwide_gross DESC
LIMIT 10;
"""

studio_performance_df = pd.read_sql(query_studio_performance, conn)
studio_performance_df


Unnamed: 0,studio,avg_worldwide_gross,num_movies
0,P/DW,507802800.0,10
1,BV,462305800.0,72
2,Fox,243598300.0,110
3,Sony,237862300.0,74
4,Uni.,233583700.0,117
5,WB (NL),230834200.0,37
6,WB,217586400.0,102
7,Par.,195110900.0,74
8,LG/S,123094400.0,31
9,Sum.,119886500.0,12


OBJECTIVE 2: Analyzing ROI (Profitability)

This calculate each movie’s ROI to identify which films and studios achieve the best returns.


In [366]:
query_roi = """
SELECT
    mb.movie,
    bg.studio,
    ROUND((mb.worldwide_gross - mb.production_budget) / mb.production_budget, 2) AS ROI,
    mb.worldwide_gross,
    mb.production_budget
FROM movie_budgets mb
JOIN bom_gross bg
    ON mb.movie = bg.title
WHERE mb.production_budget > 0
ORDER BY ROI DESC
LIMIT 10;
"""

roi_df = pd.read_sql(query_roi, conn)
roi_df


Unnamed: 0,movie,studio,ROI,worldwide_gross,production_budget
0,The Gallows,WB (NL),415.56,41656474.0,100000.0
1,The Devil Inside,Par.,100.76,101759490.0,1000000.0
2,Insidious,FD,65.58,99870886.0,1500000.0
3,Unfriended,Uni.,63.36,64364198.0,1000000.0
4,Paranormal Activity 2,Par.,58.17,177512032.0,3000000.0
5,Split,Uni.,54.79,278964806.0,5000000.0
6,Get Out,Uni.,50.07,255367951.0,5000000.0
7,Chernobyl Diaries,WB,41.41,42411721.0,1000000.0
8,Paranormal Activity 3,Par.,40.41,207039844.0,5000000.0
9,Annabelle,WB (NL),38.52,256862920.0,6500000.0


OBJECTIVE 3: Audience Ratings and Popularity

This shows which genres and types of films receive high audience ratings and votes using TMDb data.


In [367]:
query_ratings = """
SELECT
    tm.genre_ids,
    ROUND(AVG(tm.vote_average), 2) AS avg_rating,
    COUNT(*) AS num_movies
FROM tmdb_movies tm
GROUP BY tm.genre_ids
HAVING num_movies > 5
ORDER BY avg_rating DESC
LIMIT 10;
"""

ratings_df = pd.read_sql(query_ratings, conn)
ratings_df


Unnamed: 0,genre_ids,avg_rating,num_movies
0,"[37, 18, 10402]",10.0,6
1,"[99, 18, 10751]",8.13,7
2,"[99, 36, 10402]",7.89,8
3,"[99, 36, 10770]",7.8,6
4,"[53, 16, 14]",7.8,6
5,"[10402, 99, 36]",7.67,6
6,"[10402, 18]",7.42,27
7,"[14, 16, 10751]",7.41,7
8,"[99, 80]",7.35,12
9,"[10751, 99]",7.32,6


OBJECTIVE 4: Movie Performance Trends Over Time

We’ll explore whether movie performance has improved or declined over time, based on box office and budgets.


In [368]:
query_trends = """
SELECT
    bg.year,
    ROUND(AVG(mb.worldwide_gross), 2) AS avg_gross,
    ROUND(AVG(mb.production_budget), 2) AS avg_budget,
    COUNT(*) AS num_movies
FROM movie_budgets mb
JOIN bom_gross bg
    ON mb.movie = bg.title
GROUP BY bg.year
HAVING num_movies > 5
ORDER BY bg.year ASC;
"""

trends_df = pd.read_sql(query_trends, conn)
trends_df.head()


Unnamed: 0,year,avg_gross,avg_budget,num_movies
0,2010,102778500.0,38876128.53,184
1,2011,119907200.0,43302827.38,168
2,2012,146786900.0,46617118.06,144
3,2013,160772500.0,51617857.14,140
4,2014,161740600.0,45311776.35,128


OBJECTIVE 5: Identify Top Performing Movies

This involves finding the most profitable and highest rated movies across all sources.


In [369]:
query_best_movies = """
SELECT
    mb.movie,
    bg.studio,
    tm.vote_average AS rating,
    ROUND((mb.worldwide_gross - mb.production_budget) / mb.production_budget, 2) AS ROI,
    mb.worldwide_gross
FROM movie_budgets mb
JOIN bom_gross bg
    ON mb.movie = bg.title
JOIN tmdb_movies tm
    ON mb.movie = tm.title
WHERE mb.production_budget > 0
ORDER BY ROI DESC, rating DESC
LIMIT 10;
"""

best_movies_df = pd.read_sql(query_best_movies, conn)
best_movies_df


Unnamed: 0,movie,studio,rating,ROI,worldwide_gross
0,The Gallows,WB (NL),4.8,415.56,41656474.0
1,The Devil Inside,Par.,4.7,100.76,101759490.0
2,Insidious,FD,6.9,65.58,99870886.0
3,Unfriended,Uni.,5.4,63.36,64364198.0
4,Paranormal Activity 2,Par.,5.7,58.17,177512032.0
5,Split,Uni.,7.2,54.79,278964806.0
6,Split,Uni.,5.3,54.79,278964806.0
7,Split,Uni.,5.0,54.79,278964806.0
8,Split,Uni.,4.8,54.79,278964806.0
9,Get Out,Uni.,7.5,50.07,255367951.0


**SQL INSIGHTS**


**Top Studios:** The most successful studios generate high worldwide grosses consistently.  

**ROI Leaders:** Low-budget, high-grossing films show strong profit potential.  

**Ratings:** Some genres (from TMDb `genre_ids`) correlate with higher average audience ratings. 

**Trends:** Movie budgets and grosses have shifted over years, showing changing audience interests.  

**Top Titles:** Combining ROI and ratings highlights films that are both profitable and popular.

## Data Cleaning<hr>
In this section, we do the final cleaning the data from the data sources

#### 1 BOM DF

In [370]:
# Load bom_gross
bbom_df = pd.read_sql('''
 SELECT * FROM bom_gross;
''', conn)

In [371]:
bbom_df.head()

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year
0,Toy Story 3,BV,415000000.0,652000000,2010
1,Alice in Wonderland (2010),BV,334200000.0,691300000,2010
2,Harry Potter and the Deathly Hallows Part 1,WB,296000000.0,664300000,2010
3,Inception,WB,292600000.0,535700000,2010
4,Shrek Forever After,P/DW,238700000.0,513900000,2010


In [372]:
bbom_df.describe()

Unnamed: 0,domestic_gross,year
count,3359.0,3387.0
mean,28745850.0,2013.958075
std,66982500.0,2.478141
min,100.0,2010.0
25%,120000.0,2012.0
50%,1400000.0,2014.0
75%,27900000.0,2016.0
max,936700000.0,2018.0


In [373]:
bbom_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3387 entries, 0 to 3386
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           3387 non-null   object 
 1   studio          3382 non-null   object 
 2   domestic_gross  3359 non-null   float64
 3   foreign_gross   2037 non-null   object 
 4   year            3387 non-null   int64  
dtypes: float64(1), int64(1), object(3)
memory usage: 132.4+ KB


In [374]:
(2037/3387)*100

60.14171833480957

In [375]:
bbom_df["studio"].value_counts()

IFC         166
Uni.        147
WB          140
Fox         136
Magn.       136
           ... 
Blue Fox      1
AM            1
GrtIndia      1
Rog.          1
KC            1
Name: studio, Length: 257, dtype: int64

In [376]:
bbom_df[bom_df["studio"].isna()]

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year
210,Outside the Law (Hors-la-loi),,96900.0,3300000.0,2010
555,Fireflies in the Garden,,70600.0,3300000.0,2011
933,Keith Lemon: The Film,,,4000000.0,2012
1862,Plot for Peace,,7100.0,,2014
2825,Secret Superstar,,,122000000.0,2017


In [377]:
bbom_df[bom_df["studio"]=="NotSpecified"]

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year


In [378]:
bbom_df["studio"].fillna("NotSpecified", inplace=True)

In [379]:
bbom_df[bom_df["domestic_gross"].isna() & bom_df["foreign_gross"].isna()]

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year


In [380]:
bbom_df[bom_df["domestic_gross"].isna()]

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year
230,It's a Wonderful Afterlife,UTV,,1300000,2010
298,Celine: Through the Eyes of the World,Sony,,119000,2010
302,White Lion,Scre.,,99600,2010
306,Badmaash Company,Yash,,64400,2010
327,Aashayein (Wishes),Relbig.,,3800,2010
537,Force,FoxS,,4800000,2011
713,Empire of Silver,NeoC,,19000,2011
871,Solomon Kane,RTWC,,19600000,2012
928,The Tall Man,Imag.,,5200000,2012
933,Keith Lemon: The Film,NotSpecified,,4000000,2012


In [381]:
bbom_df.dropna(subset=["domestic_gross", "foreign_gross"], inplace=True)

We've decided to drop the columns that have missing revenue numbers since we want accurate numbers and filling in with mean or median may inflate or deflate some films hence giving us wrong insights

In [382]:
bbom_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2009 entries, 0 to 3353
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           2009 non-null   object 
 1   studio          2009 non-null   object 
 2   domestic_gross  2009 non-null   float64
 3   foreign_gross   2009 non-null   object 
 4   year            2009 non-null   int64  
dtypes: float64(1), int64(1), object(3)
memory usage: 94.2+ KB


In [383]:
#Connecting to cleaned movies
cleaned_conn = sqlite3.connect("../cleaned_data/movies.db")

In [384]:
# Saving to cleaned database
bbom_df.to_sql("bom_gross", cleaned_conn, if_exists="replace", index=False)

#### 2. RT Movie Info

In [385]:
# Load RT Movie info
bmovieinfo_df = pd.read_sql('''
 SELECT * FROM rt_movie_info
''', conn)

In [386]:
bmovieinfo_df.head()

Unnamed: 0,id,synopsis,rating,genre,director,writer,theater_date,dvd_date,currency,box_office,runtime,studio
0,1,"This gritty, fast-paced, and innovative police...",R,Action and Adventure|Classics|Drama,William Friedkin,Ernest Tidyman,"Oct 9, 1971","Sep 25, 2001",,,104 minutes,
1,3,"New York City, not-too-distant-future: Eric Pa...",R,Drama|Science Fiction and Fantasy,David Cronenberg,David Cronenberg|Don DeLillo,"Aug 17, 2012","Jan 1, 2013",$,600000.0,108 minutes,Entertainment One
2,5,Illeana Douglas delivers a superb performance ...,R,Drama|Musical and Performing Arts,Allison Anders,Allison Anders,"Sep 13, 1996","Apr 18, 2000",,,116 minutes,
3,6,Michael Douglas runs afoul of a treacherous su...,R,Drama|Mystery and Suspense,Barry Levinson,Paul Attanasio|Michael Crichton,"Dec 9, 1994","Aug 27, 1997",,,128 minutes,
4,7,,NR,Drama|Romance,Rodney Bennett,Giles Cooper,,,,,200 minutes,


In [387]:
#Describe
bmovieinfo_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1560 entries, 0 to 1559
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            1560 non-null   int64 
 1   synopsis      1498 non-null   object
 2   rating        1557 non-null   object
 3   genre         1552 non-null   object
 4   director      1361 non-null   object
 5   writer        1111 non-null   object
 6   theater_date  1201 non-null   object
 7   dvd_date      1201 non-null   object
 8   currency      340 non-null    object
 9   box_office    340 non-null    object
 10  runtime       1530 non-null   object
 11  studio        494 non-null    object
dtypes: int64(1), object(11)
memory usage: 146.4+ KB


In [388]:
# Checking for all the currencies in the table
bmovieinfo_df["currency"].value_counts()

$    340
Name: currency, dtype: int64

We are dropping currency columns since they are all in dollars hence it is a redudant column

In [389]:
bmovieinfo_df.drop(columns="currency", inplace=True)

In [390]:
bmovieinfo_df

Unnamed: 0,id,synopsis,rating,genre,director,writer,theater_date,dvd_date,box_office,runtime,studio
0,1,"This gritty, fast-paced, and innovative police...",R,Action and Adventure|Classics|Drama,William Friedkin,Ernest Tidyman,"Oct 9, 1971","Sep 25, 2001",,104 minutes,
1,3,"New York City, not-too-distant-future: Eric Pa...",R,Drama|Science Fiction and Fantasy,David Cronenberg,David Cronenberg|Don DeLillo,"Aug 17, 2012","Jan 1, 2013",600000,108 minutes,Entertainment One
2,5,Illeana Douglas delivers a superb performance ...,R,Drama|Musical and Performing Arts,Allison Anders,Allison Anders,"Sep 13, 1996","Apr 18, 2000",,116 minutes,
3,6,Michael Douglas runs afoul of a treacherous su...,R,Drama|Mystery and Suspense,Barry Levinson,Paul Attanasio|Michael Crichton,"Dec 9, 1994","Aug 27, 1997",,128 minutes,
4,7,,NR,Drama|Romance,Rodney Bennett,Giles Cooper,,,,200 minutes,
...,...,...,...,...,...,...,...,...,...,...,...
1555,1996,Forget terrorists or hijackers -- there's a ha...,R,Action and Adventure|Horror|Mystery and Suspense,,,"Aug 18, 2006","Jan 2, 2007",33886034,106 minutes,New Line Cinema
1556,1997,The popular Saturday Night Live sketch was exp...,PG,Comedy|Science Fiction and Fantasy,Steve Barron,Terry Turner|Tom Davis|Dan Aykroyd|Bonnie Turner,"Jul 23, 1993","Apr 17, 2001",,88 minutes,Paramount Vantage
1557,1998,"Based on a novel by Richard Powell, when the l...",G,Classics|Comedy|Drama|Musical and Performing Arts,Gordon Douglas,,"Jan 1, 1962","May 11, 2004",,111 minutes,
1558,1999,The Sandlot is a coming-of-age story about a g...,PG,Comedy|Drama|Kids and Family|Sports and Fitness,David Mickey Evans,David Mickey Evans|Robert Gunter,"Apr 1, 1993","Jan 29, 2002",,101 minutes,


In [391]:
bmovieinfo_df[bmovieinfo_df["box_office"].isna() == False]["box_office"]

1          600,000
6       41,032,915
7          224,114
8          134,904
15       1,039,869
           ...    
1541    25,335,935
1542     1,416,189
1545        59,371
1546       794,306
1555    33,886,034
Name: box_office, Length: 340, dtype: object

In [392]:
# Marking missing box_office_missing since it's an important column even though most values are missing
# This makes it easier to filter out filled in values
bmovieinfo_df['box_office_missing'] = bmovieinfo_df['box_office'].isna()

In [393]:
#bmovieinfo_df.drop(columns="box_office_missing", inplace=True)

In [394]:
bmovieinfo_df.head(10)

Unnamed: 0,id,synopsis,rating,genre,director,writer,theater_date,dvd_date,box_office,runtime,studio,box_office_missing
0,1,"This gritty, fast-paced, and innovative police...",R,Action and Adventure|Classics|Drama,William Friedkin,Ernest Tidyman,"Oct 9, 1971","Sep 25, 2001",,104 minutes,,True
1,3,"New York City, not-too-distant-future: Eric Pa...",R,Drama|Science Fiction and Fantasy,David Cronenberg,David Cronenberg|Don DeLillo,"Aug 17, 2012","Jan 1, 2013",600000.0,108 minutes,Entertainment One,False
2,5,Illeana Douglas delivers a superb performance ...,R,Drama|Musical and Performing Arts,Allison Anders,Allison Anders,"Sep 13, 1996","Apr 18, 2000",,116 minutes,,True
3,6,Michael Douglas runs afoul of a treacherous su...,R,Drama|Mystery and Suspense,Barry Levinson,Paul Attanasio|Michael Crichton,"Dec 9, 1994","Aug 27, 1997",,128 minutes,,True
4,7,,NR,Drama|Romance,Rodney Bennett,Giles Cooper,,,,200 minutes,,True
5,8,The year is 1942. As the Allies unite overseas...,PG,Drama|Kids and Family,Jay Russell,Gail Gilchriest,"Mar 3, 2000","Jul 11, 2000",,95 minutes,Warner Bros. Pictures,True
6,10,Some cast and crew from NBC's highly acclaimed...,PG-13,Comedy,Jake Kasdan,Mike White,"Jan 11, 2002","Jun 18, 2002",41032915.0,82 minutes,Paramount Pictures,False
7,13,"Stewart Kane, an Irishman living in the Austra...",R,Drama,Ray Lawrence,Raymond Carver|Beatrix Christian,"Apr 27, 2006","Oct 2, 2007",224114.0,123 minutes,Sony Pictures Classics,False
8,14,"""Love Ranch"" is a bittersweet love story that ...",R,Drama,Taylor Hackford,Mark Jacobson,"Jun 30, 2010","Nov 9, 2010",134904.0,117 minutes,,False
9,15,When a diamond expedition in the Congo is lost...,PG-13,Action and Adventure|Mystery and Suspense|Scie...,Frank Marshall,John Patrick Shanley,"Jun 9, 1995","Jul 27, 1999",,108 minutes,,True


In [395]:
# Remove commas
bmovieinfo_df['box_office'] = (
    bmovieinfo_df['box_office']
    .replace('None', pd.NA)        
    .str.replace(',', '', regex=True)  # remove commas
)


In [396]:
# Convert box_office to numeric data type
bmovieinfo_df["box_office"] = pd.to_numeric(bmovieinfo_df["box_office"], errors="coerce")

In [397]:

bmovieinfo_df.head(10)

Unnamed: 0,id,synopsis,rating,genre,director,writer,theater_date,dvd_date,box_office,runtime,studio,box_office_missing
0,1,"This gritty, fast-paced, and innovative police...",R,Action and Adventure|Classics|Drama,William Friedkin,Ernest Tidyman,"Oct 9, 1971","Sep 25, 2001",,104 minutes,,True
1,3,"New York City, not-too-distant-future: Eric Pa...",R,Drama|Science Fiction and Fantasy,David Cronenberg,David Cronenberg|Don DeLillo,"Aug 17, 2012","Jan 1, 2013",600000.0,108 minutes,Entertainment One,False
2,5,Illeana Douglas delivers a superb performance ...,R,Drama|Musical and Performing Arts,Allison Anders,Allison Anders,"Sep 13, 1996","Apr 18, 2000",,116 minutes,,True
3,6,Michael Douglas runs afoul of a treacherous su...,R,Drama|Mystery and Suspense,Barry Levinson,Paul Attanasio|Michael Crichton,"Dec 9, 1994","Aug 27, 1997",,128 minutes,,True
4,7,,NR,Drama|Romance,Rodney Bennett,Giles Cooper,,,,200 minutes,,True
5,8,The year is 1942. As the Allies unite overseas...,PG,Drama|Kids and Family,Jay Russell,Gail Gilchriest,"Mar 3, 2000","Jul 11, 2000",,95 minutes,Warner Bros. Pictures,True
6,10,Some cast and crew from NBC's highly acclaimed...,PG-13,Comedy,Jake Kasdan,Mike White,"Jan 11, 2002","Jun 18, 2002",41032915.0,82 minutes,Paramount Pictures,False
7,13,"Stewart Kane, an Irishman living in the Austra...",R,Drama,Ray Lawrence,Raymond Carver|Beatrix Christian,"Apr 27, 2006","Oct 2, 2007",224114.0,123 minutes,Sony Pictures Classics,False
8,14,"""Love Ranch"" is a bittersweet love story that ...",R,Drama,Taylor Hackford,Mark Jacobson,"Jun 30, 2010","Nov 9, 2010",134904.0,117 minutes,,False
9,15,When a diamond expedition in the Congo is lost...,PG-13,Action and Adventure|Mystery and Suspense|Scie...,Frank Marshall,John Patrick Shanley,"Jun 9, 1995","Jul 27, 1999",,108 minutes,,True


In [398]:
bmovieinfo_df.describe()

Unnamed: 0,id,box_office
count,1560.0,340.0
mean,1007.303846,37906010.0
std,579.164527,57491590.0
min,1.0,363.0
25%,504.75,1905152.0
50%,1007.5,14141050.0
75%,1503.25,44825240.0
max,2000.0,368000000.0


In [399]:
rt_info_bo_median = bmovieinfo_df["box_office"].median()
rt_info_bo_median

14141054.5

In [400]:
# Filling missing box office values with the box_office column median
bmovieinfo_df["box_office"].fillna(rt_info_bo_median,inplace=True)

In [401]:
bmovieinfo_df.head(10)

Unnamed: 0,id,synopsis,rating,genre,director,writer,theater_date,dvd_date,box_office,runtime,studio,box_office_missing
0,1,"This gritty, fast-paced, and innovative police...",R,Action and Adventure|Classics|Drama,William Friedkin,Ernest Tidyman,"Oct 9, 1971","Sep 25, 2001",14141054.5,104 minutes,,True
1,3,"New York City, not-too-distant-future: Eric Pa...",R,Drama|Science Fiction and Fantasy,David Cronenberg,David Cronenberg|Don DeLillo,"Aug 17, 2012","Jan 1, 2013",600000.0,108 minutes,Entertainment One,False
2,5,Illeana Douglas delivers a superb performance ...,R,Drama|Musical and Performing Arts,Allison Anders,Allison Anders,"Sep 13, 1996","Apr 18, 2000",14141054.5,116 minutes,,True
3,6,Michael Douglas runs afoul of a treacherous su...,R,Drama|Mystery and Suspense,Barry Levinson,Paul Attanasio|Michael Crichton,"Dec 9, 1994","Aug 27, 1997",14141054.5,128 minutes,,True
4,7,,NR,Drama|Romance,Rodney Bennett,Giles Cooper,,,14141054.5,200 minutes,,True
5,8,The year is 1942. As the Allies unite overseas...,PG,Drama|Kids and Family,Jay Russell,Gail Gilchriest,"Mar 3, 2000","Jul 11, 2000",14141054.5,95 minutes,Warner Bros. Pictures,True
6,10,Some cast and crew from NBC's highly acclaimed...,PG-13,Comedy,Jake Kasdan,Mike White,"Jan 11, 2002","Jun 18, 2002",41032915.0,82 minutes,Paramount Pictures,False
7,13,"Stewart Kane, an Irishman living in the Austra...",R,Drama,Ray Lawrence,Raymond Carver|Beatrix Christian,"Apr 27, 2006","Oct 2, 2007",224114.0,123 minutes,Sony Pictures Classics,False
8,14,"""Love Ranch"" is a bittersweet love story that ...",R,Drama,Taylor Hackford,Mark Jacobson,"Jun 30, 2010","Nov 9, 2010",134904.0,117 minutes,,False
9,15,When a diamond expedition in the Congo is lost...,PG-13,Action and Adventure|Mystery and Suspense|Scie...,Frank Marshall,John Patrick Shanley,"Jun 9, 1995","Jul 27, 1999",14141054.5,108 minutes,,True


We have handled the box office column by marking missing values as missing and then filling them with median

In [402]:
bmovieinfo_df[bmovieinfo_df["director"].isna()]

Unnamed: 0,id,synopsis,rating,genre,director,writer,theater_date,dvd_date,box_office,runtime,studio,box_office_missing
10,17,,,,,,,,14141054.5,,,True
11,18,"In 1979, Bill Viola and Frank Caliguri dreamed...",NR,Documentary,,Robert Zullo,,,14141054.5,,Showtime Documentary Films,True
12,19,While Microsoft may be the biggest software co...,NR,Documentary|Special Interest,,,"Aug 23, 2002","Sep 30, 2003",14141054.5,90 minutes,Seventh Art Releasing,True
16,23,A fictional film set in the alluring world of ...,R,Drama,,,"Dec 20, 2013","Mar 18, 2014",99165609.0,129 minutes,Sony Pictures,False
20,27,,NR,Musical and Performing Arts,,,,,14141054.5,,,True
...,...,...,...,...,...,...,...,...,...,...,...,...
1543,1982,,,,,,,,14141054.5,,,True
1546,1986,Aki Kaurismaki's The Man Without a Past opens ...,PG,Art House and International|Comedy|Drama,,,"Aug 30, 2002","Oct 7, 2003",794306.0,97 minutes,,False
1549,1989,Hungarian Rhapsody (Magyar Rapszodia) is the f...,NR,Art House and International|Drama,,,,,14141054.5,101 minutes,,True
1555,1996,Forget terrorists or hijackers -- there's a ha...,R,Action and Adventure|Horror|Mystery and Suspense,,,"Aug 18, 2006","Jan 2, 2007",33886034.0,106 minutes,New Line Cinema,False


In [403]:

# Saving to cleaned database
bmovieinfo_df.to_sql("rt_movie_info", cleaned_conn, if_exists="replace", index=False)

#### 3. RT Reviews

In [404]:
# Loading RT Reviews
breview_df = pd.read_sql('''
    SELECT * FROM rt_reviews;
''', conn) 

breview_df.head(10)

Unnamed: 0,id,review,rating,fresh,critic,top_critic,publisher,date
0,3,A distinctly gallows take on contemporary fina...,3/5,fresh,PJ Nabarro,0,Patrick Nabarro,"November 10, 2018"
1,3,It's an allegory in search of a meaning that n...,,rotten,Annalee Newitz,0,io9.com,"May 23, 2018"
2,3,... life lived in a bubble in financial dealin...,,fresh,Sean Axmaker,0,Stream on Demand,"January 4, 2018"
3,3,Continuing along a line introduced in last yea...,,fresh,Daniel Kasman,0,MUBI,"November 16, 2017"
4,3,... a perverse twist on neorealism...,,fresh,,0,Cinema Scope,"October 12, 2017"
5,3,... Cronenberg's Cosmopolis expresses somethin...,,fresh,Michelle Orange,0,Capital New York,"September 11, 2017"
6,3,"Quickly grows repetitive and tiresome, meander...",C,rotten,Eric D. Snider,0,EricDSnider.com,"July 17, 2013"
7,3,Cronenberg is not a director to be daunted by ...,2/5,rotten,Matt Kelemen,0,Las Vegas CityLife,"April 21, 2013"
8,3,"Cronenberg's cold, exacting precision and emot...",,fresh,Sean Axmaker,0,Parallax View,"March 24, 2013"
9,3,Over and above its topical urgency or the bit ...,,fresh,Kong Rithdee,0,Bangkok Post,"March 4, 2013"


In [405]:
breview_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54423 entries, 0 to 54422
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          54423 non-null  int64 
 1   review      48867 non-null  object
 2   rating      40907 non-null  object
 3   fresh       54423 non-null  object
 4   critic      51710 non-null  object
 5   top_critic  54423 non-null  int64 
 6   publisher   54114 non-null  object
 7   date        54423 non-null  object
dtypes: int64(2), object(6)
memory usage: 3.3+ MB


I'm dropping rows with missing ratings since they are of no use to as

In [406]:
breview_df.dropna(subset=["rating"], inplace=True)

In [407]:
breview_df

Unnamed: 0,id,review,rating,fresh,critic,top_critic,publisher,date
0,3,A distinctly gallows take on contemporary fina...,3/5,fresh,PJ Nabarro,0,Patrick Nabarro,"November 10, 2018"
6,3,"Quickly grows repetitive and tiresome, meander...",C,rotten,Eric D. Snider,0,EricDSnider.com,"July 17, 2013"
7,3,Cronenberg is not a director to be daunted by ...,2/5,rotten,Matt Kelemen,0,Las Vegas CityLife,"April 21, 2013"
11,3,"While not one of Cronenberg's stronger films, ...",B-,fresh,Emanuel Levy,0,EmanuelLevy.Com,"February 3, 2013"
12,3,Robert Pattinson works mighty hard to make Cos...,2/4,rotten,Christian Toto,0,Big Hollywood,"January 15, 2013"
...,...,...,...,...,...,...,...,...
54415,2000,Dawdles and drags when it should pop; it doesn...,1.5/5,rotten,Manohla Dargis,1,Los Angeles Times,"September 26, 2002"
54419,2000,,1/5,rotten,Michael Szymanski,0,Zap2it.com,"September 21, 2005"
54420,2000,,2/5,rotten,Emanuel Levy,0,EmanuelLevy.Com,"July 17, 2005"
54421,2000,,2.5/5,rotten,Christopher Null,0,Filmcritic.com,"September 7, 2003"


In [408]:
breview_df['rating'].unique()

array(['3/5', 'C', '2/5', 'B-', '2/4', 'B', '3/4', '4/5', '4/4', '6/10',
       '1/4', '8', '2.5/4', '4/10', '2.0/5', '3/10', '7/10', 'A-', '5/5',
       'F', '3.5/4', 'D+', '1.5/4', '3.5/5', '8/10', 'B+', '9/10',
       '2.5/5', '7.5/10', '5.5/10', 'C-', '1.5/5', '1/5', '5/10', 'C+',
       '0/5', '6', '0.5/4', 'D', '3.1/5', '3/6', '4.5/5', '0/4', '2/10',
       'D-', '7', '1/10', '3', 'A+', 'A', '4.0/4', '9.5/10', '2.5',
       '2.1/2', '6.5/10', '3.7/5', '8.4/10', '9', '1', '7.2/10', '2.2/5',
       '0.5/10', '5', '0', '2', '4.5', '7.7', '5.0/5', '8.5/10', '3.0/5',
       '0.5/5', '1.5/10', '3.0/4', '2.3/10', '4.5/10', '4/6', '3.5',
       '8.6/10', '6/8', '2.0/4', '2.7', '4.2/10', '5.8', '4', '7.1/10',
       '5/4', 'N', '3.5/10', '5.8/10', 'R', '4.0/5', '0/10', '5.0/10',
       '5.9/10', '2.4/5', '1.9/5', '4.9', '7.4/10', '1.5', '2.3/4',
       '8.8/10', '4.0/10', '2.2', '3.8/10', '6.8/10', '7.3', '7.0/10',
       '3.2', '4.2', '8.4', '5.5/5', '6.3/10', '7.6/10', '8.1/10',
       

In [409]:
# Saving to cleaned database
breview_df.to_sql("rt_reviews", cleaned_conn, if_exists="replace", index=False)

#### 5. TMDB 

In [410]:
# Loading the tmdb database
btmdb_df = pd.read_sql('''
    SELECT * FROM tmdb_movies;
''',conn)

In [411]:
btmdb_df

Unnamed: 0,unnamed:_0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count
0,0,"[12, 14, 10751]",12444,en,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788
1,1,"[14, 12, 16, 10751]",10191,en,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610
2,2,"[12, 28, 878]",10138,en,Iron Man 2,28.515,2010-05-07,Iron Man 2,6.8,12368
3,3,"[16, 35, 10751]",862,en,Toy Story,28.005,1995-11-22,Toy Story,7.9,10174
4,4,"[28, 878, 12]",27205,en,Inception,27.920,2010-07-16,Inception,8.3,22186
...,...,...,...,...,...,...,...,...,...,...
26512,26512,"[27, 18]",488143,en,Laboratory Conditions,0.600,2018-10-13,Laboratory Conditions,0.0,1
26513,26513,"[18, 53]",485975,en,_EXHIBIT_84xxx_,0.600,2018-05-01,_EXHIBIT_84xxx_,0.0,1
26514,26514,"[14, 28, 12]",381231,en,The Last One,0.600,2018-10-01,The Last One,0.0,1
26515,26515,"[10751, 12, 28]",366854,en,Trailer Made,0.600,2018-06-22,Trailer Made,0.0,1


In [412]:
#Pandas is showing duplicate index columns so we drop one in the following 2 columns
btmdb_df = btmdb_df.rename(columns={"unnamed:_0": "index"})

In [413]:
btmdb_df = btmdb_df.set_index("index")

In [414]:
btmdb_df

Unnamed: 0_level_0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,"[12, 14, 10751]",12444,en,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788
1,"[14, 12, 16, 10751]",10191,en,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610
2,"[12, 28, 878]",10138,en,Iron Man 2,28.515,2010-05-07,Iron Man 2,6.8,12368
3,"[16, 35, 10751]",862,en,Toy Story,28.005,1995-11-22,Toy Story,7.9,10174
4,"[28, 878, 12]",27205,en,Inception,27.920,2010-07-16,Inception,8.3,22186
...,...,...,...,...,...,...,...,...,...
26512,"[27, 18]",488143,en,Laboratory Conditions,0.600,2018-10-13,Laboratory Conditions,0.0,1
26513,"[18, 53]",485975,en,_EXHIBIT_84xxx_,0.600,2018-05-01,_EXHIBIT_84xxx_,0.0,1
26514,"[14, 28, 12]",381231,en,The Last One,0.600,2018-10-01,The Last One,0.0,1
26515,"[10751, 12, 28]",366854,en,Trailer Made,0.600,2018-06-22,Trailer Made,0.0,1


In [415]:
#looking at tmdb metadata
btmdb_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26517 entries, 0 to 26516
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   genre_ids          26517 non-null  object 
 1   id                 26517 non-null  int64  
 2   original_language  26517 non-null  object 
 3   original_title     26517 non-null  object 
 4   popularity         26517 non-null  float64
 5   release_date       26517 non-null  object 
 6   title              26517 non-null  object 
 7   vote_average       26517 non-null  float64
 8   vote_count         26517 non-null  int64  
dtypes: float64(2), int64(2), object(5)
memory usage: 2.0+ MB


No need for cleaning

In [416]:
btmdb_df.describe()

Unnamed: 0,id,popularity,vote_average,vote_count
count,26517.0,26517.0,26517.0,26517.0
mean,295050.15326,3.130912,5.991281,194.224837
std,153661.615648,4.355229,1.852946,960.961095
min,27.0,0.6,0.0,1.0
25%,157851.0,0.6,5.0,2.0
50%,309581.0,1.374,6.0,5.0
75%,419542.0,3.694,7.0,28.0
max,608444.0,80.773,10.0,22186.0


In [417]:
# Saving to cleaned database
btmdb_df.to_sql("tmdb_movies", cleaned_conn, if_exists="replace", index=False)

#### 5. Budgets

In [418]:
# Loading budget
bbudgets_df = pd.read_sql('''
    SELECT * FROM movie_budgets
''',conn)

In [419]:
bbudgets_df

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross
0,1,"Dec 18, 2009",Avatar,425000000.0,760507625.0,2.776345e+09
1,2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,410600000.0,241063875.0,1.045664e+09
2,3,"Jun 7, 2019",Dark Phoenix,350000000.0,42762350.0,1.497624e+08
3,4,"May 1, 2015",Avengers: Age of Ultron,330600000.0,459005868.0,1.403014e+09
4,5,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,317000000.0,620181382.0,1.316722e+09
...,...,...,...,...,...,...
5777,78,"Dec 31, 2018",Red 11,7000.0,0.0,0.000000e+00
5778,79,"Apr 2, 1999",Following,6000.0,48482.0,2.404950e+05
5779,80,"Jul 13, 2005",Return to the Land of Wonders,5000.0,1338.0,1.338000e+03
5780,81,"Sep 29, 2015",A Plague So Pleasant,1400.0,0.0,0.000000e+00


In [420]:
#Checking for descriptive stats
bbudgets_df.describe()

Unnamed: 0,id,production_budget,domestic_gross,worldwide_gross
count,5782.0,5782.0,5782.0,5782.0
mean,50.372363,31587760.0,41873330.0,91487460.0
std,28.821076,41812080.0,68240600.0,174720000.0
min,1.0,1100.0,0.0,0.0
25%,25.0,5000000.0,1429534.0,4125415.0
50%,50.0,17000000.0,17225940.0,27984450.0
75%,75.0,40000000.0,52348660.0,97645840.0
max,100.0,425000000.0,936662200.0,2776345000.0


In [421]:
#Checking for null values
bbudgets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5782 entries, 0 to 5781
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5782 non-null   int64  
 1   release_date       5782 non-null   object 
 2   movie              5782 non-null   object 
 3   production_budget  5782 non-null   float64
 4   domestic_gross     5782 non-null   float64
 5   worldwide_gross    5782 non-null   float64
dtypes: float64(3), int64(1), object(2)
memory usage: 271.2+ KB


No need for cleaning

In [422]:
# Saving to cleaned database
bbudgets_df.to_sql("movie_budgets", cleaned_conn, if_exists="replace", index=False)

In [423]:
#Closing database connections
conn.close()
cleaned_conn.close()
im_conn.close()