In [247]:
# Import the required libraries and dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm




from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.decomposition import PCA
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVC 
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import balanced_accuracy_score, accuracy_score, classification_report, mean_squared_error, r2_score


In [248]:
# Read movies_metadata CSV file
tmdb_data = pd.read_csv('Resources/TMDB_all_movies.csv')
tmdb_data.head()


Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,budget,imdb_id,...,genres,production_companies,production_countries,spoken_languages,cast,director,director_of_photography,writers,producers,music_composer
0,2,Ariel,7.1,318.0,Released,1988-10-21,0.0,73.0,0.0,tt0094675,...,"Drama, Comedy, Romance, Crime",Villealfa Filmproductions,Finland,suomi,"Kari Helaseppä, Susanna Haavisto, Hannu Kivisa...",Aki Kaurismäki,Timo Salminen,Aki Kaurismäki,Aki Kaurismäki,
1,3,Shadows in Paradise,7.3,360.0,Released,1986-10-17,0.0,74.0,0.0,tt0092149,...,"Drama, Comedy, Romance",Villealfa Filmproductions,Finland,"suomi, English, svenska","Helmeri Pellonpää, Tanja Talaskivi, Esko Nikka...",Aki Kaurismäki,Timo Salminen,Aki Kaurismäki,Mika Kaurismäki,
2,5,Four Rooms,5.8,2578.0,Released,1995-12-09,4257354.0,98.0,4000000.0,tt0113101,...,Comedy,"Miramax, A Band Apart",United States of America,English,"Kimberly Blair, Sammi Davis, Paul Skemp, Bruce...","Alexandre Rockwell, Quentin Tarantino, Allison...","Andrzej Sekula, Rodrigo García, Guillermo Nava...","Alexandre Rockwell, Quentin Tarantino, Allison...","Alexandre Rockwell, Lawrence Bender, Quentin T...",Combustible Edison
3,6,Judgment Night,6.5,323.0,Released,1993-10-15,12136938.0,109.0,21000000.0,tt0107286,...,"Action, Crime, Thriller","Largo Entertainment, JVC, Universal Pictures",United States of America,English,"Doug Wert, Darin Mangan, Stephen Dorff, Nigel ...",Stephen Hopkins,Peter Levy,"Lewis Colick, Jere Cunningham","Gene Levy, Marilyn Vance, Lloyd Segan",Alan Silvestri
4,8,Life in Loops (A Megacities RMX),7.5,27.0,Released,2006-01-01,0.0,80.0,42000.0,tt0825671,...,Documentary,inLoops,Austria,"English, हिन्दी, 日本語, Pусский, Español",,Timo Novotny,Wolfgang Thaler,"Michael Glawogger, Timo Novotny","Ulrich Gehmacher, Timo Novotny",


## Exploratory Data Analysis

In [249]:
# Check for null values by finding the percentage in each column
tmdb_data.isnull().sum()/len(tmdb_data)*100

id                          0.000000
title                       0.000632
vote_average                0.000105
vote_count                  0.000105
status                      0.000105
release_date                9.725145
revenue                     0.000105
runtime                     0.000105
budget                      0.000105
imdb_id                    40.222934
original_language           0.000105
original_title              0.000737
overview                   16.779777
popularity                  0.000105
tagline                    85.191223
genres                     28.776179
production_companies       53.744426
production_countries       38.760508
spoken_languages           38.069947
cast                       32.795166
director                   17.367549
director_of_photography    76.183916
writers                    51.589754
producers                  68.412184
music_composer             90.188285
dtype: float64

In [250]:
# Drop columns irrelevant to the analysis and can be dropped.
tmdb_data.drop(columns=['imdb_id','overview', 'tagline', 'director_of_photography', 'music_composer'],inplace=True)
tmdb_data.isnull().sum()/len(tmdb_data)*100

id                       0.000000
title                    0.000632
vote_average             0.000105
vote_count               0.000105
status                   0.000105
release_date             9.725145
revenue                  0.000105
runtime                  0.000105
budget                   0.000105
original_language        0.000105
original_title           0.000737
popularity               0.000105
genres                  28.776179
production_companies    53.744426
production_countries    38.760508
spoken_languages        38.069947
cast                    32.795166
director                17.367549
writers                 51.589754
producers               68.412184
dtype: float64

In [251]:
tmdb_data['status'].value_counts()

status
Released           934732
In Production        4591
Planned              4546
Post Production      4444
Rumored               782
Canceled              422
Name: count, dtype: int64

In [252]:
tmdb_data['production_countries'].value_counts()


production_countries
United States of America                137747
Japan                                    36076
United Kingdom                           32964
France                                   31543
Germany                                  30371
                                         ...  
Czechoslovakia, Tunisia                      1
France, Sweden, Tunisia                      1
Ireland, Germany, Austria                    1
Egypt, Lebanon, Syrian Arab Republic         1
Costa Rica, Uruguay                          1
Name: count, Length: 10350, dtype: int64

In [253]:
# Filter df to US production companies and movies that have been released
us_tmdb_df = tmdb_data.loc[(tmdb_data['production_countries'] == 'United States of America') & (tmdb_data['status']=='Released')]
us_tmdb_df.isnull().sum()/len(us_tmdb_df)*100


id                       0.000000
title                    0.001485
vote_average             0.000000
vote_count               0.000000
status                   0.000000
release_date             1.390941
revenue                  0.000000
runtime                  0.000000
budget                   0.000000
original_language        0.000000
original_title           0.002228
popularity               0.000000
genres                   8.073847
production_companies    24.727270
production_countries     0.000000
spoken_languages        21.510207
cast                    15.108758
director                 9.362306
writers                 34.216565
producers               49.239178
dtype: float64

In [254]:
us_tmdb_df.shape

(134657, 20)

In [255]:
us_tmdb_df['vote_average'].describe()

count    134657.000000
mean          3.340277
std           3.162888
min           0.000000
25%           0.000000
50%           4.000000
75%           6.000000
max          10.000000
Name: vote_average, dtype: float64

In [256]:
'''
Question: How do we determine if a movie is a critical success?
Answer: Bin 'vote_average' column in the dataset needs based on the following guidelines:
0-2.5 = panned 
2.51-5 = alright
5.1-7.5 = well liked
7.51-10 = critical success
'''

"\nQuestion: How do we determine if a movie is a critical success?\nAnswer: Bin 'vote_average' column in the dataset needs based on the following guidelines:\n0-2.5 = panned \n2.51-5 = alright\n5.1-7.5 = well liked\n7.51-10 = critical success\n"

In [257]:
# Create bins in which to place values based upon 'vote_average' column
bins = [0, 2.5, 5, 7.5, 10]

critical_success = ['panned', 'alright', 'well liked', 'critical success']


In [258]:
# Slice the data and place it into bins
us_tmdb_df['critical_success'] = pd.cut(us_tmdb_df['vote_average'], bins, labels=critical_success, include_lowest=True)
us_tmdb_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  us_tmdb_df['critical_success'] = pd.cut(us_tmdb_df['vote_average'], bins, labels=critical_success, include_lowest=True)


Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,budget,original_language,...,popularity,genres,production_companies,production_countries,spoken_languages,cast,director,writers,producers,critical_success
2,5,Four Rooms,5.8,2578.0,Released,1995-12-09,4257354.0,98.0,4000000.0,en,...,24.747,Comedy,"Miramax, A Band Apart",United States of America,English,"Kimberly Blair, Sammi Davis, Paul Skemp, Bruce...","Alexandre Rockwell, Quentin Tarantino, Allison...","Alexandre Rockwell, Quentin Tarantino, Allison...","Alexandre Rockwell, Lawrence Bender, Quentin T...",well liked
3,6,Judgment Night,6.5,323.0,Released,1993-10-15,12136938.0,109.0,21000000.0,en,...,27.882,"Action, Crime, Thriller","Largo Entertainment, JVC, Universal Pictures",United States of America,English,"Doug Wert, Darin Mangan, Stephen Dorff, Nigel ...",Stephen Hopkins,"Lewis Colick, Jere Cunningham","Gene Levy, Marilyn Vance, Lloyd Segan",well liked
6,11,Star Wars,8.203,20152.0,Released,1977-05-25,775398007.0,121.0,11000000.0,en,...,250.715,"Adventure, Action, Science Fiction","Lucasfilm Ltd., 20th Century Fox",United States of America,English,"Leslie Schofield, Don Henderson, David Prowse,...",George Lucas,George Lucas,"Rick McCallum, George Lucas, Gary Kurtz",critical success
7,12,Finding Nemo,7.821,18826.0,Released,2003-05-30,940335536.0,100.0,94000000.0,en,...,124.166,"Animation, Family",Pixar,United States of America,English,"Alexander Gould, Susan Blu, Michaela Jill Murp...",Andrew Stanton,"Bob Peterson, Will Csaklos, Andrew Stanton, Ji...","John Lasseter, Graham Walters",critical success
8,13,Forrest Gump,8.475,26790.0,Released,1994-06-23,677387716.0,142.0,55000000.0,en,...,127.156,"Comedy, Drama, Romance","Paramount Pictures, The Steve Tisch Company, W...",United States of America,English,"Angela Lomas, Geoffrey Blake, Zach Hanner, Mic...",Robert Zemeckis,"Winston Groom, Eric Roth","Steve Tisch, Steve Starkey, Wendy Finerman",critical success


In [259]:
us_tmdb_df.shape

(134657, 21)

In [260]:
'''
Question: How do we determine if a movie is a financial success?
Answer: Use ROI (Return on Investment) to determine if a movie is a financial success. 
ROI is calculated as follows: ROI = ((revenue-budget)/budget) * 100? 
Create 'roi' columun based on calculation and bin values based on the following guidelines:
<0% = failure
0% = broke even
0%<>50% = modest returns
50%<>100% = moderate returns
100%<>500% = excellent returns
>500% = extraordinary returns
'''

"\nQuestion: How do we determine if a movie is a financial success?\nAnswer: Use ROI (Return on Investment) to determine if a movie is a financial success. \nROI is calculated as follows: ROI = ((revenue-budget)/budget) * 100? \nCreate 'roi' columun based on calculation and bin values based on the following guidelines:\n<0% = failure\n0% = broke even\n0%<>50% = modest returns\n50%<>100% = moderate returns\n100%<>500% = excellent returns\n>500% = extraordinary returns\n"

In [261]:

# Determine the dimensions of the dataframe when the budget is 0
# us_tmdb_df[us_tmdb_df['budget'] == 0]
us_tmdb_df.loc[us_tmdb_df['budget'] == 0]


Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,budget,original_language,...,popularity,genres,production_companies,production_countries,spoken_languages,cast,director,writers,producers,critical_success
16,21,The Endless Summer,7.2,109.0,Released,1966-06-15,0.0,91.0,0.0,en,...,7.374,Documentary,Bruce Brown Films,United States of America,English,"Chuck Gardner, Wayne Miyata, Robert August, Lo...",Bruce Brown,Bruce Brown,"Robert Bagley, Bruce Brown",well liked
94,132,Gimme Shelter,7.4,159.0,Released,1970-12-13,252570.0,92.0,0.0,en,...,20.296,"Documentary, Music",Maysles Films,United States of America,English,"Jorma Kaukonen, Rock Scully, Marty Balin, Keit...","Albert Maysles, Charlotte Zwerin, David Maysles",,Ronald Schneider,well liked
95,133,Primary,6.5,30.0,Released,1960-11-08,0.0,53.0,0.0,en,...,3.257,Documentary,Drew Associates,United States of America,English,"John F. Kennedy, Robert Drew, Pierre Salinger,...",Robert Drew,Robert Drew,Robert Drew,well liked
97,135,Don't Look Back,7.5,141.0,Released,1967-05-17,0.0,96.0,0.0,en,...,13.256,"Documentary, Music",Leacock-Pennebaker,United States of America,English,"Tito Burns, Albert Grossman, Derroll Adams, Al...",D. A. Pennebaker,D. A. Pennebaker,"John Court, Albert Grossman",well liked
154,195,Trouble in Paradise,7.4,267.0,Released,1932-10-30,0.0,83.0,0.0,en,...,9.003,"Comedy, Crime, Romance",Paramount,United States of America,"Deutsch, English, Español, Italiano, Pусский","George Humbert, Tyler Brooke, Marion Byron, St...",Ernst Lubitsch,"Samson Raphaelson, Aladar Laszlo, Grover Jones...",Ernst Lubitsch,well liked
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
949411,1308786,The Seagull on the Sims 4,0.0,0.0,Released,2020-10-27,0.0,318.0,0.0,en,...,0.000,"Drama, Animation",New York Theatre Workshop,United States of America,,"Jiehae Park, Celine Song, Jim Nicola, C.A. Joh...",Celine Song,"Anton Chekhov, Celine Song",,panned
949428,1308804,Two Cops,0.0,0.0,Released,2019-09-13,0.0,30.0,0.0,en,...,0.000,,,United States of America,English,"Eric Cohen, Steve Cohen",Tova Mozard,,,panned
949462,1308839,Computer love,0.0,0.0,Released,2022-06-02,0.0,0.0,0.0,en,...,0.000,"Comedy, Romance, Science Fiction",,United States of America,English,"Ryan Scott Morris, Anne Stedman, Ali Gallo, Sh...",Joseph Picozzi,Joseph Picozzi,Joseph Picozzi,panned
949463,1308840,Power Slap 8: Da Crazy Hawaiian vs. Van Heerden,0.0,0.0,Released,2024-06-28,0.0,180.0,0.0,en,...,0.000,Action,,United States of America,English,"Dan Hellie, Charly Arnolt, Justin Bernard, Mic...",,,,panned


In [262]:
# Filter through budget columns to select rows with budget not equal to 0
# us_tmdb_df = us_tmdb_df.loc[us_tmdb_df['budget'] != 0].copy()

us_tmdb_df = us_tmdb_df[us_tmdb_df['budget'] != 0].copy()

# Check bugdet column for 0 values
us_tmdb_df['budget'].loc[us_tmdb_df['budget'] == 0]

us_tmdb_df.shape

(15728, 21)

In [263]:
# Calculate ROI and as metric for financial success based on revenue and budget
us_tmdb_df['roi'] = ((us_tmdb_df['revenue'] - us_tmdb_df['budget'])/us_tmdb_df['budget'])*100

us_tmdb_df.head()

Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,budget,original_language,...,genres,production_companies,production_countries,spoken_languages,cast,director,writers,producers,critical_success,roi
2,5,Four Rooms,5.8,2578.0,Released,1995-12-09,4257354.0,98.0,4000000.0,en,...,Comedy,"Miramax, A Band Apart",United States of America,English,"Kimberly Blair, Sammi Davis, Paul Skemp, Bruce...","Alexandre Rockwell, Quentin Tarantino, Allison...","Alexandre Rockwell, Quentin Tarantino, Allison...","Alexandre Rockwell, Lawrence Bender, Quentin T...",well liked,6.43385
3,6,Judgment Night,6.5,323.0,Released,1993-10-15,12136938.0,109.0,21000000.0,en,...,"Action, Crime, Thriller","Largo Entertainment, JVC, Universal Pictures",United States of America,English,"Doug Wert, Darin Mangan, Stephen Dorff, Nigel ...",Stephen Hopkins,"Lewis Colick, Jere Cunningham","Gene Levy, Marilyn Vance, Lloyd Segan",well liked,-42.205057
6,11,Star Wars,8.203,20152.0,Released,1977-05-25,775398007.0,121.0,11000000.0,en,...,"Adventure, Action, Science Fiction","Lucasfilm Ltd., 20th Century Fox",United States of America,English,"Leslie Schofield, Don Henderson, David Prowse,...",George Lucas,George Lucas,"Rick McCallum, George Lucas, Gary Kurtz",critical success,6949.072791
7,12,Finding Nemo,7.821,18826.0,Released,2003-05-30,940335536.0,100.0,94000000.0,en,...,"Animation, Family",Pixar,United States of America,English,"Alexander Gould, Susan Blu, Michaela Jill Murp...",Andrew Stanton,"Bob Peterson, Will Csaklos, Andrew Stanton, Ji...","John Lasseter, Graham Walters",critical success,900.356953
8,13,Forrest Gump,8.475,26790.0,Released,1994-06-23,677387716.0,142.0,55000000.0,en,...,"Comedy, Drama, Romance","Paramount Pictures, The Steve Tisch Company, W...",United States of America,English,"Angela Lomas, Geoffrey Blake, Zach Hanner, Mic...",Robert Zemeckis,"Winston Groom, Eric Roth","Steve Tisch, Steve Starkey, Wendy Finerman",critical success,1131.614029


In [264]:
us_tmdb_df['roi'].describe()

count    1.572800e+04
mean     2.092805e+05
std      1.786877e+07
min     -2.000000e+02
25%     -1.000000e+02
50%     -1.000000e+02
75%      0.000000e+00
max      2.000000e+09
Name: roi, dtype: float64

In [265]:
us_tmdb_df.loc[us_tmdb_df['roi'] == 0]

Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,budget,original_language,...,genres,production_companies,production_countries,spoken_languages,cast,director,writers,producers,critical_success,roi
3677,6883,Two-Minute Warning,6.100,84.0,Released,1976-11-12,6700000.0,115.0,6700000.0,en,...,"Action, Thriller","Universal Pictures, Filmways Pictures",United States of America,English,"John Stuart West, William Bryant, Andy Sidaris...",Larry Peerce,"George La Fountaine Sr., Edward Hume",Edward S. Feldman,well liked,0.0
4605,9409,Another Stakeout,5.500,209.0,Released,1993-07-22,30000000.0,105.0,30000000.0,en,...,"Comedy, Crime, Action",Touchstone Pictures,United States of America,English,"Cathy Moriarty, Denalda Williams, Jan Speck, R...",John Badham,"Jim Kouf, Lynn Kouf","Cathleen Summers, John Badham, Jim Kouf, Lynn ...",well liked,0.0
12021,19933,The Brave Little Toaster,6.900,415.0,Released,1987-07-09,2300000.0,90.0,2300000.0,en,...,"Animation, Adventure, Family, Fantasy","The Kushner-Locke Company, Hyperion Pictures, ...",United States of America,"Polski, Español, English","Joe Ranft, Jon Lovitz, Jonathan Benair, Wayne ...",Jerry Rees,"Jerry Rees, Joe Ranft, Thomas M. Disch, Brian ...","Willard Carroll, Thomas L. Wilhite, Peter Lock...",well liked,0.0
13651,22887,Loose Change: Final Cut,6.313,40.0,Released,2007-11-11,6000.0,129.0,6000.0,en,...,Documentary,Louder Than Words,United States of America,English,Richard Shelby,Dylan Avery,Dylan Avery,,well liked,0.0
21113,33689,It Takes Two,6.500,858.0,Released,1995-11-17,19474589.0,101.0,19474589.0,en,...,"Comedy, Family, Romance","Dualstar Productions, Orr & Cruickshank, Ryshe...",United States of America,English,"Michèle Lonsdale Smith, Elizabeth Walsh, Kirst...",Andy Tennant,Deborah Dean Davis,"Mel Efros, James Orr, Jim Cruickshank, Keith S...",well liked,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
908665,1262469,Opportunities,10.000,1.0,Released,2023-05-20,444.0,12.0,444.0,en,...,"Adventure, Drama, Thriller, Animation",Pink Moose Productions,United States of America,English,"Kevin Glass, Josh Buraczewski, Ross Moshinsky,...",Matthew Francis Hess,"Wes Peppers, Corey Dowd, Matthew Francis Hess","Josh Triezenberg, Matthew Francis Hess, Laura ...",critical success,0.0
911082,1265226,The Bill Land Dance Off,0.000,0.0,Released,2024-06-01,1.0,10.0,1.0,en,...,Drama,BAEBUH,United States of America,English,"Jate, Tim, Mita, Rizzy, Nit",Nit,Rizzy,Tim,panned,0.0
921973,1277446,The Wake of Flip Fitch,0.000,0.0,Released,2023-12-08,200.0,50.0,200.0,en,...,"Music, Horror",Ogus Entertainment,United States of America,,"Feck Speiderbeck, Flip Fitch, Mateo Molina, DJ...","Feck Speiderbeck, Flip Fitch",,,panned,0.0
940159,1298360,Hit And Run,0.000,0.0,Released,2024-05-31,1.0,1.0,1.0,en,...,"Drama, Action",BlazeProductions,United States of America,English,"Brodie, Wall, Hat, Adam Toland",Adam Toland,Adam Toland,Adam Toland,panned,0.0


In [266]:
# Create bins in which to place values based upon roi column
bins = [-float('inf'), 0, 50, 100, 500, float('inf')]

financial_success = ['failure', 'modest returns', 
                    'moderate returns', 'excellent returns', 'extraordinary returns']


In [267]:
# Slice the data and place it into bins
us_tmdb_df['financial_success'] = pd.cut(us_tmdb_df['roi'], bins, labels=financial_success, include_lowest=True)

# Add the new category 'broke even' to the financial_success column
us_tmdb_df['financial_success'] = us_tmdb_df['financial_success'].cat.add_categories('broke even')

# Handle the special case where roi equals 0
us_tmdb_df.loc[us_tmdb_df['roi'] == 0, 'financial_success'] = 'broke even'


In [268]:
# Check to ensure when roi is 0, financial_success is 'broke even'
us_tmdb_df.loc[us_tmdb_df['roi'] == 0]

Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,budget,original_language,...,production_companies,production_countries,spoken_languages,cast,director,writers,producers,critical_success,roi,financial_success
3677,6883,Two-Minute Warning,6.100,84.0,Released,1976-11-12,6700000.0,115.0,6700000.0,en,...,"Universal Pictures, Filmways Pictures",United States of America,English,"John Stuart West, William Bryant, Andy Sidaris...",Larry Peerce,"George La Fountaine Sr., Edward Hume",Edward S. Feldman,well liked,0.0,broke even
4605,9409,Another Stakeout,5.500,209.0,Released,1993-07-22,30000000.0,105.0,30000000.0,en,...,Touchstone Pictures,United States of America,English,"Cathy Moriarty, Denalda Williams, Jan Speck, R...",John Badham,"Jim Kouf, Lynn Kouf","Cathleen Summers, John Badham, Jim Kouf, Lynn ...",well liked,0.0,broke even
12021,19933,The Brave Little Toaster,6.900,415.0,Released,1987-07-09,2300000.0,90.0,2300000.0,en,...,"The Kushner-Locke Company, Hyperion Pictures, ...",United States of America,"Polski, Español, English","Joe Ranft, Jon Lovitz, Jonathan Benair, Wayne ...",Jerry Rees,"Jerry Rees, Joe Ranft, Thomas M. Disch, Brian ...","Willard Carroll, Thomas L. Wilhite, Peter Lock...",well liked,0.0,broke even
13651,22887,Loose Change: Final Cut,6.313,40.0,Released,2007-11-11,6000.0,129.0,6000.0,en,...,Louder Than Words,United States of America,English,Richard Shelby,Dylan Avery,Dylan Avery,,well liked,0.0,broke even
21113,33689,It Takes Two,6.500,858.0,Released,1995-11-17,19474589.0,101.0,19474589.0,en,...,"Dualstar Productions, Orr & Cruickshank, Ryshe...",United States of America,English,"Michèle Lonsdale Smith, Elizabeth Walsh, Kirst...",Andy Tennant,Deborah Dean Davis,"Mel Efros, James Orr, Jim Cruickshank, Keith S...",well liked,0.0,broke even
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
908665,1262469,Opportunities,10.000,1.0,Released,2023-05-20,444.0,12.0,444.0,en,...,Pink Moose Productions,United States of America,English,"Kevin Glass, Josh Buraczewski, Ross Moshinsky,...",Matthew Francis Hess,"Wes Peppers, Corey Dowd, Matthew Francis Hess","Josh Triezenberg, Matthew Francis Hess, Laura ...",critical success,0.0,broke even
911082,1265226,The Bill Land Dance Off,0.000,0.0,Released,2024-06-01,1.0,10.0,1.0,en,...,BAEBUH,United States of America,English,"Jate, Tim, Mita, Rizzy, Nit",Nit,Rizzy,Tim,panned,0.0,broke even
921973,1277446,The Wake of Flip Fitch,0.000,0.0,Released,2023-12-08,200.0,50.0,200.0,en,...,Ogus Entertainment,United States of America,,"Feck Speiderbeck, Flip Fitch, Mateo Molina, DJ...","Feck Speiderbeck, Flip Fitch",,,panned,0.0,broke even
940159,1298360,Hit And Run,0.000,0.0,Released,2024-05-31,1.0,1.0,1.0,en,...,BlazeProductions,United States of America,English,"Brodie, Wall, Hat, Adam Toland",Adam Toland,Adam Toland,Adam Toland,panned,0.0,broke even


In [269]:
# Convert 'release_date' column to datetime
us_tmdb_df['release_date'] = pd.to_datetime(us_tmdb_df['release_date'], format='%Y-%m-%d', errors='coerce')


us_tmdb_df.dtypes

id                               int64
title                           object
vote_average                   float64
vote_count                     float64
status                          object
release_date            datetime64[ns]
revenue                        float64
runtime                        float64
budget                         float64
original_language               object
original_title                  object
popularity                     float64
genres                          object
production_companies            object
production_countries            object
spoken_languages                object
cast                            object
director                        object
writers                         object
producers                       object
critical_success              category
roi                            float64
financial_success             category
dtype: object

In [270]:
us_tmdb_df.shape

(15728, 23)

In [271]:
# Check the release_date column for NaT (Not a Time) values 
us_tmdb_df['release_date'].isna().sum()

# us_tmdb_df[us_tmdb_df['release_date'].isna()]

# us_tmdb_df['release_date'].isna().sum()/len(us_tmdb_df)*100


365

In [272]:
# Drop rows where 'release_date' is NaT
us_tmdb_df.dropna(subset=['release_date'], inplace=True)

In [273]:
# Check the release_date column for NaT (Not a Time) values 
us_tmdb_df['release_date'].isna().sum()
# us_tmdb_df.shape

0

In [274]:
# Extract year, month, and day from 'release_date' column
us_tmdb_df['released_year'] = us_tmdb_df['release_date'].dt.year
us_tmdb_df['released_month'] = us_tmdb_df['release_date'].dt.month
us_tmdb_df['released_day'] = us_tmdb_df['release_date'].dt.day
us_tmdb_df.head()

Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,budget,original_language,...,cast,director,writers,producers,critical_success,roi,financial_success,released_year,released_month,released_day
2,5,Four Rooms,5.8,2578.0,Released,1995-12-09,4257354.0,98.0,4000000.0,en,...,"Kimberly Blair, Sammi Davis, Paul Skemp, Bruce...","Alexandre Rockwell, Quentin Tarantino, Allison...","Alexandre Rockwell, Quentin Tarantino, Allison...","Alexandre Rockwell, Lawrence Bender, Quentin T...",well liked,6.43385,modest returns,1995,12,9
3,6,Judgment Night,6.5,323.0,Released,1993-10-15,12136938.0,109.0,21000000.0,en,...,"Doug Wert, Darin Mangan, Stephen Dorff, Nigel ...",Stephen Hopkins,"Lewis Colick, Jere Cunningham","Gene Levy, Marilyn Vance, Lloyd Segan",well liked,-42.205057,failure,1993,10,15
6,11,Star Wars,8.203,20152.0,Released,1977-05-25,775398007.0,121.0,11000000.0,en,...,"Leslie Schofield, Don Henderson, David Prowse,...",George Lucas,George Lucas,"Rick McCallum, George Lucas, Gary Kurtz",critical success,6949.072791,extraordinary returns,1977,5,25
7,12,Finding Nemo,7.821,18826.0,Released,2003-05-30,940335536.0,100.0,94000000.0,en,...,"Alexander Gould, Susan Blu, Michaela Jill Murp...",Andrew Stanton,"Bob Peterson, Will Csaklos, Andrew Stanton, Ji...","John Lasseter, Graham Walters",critical success,900.356953,extraordinary returns,2003,5,30
8,13,Forrest Gump,8.475,26790.0,Released,1994-06-23,677387716.0,142.0,55000000.0,en,...,"Angela Lomas, Geoffrey Blake, Zach Hanner, Mic...",Robert Zemeckis,"Winston Groom, Eric Roth","Steve Tisch, Steve Starkey, Wendy Finerman",critical success,1131.614029,extraordinary returns,1994,6,23


In [275]:
# Convert the released_year, released_month, and released_day columns to integers. 
us_tmdb_df['released_year'] = us_tmdb_df['released_year'].astype(int)
us_tmdb_df['released_month'] = us_tmdb_df['released_month'].astype(int)
us_tmdb_df['released_day'] = us_tmdb_df['released_day'].astype(int)
us_tmdb_df.head()

Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,budget,original_language,...,cast,director,writers,producers,critical_success,roi,financial_success,released_year,released_month,released_day
2,5,Four Rooms,5.8,2578.0,Released,1995-12-09,4257354.0,98.0,4000000.0,en,...,"Kimberly Blair, Sammi Davis, Paul Skemp, Bruce...","Alexandre Rockwell, Quentin Tarantino, Allison...","Alexandre Rockwell, Quentin Tarantino, Allison...","Alexandre Rockwell, Lawrence Bender, Quentin T...",well liked,6.43385,modest returns,1995,12,9
3,6,Judgment Night,6.5,323.0,Released,1993-10-15,12136938.0,109.0,21000000.0,en,...,"Doug Wert, Darin Mangan, Stephen Dorff, Nigel ...",Stephen Hopkins,"Lewis Colick, Jere Cunningham","Gene Levy, Marilyn Vance, Lloyd Segan",well liked,-42.205057,failure,1993,10,15
6,11,Star Wars,8.203,20152.0,Released,1977-05-25,775398007.0,121.0,11000000.0,en,...,"Leslie Schofield, Don Henderson, David Prowse,...",George Lucas,George Lucas,"Rick McCallum, George Lucas, Gary Kurtz",critical success,6949.072791,extraordinary returns,1977,5,25
7,12,Finding Nemo,7.821,18826.0,Released,2003-05-30,940335536.0,100.0,94000000.0,en,...,"Alexander Gould, Susan Blu, Michaela Jill Murp...",Andrew Stanton,"Bob Peterson, Will Csaklos, Andrew Stanton, Ji...","John Lasseter, Graham Walters",critical success,900.356953,extraordinary returns,2003,5,30
8,13,Forrest Gump,8.475,26790.0,Released,1994-06-23,677387716.0,142.0,55000000.0,en,...,"Angela Lomas, Geoffrey Blake, Zach Hanner, Mic...",Robert Zemeckis,"Winston Groom, Eric Roth","Steve Tisch, Steve Starkey, Wendy Finerman",critical success,1131.614029,extraordinary returns,1994,6,23


In [276]:
us_tmdb_df['genres'].value_counts()

genres
Drama                               1343
Comedy                               994
Horror                               986
Documentary                          796
Comedy, Drama                        377
                                    ... 
Documentary, Music, TV Movie           1
Thriller, Drama, Crime, TV Movie       1
Action, Drama, Horror, War             1
TV Movie, Comedy, Drama, Romance       1
Horror, Romance, Thriller, Music       1
Name: count, Length: 2208, dtype: int64

In [277]:
# Split the genre strings into lists of individual genres
us_tmdb_df = us_tmdb_df.copy()
us_tmdb_df['genres'] = us_tmdb_df['genres'].str.split(',')

# Seperate rows with a lists of genres into individual rows
us_tmdb_df = us_tmdb_df.explode('genres')

# Strip white spaces from the genre column
us_tmdb_df['genres'] = us_tmdb_df['genres'].str.strip()

# Recount the occurrences of each genre
us_tmdb_df['genres'].value_counts()

genres
Drama              5620
Comedy             4736
Horror             3340
Thriller           2868
Action             2263
Romance            1868
Crime              1549
Science Fiction    1513
Adventure          1314
Documentary        1191
Family              954
Mystery             953
Fantasy             944
Animation           647
Music               607
History             336
War                 297
Western             291
TV Movie            208
Name: count, dtype: int64

In [278]:
# Identify duplicate rows in a DataFrame across all or specific columns.
# us_tmdb_duplicate = us_tmdb_df[us_tmdb_df.duplicated(subset=['title'], keep=False)]
us_tmdb_duplicate = us_tmdb_df[us_tmdb_df.duplicated(subset=['id'], keep=False)]
us_tmdb_duplicate

Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,budget,original_language,...,cast,director,writers,producers,critical_success,roi,financial_success,released_year,released_month,released_day
3,6,Judgment Night,6.500,323.0,Released,1993-10-15,12136938.0,109.0,21000000.0,en,...,"Doug Wert, Darin Mangan, Stephen Dorff, Nigel ...",Stephen Hopkins,"Lewis Colick, Jere Cunningham","Gene Levy, Marilyn Vance, Lloyd Segan",well liked,-42.205057,failure,1993,10,15
3,6,Judgment Night,6.500,323.0,Released,1993-10-15,12136938.0,109.0,21000000.0,en,...,"Doug Wert, Darin Mangan, Stephen Dorff, Nigel ...",Stephen Hopkins,"Lewis Colick, Jere Cunningham","Gene Levy, Marilyn Vance, Lloyd Segan",well liked,-42.205057,failure,1993,10,15
3,6,Judgment Night,6.500,323.0,Released,1993-10-15,12136938.0,109.0,21000000.0,en,...,"Doug Wert, Darin Mangan, Stephen Dorff, Nigel ...",Stephen Hopkins,"Lewis Colick, Jere Cunningham","Gene Levy, Marilyn Vance, Lloyd Segan",well liked,-42.205057,failure,1993,10,15
6,11,Star Wars,8.203,20152.0,Released,1977-05-25,775398007.0,121.0,11000000.0,en,...,"Leslie Schofield, Don Henderson, David Prowse,...",George Lucas,George Lucas,"Rick McCallum, George Lucas, Gary Kurtz",critical success,6949.072791,extraordinary returns,1977,5,25
6,11,Star Wars,8.203,20152.0,Released,1977-05-25,775398007.0,121.0,11000000.0,en,...,"Leslie Schofield, Don Henderson, David Prowse,...",George Lucas,George Lucas,"Rick McCallum, George Lucas, Gary Kurtz",critical success,6949.072791,extraordinary returns,1977,5,25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
948247,1307509,Doc Ock,0.000,0.0,Released,2024-06-27,0.0,14.0,100.0,en,...,,Peter Dudka,,,panned,-100.000000,failure,2024,6,27
948247,1307509,Doc Ock,0.000,0.0,Released,2024-06-27,0.0,14.0,100.0,en,...,,Peter Dudka,,,panned,-100.000000,failure,2024,6,27
948247,1307509,Doc Ock,0.000,0.0,Released,2024-06-27,0.0,14.0,100.0,en,...,,Peter Dudka,,,panned,-100.000000,failure,2024,6,27
948302,1307570,12 oz. Mouse (DVD MOVIECUT),0.000,0.0,Released,2008-02-29,0.0,208.0,100.0,en,...,"Matt Maiellaro, Kurt Soccolich, Matt Harrigan,...",Matt Maiellaro,,,panned,-100.000000,failure,2008,2,29


In [280]:
# Remove duplicates based on "id" column (the unique identifier for each movie)
us_tmdb_df = us_tmdb_df.drop_duplicates(subset=['id'], keep='first')
us_tmdb_df.shape

(15363, 26)

In [281]:
# Verify whether there are still duplicate rows in a DataFrame based on the "id" column
us_tmdb_duplicate = us_tmdb_df[us_tmdb_df.duplicated(subset=['id'], keep=False)]
us_tmdb_duplicate

Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,budget,original_language,...,cast,director,writers,producers,critical_success,roi,financial_success,released_year,released_month,released_day


In [None]:
# # Visualize and plot replationships between vote_average and revenue
# us_tmdb_df.plot.scatter(x='vote_average', y='revenue', title='Vote Average vs Revenue')

# # Visualize and plot replationships between vote_count and revenue
# us_tmdb_df.plot.scatter(x='vote_count', y='revenue', title='Vote Count vs Revenue')

# # Visualize and plot replationships between popularity and revenue
# us_tmdb_df.plot.scatter(x='popularity', y='revenue', title='Popularity vs Revenue')

# # Visualize and plot replationships between runtime and revenue
# us_tmdb_df.plot.scatter(x='runtime', y='revenue', title='Runtime vs Revenue')

# # Visualize and plot replationships between budget and revenue
# us_tmdb_df.plot.scatter(x='budget', y='revenue', title='Budget vs Revenue')

In [None]:
# Determine outliers
# Separate cast and crew into individual columns
# Convert the release_date column to datetime and extract features from the date
# 
# Groupby and aggregate genre & directior for further data analysis
# String Extraction for the cast column

In [None]:
# # Use a function to create a 'profitablity encoded' column to categorize the profitability of the movies
# def encode_profitability(profit):
#     if profit <= 0:
#         return 0
#     else:
#         return 1
    
# us_tmdb_df['profitability_encoded'] = us_tmdb_df['profitability'].apply(encode_profitability)


## Splitting Data

In [None]:
# Creat a list of features to include in the model:
features = ['vote_average','vote_count','budget']

X = us_tmdb_df[features]
X.head()

In [None]:
# Set the target variable y
y = us_tmdb_df['revenue']

In [None]:
# Split the data into training and testing sets again
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

## Modeling

In [None]:
# Create the model
lr_model = LinearRegression()

# Fit the model to the training data. 
lr_model.fit(X_train, y_train)

In [None]:
# Calculate the mean_squared_error and the r-squared value for the testing data
# Use lr_model to predict the y values
predictions = lr_model.predict(X_test)

# Score the predictions with mse and r2
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2 ): {r2}")

In [None]:
# Provided code to create the adjusted r-squared function
def r2_adj(x, y, model):
    r2 = model.score(x,y)
    n_cols = x.shape[1]
    return 1 - (1 - r2) * (len(y) - 1) / (len(y) - n_cols - 1)

In [None]:
# Calculate the adjusted r-squared value of the model
adj_score = r2_adj(X_test, y_test, lr_model)

print(f"Adjusted R2: {adj_score}")


In [None]:
# Use the statsmodels package to create and fit a linear regression
lr_sm = sm.OLS(y_train, X_train).fit()

# Show the p-values of all columns sorted in ascending order
lr_sm.pvalues.sort_values()

In [None]:
# # Create a function to calculate VIF

# def calc_vif(X):
#     vif = pd.DataFrame()
#     vif["variables"] = X.columns
#     vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
#     return(vif)

# # Calculate vif for the dataframe
# calc_vif(X_train).sort_values(by='VIF',ascending=False)

## Standardization

## Encoding

## Imputation

## Data Engineering

## Feature Selection

## Tuning & Sampling