In [1]:
# Load data and necessary packages
import numpy as np
import pandas as pd
import pandas_profiling
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import sklearn
from sklearn.neural_network import MLPClassifier

# Movies are Like a Box of Chocolates

In this exercise, I want to see what, if any, characteristics make for a successful movie. Of course, the term success can be subjective. For this study, we will arbitrarily consider any movie that makes at least enough money to recoup the cost of making the movie a success.

In [2]:
# import data
df = pd.read_csv('../../../Data/movie_metadata.csv')
df.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


In [3]:
# shape of df
df.shape

(5043, 28)

In [4]:
# drop null values
df = df.dropna()
df = df.reset_index(drop=True)

In [23]:
# new shape
df.shape

(3756, 27)

In [6]:
# columns
df.columns

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')

In [7]:
# let's add a column for net profit
df['net_profit'] = df['gross'] - df['budget']

In [8]:
df.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes,net_profit
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000,523505847.0
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0,9404152.0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000,-44925825.0
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000,198130642.0
4,Color,Andrew Stanton,462.0,132.0,475.0,530.0,Samantha Morton,640.0,73058679.0,Action|Adventure|Sci-Fi,...,English,USA,PG-13,263700000.0,2012.0,632.0,6.6,2.35,24000,-190641321.0


In [9]:
pandas_profiling.ProfileReport(df)



In [10]:
# aggregating net_profits by director
directors = df.groupby(['director_name'], as_index=False)['gross', 'budget', 'net_profit'].sum()

# changing display options to standardize format for ease of reading
pd.set_option('display.float_format', lambda x: '%0.0f' % x)
#print(directors)

In [11]:
# create new dataframe for ease of use when plotting
df_directors = pd.DataFrame(directors)
df_directors.shape

(1659, 4)

In [12]:
# That's a lot of directors. Let's see the top 10.
df_directors.sort_values(by='net_profit', ascending=False).head(10)

Unnamed: 0,director_name,gross,budget,net_profit
1492,Steven Spielberg,4114233101,1627900870,2486332231
518,George Lucas,1741418480,354777000,1386641480
620,James Cameron,1948125910,748500000,1199625910
839,Joss Whedon,1730886628,730000000,1000886628
222,Chris Columbus,1618707624,677000000,941707624
1536,Tim Burton,2071275480,1247000000,824275480
251,Christopher Nolan,1813227576,1005000000,808227576
1229,Peter Jackson,2289968050,1512000000,777968050
790,Jon Favreau,1562381547,793000000,769381547
463,Francis Lawrence,1358501971,603000000,755501971


Well that was to be expected - lots of recognizable names. Now let's see the bottom 10 just to see if there are any recognizable names there.

In [13]:
df_directors.sort_values(by='net_profit', ascending=True).head(10)

Unnamed: 0,director_name,gross,budget,net_profit
817,Joon-ho Bong,6764441,12254700000,-12247935559
207,Chan-wook Park,4095234,4215000000,-4210904766
865,Katsuhiro Ôtomo,849550,3227519898,-3226670348
909,Lajos Koltai,195888,2500000000,-2499804112
573,Hayao Miyazaki,32140315,2477000000,-2444859685
1511,Takao Okawara,10037390,1000000000,-989962610
856,Karan Johar,7294138,712000000,-704705862
197,Carlos Saura,1687311,700000000,-698312689
785,John Woo,493401036,983632000,-490230964
217,Chatrichalerm Yukol,454255,400000000,-399545745


Hmmmm... very suspicious. Chan-wook Park is a very successful Korean director and it doesn't seem possible to me that his budget would be that astronomical. I want to take a closer look at his movies.

In [14]:
cwpark = df[df['director_name'] == 'Chan-wook Park']
cwpark

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes,net_profit
2686,Color,Chan-wook Park,469,99,0,520,Alden Ehrenreich,3000,1702277,Drama|Thriller,...,English,UK,R,12000000,2013,1000,7,2,27000,-10297723
3259,Color,Chan-wook Park,202,112,0,38,Yeong-ae Lee,717,211667,Crime|Drama,...,Korean,South Korea,R,4200000000,2005,126,8,2,4000,-4199788333
3390,Color,Chan-wook Park,305,120,0,38,Ji-tae Yu,717,2181290,Drama|Mystery|Thriller,...,Korean,South Korea,R,3000000,2003,78,8,2,43000,-818710


Wut? A korean movie made in 2005 had a budget of $4.2 billion??? If that's true, this in itself would have been news. I can't help but dig deeper.

In [15]:
df.iloc[3259]

color                                                                    Color
director_name                                                   Chan-wook Park
num_critic_for_reviews                                                     202
duration                                                                   112
director_facebook_likes                                                      0
actor_3_facebook_likes                                                      38
actor_2_name                                                      Yeong-ae Lee
actor_1_facebook_likes                                                     717
gross                                                                   211667
genres                                                             Crime|Drama
actor_1_name                                                      Min-sik Choi
movie_title                                                    Lady Vengeance 
num_voted_users                                     

Welp, there's the problem. The budget is in KRW, but gross is in American Dollars. This is a real problem that puts the reliability of this dataset into question. I have two choices: I can convert all budgets to dollars which would give me a more complete picture but at the cost of time and effort, and there is also no guarantee that the budget was consistently listed by local currency. Or, I can drop the budget column altogether which would give save me a bunch of time but end up with a slightly occluded picture. I am going to choose to save time for this practice. Back to square one!

In [64]:
# re-import data
df = pd.read_csv('../../../Data/movie_metadata.csv')
df.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0,855.0,Joel David Moore,1000,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936,8,2.0,33000
1,Color,Gore Verbinski,302.0,169.0,563,1000.0,Orlando Bloom,40000,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000,7,2.0,0
2,Color,Sam Mendes,602.0,148.0,0,161.0,Rory Kinnear,11000,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393,7,2.0,85000
3,Color,Christopher Nolan,813.0,164.0,22000,23000.0,Christian Bale,27000,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000,8,2.0,164000
4,,Doug Walker,,,131,,Rob Walker,131,,Documentary,...,,,,,,,12,7,,0


In [65]:
# drop null values
df = df.dropna()
df = df.reset_index(drop=True)
df.shape

(3756, 28)

In [66]:
# drop budget column as well as any unnecessary columns
df = df.drop(columns=['budget', 'actor_1_name', 'actor_2_name', 'actor_3_name', 'plot_keywords', 'movie_title',
                      'movie_imdb_link', 'country', 'genres', 'language']) #ideally genre would need to be separated and categorized
                                                            
df.shape

(3756, 18)

Let's see if the top ten lists change siginificantly. My guess would be that it doesn't, at least on the top end.

In [28]:
# aggregate sum of gross by director
directors2 = df.groupby(['director_name'], as_index=False)['gross'].sum()
df_directors2 = pd.DataFrame(directors2)
df_directors2.head()

Unnamed: 0,director_name,gross
0,Aaron Schneider,9176553
1,Aaron Seltzer,48546578
2,Abel Ferrara,1227324
3,Adam Goldberg,2580
4,Adam Marcus,15935068


In [29]:
# top grossing directors
df_directors2.sort_values(by='gross', ascending=False).head(10)

Unnamed: 0,director_name,gross
1492,Steven Spielberg,4114233101
1229,Peter Jackson,2289968050
1038,Michael Bay,2231242537
1536,Tim Burton,2071275480
1398,Sam Raimi,2049549198
620,James Cameron,1948125910
251,Christopher Nolan,1813227576
518,George Lucas,1741418480
839,Joss Whedon,1730886628
1350,Robert Zemeckis,1619309108


In [30]:
# bottom grossing directors
df_directors2.sort_values(by='gross', ascending=True).head(10)

Unnamed: 0,director_name,gross
424,Ekachai Uekrongtham,162
473,Frank Whaley,703
1311,Ricki Stern,1111
32,Alex Craig Mann,1332
1190,Paul Bunnell,2436
179,Bruce Dellis,2468
3,Adam Goldberg,2580
182,Bruce McDonald,3478
1657,Álex de la Iglesia,3607
815,Jonathan Newman,4756


In [40]:
df['gross'].describe()

count        3756
mean     52612824
std      70317867
min           162
25%       8270233
50%      30093107
75%      66881941
max     760505847
Name: gross, dtype: float64

In [67]:
# Let's set the bar somewhat high and say that anything over 70,000,000 is a success.
df['success'] = np.where(df['gross'] > 70000000, 1, 0)

# convert 'color' to int
df['color'] = np.where(df['color'] == 'Color', 1, 0)

In [68]:
# set dummies for directors 
director_dum = pd.get_dummies(df['director_name'])
director_dum.shape

(3756, 1659)

In [72]:
# create dummies for content rating
rating = pd.get_dummies(df['content_rating'])
rating.shape

(3756, 12)

In [75]:
# concatenate dummies
X = df.drop(columns=['gross', 'success', 'director_name', 'content_rating'])
X = pd.concat([X, director_dum, rating], axis=1)
X.shape

(3756, 1686)

In [54]:
# target
y = df['success']
y.shape

(3756,)

In [86]:
# build mlp classifier model
import time

start = time.time()

mlp = MLPClassifier(hidden_layer_sizes=(500, 100, ))
mlp.fit(X, y)

elapsed = time.time() - start
print('Runtime: %0.2f' % elapsed)

Runtime: 12.86


In [87]:
mlp.score(X, y)

0.8176251331203408

In [78]:
mlp.predict_proba(X)

array([[0.00000000e+00, 1.00000000e+00],
       [0.00000000e+00, 1.00000000e+00],
       [0.00000000e+00, 1.00000000e+00],
       ...,
       [6.82017245e-03, 9.93179828e-01],
       [0.00000000e+00, 1.00000000e+00],
       [1.00000000e+00, 1.26718776e-61]])

81.76%. Not bad. Let's see how that compares to a random forest model. 

In [89]:
from sklearn import ensemble
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Start clock
start_time = time.time()

# Initialize random forest regressor
rfr = ensemble.RandomForestClassifier(n_estimators=1000)

# Fit model to training data
rfr.fit(X_train, y_train)

# Test model with test data
print(rfr.score(X_test, y_test))

# Print runtime
rf_time = time.time() - start_time
print("Runtime: %0.2f" % rf_time)

0.8577127659574468
Runtime: 11.84


Very similar but random forest did give a slightly better figure. This dataset may not be the best in showing the benefits of neural networks since we only had less than 4000 rows. Since neural networks tend to like having a lot of data, it might show a real difference if used on a very large dataset.