In [1]:
import json
import pandas as pd
import numpy as np
import json
import seaborn as sns
import datetime
import matplotlib.pyplot as plt
from joblib import dump, load
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor 
from mlxtend.plotting import plot_decision_regions

In [2]:
def load_tmdb_movies(path):
    df = pd.read_csv(path, parse_dates=['release_date'])
    df['release_date'] = pd.to_datetime(df['release_date']).apply(lambda x: x.date())
    json_columns = ['genres', 'keywords', 'production_countries', 'production_companies', 'spoken_languages']
    for column in json_columns:
        df[column] = df[column].apply(json.loads)
    return df

In [3]:
def load_tmdb_credits(path):
    df = pd.read_csv(path)
    json_columns = ['cast', 'crew']
    for column in json_columns:
        df[column] = df[column].apply(json.loads)
    return df

In [4]:
LOST_COLUMNS = [
    'actor_1_facebook_likes',
    'actor_2_facebook_likes',
    'actor_3_facebook_likes',
    'aspect_ratio',
    'cast_total_facebook_likes',
    'color',
    'content_rating',
    'director_facebook_likes',
    'facenumber_in_poster',
    'movie_facebook_likes',
    'movie_imdb_link',
    'num_critic_for_reviews',
    'num_user_for_reviews'
                ]

In [5]:
TMDB_TO_IMDB_SIMPLE_EQUIVALENCIES = {
    'budget': 'budget',
    'genres': 'genres',
    'revenue': 'gross',
    'title': 'movie_title',
    'runtime': 'duration',
    'original_language': 'language',  # it's possible that spoken_languages would be a better match
    'keywords': 'plot_keywords',
    'vote_count': 'num_voted_users',
                                         }

IMDB_COLUMNS_TO_REMAP = {'imdb_score': 'vote_average'}

In [6]:
def safe_access(container, index_values):
    # return a missing value rather than an error upon indexing/key failure
    result = container
    try:
        for idx in index_values:
            result = result[idx]
        return result
    except IndexError or KeyError:
        return pd.np.nan

In [7]:
def get_director(crew_data):
    directors = [x['name'] for x in crew_data if x['job'] == 'Director']
    return safe_access(directors, [0])

In [8]:
def get_companies(company_data):
    companies = [x['name'] for x in company_data if x['name'] == 'Director']
    return safe_access(directors, [0])

In [9]:
def pipe_flatten_names(keywords):
    return '|'.join([x['name'] for x in keywords])

In [10]:
def convert_to_original_format(movies, credits):
    # Converts TMDb data to make it as compatible as possible with kernels built on the original version of the data.
    tmdb_movies = movies.copy()
    tmdb_movies.rename(columns=TMDB_TO_IMDB_SIMPLE_EQUIVALENCIES, inplace=True)
    tmdb_movies['title_year'] = pd.to_datetime(tmdb_movies['release_date']).apply(lambda x: x.year)
    # I'm assuming that the first production country is equivalent, but have not been able to validate this
    tmdb_movies['country'] = tmdb_movies['production_countries'].apply(lambda x: safe_access(x, [0, 'name']))
    tmdb_movies['company'] = tmdb_movies['production_companies'].apply(lambda x: safe_access(x, [0, 'name']))
    tmdb_movies['language'] = tmdb_movies['spoken_languages'].apply(lambda x: safe_access(x, [0, 'name']))
    # tmdb_movies['production_companies'] = tmdb_movies['production_companies'].apply(lambda x: safe_access(x, [0, 'name']))
    tmdb_movies['director_name'] = credits['crew'].apply(get_director)
    tmdb_movies['actor_1_name'] = credits['cast'].apply(lambda x: safe_access(x, [0, 'name']))
    tmdb_movies['actor_2_name'] = credits['cast'].apply(lambda x: safe_access(x, [1, 'name']))
    tmdb_movies['actor_3_name'] = credits['cast'].apply(lambda x: safe_access(x, [2, 'name']))
    tmdb_movies['genres'] = tmdb_movies['genres'].apply(pipe_flatten_names)
    tmdb_movies['plot_keywords'] = tmdb_movies['plot_keywords'].apply(pipe_flatten_names)
    return tmdb_movies

In [11]:
movies = load_tmdb_movies("Data/Original/tmdb_5000_movies.csv")
credits = load_tmdb_credits("Data/tmdb_5000_credits.csv")
original_format = convert_to_original_format(movies, credits)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 5)
original_format

Unnamed: 0,budget,genres,homepage,id,plot_keywords,language,original_title,overview,popularity,production_companies,production_countries,release_date,gross,duration,spoken_languages,status,tagline,movie_title,vote_average,num_voted_users,title_year,country,company,director_name,actor_1_name,actor_2_name,actor_3_name
0,237000000,Action|Adventure|Fantasy|Science Fiction,http://www.avatarmovie.com/,19995,culture clash|future|space war|space colony|so...,English,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{'name': 'Ingenious Film Partners', 'id': 289...","[{'iso_3166_1': 'US', 'name': 'United States o...",2009-12-10,2787965087,162.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,2009.0,United States of America,Ingenious Film Partners,James Cameron,Sam Worthington,Zoe Saldana,Sigourney Weaver
1,300000000,Adventure|Fantasy|Action,http://disney.go.com/disneypictures/pirates/,285,ocean|drug abuse|exotic island|east india trad...,English,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{'name': 'Walt Disney Pictures', 'id': 2}, {'...","[{'iso_3166_1': 'US', 'name': 'United States o...",2007-05-19,961000000,169.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,2007.0,United States of America,Walt Disney Pictures,Gore Verbinski,Johnny Depp,Orlando Bloom,Keira Knightley
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4801,0,,http://shanghaicalling.com/,126186,,English,Shanghai Calling,When ambitious New York attorney Sam is sent t...,0.857008,[],"[{'iso_3166_1': 'US', 'name': 'United States o...",2012-05-03,0,98.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,A New Yorker in Shanghai,Shanghai Calling,5.7,7,2012.0,United States of America,,Daniel Hsia,Daniel Henney,Eliza Coupe,Bill Paxton
4802,0,Documentary,,25975,obsession|camcorder|crush|dream girl,English,My Date with Drew,Ever since the second grade when he first saw ...,1.929883,"[{'name': 'rusty bear entertainment', 'id': 87...","[{'iso_3166_1': 'US', 'name': 'United States o...",2005-08-05,0,90.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,My Date with Drew,6.3,16,2005.0,United States of America,rusty bear entertainment,Brian Herzlinger,Drew Barrymore,Brian Herzlinger,Corey Feldman


In [12]:
df = original_format.copy()
features_to_remove =['id','original_title','overview', 'movie_title', 'homepage', 'status', 'plot_keywords', 'spoken_languages', 'title_year', 'production_countries', 'production_companies','tagline','popularity','vote_average', 'num_voted_users', 'genres']
df.drop(features_to_remove, axis = 1, inplace= True)
df.columns

Index(['budget', 'language', 'release_date', 'gross', 'duration', 'country',
       'company', 'director_name', 'actor_1_name', 'actor_2_name',
       'actor_3_name'],
      dtype='object')

In [13]:
df.isna().sum()

budget           0
language        86
                ..
actor_2_name    53
actor_3_name    63
Length: 11, dtype: int64

In [14]:
replace_with_zero = ['duration']

missing_val_cols = list(df.columns[df.isna().any()])
for col in missing_val_cols:
  if col not in replace_with_zero:
    miss_percentage = ((df[col].isna().sum())/4803) * 100 
    print(f'{col} : {str(miss_percentage)}')
    df[col].fillna('Missing', inplace=True)
  else:
    miss_percentage = ((df[col].isna().sum())/4803) * 100 
    print(f'{col} : {str(miss_percentage)}')
     

df.isna().any()

language : 1.7905475744326462
release_date : 0.020820320632937747
duration : 0.041640641265875494
country : 3.622735790131168
company : 7.30793254216115
director_name : 0.6246096189881324
actor_1_name : 0.8952737872163231
actor_2_name : 1.1034769935457007
actor_3_name : 1.311680199875078


budget          False
language        False
                ...  
actor_2_name    False
actor_3_name    False
Length: 11, dtype: bool

In [15]:
df['release_date'] = pd.to_datetime(df['release_date'].astype(str),errors='coerce')
df['dayofrelease']=df['release_date'].dt.strftime('%A')
df['monthofrelease']=df['release_date'].dt.strftime('%b')
df.drop(columns=['release_date'], inplace = True)
df.head(3)

Unnamed: 0,budget,language,gross,duration,country,company,director_name,actor_1_name,actor_2_name,actor_3_name,dayofrelease,monthofrelease
0,237000000,English,2787965087,162.0,United States of America,Ingenious Film Partners,James Cameron,Sam Worthington,Zoe Saldana,Sigourney Weaver,Thursday,Dec
1,300000000,English,961000000,169.0,United States of America,Walt Disney Pictures,Gore Verbinski,Johnny Depp,Orlando Bloom,Keira Knightley,Saturday,May
2,245000000,Français,880674609,148.0,United Kingdom,Columbia Pictures,Sam Mendes,Daniel Craig,Christoph Waltz,Léa Seydoux,Monday,Oct


In [16]:
# Parse the stringified features into their corresponding python objects
from ast import literal_eval

# Function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''


features = ['actor_1_name','actor_2_name','actor_3_name','director_name','country', 'company', 'dayofrelease', 'language']
for feature in features:
    df[feature] = df[feature].apply(clean_data)

df

Unnamed: 0,budget,language,gross,duration,country,company,director_name,actor_1_name,actor_2_name,actor_3_name,dayofrelease,monthofrelease
0,237000000,english,2787965087,162.0,unitedstatesofamerica,ingeniousfilmpartners,jamescameron,samworthington,zoesaldana,sigourneyweaver,thursday,Dec
1,300000000,english,961000000,169.0,unitedstatesofamerica,waltdisneypictures,goreverbinski,johnnydepp,orlandobloom,keiraknightley,saturday,May
...,...,...,...,...,...,...,...,...,...,...,...,...
4801,0,english,0,98.0,unitedstatesofamerica,missing,danielhsia,danielhenney,elizacoupe,billpaxton,thursday,May
4802,0,english,0,90.0,unitedstatesofamerica,rustybearentertainment,brianherzlinger,drewbarrymore,brianherzlinger,coreyfeldman,friday,Aug


In [17]:
data_copy = df.copy()
data_copy['Success'] = np.where(data_copy['gross'] >= data_copy['gross'].mean(), 1,0)
data_copy.pop('gross')
data_copy

Unnamed: 0,budget,language,duration,country,company,director_name,actor_1_name,actor_2_name,actor_3_name,dayofrelease,monthofrelease,Success
0,237000000,english,162.0,unitedstatesofamerica,ingeniousfilmpartners,jamescameron,samworthington,zoesaldana,sigourneyweaver,thursday,Dec,1
1,300000000,english,169.0,unitedstatesofamerica,waltdisneypictures,goreverbinski,johnnydepp,orlandobloom,keiraknightley,saturday,May,1
...,...,...,...,...,...,...,...,...,...,...,...,...
4801,0,english,98.0,unitedstatesofamerica,missing,danielhsia,danielhenney,elizacoupe,billpaxton,thursday,May,0
4802,0,english,90.0,unitedstatesofamerica,rustybearentertainment,brianherzlinger,drewbarrymore,brianherzlinger,coreyfeldman,friday,Aug,0


In [18]:
X = data_copy.copy()
y = X.pop('Success')
X

Unnamed: 0,budget,language,duration,country,company,director_name,actor_1_name,actor_2_name,actor_3_name,dayofrelease,monthofrelease
0,237000000,english,162.0,unitedstatesofamerica,ingeniousfilmpartners,jamescameron,samworthington,zoesaldana,sigourneyweaver,thursday,Dec
1,300000000,english,169.0,unitedstatesofamerica,waltdisneypictures,goreverbinski,johnnydepp,orlandobloom,keiraknightley,saturday,May
...,...,...,...,...,...,...,...,...,...,...,...
4801,0,english,98.0,unitedstatesofamerica,missing,danielhsia,danielhenney,elizacoupe,billpaxton,thursday,May
4802,0,english,90.0,unitedstatesofamerica,rustybearentertainment,brianherzlinger,drewbarrymore,brianherzlinger,coreyfeldman,friday,Aug


In [19]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y,train_size=0.8,test_size=0.2,random_state=0)

In [20]:
categorical_cols = ['language','country','company','director_name','actor_1_name','actor_2_name','actor_3_name','dayofrelease', 'monthofrelease']

# Select numerical columns
numerical_cols = ['budget','duration']

In [21]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps =[
    ('imputer',SimpleImputer(strategy='mean')),
    ('scale',StandardScaler())
    ])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [22]:
# Define model
model = RandomForestClassifier(n_estimators=100, random_state=0)
# Bundle preprocessing and modeling code in a pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),('model', model)])
# Preprocessing of training data, fit model 
clf.fit(X_train, y_train)
# Preprocessing of validation data, get predictions
preds = clf.predict(X_valid)
from sklearn.model_selection import cross_val_score

accuracy = cross_val_score(clf, X, y, scoring='accuracy', cv = 10)
print(accuracy)
#get the mean of each fold 
print("Accuracy of Model with Cross Validation is:",accuracy.mean() * 100)

[0.3991684  0.68191268 0.91891892 0.85416667 0.80208333 0.76458333
 0.74375    0.74583333 0.7375     0.73541667]
Accuracy of Model with Cross Validation is: 73.83333333333334


In [23]:
from sklearn.linear_model import LogisticRegression

# Define model
model = LogisticRegression()
# Bundle preprocessing and modeling code in a pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),('model', model)])
# Preprocessing of training data, fit model 
clf.fit(X_train, y_train)
# Preprocessing of validation data, get predictions
preds = clf.predict(X_valid)
from sklearn.model_selection import cross_val_score

accuracy = cross_val_score(clf, X, y, scoring='accuracy', cv = 10)
print(accuracy)
#get the mean of each fold 
print("Accuracy of Model with Cross Validation is:",accuracy.mean() * 100)

[0.42619543 0.88357588 0.96257796 0.95208333 0.89791667 0.85833333
 0.82083333 0.78333333 0.75208333 0.73541667]
Accuracy of Model with Cross Validation is: 80.72349272349273


In [24]:
from sklearn.neighbors import KNeighborsClassifier

# Define model
model = KNeighborsClassifier()
# Bundle preprocessing and modeling code in a pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),('model', model)])
# Preprocessing of training data, fit model 
clf.fit(X_train, y_train)
# Preprocessing of validation data, get predictions
preds = clf.predict(X_valid)
from sklearn.model_selection import cross_val_score

accuracy = cross_val_score(clf, X, y, scoring='accuracy', cv = 10)
print(accuracy)
#get the mean of each fold 
print("Accuracy of Model with Cross Validation is:",accuracy.mean() * 100)

[0.48232848 0.76507277 0.84199584 0.87083333 0.825      0.83333333
 0.81041667 0.78958333 0.775      0.74375   ]
Accuracy of Model with Cross Validation is: 77.37313756063757


In [28]:
from sklearn.svm import LinearSVC

# Define model
model = LinearSVC()
# Bundle preprocessing and modeling code in a pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),('model', model)])
# Preprocessing of training data, fit model 
clf.fit(X_train, y_train)
# Preprocessing of validation data, get predictions
preds = clf.predict(X_valid)
from sklearn.model_selection import cross_val_score

accuracy = cross_val_score(clf, X, y, scoring='accuracy', cv = 10)
print(accuracy)
#get the mean of each fold 
print("Accuracy of Model with Cross Validation is:",accuracy.mean() * 100)

[0.47401247 0.78794179 0.88981289 0.88958333 0.85833333 0.85
 0.82708333 0.82291667 0.78958333 0.75416667]
Accuracy of Model with Cross Validation is: 79.43433818433817


In [29]:
from sklearn.neural_network import MLPClassifier

#Define the model
model = MLPClassifier()
# Bundle preprocessing and modeling code in a pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),('model', model)])
# Preprocessing of training data, fit model 
clf.fit(X_train, y_train)
# Preprocessing of validation data, get predictions
preds = clf.predict(X_valid)
from sklearn.model_selection import cross_val_score

accuracy = cross_val_score(clf, X, y, scoring='accuracy', cv = 10)
print(accuracy)
#get the mean of each fold 
print("Accuracy of Model with Cross Validation is:",accuracy.mean() * 100)

NameError: name 'random_state' is not defined

In [26]:
# Bundle preprocessing and modeling code in a pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),('model', model)])

In [98]:
# Preprocessing of training data, fit model 
clf.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = clf.predict(X_valid)

In [99]:
from sklearn.metrics import mean_absolute_error
print('MAE:', mean_absolute_error(y_valid, preds))

MAE: 0.1675338189386056


In [100]:
print("model score: %.3f" % clf.score(X_valid, y_valid))

model score: 0.832


In [101]:
from sklearn.model_selection import cross_val_score

accuracy = cross_val_score(clf, X, y, scoring='accuracy', cv = 10)
print(accuracy)
#get the mean of each fold 
print("Accuracy of Model with Cross Validation is:",accuracy.mean() * 100)

[0.42619543 0.88357588 0.96257796 0.95208333 0.89791667 0.85833333
 0.82083333 0.78333333 0.75208333 0.73541667]
Accuracy of Model with Cross Validation is: 80.72349272349273


In [103]:
dump(clf, 'models/final_model.joblib')

['models/final_model.joblib']