In [2]:
import json
import pandas as pd
import numpy as np
import json
import seaborn as sns
import datetime
import matplotlib.pyplot as plt
import tensorflow as tf
from joblib import dump, load
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model,metrics
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

In [3]:
def load_tmdb_movies(path):
    df = pd.read_csv(path, parse_dates=['release_date'])
    df['release_date'] = pd.to_datetime(df['release_date']).apply(lambda x: x.date())
    json_columns = ['genres', 'keywords', 'production_countries', 'production_companies', 'spoken_languages']
    for column in json_columns:
        df[column] = df[column].apply(json.loads)
    return df

In [4]:
def load_tmdb_credits(path):
    df = pd.read_csv(path)
    json_columns = ['cast', 'crew']
    for column in json_columns:
        df[column] = df[column].apply(json.loads)
    return df

### Columns that existed in the IMDB version of the dataset and are gone.

In [5]:
LOST_COLUMNS = [
    'actor_1_facebook_likes',
    'actor_2_facebook_likes',
    'actor_3_facebook_likes',
    'aspect_ratio',
    'cast_total_facebook_likes',
    'color',
    'content_rating',
    'director_facebook_likes',
    'facenumber_in_poster',
    'movie_facebook_likes',
    'movie_imdb_link',
    'num_critic_for_reviews',
    'num_user_for_reviews'
                ]

### Columns in TMDb that had direct equivalents in the IMDB version. 

In [6]:
TMDB_TO_IMDB_SIMPLE_EQUIVALENCIES = {
    'budget': 'budget',
    'genres': 'genres',
    'revenue': 'gross',
    'title': 'movie_title',
    'runtime': 'duration',
    'original_language': 'language',  # it's possible that spoken_languages would be a better match
    'keywords': 'plot_keywords',
    'vote_count': 'num_voted_users',
                                         }

IMDB_COLUMNS_TO_REMAP = {'imdb_score': 'vote_average'}

In [7]:
def safe_access(container, index_values):
    # return a missing value rather than an error upon indexing/key failure
    result = container
    try:
        for idx in index_values:
            result = result[idx]
        return result
    except IndexError or KeyError:
        return pd.np.nan

In [8]:
def get_director(crew_data):
    directors = [x['name'] for x in crew_data if x['job'] == 'Director']
    return safe_access(directors, [0])

In [9]:
def pipe_flatten_names(keywords):
    return '|'.join([x['name'] for x in keywords])

In [10]:
def convert_to_original_format(movies, credits):
    # Converts TMDb data to make it as compatible as possible with kernels built on the original version of the data.
    tmdb_movies = movies.copy()
    tmdb_movies.rename(columns=TMDB_TO_IMDB_SIMPLE_EQUIVALENCIES, inplace=True)
    tmdb_movies['title_year'] = pd.to_datetime(tmdb_movies['release_date']).apply(lambda x: x.year)
    # I'm assuming that the first production country is equivalent, but have not been able to validate this
    tmdb_movies['country'] = tmdb_movies['production_countries'].apply(lambda x: safe_access(x, [0, 'name']))
    tmdb_movies['language'] = tmdb_movies['spoken_languages'].apply(lambda x: safe_access(x, [0, 'name']))
    tmdb_movies['director_name'] = credits['crew'].apply(get_director)
    tmdb_movies['actor_1_name'] = credits['cast'].apply(lambda x: safe_access(x, [0, 'name']))
    tmdb_movies['actor_2_name'] = credits['cast'].apply(lambda x: safe_access(x, [1, 'name']))
    tmdb_movies['actor_3_name'] = credits['cast'].apply(lambda x: safe_access(x, [2, 'name']))
    tmdb_movies['genres'] = tmdb_movies['genres'].apply(pipe_flatten_names)
    tmdb_movies['plot_keywords'] = tmdb_movies['plot_keywords'].apply(pipe_flatten_names)
    return tmdb_movies

In [200]:
movies = load_tmdb_movies("Data/Original/tmdb_5000_movies.csv")
credits = load_tmdb_credits("Data/tmdb_5000_credits.csv")
original_format = convert_to_original_format(movies, credits)
original_format.head(1)

Unnamed: 0,budget,genres,homepage,id,plot_keywords,language,original_title,overview,popularity,production_companies,...,tagline,movie_title,vote_average,num_voted_users,title_year,country,director_name,actor_1_name,actor_2_name,actor_3_name
0,237000000,Action|Adventure|Fantasy|Science Fiction,http://www.avatarmovie.com/,19995,culture clash|future|space war|space colony|so...,English,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{'name': 'Ingenious Film Partners', 'id': 289...",...,Enter the World of Pandora.,Avatar,7.2,11800,2009.0,United States of America,James Cameron,Sam Worthington,Zoe Saldana,Sigourney Weaver


In [230]:
df = original_format.copy()
features_to_remove =['id','original_title','overview', 'movie_title', 'homepage', 'status', 'plot_keywords', 'spoken_languages', 'gross', 'title_year', 'production_countries', 'production_companies','tagline', 'vote_average', 'num_voted_users']
df.drop(features_to_remove, axis = 1, inplace= True)
df.columns

Index(['budget', 'genres', 'language', 'popularity', 'release_date',
       'duration', 'country', 'director_name', 'actor_1_name', 'actor_2_name',
       'actor_3_name'],
      dtype='object')

In [231]:
df.isna().sum()

budget             0
genres             0
language          86
popularity         0
release_date       1
duration           2
country          174
director_name     30
actor_1_name      43
actor_2_name      53
actor_3_name      63
dtype: int64

In [232]:
replace_with_zero = ['duration','title_year']

missing_val_cols = list(df.columns[df.isna().any()])
for col in missing_val_cols:
  if col not in replace_with_zero:
    miss_percentage = ((df[col].isna().sum())/4803) * 100 
    print(f'{col} : {str(miss_percentage)}')
    df[col].fillna('Missing', inplace=True)
  else:
    miss_percentage = ((df[col].isna().sum())/4803) * 100 
    print(f'{col} : {str(miss_percentage)}')
    df[col].fillna(df[col].mean(), inplace=True)

language : 1.7905475744326462
release_date : 0.020820320632937747
duration : 0.041640641265875494
country : 3.622735790131168
director_name : 0.6246096189881324
actor_1_name : 0.8952737872163231
actor_2_name : 1.1034769935457007
actor_3_name : 1.311680199875078


In [233]:
df.dtypes

budget             int64
genres            object
language          object
popularity       float64
release_date      object
duration         float64
country           object
director_name     object
actor_1_name      object
actor_2_name      object
actor_3_name      object
dtype: object

In [234]:
# scaler = StandardScaler()

# numerical = df.select_dtypes(include=['int64','float']).copy()
# numerical = df.columns[df.dtypes.apply(lambda c: np.issubdtype(c, np.number))]


# df[numerical] = scaler.fit_transform(df[numerical])
# df


In [235]:
df['release_date'] = pd.to_datetime(df['release_date'].astype(str),errors='coerce')
df['dayofrelease']=df['release_date'].dt.strftime('%A')
# df['monthofrelease']=df['release_date'].dt.strftime('%m')
df.drop(columns=['release_date'], inplace = True)
df.head(3)

Unnamed: 0,budget,genres,language,popularity,duration,country,director_name,actor_1_name,actor_2_name,actor_3_name,dayofrelease
0,237000000,Action|Adventure|Fantasy|Science Fiction,English,150.437577,162.0,United States of America,James Cameron,Sam Worthington,Zoe Saldana,Sigourney Weaver,Thursday
1,300000000,Adventure|Fantasy|Action,English,139.082615,169.0,United States of America,Gore Verbinski,Johnny Depp,Orlando Bloom,Keira Knightley,Saturday
2,245000000,Action|Adventure|Crime,Français,107.376788,148.0,United Kingdom,Sam Mendes,Daniel Craig,Christoph Waltz,Léa Seydoux,Monday


In [236]:
data_copy = df.copy()

nominal = ['dayofrelease']
one_hot = pd.get_dummies(data_copy[nominal])
data_copy.drop(['dayofrelease'], axis=1, inplace=True)
data_copy = data_copy.join(one_hot)
data_copy

Unnamed: 0,budget,genres,language,popularity,duration,country,director_name,actor_1_name,actor_2_name,actor_3_name,dayofrelease_Friday,dayofrelease_Monday,dayofrelease_Saturday,dayofrelease_Sunday,dayofrelease_Thursday,dayofrelease_Tuesday,dayofrelease_Wednesday
0,237000000,Action|Adventure|Fantasy|Science Fiction,English,150.437577,162.0,United States of America,James Cameron,Sam Worthington,Zoe Saldana,Sigourney Weaver,0,0,0,0,1,0,0
1,300000000,Adventure|Fantasy|Action,English,139.082615,169.0,United States of America,Gore Verbinski,Johnny Depp,Orlando Bloom,Keira Knightley,0,0,1,0,0,0,0
2,245000000,Action|Adventure|Crime,Français,107.376788,148.0,United Kingdom,Sam Mendes,Daniel Craig,Christoph Waltz,Léa Seydoux,0,1,0,0,0,0,0
3,250000000,Action|Crime|Drama|Thriller,English,112.312950,165.0,United States of America,Christopher Nolan,Christian Bale,Michael Caine,Gary Oldman,0,1,0,0,0,0,0
4,260000000,Action|Adventure|Science Fiction,English,43.926995,132.0,United States of America,Andrew Stanton,Taylor Kitsch,Lynn Collins,Samantha Morton,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4798,220000,Action|Crime|Thriller,Español,14.269792,81.0,Mexico,Robert Rodriguez,Carlos Gallardo,Jaime de Hoyos,Peter Marquardt,1,0,0,0,0,0,0
4799,9000,Comedy|Romance,Missing,0.642552,85.0,Missing,Edward Burns,Edward Burns,Kerry Bishé,Marsha Dietlein,0,1,0,0,0,0,0
4800,0,Comedy|Drama|Romance|TV Movie,English,1.444476,120.0,United States of America,Scott Smith,Eric Mabius,Kristin Booth,Crystal Lowe,0,0,0,1,0,0,0
4801,0,,English,0.857008,98.0,United States of America,Daniel Hsia,Daniel Henney,Eliza Coupe,Bill Paxton,0,0,0,0,1,0,0


In [237]:
from sklearn.preprocessing import LabelEncoder

ordinal = ['actor_1_name','actor_2_name','actor_3_name', 'director_name', 'country','language', 'genres']
for feature in ordinal:
    data_copy[feature] = data_copy[feature].astype('category')
    data_copy[feature] = data_copy[feature].cat.codes
# pd.set_option('display.max_rows', None)

In [238]:
data_copy.head()

Unnamed: 0,budget,genres,language,popularity,duration,country,director_name,actor_1_name,actor_2_name,actor_3_name,dayofrelease_Friday,dayofrelease_Monday,dayofrelease_Saturday,dayofrelease_Sunday,dayofrelease_Thursday,dayofrelease_Tuesday,dayofrelease_Wednesday
0,237000000,64,8,150.437577,162.0,70,885,1760,2715,2725,0,0,0,0,1,0,0
1,300000000,332,8,139.082615,169.0,70,769,991,2017,1674,0,0,1,0,0,0,0
2,245000000,33,11,107.376788,148.0,69,1972,439,452,1909,0,1,0,0,0,0,0
3,250000000,135,8,112.31295,165.0,70,360,361,1824,994,0,1,0,0,0,0,0
4,260000000,76,8,43.926995,132.0,70,93,1923,1639,2615,0,0,0,0,0,0,1


In [239]:
data_copy.drop(columns=['popularity'], inplace= True)
new_data = pd.concat([data_copy, df['popularity']], axis =1)
new_data

Unnamed: 0,budget,genres,language,duration,country,director_name,actor_1_name,actor_2_name,actor_3_name,dayofrelease_Friday,dayofrelease_Monday,dayofrelease_Saturday,dayofrelease_Sunday,dayofrelease_Thursday,dayofrelease_Tuesday,dayofrelease_Wednesday,popularity
0,237000000,64,8,162.0,70,885,1760,2715,2725,0,0,0,0,1,0,0,150.437577
1,300000000,332,8,169.0,70,769,991,2017,1674,0,0,1,0,0,0,0,139.082615
2,245000000,33,11,148.0,69,1972,439,452,1909,0,1,0,0,0,0,0,107.376788
3,250000000,135,8,165.0,70,360,361,1824,994,0,1,0,0,0,0,0,112.312950
4,260000000,76,8,132.0,70,93,1923,1639,2615,0,0,0,0,0,0,1,43.926995
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4798,220000,141,9,81.0,46,1896,291,1064,2363,1,0,0,0,0,0,0,14.269792
4799,9000,527,19,85.0,47,605,553,1471,1987,0,1,0,0,0,0,0,0.642552
4800,0,482,8,120.0,70,1998,603,1511,598,0,0,0,1,0,0,0,1.444476
4801,0,0,8,98.0,70,439,441,743,301,0,0,0,0,1,0,0,0.857008


In [241]:
scaler = StandardScaler()

# numerical = df.select_dtypes(include=['int64','float']).copy()
numerical = data_copy.columns[data_copy.dtypes.apply(lambda c: np.issubdtype(c, np.number))]


data_copy[numerical] = scaler.fit_transform(data_copy[numerical])
data_copy

Unnamed: 0,budget,genres,language,duration,country,director_name,actor_1_name,actor_2_name,actor_3_name,dayofrelease_Friday,dayofrelease_Monday,dayofrelease_Saturday,dayofrelease_Sunday,dayofrelease_Thursday,dayofrelease_Tuesday,dayofrelease_Wednesday
0,5.107181,-1.712060,-0.273936,2.438596,0.566591,-0.477710,1.171365,1.746301,1.324216,-0.864029,-0.225798,-0.227828,-0.217527,2.059964,-0.280786,-0.445369
1,6.654402,-0.794803,-0.273936,2.748263,0.566591,-0.647522,-0.108215,0.850469,0.138410,-0.864029,-0.225798,4.389285,-0.217527,-0.485445,-0.280786,-0.445369
2,5.303653,-1.818160,0.152828,1.819260,0.517966,1.113538,-1.026717,-1.158094,0.403552,-0.864029,4.428740,-0.227828,-0.217527,-0.485445,-0.280786,-0.445369
3,5.426449,-1.469055,-0.273936,2.571310,0.566591,-1.246253,-1.156506,0.602768,-0.628810,-0.864029,4.428740,-0.227828,-0.217527,-0.485445,-0.280786,-0.445369
4,5.672039,-1.670988,-0.273936,1.111448,0.566591,-1.637111,1.442589,0.365334,1.200107,-0.864029,-0.225798,-0.227828,-0.217527,-0.485445,-0.280786,2.245331
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4798,-0.707916,-1.448520,-0.131681,-1.144703,-0.600426,1.002283,-1.272982,-0.372637,0.915784,1.157369,-0.225798,-0.227828,-0.217527,-0.485445,-0.280786,-0.445369
4799,-0.713098,-0.127396,1.290866,-0.967750,-0.551800,-0.887600,-0.837027,0.149718,0.491557,-0.864029,4.428740,-0.227828,-0.217527,-0.485445,-0.280786,-0.445369
4800,-0.713319,-0.281413,-0.273936,0.580589,0.566591,1.151599,-0.753829,0.201055,-1.075603,-0.864029,-0.225798,-0.227828,4.597134,-0.485445,-0.280786,-0.445369
4801,-0.713319,-1.931106,-0.273936,-0.392652,0.566591,-1.130605,-1.023389,-0.784617,-1.410697,-0.864029,-0.225798,-0.227828,-0.217527,2.059964,-0.280786,-0.445369


In [242]:
data_copy.head()

Unnamed: 0,budget,genres,language,duration,country,director_name,actor_1_name,actor_2_name,actor_3_name,dayofrelease_Friday,dayofrelease_Monday,dayofrelease_Saturday,dayofrelease_Sunday,dayofrelease_Thursday,dayofrelease_Tuesday,dayofrelease_Wednesday
0,5.107181,-1.71206,-0.273936,2.438596,0.566591,-0.47771,1.171365,1.746301,1.324216,-0.864029,-0.225798,-0.227828,-0.217527,2.059964,-0.280786,-0.445369
1,6.654402,-0.794803,-0.273936,2.748263,0.566591,-0.647522,-0.108215,0.850469,0.13841,-0.864029,-0.225798,4.389285,-0.217527,-0.485445,-0.280786,-0.445369
2,5.303653,-1.81816,0.152828,1.81926,0.517966,1.113538,-1.026717,-1.158094,0.403552,-0.864029,4.42874,-0.227828,-0.217527,-0.485445,-0.280786,-0.445369
3,5.426449,-1.469055,-0.273936,2.57131,0.566591,-1.246253,-1.156506,0.602768,-0.62881,-0.864029,4.42874,-0.227828,-0.217527,-0.485445,-0.280786,-0.445369
4,5.672039,-1.670988,-0.273936,1.111448,0.566591,-1.637111,1.442589,0.365334,1.200107,-0.864029,-0.225798,-0.227828,-0.217527,-0.485445,-0.280786,2.245331


In [243]:
all_features = list(new_data.columns)
all_features
X = new_data.loc[:,all_features[0]:all_features[-2]].values
y = new_data.loc[:,[all_features[-1]]].values
X

array([[2.37e+08, 6.40e+01, 8.00e+00, ..., 1.00e+00, 0.00e+00, 0.00e+00],
       [3.00e+08, 3.32e+02, 8.00e+00, ..., 0.00e+00, 0.00e+00, 0.00e+00],
       [2.45e+08, 3.30e+01, 1.10e+01, ..., 0.00e+00, 0.00e+00, 0.00e+00],
       ...,
       [0.00e+00, 4.82e+02, 8.00e+00, ..., 0.00e+00, 0.00e+00, 0.00e+00],
       [0.00e+00, 0.00e+00, 8.00e+00, ..., 1.00e+00, 0.00e+00, 0.00e+00],
       [0.00e+00, 6.01e+02, 8.00e+00, ..., 0.00e+00, 0.00e+00, 0.00e+00]])

In [244]:
X = PowerTransformer().fit_transform(X)
y = PowerTransformer().fit_transform(y)
X

array([[ 1.61646776, -1.86015999, -0.22438998, ...,  2.05996354,
        -0.28078634, -0.44536872],
       [ 1.75415631, -0.73768099, -0.22438998, ..., -0.48544549,
        -0.28078634, -0.44536872],
       [ 1.63553265, -2.03111726,  0.59947141, ..., -0.48544549,
        -0.28078634, -0.44536872],
       ...,
       [-1.67433707, -0.20949588, -0.22438998, ..., -0.48544549,
        -0.28078634, -0.44536872],
       [-1.67433707, -2.27150805, -0.22438998, ...,  2.05996354,
        -0.28078634, -0.44536872],
       [-1.67433707,  0.18467373, -0.22438998, ..., -0.48544549,
        -0.28078634, -0.44536872]])

In [245]:
pca_with_varia = PCA(0.9)
pca_with_varia.fit_transform(X)
pca_with_varia.n_components_

13

In [246]:
pca = PCA(n_components=3)
pc_s = pca.fit_transform(X)
pca_data = pd.DataFrame(pc_s, columns=['PC1','PC2','PC3'])
target_data = pd.DataFrame(y,columns=['popularity'])
final_data = pd.concat([pca_data, target_data], axis =1)
final_data

Unnamed: 0,PC1,PC2,PC3,popularity
0,2.346944,-1.888462,-1.364127,2.400489
1,1.615583,-1.044753,0.387666,2.314766
2,1.864714,-1.438651,0.607964,2.037021
3,2.034855,-2.005410,0.014940,2.084737
4,2.141629,-2.090067,1.654339,1.134385
...,...,...,...,...
4798,-1.484447,0.414909,0.583813,0.127453
4799,0.145647,2.333304,0.759200,-1.649725
4800,-0.000853,1.819871,0.303886,-1.360066
4801,0.700179,0.897910,-2.064304,-1.561498


In [247]:
pca.explained_variance_ratio_

array([0.09787042, 0.08780805, 0.07714011])

In [248]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=1)
X_train

array([[ 0.89812837, -2.13903936, -0.22438998, ..., -0.48544549,
        -0.28078634,  2.24533058],
       [ 0.2858123 , -0.54267547,  0.59947141, ..., -0.48544549,
        -0.28078634,  2.24533058],
       [-1.67433707, -0.08494362, -0.22438998, ..., -0.48544549,
        -0.28078634, -0.44536872],
       ...,
       [ 0.83177067, -0.30165856, -0.22438998, ..., -0.48544549,
        -0.28078634, -0.44536872],
       [-1.67433707,  1.24045979,  1.34428832, ..., -0.48544549,
        -0.28078634,  2.24533058],
       [ 1.14248347,  0.90636132,  0.59947141, ..., -0.48544549,
        -0.28078634, -0.44536872]])

In [132]:
result = {}
models = {
    'PassiveAggressiveRegressor':linear_model.PassiveAggressiveRegressor(),
    'Ridge':linear_model.Ridge(),
    'ElasticNet': linear_model.ElasticNet(),
    'LassoCV': linear_model.LassoCV(),
    'BayesianRidge' : linear_model.BayesianRidge(),
    'ARDRegression' : linear_model.ARDRegression()
}

for i in models:
  models[i].fit(X_train, y_train)
  accuracy = models[i].score(X_test, y_test)
  result[i]= [accuracy*100]
pd.DataFrame(result)  

ValueError: could not convert string to float: 'United States of America'

In [133]:
dump(models['Ridge'], 'Ridge.joblib')

['Ridge.joblib']

In [134]:
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin