In [1]:
import json
import pandas as pd
import numpy as np
import json
import seaborn as sns
import datetime
import matplotlib.pyplot as plt
from joblib import dump, load
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from mlxtend.plotting import plot_decision_regions

In [2]:
def load_tmdb_movies(path):
    df = pd.read_csv(path, parse_dates=['release_date'])
    df['release_date'] = pd.to_datetime(df['release_date']).apply(lambda x: x.date())
    json_columns = ['genres', 'keywords', 'production_countries', 'production_companies', 'spoken_languages']
    for column in json_columns:
        df[column] = df[column].apply(json.loads)
    return df

In [3]:
def load_tmdb_credits(path):
    df = pd.read_csv(path)
    json_columns = ['cast', 'crew']
    for column in json_columns:
        df[column] = df[column].apply(json.loads)
    return df

In [4]:
LOST_COLUMNS = [
    'actor_1_facebook_likes',
    'actor_2_facebook_likes',
    'actor_3_facebook_likes',
    'aspect_ratio',
    'cast_total_facebook_likes',
    'color',
    'content_rating',
    'director_facebook_likes',
    'facenumber_in_poster',
    'movie_facebook_likes',
    'movie_imdb_link',
    'num_critic_for_reviews',
    'num_user_for_reviews'
                ]

In [5]:
TMDB_TO_IMDB_SIMPLE_EQUIVALENCIES = {
    'budget': 'budget',
    'genres': 'genres',
    'revenue': 'gross',
    'title': 'movie_title',
    'runtime': 'duration',
    'original_language': 'language',  # it's possible that spoken_languages would be a better match
    'keywords': 'plot_keywords',
    'vote_count': 'num_voted_users',
                                         }

IMDB_COLUMNS_TO_REMAP = {'imdb_score': 'vote_average'}

In [6]:
def safe_access(container, index_values):
    # return a missing value rather than an error upon indexing/key failure
    result = container
    try:
        for idx in index_values:
            result = result[idx]
        return result
    except IndexError or KeyError:
        return pd.np.nan

In [7]:
def get_director(crew_data):
    directors = [x['name'] for x in crew_data if x['job'] == 'Director']
    return safe_access(directors, [0])

In [8]:
def pipe_flatten_names(keywords):
    return '|'.join([x['name'] for x in keywords])

In [9]:
def convert_to_original_format(movies, credits):
    # Converts TMDb data to make it as compatible as possible with kernels built on the original version of the data.
    tmdb_movies = movies.copy()
    tmdb_movies.rename(columns=TMDB_TO_IMDB_SIMPLE_EQUIVALENCIES, inplace=True)
    tmdb_movies['title_year'] = pd.to_datetime(tmdb_movies['release_date']).apply(lambda x: x.year)
    # I'm assuming that the first production country is equivalent, but have not been able to validate this
    tmdb_movies['country'] = tmdb_movies['production_countries'].apply(lambda x: safe_access(x, [0, 'name']))
    tmdb_movies['language'] = tmdb_movies['spoken_languages'].apply(lambda x: safe_access(x, [0, 'name']))
    tmdb_movies['director_name'] = credits['crew'].apply(get_director)
    tmdb_movies['actor_1_name'] = credits['cast'].apply(lambda x: safe_access(x, [0, 'name']))
    tmdb_movies['actor_2_name'] = credits['cast'].apply(lambda x: safe_access(x, [1, 'name']))
    tmdb_movies['actor_3_name'] = credits['cast'].apply(lambda x: safe_access(x, [2, 'name']))
    tmdb_movies['genres'] = tmdb_movies['genres'].apply(pipe_flatten_names)
    tmdb_movies['plot_keywords'] = tmdb_movies['plot_keywords'].apply(pipe_flatten_names)
    return tmdb_movies

In [209]:
movies = load_tmdb_movies("Data/Original/tmdb_5000_movies.csv")
credits = load_tmdb_credits("Data/tmdb_5000_credits.csv")
original_format = convert_to_original_format(movies, credits)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 5)
original_format

In [188]:
original_format.genres.nunique()

1175

In [189]:
df = original_format.copy()
features_to_remove =['id','original_title','overview', 'movie_title', 'homepage', 'status' , 'plot_keywords', 'spoken_languages', 'gross', 'title_year', 'production_countries','production_companies','tagline', 'vote_average','num_voted_users', 'genres']
df.drop(features_to_remove, axis = 1, inplace= True)
df.columns

Index(['budget', 'language', 'popularity', 'release_date', 'duration',
       'country', 'director_name', 'actor_1_name', 'actor_2_name',
       'actor_3_name'],
      dtype='object')

In [190]:
replace_with_zero = ['duration','title_year']

missing_val_cols = list(df.columns[df.isna().any()])
for col in missing_val_cols:
  if col not in replace_with_zero:
    miss_percentage = ((df[col].isna().sum())/4803) * 100 
    print(f'{col} : {str(miss_percentage)}')
    df[col].fillna('Missing', inplace=True)
  else:
    miss_percentage = ((df[col].isna().sum())/4803) * 100 
    print(f'{col} : {str(miss_percentage)}')
    df[col].fillna(df[col].mean(), inplace=True)

language : 1.7905475744326462
release_date : 0.020820320632937747
duration : 0.041640641265875494
country : 3.622735790131168
director_name : 0.6246096189881324
actor_1_name : 0.8952737872163231
actor_2_name : 1.1034769935457007
actor_3_name : 1.311680199875078


In [191]:
# scaler = StandardScaler()

# # numerical = df.select_dtypes(include=['int64','float']).copy()
# numerical = df.columns[df.dtypes.apply(lambda c: np.issubdtype(c, np.number))]


# df[numerical] = scaler.fit_transform(df[numerical])
# df.head()

Unnamed: 0,budget,language,popularity,release_date,duration,country,director_name,actor_1_name,actor_2_name,actor_3_name
0,5.107181,English,4.053183,2009-12-10,2.438596,United States of America,James Cameron,Sam Worthington,Zoe Saldana,Sigourney Weaver
1,6.654402,English,3.696258,2007-05-19,2.748263,United States of America,Gore Verbinski,Johnny Depp,Orlando Bloom,Keira Knightley
2,5.303653,Français,2.699638,2015-10-26,1.81926,United Kingdom,Sam Mendes,Daniel Craig,Christoph Waltz,Léa Seydoux
3,5.426449,English,2.854798,2012-07-16,2.57131,United States of America,Christopher Nolan,Christian Bale,Michael Caine,Gary Oldman
4,5.672039,English,0.705198,2012-03-07,1.111448,United States of America,Andrew Stanton,Taylor Kitsch,Lynn Collins,Samantha Morton


In [192]:
df['release_date'] = pd.to_datetime(df['release_date'].astype(str),errors='coerce')
df['dayofrelease']=df['release_date'].dt.strftime('%A')
df.drop(columns=['release_date'], inplace = True)
df.head(3)

Unnamed: 0,budget,language,popularity,duration,country,director_name,actor_1_name,actor_2_name,actor_3_name,dayofrelease,monthofrelease
0,5.107181,English,4.053183,2.438596,United States of America,James Cameron,Sam Worthington,Zoe Saldana,Sigourney Weaver,Thursday,12
1,6.654402,English,3.696258,2.748263,United States of America,Gore Verbinski,Johnny Depp,Orlando Bloom,Keira Knightley,Saturday,5
2,5.303653,Français,2.699638,1.81926,United Kingdom,Sam Mendes,Daniel Craig,Christoph Waltz,Léa Seydoux,Monday,10


In [193]:
data_copy = df.copy()

nominal = ['dayofrelease', 'language']
one_hot = pd.get_dummies(data_copy[nominal])
data_copy.drop(['dayofrelease','language'], axis=1, inplace=True)
data_copy = data_copy.join(one_hot)
data_copy.head()

Unnamed: 0,budget,popularity,duration,country,director_name,actor_1_name,actor_2_name,actor_3_name,monthofrelease,dayofrelease_Friday,dayofrelease_Monday,dayofrelease_Saturday,dayofrelease_Sunday,dayofrelease_Thursday,dayofrelease_Tuesday,dayofrelease_Wednesday,language_,language_??????,language_Afrikaans,language_Bahasa indonesia,language_Bosanski,language_Dansk,language_Deutsch,language_Eesti,language_English,language_Español,language_Esperanto,language_Français,language_Gaeilge,language_Galego,language_Hrvatski,language_Italiano,language_Kiswahili,language_Latin,language_Magyar,language_Missing,language_Nederlands,language_No Language,language_Norsk,language_Polski,language_Português,language_Pусский,language_Română,language_Srpski,language_Tiếng Việt,language_shqip,language_svenska,language_Íslenska,language_Český,language_ελληνικά,language_Український,language_български език,language_עִבְרִית,language_اردو,language_العربية,language_فارسی,language_हिन्दी,language_বাংলা,language_தமிழ்,language_ภาษาไทย,language_广州话 / 廣州話,language_日本語,language_普通话,language_한국어/조선말
0,5.107181,4.053183,2.438596,United States of America,James Cameron,Sam Worthington,Zoe Saldana,Sigourney Weaver,12,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,6.654402,3.696258,2.748263,United States of America,Gore Verbinski,Johnny Depp,Orlando Bloom,Keira Knightley,5,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,5.303653,2.699638,1.81926,United Kingdom,Sam Mendes,Daniel Craig,Christoph Waltz,Léa Seydoux,10,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,5.426449,2.854798,2.57131,United States of America,Christopher Nolan,Christian Bale,Michael Caine,Gary Oldman,7,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,5.672039,0.705198,1.111448,United States of America,Andrew Stanton,Taylor Kitsch,Lynn Collins,Samantha Morton,3,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [207]:
from sklearn.preprocessing import LabelEncoder

ordinal = ['actor_1_name','actor_2_name','actor_3_name','director_name','country']
for feature in ordinal:
    data_copy[feature] = data_copy[feature].astype('category')
    data_copy[feature] = data_copy[feature].cat.codes
# pd.set_option('display.max_rows', None)

In [208]:
data_copy.drop(columns=['popularity'], inplace= True)
new_data = pd.concat([data_copy, df['popularity']], axis =1)

KeyError: "['popularity'] not found in axis"

In [196]:
knn_data = data_copy.copy()
knn_data.dtypes

budget              float64
duration            float64
                     ...   
language_普通话          uint8
language_한국어/조선말      uint8
Length: 63, dtype: object

In [197]:
knn_data['Success'] = np.where(df['popularity'] >= df['popularity'].mean(), 1,0)
# knn_data.drop(columns =['standardised_popularity'], inplace= True)
knn_data

Unnamed: 0,budget,duration,country,director_name,actor_1_name,actor_2_name,actor_3_name,monthofrelease,dayofrelease_Friday,dayofrelease_Monday,dayofrelease_Saturday,dayofrelease_Sunday,dayofrelease_Thursday,dayofrelease_Tuesday,dayofrelease_Wednesday,language_,language_??????,language_Afrikaans,language_Bahasa indonesia,language_Bosanski,language_Dansk,language_Deutsch,language_Eesti,language_English,language_Español,language_Esperanto,language_Français,language_Gaeilge,language_Galego,language_Hrvatski,language_Italiano,language_Kiswahili,language_Latin,language_Magyar,language_Missing,language_Nederlands,language_No Language,language_Norsk,language_Polski,language_Português,language_Pусский,language_Română,language_Srpski,language_Tiếng Việt,language_shqip,language_svenska,language_Íslenska,language_Český,language_ελληνικά,language_Український,language_български език,language_עִבְרִית,language_اردو,language_العربية,language_فارسی,language_हिन्दी,language_বাংলা,language_தமிழ்,language_ภาษาไทย,language_广州话 / 廣州話,language_日本語,language_普通话,language_한국어/조선말,Success
0,5.107181,2.438596,70,885,1760,2715,2725,12,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,6.654402,2.748263,70,769,991,2017,1674,05,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4801,-0.713319,-0.392652,70,439,441,743,301,05,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4802,-0.713319,-0.746559,70,249,529,294,585,08,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [198]:
knn_X = knn_data.iloc[:,:-1].values
knn_y = knn_data.iloc[:,64].values
knn_X

IndexError: single positional indexer is out-of-bounds

In [199]:
pca = PCA(n_components = 2)
knn_X = pca.fit_transform(knn_X)
knn_X

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [200]:
knnX_train, knnX_test, knny_train, knny_test = train_test_split(knn_X, knn_y, test_size=0.30)

In [201]:
classifier = KNeighborsClassifier(n_neighbors=60)
classifier.fit(knnX_train,knny_train)

KNeighborsClassifier(n_neighbors=60)

In [202]:
y_pred = classifier.predict(knnX_test)
classifier.score(knnX_test, knny_test) * 100

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [203]:
# plt.figure(figsize = (14,6))

# plot_decision_regions(knnX_train, knny_train, clf = classifier, legend = 2)

In [204]:
error = []

for i in range(39,81):
  knn = KNeighborsClassifier(n_neighbors=i)
  knn.fit(knnX_train,knny_train)
  pred_i = knn.predict(knnX_test)
  error.append(np.mean(pred_i != knny_test))

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [205]:
plt.figure(figsize = (12,6))
plt.plot(range(39,81), error, marker='o')
plt.title('Error Rate K Value')
plt.xlabel('K Value')
plt.ylabel('Mean Error')

ValueError: x and y must have same first dimension, but have shapes (42,) and (0,)

In [206]:
in_data = np.array([[ 0.3, 10], [1, 3.3], [2.1, 0.5]])