In [84]:
import pandas as pd 
import numpy as np
import json

In [308]:

def getDatabase():
    credits = pd.read_csv('data/tmdb_5000_credits.csv')
    movies = pd.read_csv('data/tmdb_5000_movies.csv')

    credits.columns = ['id','tittle','cast','crew']
    movies = movies.merge(credits,on='id')
    return movies

def cleanDatabase(movies):
    movies_clean = movies[movies["vote_count"] > 100]
    movies_clean["genres"] = movies_clean.apply(lambda row: getIds(row["genres"]), axis = 1)
    movies_clean["keywords"] = movies_clean.apply(lambda row: getIds(row["keywords"]), axis = 1)
    movies_clean["cast"] = movies_clean.apply(lambda row: getIds(row["cast"]), axis = 1)
    movies_clean["crew"] = movies_clean.apply(lambda row: getIds(row["crew"]), axis = 1)

    useful_headers = ['id',
    'title',
    'original_language',
    'budget',
    'popularity',
    'runtime',
    'release_date',
    'genres',
    'keywords',
    'cast',
    'crew',
    'vote_count',
    'vote_average']
    return movies_clean[useful_headers]


def getIds(listOfDict):
    listOfDict = json.loads(listOfDict)
    output = list()
    for dictionary in listOfDict:
        name = dictionary["name"]
        nameShort = name.lower()
        nameShort = nameShort.replace(" ","")
        if nameShort in output:
            continue
        output.append(nameShort)
    return output




In [309]:
movies = getDatabase()


In [323]:
df = cleanDatabase(movies)

In [311]:
def getAllElementsFrom(columnName):
    output = []
    for listOfElements in df[columnName]:
        for element in listOfElements:
            if element in output:
                continue
            output.append(element)
    return output


genres = getAllElementsFrom("genres")
keys = getAllElementsFrom("keywords")
crews = getAllElementsFrom("crew")
casts = getAllElementsFrom("cast")

df = df[ [ col for col in df.columns if col != 'vote_average' ] + ['vote_average']]

In [312]:
print("Genres: {}".format(len(genres)))
print("keys: {}".format(len(keys)))
print("crews: {}".format(len(crews)))
print("casts: {}".format(len(casts)))

Genres: 19
keys: 8738
crews: 45203
casts: 45072


In [324]:
for genre in genres:
    newColName = "has_{}".format(genre)
    df[newColName] = df.apply(lambda row: 1 if genre in row["genres"] else 0, axis = 1)

for cast in casts:
    newColName = "has_{}".format(cast)
    df[newColName] = df.apply(lambda row: 1 if cast in row["cast"] else 0, axis = 1)

df = df[ [ col for col in df.columns if col != 'vote_average' ] + ['vote_average']]


KeyError: 'casts'

In [319]:
not_usefull_Columns = ['title',
    'release_date',
    'original_language',
    'runtime',
    'genres',
    'keywords',
    'cast',
    'crew']

df_test = df
for columnName in not_usefull_Columns:
    df_test.pop(columnName)

lastColumnIndex = len(list(df_test)) - 1

X = df_test.iloc[:, 1:lastColumnIndex].values
y = df_test.iloc[:, lastColumnIndex ].values

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)




In [320]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
sc_y = StandardScaler()
y_train = sc_y.fit_transform(y_train.reshape(-1,1))
y_test = sc_y.fit_transform(y_test.reshape(-1,1))


In [303]:
# Fitting DecisionTreeRegressor to the dataset

from sklearn.tree import DecisionTreeRegressor
print("Decision Tree Regression")

regressor = DecisionTreeRegressor(random_state = 0)
regressor.fit(X_train, y_train)
getFittingSummary(regressor)


Decision Tree Regression
Mean Squared Error: 2.03 degrees.
R2: 0.099


In [321]:
# Fitting Random Forest Regression to the dataset
from sklearn.ensemble import RandomForestRegressor
print("Random Forest Regression")
regressor = RandomForestRegressor(n_estimators = 100, random_state = 0)
regressor.fit(X_train, y_train)
getFittingSummary(regressor)

Random Forest Regression
Mean Squared Error: 1.53 degrees.
R2: 0.46


In [322]:
# Fitting SVR to the dataset
from sklearn.svm import SVR
print("Support Vector Regression")
regressor = SVR(kernel = 'rbf')
regressor.fit(X_train, y_train)
getFittingSummary(regressor)

Support Vector Regression
Mean Squared Error: 1.5 degrees.
R2: 0.459


In [302]:
def getFittingSummary(regressor):
    predictions = regressor.predict(X_test)
    errors = abs(predictions - y_test)
    errorsSquared = (predictions - y_test) ** 2
    
    print('Mean Squared Error:', round(np.mean(errorsSquared), 2), 'degrees.')

    mape = 100 * (errors / y_test)
    accuracy = 100 - np.mean(mape)

    score = regressor.score(X_test, y_test)
    print('R2:', round(score, 3))


In [307]:
list(df)

['id',
 'title',
 'original_language',
 'budget',
 'popularity',
 'runtime',
 'release_date',
 'genres',
 'keywords',
 'cast',
 'crew',
 'vote_count',
 'has_action',
 'has_adventure',
 'has_fantasy',
 'has_sciencefiction',
 'has_crime',
 'has_drama',
 'has_thriller',
 'has_animation',
 'has_family',
 'has_western',
 'has_comedy',
 'has_romance',
 'has_horror',
 'has_mystery',
 'has_history',
 'has_war',
 'has_music',
 'has_documentary',
 'has_tvmovie',
 'vote_average']

In [None]:
#Ideas
"""
Cast, crew, keywords. Recucir a los mas usados. Algo asi como los q aparezcan en mas de 10 peliculas. Ver cuanto queda. 

Añadir recien ahi la cantidad de columnas razonables a la matriz. Ojala no mas de 1000 por cada uno. 500 actores, idk.
"""

