In [9]:
import pandas as pd 
import numpy as np
import json

First, we will load de dataset using pandas

In [200]:
def getDatabase():
    credits = pd.read_csv('data/tmdb_5000_credits.csv')
    movies = pd.read_csv('data/tmdb_5000_movies.csv')

    credits.columns = ['id','tittle','cast','crew']
    movies = movies.merge(credits,on='id')
    return movies

movies = getDatabase()


Clean data. Remove some useless columns and get a clean list of genres, crew members, cast and keywords.


In [201]:
def cleanDatabase(movies):
    movies_clean = movies[movies["vote_count"] > 300]
    movies_clean["genres"] = movies_clean.apply(lambda row: getValues(row["genres"]), axis = 1)
    movies_clean["keywords"] = movies_clean.apply(lambda row: getValues(row["keywords"]), axis = 1)
    movies_clean["cast"] = movies_clean.apply(lambda row: getValues(row["cast"]), axis = 1)
    movies_clean["crew"] = movies_clean.apply(lambda row: getValues(row["crew"]), axis = 1)

    useful_headers = ['id',
    'title',
    'original_language',
    'budget',
    'popularity',
    'runtime',
    'release_date',
    'genres',
    'keywords',
    'cast',
    'crew',
    'vote_count',
    'vote_average']
    return movies_clean[useful_headers]


def getValues(listOfDict):
    listOfDict = json.loads(listOfDict)
    output = list()
    for dictionary in listOfDict:
        name = dictionary["name"]
        nameShort = name.lower()
        nameShort = nameShort.replace(" ","")
        if nameShort in output:
            continue
        output.append(nameShort)
    return output

df = cleanDatabase(movies)
df.head(3)

Unnamed: 0,id,title,original_language,budget,popularity,runtime,release_date,genres,keywords,cast,crew,vote_count,vote_average
0,19995,Avatar,en,237000000,150.437577,162.0,2009-12-10,"[action, adventure, fantasy, sciencefiction]","[cultureclash, future, spacewar, spacecolony, ...","[samworthington, zoesaldana, sigourneyweaver, ...","[stephene.rivkin, rickcarter, christopherboyes...",11800,7.2
1,285,Pirates of the Caribbean: At World's End,en,300000000,139.082615,169.0,2007-05-19,"[adventure, fantasy, action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[johnnydepp, orlandobloom, keiraknightley, ste...","[dariuszwolski, goreverbinski, jerrybruckheime...",4500,6.9
2,206647,Spectre,en,245000000,107.376788,148.0,2015-10-26,"[action, adventure, crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[danielcraig, christophwaltz, léaseydoux, ralp...","[thomasnewman, sammendes, annapinnock, johnlog...",4466,6.3


Get unique genres, cast, crew and keywords. We have a dictionary with the number of times that a specific genre/cast/etc has been used in a movie. 


In [12]:
def getAllElementsFrom(columnName):
    output = dict()
    for listOfElements in df[columnName]:
        for element in listOfElements:
            if element in output.keys():
                output[element] += 1
            else:
                output[element] = 1
    return output

genres = getAllElementsFrom("genres")
keys = getAllElementsFrom("keywords")
crews = getAllElementsFrom("crew")
casts = getAllElementsFrom("cast")

print("Genres: {}".format(len(genres)))
print("keywords: {}".format(len(keys)))
print("crews: {}".format(len(crews)))
print("casts: {}".format(len(casts)))

Genres: 19
keywords: 7350
crews: 38208
casts: 36837


That's a lot of people. I don't wanna train a model with that amount of columns. Let's filter the relevant information with a bare minimun apparences on the dataset. 


In [13]:
def getShortList(dictionaryCount):
    shortList = list()
    min_apparences = 20
    for key, count in dictionaryCount.items():
        if count < min_apparences:
            continue
        shortList.append(key)
    return shortList

short_genres = getShortList(genres)
short_keys = getShortList(keys)
short_crews = getShortList(crews)
short_casts = getShortList(casts)

print("Genres: {}".format(len(short_genres)))
print("keywords: {}".format(len(short_keys)))
print("crews: {}".format(len(short_crews)))
print("casts: {}".format(len(short_casts)))



Genres: 17
keywords: 132
crews: 138
casts: 87


That's better. Now, let's add each relevant genre/cast/crew/keyword to the dataset as a binary column.


In [64]:
def addBinaryColumnsToDataframe(listOfKeywords, columnName):
    for keyword in listOfKeywords:
        newColName = "has_{}".format(keyword)
        df[newColName] = df.apply(lambda row: True if keyword in row[columnName] else False, axis = 1)

addBinaryColumnsToDataframe(short_genres, "genres")
addBinaryColumnsToDataframe(short_keys, "keywords")
addBinaryColumnsToDataframe(short_crews, "crew")
addBinaryColumnsToDataframe(short_casts, "cast")


Preparing the dataset for being used. First let's get rid of some useless columns and select the vote_average columns as the target.


In [173]:
import copy 

df_test = copy.deepcopy(df)
target_name = "vote_average"
not_usefull_Columns = [
    'id',
    'release_date',
    'genres',
    'keywords',
    'cast',
    'crew']

for columnName in not_usefull_Columns:
    df_test.pop(columnName)

data = df_test.drop(columns=[target_name])
target = df_test[target_name]

data_train, data_test, target_train, target_test = train_test_split(
    data, target, random_state=42)



Feature Scaling with encoders. We have two different encoders: one for categorical variables and other for numerical. Then, we will combine those in a pipeline with a preprocessor. 


In [174]:
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer


float_columns_selector = selector(dtype_include="float")
int_columns_selector = selector(dtype_include="int")
str_columns_selector = selector(dtype_include="object")
bool_columns_selector = selector(dtype_include="bool")

numerical_columns = float_columns_selector(data) + int_columns_selector(data)
categorical_columns = str_columns_selector(data) + bool_columns_selector(data)

categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
numerical_preprocessor = StandardScaler()


preprocessor = ColumnTransformer([
    ('one-hot-encoder', categorical_preprocessor, categorical_columns),
    ('standard_scaler', numerical_preprocessor, numerical_columns)])


In [175]:
def getFittingSummary(regressor):
    predictions = regressor.predict(data_test)
    errorsSquared = (predictions - target_test) ** 2
    
    print('Mean Squared Error:', round(np.mean(errorsSquared), 2), 'degrees.')

    score = regressor.score(data_test, target_test)
    print('R2:', round(score, 3))

In [176]:
# Fitting DecisionTreeRegressor to the dataset
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import make_pipeline

print("Decision Tree Regression")
regressor = DecisionTreeRegressor(random_state = 0)
model = make_pipeline(preprocessor, regressor)
model.fit(data_train, target_train)
getFittingSummary(model)

Decision Tree Regression
Mean Squared Error: 0.57 degrees.
R2: 0.034


In [177]:
# Fitting Random Forest Regression to the dataset
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline

print("Random Forest Regression")
regressor = RandomForestRegressor(n_estimators = 100, random_state = 0)
model = make_pipeline(preprocessor, regressor))
model.fit(data_train, target_train)
getFittingSummary(model)

Random Forest Regression
Mean Squared Error: 0.27 degrees.
R2: 0.536


In [178]:
# Fitting SVR to the dataset
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline

print("Support Vector Regression")
regressor = SVR(kernel = 'rbf')
model = make_pipeline(preprocessor, regressor)
model.fit(data_train, target_train)
getFittingSummary(model)

Support Vector Regression
Mean Squared Error: 0.26 degrees.
R2: 0.566


In summary, we have made some regression models to predict the socre of a movie given de IMDB information, such as crew memenber, cast, genres, budget, among others. 

The best model of the three tested is the SVR.




Bonus track: Let's see the predictions over a real set of movies rating on IMDB. Some movies are slighly inacurate, but quite close to the real rating. We predicted that "Superman Returns" would be a disaster and that "The Avengers" would be a moderate succes. 

I find these result quite good becouse movies that I love, like "The Dark Knight Rises", is predictid with les than 3% of error on its rating.

In [211]:
dataToShow = df_test[:25]
predictions = model.predict(dataToShow)
real = list(dataToShow[target_name])
titles = list(dataToShow['title'])

for index, name in enumerate(titles):
    observed = real[index]
    predicted = predictions[index]
    error = 100 * abs(observed - predicted) / observed
    print("Prediction {}   Real   {}    Error {}% \t| {}".format(
        round(predicted, 1), 
        observed,
        round(error), 
        name))


Prediction 7.7   Real   7.2    Error 7% 	| Avatar
Prediction 6.7   Real   6.9    Error 3% 	| Pirates of the Caribbean: At World's End
Prediction 6.4   Real   6.3    Error 2% 	| Spectre
Prediction 7.4   Real   7.6    Error 3% 	| The Dark Knight Rises
Prediction 6.0   Real   6.1    Error 2% 	| John Carter
Prediction 6.0   Real   5.9    Error 2% 	| Spider-Man 3
Prediction 6.4   Real   7.4    Error 14% 	| Tangled
Prediction 6.6   Real   7.3    Error 10% 	| Avengers: Age of Ultron
Prediction 6.9   Real   7.4    Error 7% 	| Harry Potter and the Half-Blood Prince
Prediction 6.8   Real   5.7    Error 20% 	| Batman v Superman: Dawn of Justice
Prediction 5.5   Real   5.4    Error 1% 	| Superman Returns
Prediction 6.1   Real   6.1    Error 0% 	| Quantum of Solace
Prediction 7.1   Real   7.0    Error 1% 	| Pirates of the Caribbean: Dead Man's Chest
Prediction 6.1   Real   5.9    Error 3% 	| The Lone Ranger
Prediction 6.7   Real   6.5    Error 3% 	| Man of Steel
Prediction 6.2   Real   6.3    Error