In [2]:
import pandas as pd 
import numpy as np
import json

In [5]:
# Get data from dataset

def getDatabase():
    credits = pd.read_csv('data/tmdb_5000_credits.csv')
    movies = pd.read_csv('data/tmdb_5000_movies.csv')

    credits.columns = ['id','tittle','cast','crew']
    movies = movies.merge(credits,on='id')
    return movies

movies = getDatabase()
movies[:3]

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,tittle,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."


In [6]:
# Clean data. Remove some useless columns and get a list of genres, crew members, cast and keywords.

def cleanDatabase(movies):
    movies_clean = movies[movies["vote_count"] > 300]
    movies_clean["genres"] = movies_clean.apply(lambda row: getValues(row["genres"]), axis = 1)
    movies_clean["keywords"] = movies_clean.apply(lambda row: getValues(row["keywords"]), axis = 1)
    movies_clean["cast"] = movies_clean.apply(lambda row: getValues(row["cast"]), axis = 1)
    movies_clean["crew"] = movies_clean.apply(lambda row: getValues(row["crew"]), axis = 1)

    useful_headers = ['id',
    'title',
    'original_language',
    'budget',
    'popularity',
    'runtime',
    'release_date',
    'genres',
    'keywords',
    'cast',
    'crew',
    'vote_count',
    'vote_average']
    return movies_clean[useful_headers]


def getValues(listOfDict):
    listOfDict = json.loads(listOfDict)
    output = list()
    for dictionary in listOfDict:
        name = dictionary["name"]
        nameShort = name.lower()
        nameShort = nameShort.replace(" ","")
        if nameShort in output:
            continue
        output.append(nameShort)
    return output

df = cleanDatabase(movies)
df[:3]

Unnamed: 0,id,title,original_language,budget,popularity,runtime,release_date,genres,keywords,cast,crew,vote_count,vote_average
0,19995,Avatar,en,237000000,150.437577,162.0,2009-12-10,"[action, adventure, fantasy, sciencefiction]","[cultureclash, future, spacewar, spacecolony, ...","[samworthington, zoesaldana, sigourneyweaver, ...","[stephene.rivkin, rickcarter, christopherboyes...",11800,7.2
1,285,Pirates of the Caribbean: At World's End,en,300000000,139.082615,169.0,2007-05-19,"[adventure, fantasy, action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[johnnydepp, orlandobloom, keiraknightley, ste...","[dariuszwolski, goreverbinski, jerrybruckheime...",4500,6.9
2,206647,Spectre,en,245000000,107.376788,148.0,2015-10-26,"[action, adventure, crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[danielcraig, christophwaltz, léaseydoux, ralp...","[thomasnewman, sammendes, annapinnock, johnlog...",4466,6.3


In [62]:
# Get unique genres, cast, crew and keywords. We have a dictionary with the number of times that a specific genre/cast/etc has been used in a movie. 

def getAllElementsFrom(columnName):
    output = dict()
    for listOfElements in df[columnName]:
        for element in listOfElements:
            if element in output.keys():
                output[element] += 1
            else:
                output[element] = 1
    return output

genres = getAllElementsFrom("genres")
keys = getAllElementsFrom("keywords")
crews = getAllElementsFrom("crew")
casts = getAllElementsFrom("cast")

print("Genres: {}".format(len(genres)))
print("keywords: {}".format(len(keys)))
print("crews: {}".format(len(crews)))
print("casts: {}".format(len(casts)))

Genres: 19
keys: 7350
crews: 38208
casts: 36837


In [63]:
# That's a lot of people. Let's filter the relevant information with a bare minimun apparences on the dataset. 

def getShortList(dictionaryCount):
    shortList = list()
    min_apparences = 20
    for key, count in dictionaryCount.items():
        if count < min_apparences:
            continue
        shortList.append(key)
    return shortList

short_genres = getShortList(genres)
short_keys = getShortList(keys)
short_crews = getShortList(crews)
short_casts = getShortList(casts)

print("Genres: {}".format(len(short_genres)))
print("keywords: {}".format(len(short_keys)))
print("crews: {}".format(len(short_crews)))
print("casts: {}".format(len(short_casts)))



Genres: 17
keys: 132
crews: 138
casts: 87


In [64]:
# That's better. Now, let's add each genre/cast/crew/keyword to the dataset as a binary column.

def addBinaryColumnsToDataframe(listOfKeywords, columnName):
    for keyword in listOfKeywords:
        newColName = "has_{}".format(keyword)
        df[newColName] = df.apply(lambda row: 1 if keyword in row[columnName] else 0, axis = 1)

addBinaryColumnsToDataframe(short_genres, "genres")
addBinaryColumnsToDataframe(short_keys, "keywords")
addBinaryColumnsToDataframe(short_crews, "crew")
addBinaryColumnsToDataframe(short_casts, "cast")


In [65]:
# Preparing the dataset for being used. First let's get rif off some useless columns and let the vote_average column at the end

df = df[ [ col for col in df.columns if col != 'vote_average' ] + ['vote_average']]
df_test = df

not_usefull_Columns = ['title',
    'release_date',
    'original_language',
    'runtime',
    'genres',
    'keywords',
    'cast',
    'crew']

for columnName in not_usefull_Columns:
    df_test.pop(columnName)

lastColumnIndex = len(list(df_test)) - 1

X = df_test.iloc[:, 1:lastColumnIndex].values
y = df_test.iloc[:, lastColumnIndex ].values

# Split the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)




In [66]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
sc_y = StandardScaler()
y_train = sc_y.fit_transform(y_train.reshape(-1,1))
y_test = sc_y.fit_transform(y_test.reshape(-1,1))


In [67]:
# Simple prediction summary
def getFittingSummary(regressor):
    predictions = regressor.predict(X_test)
    errorsSquared = (predictions - y_test) ** 2
    
    print('Mean Squared Error:', round(np.mean(errorsSquared), 2), 'degrees.')

    score = regressor.score(X_test, y_test)
    print('R2:', round(score, 3))

In [69]:
# Fitting DecisionTreeRegressor to the dataset
from sklearn.tree import DecisionTreeRegressor
print("Decision Tree Regression")

regressor = DecisionTreeRegressor(random_state = 0)
regressor.fit(X_train, y_train)
getFittingSummary(regressor)


Decision Tree Regression
Mean Squared Error: 2.05 degrees.
R2: 0.133


In [68]:
# Fitting Random Forest Regression to the dataset
from sklearn.ensemble import RandomForestRegressor
print("Random Forest Regression")
regressor = RandomForestRegressor(n_estimators = 100, random_state = 0)
regressor.fit(X_train, y_train)
getFittingSummary(regressor)

Random Forest Regression
Mean Squared Error: 1.49 degrees.
R2: 0.555


In [70]:
# Fitting SVR to the dataset
from sklearn.svm import SVR
print("Support Vector Regression")
regressor = SVR(kernel = 'rbf')
regressor.fit(X_train, y_train)
getFittingSummary(regressor)

Support Vector Regression
Mean Squared Error: 1.31 degrees.
R2: 0.412


In [7]:
"""
In summary, we have made some regression models to predict the socre of a movie given de IMDB information, such as crew memenber, cast, genres, budget, among others. 

The best model of the three tested is the Random Forest Regression.

"""
print("Thanks for reading")

Thanks for reading
