In [2]:
import pandas as pd
import numpy as np
import json
import warnings
warnings.filterwarnings('ignore')
import base64
import io
from functools import reduce

In [3]:
# read in the csv files
movies = pd.read_csv('../dataset/tmdb_5000_movies.csv')
credits = pd.read_csv('../dataset/tmdb_5000_credits.csv')

# Clean the dataset by converting json columns into list of strings
fields = ['genres', 'keywords', 'production_companies', 'production_countries']
for field in fields:
    movies[field]=movies[field].apply(json.loads)
    for index,i in zip(movies.index,movies[field]):
        list1=[]
        for j in range(len(i)):
            list1.append((i[j]['name']))
        movies.loc[index,field]=str(list1)
        
        
# Clean the credits 
credits['cast']=credits['cast'].apply(json.loads)
for index,i in zip(credits.index,credits['cast']):
    list1=[]
    for j in range(len(i)):
        list1.append((i[j]['name']))
    credits.loc[index,'cast']=str(list1)

fields = ['crew']
for field in fields:
    credits[field] = credits[field].apply(json.loads)
    def director(x):
        for i in x:
            if i['job'] == 'Director':
                return i['name']
    credits['crew'] = credits['crew'].apply(director)
    credits.rename(columns={'crew':'director'},inplace=True)

In [4]:
# Need to combine two datasets. Can join on movie_id
df = pd.merge(movies, credits, how='left', left_on='id', right_on='movie_id')

In [5]:
# Feature engineering! select which ones we want here...
features = ['id', 'original_title', 'genres', 'cast', 'vote_average', 'director', 'keywords', 'popularity', 'vote_count']
df = df[features]

In [6]:
# Format genres
df['genres'] = df['genres'].str.strip('[]').str.replace(' ','').str.replace("'",'').str.replace('Adventure', 'Adv').str.replace('u', '').str.replace('Adv', 'Adventure')
df['genres'] = df['genres'].str.split(',')

In [7]:
# Create an ordered list of all possible genres so we can use a bit string to represent them
genreList = []
for genres in df['genres']:
    for genre in genres:
        if genre not in genreList:
            genreList.append(genre)
            
genreList.sort()
genreList = genreList[1:]

In [8]:
# Create the bitstring of genres
def genre_bitlist(genres):
    blist = []
    for genre in genreList:
        if genre in genres:
            blist.append(1)
        else:
            blist.append(0)
    return blist

            
df['genres_bs'] = df['genres'].apply(lambda x: genre_bitlist(x))

In [9]:
# Format the cast
df['cast']=df['cast'].str.strip('[]').str.replace(' ','').str.replace("'",'').str.replace('"','')
df['cast']=df['cast'].str.split(',')

In [10]:
# Create a list of actors
import operator 
actorList = []
list1 = []
for i in df['cast']:
    list1.extend(i)
    
actors = {}
for actor in list1:
    if actor in actors:
        actors[actor] += 1
    else:
        actors[actor] = 1
        
sorted_actors = sorted(actors.items(), key=operator.itemgetter(1))

for i in range(0, 21):
    actorList.append(sorted_actors[-1-i])
    

actorList = [actor[0] for actor in actorList]
alist = []
for actor in actorList:
    if actor != 'r.': # Get rid of this trailing value
        alist.append(actor)
actorList = alist

# Now we need to sort the list!
actorList = sorted(actorList)
actorList = actorList[1:]

In [11]:
# Create the bitstring of actors
def actor_bitlist(actors):
    blist = []
    for actor in actorList:
        if actor in actors:
            blist.append(1)
        else:
            blist.append(0)
    return blist

df['cast_bs'] = df['cast'].apply(lambda x: actor_bitlist(x))

In [12]:
# Now directors
def isnull(s):
    if s is None:
        return ''
    return s

df['director'] = df['director'].apply(isnull)

In [13]:
# Create a list of directors
directorList = []
for director in df['director']:
    if director not in directorList:
        directorList.append(director)
        
directorList = sorted(directorList)[1:]

In [14]:
# Create the bitstring of directors
def director_bitlist(directors):
    blist = []
    for director in directorList:
        if director in directors:
            blist.append(1)
        else:
            blist.append(0)
    return blist

df['director_bs'] = df['director'].apply(lambda x: director_bitlist(x))

In [15]:
# Allows us to make each genre a category in the dataframe
from collections import defaultdict
cols = defaultdict(list)
for g in genreList:
    for index, row in df.iterrows():
        genres = row["genres"]
        if g in genres:
            cols[g].append(1)
        else:
            cols[g].append(0)
        
    df[g] = cols[g]

In [16]:
# Allows us to make each actor a category in the dataframe
from collections import defaultdict
cols = defaultdict(list)
for a in actorList:
    for index, row in df.iterrows():
        cast = row["cast"]
        if a in cast:
            cols[a].append(1)
        else:
            cols[a].append(0)
        
    df[a] = cols[a]

In [17]:
svm_features = ['popularity',  'vote_count', 'Action', 'Adventure', 'Animation', 'Comedy', 'Docmentary', 'Drama', 'Family', 'Fantasy', 'Foreign', 'History', 'Horror', 'Mystery', 'Romance', 'ScienceFiction', 'Thriller', 'War', 'Western', 'AlecBaldwin', 'BradPitt', 'BruceWillis', 'JohnGoodman', 'JohnnyDepp', 'LiamNeeson', 'MattDamon', 'MorganFreeman', 'NicolasCage', 'OwenWilson', 'RobertDeNiro', 'SamuelL.Jackson', 'SteveBuscemi', 'WillFerrell', 'WillemDafoe']
target = ['vote_average']

from sklearn import svm
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

X = df[svm_features]
Y = df[target]

clf = svm.SVR(kernel='rbf', epsilon=0.1)
parameters = { 'C': np.linspace(1, 100, 5) }
gs_clf = GridSearchCV(clf, parameters, n_jobs=-1)

test_scores = []
train_scores = []

for i in range(10):
    # Specify that we want to use mean_squared_error for cross-validation
    cv_results = cross_validate(gs_clf, X, Y, cv=10, scoring='neg_mean_squared_error')
    
    test_score = cv_results['test_score']
    train_score = cv_results['train_score']
    
    test_scores.append(reduce(lambda x, y: abs(x)+abs(y), test_score)/len(test_score))
    train_scores.append(reduce(lambda x, y: abs(x)+abs(y), train_score)/len(train_score))

print("Mean test score: ", reduce(lambda x, y: x+y, test_scores)/len(test_scores))
print("Mean train score: ", reduce(lambda x, y: x+y, train_scores)/len(train_scores))

Mean test score:  0.30891974939784117
Mean train score:  0.2207114772111523


In [23]:
lin_features = ['popularity',  'vote_count', 'Action', 'Adventure', 'Animation', 'Comedy', 'Docmentary', 'Drama', 'Family', 'Fantasy', 'Foreign', 'History', 'Horror', 'Mystery', 'Romance', 'ScienceFiction', 'Thriller', 'War', 'Western', 'AlecBaldwin', 'BradPitt', 'BruceWillis', 'JohnGoodman', 'JohnnyDepp', 'LiamNeeson', 'MattDamon', 'MorganFreeman', 'NicolasCage', 'OwenWilson', 'RobertDeNiro', 'SamuelL.Jackson', 'SteveBuscemi', 'WillFerrell', 'WillemDafoe']
target = ['vote_average']

from sklearn import linear_model
from sklearn.model_selection import cross_validate

X = df[lin_features]
Y = df[target]

lin = linear_model.LinearRegression()

test_scores = []
train_scores = []
for i in range(10):
    # Specify that we want to use mean_squared error for the cross-validation
    cv_results = cross_validate(lin, X, Y, cv=10, scoring='neg_mean_squared_error')
    
    test_score = cv_results['test_score']
    train_score = cv_results['train_score']
    
    test_scores.append(reduce(lambda x, y: abs(x)+abs(y), test_score)/len(test_score))
    train_scores.append(reduce(lambda x, y: abs(x)+abs(y), train_score)/len(train_score))

print("Mean test score: ", reduce(lambda x, y: x+absy, test_score)/len(test_scores))
print("Mean train score: ", reduce(lambda x, y: x+absy, train_score)/len(train_scores))

Mean test score:  1.2391289483019878
Mean train score:  1.1290526232734734
