In [51]:
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import seaborn as sns
import numpy as np
import json
import warnings
warnings.filterwarnings('ignore')
import base64
import io
# from scipy.misc import imread
import codecs
from IPython.display import HTML

movies = pd.read_csv('../dataset/tmdb_5000_movies.csv')
credits = pd.read_csv('../dataset/tmdb_5000_credits.csv')

movies.head(3)
#credits.head(3)

# Possible features: keywords, popularity, production_countries,
# revenue, vote_count


# Clean the dataset by converting json columns into list of strings
fields = ['genres', 'keywords', 'production_companies', 'production_countries']
for field in fields:
    movies[field]=movies[field].apply(json.loads)
    for index,i in zip(movies.index,movies[field]):
        list1=[]
        for j in range(len(i)):
            list1.append((i[j]['name']))
        movies.loc[index,field]=str(list1)
        
        
# Clean the credits 
credits['cast']=credits['cast'].apply(json.loads)
for index,i in zip(credits.index,credits['cast']):
    list1=[]
    for j in range(len(i)):
        list1.append((i[j]['name']))
    credits.loc[index,'cast']=str(list1)

fields = ['crew']
for field in fields:
    credits[field] = credits[field].apply(json.loads)
    def director(x):
        for i in x:
            if i['job'] == 'Director':
                return i['name']
    credits['crew'] = credits['crew'].apply(director)
    credits.rename(columns={'crew':'director'},inplace=True)

# Need to combine two datasets. Can join on movie_id
df = pd.merge(movies, credits, how='left', left_on='id', right_on='movie_id')

# Feature engineering! select which ones we want here...
features = ['id', 'original_title', 'genres', 'cast', 'vote_average', 'director', 'keywords', 'popularity', 'vote_count']
df = df[features]

# Find top genres
df['genres'] = df['genres'].str.strip('[]').str.replace(' ','').str.replace("'",'').str.replace('Adventure', 'Adv').str.replace('u', '').str.replace('Adv', 'Adventure')
df['genres'] = df['genres'].str.split(',')

# Create an ordered list of all possible genres so we can use a bit string to represent them
genreList = []
for genres in df['genres']:
    for genre in genres:
        if genre not in genreList:
            genreList.append(genre)
            
genreList.sort()
genreList = genreList[1:]

# Create the bitstring of genres
def genre_bitlist(genres):
    blist = []
    for genre in genreList:
        if genre in genres:
            blist.append(1)
        else:
            blist.append(0)
    return blist

            
df['genres_bs'] = df['genres'].apply(lambda x: genre_bitlist(x))

# Work with cast. Let's get the top 25 actors
n = 25

df['cast']=df['cast'].str.strip('[]').str.replace(' ','').str.replace("'",'').str.replace('"','')
df['cast']=df['cast'].str.split(',')

import operator 
actorList = []
list1 = []
for i in df['cast']:
    list1.extend(i)
    
actors = {}
for actor in list1:
    if actor in actors:
        actors[actor] += 1
    else:
        actors[actor] = 1
        
sorted_actors = sorted(actors.items(), key=operator.itemgetter(1))

for i in range(0, 21):
    actorList.append(sorted_actors[-1-i])
    

actorList = [actor[0] for actor in actorList]
alist = []
for actor in actorList:
    if actor != 'r.': # Get rid of this trailing value
        alist.append(actor)
actorList = alist

# Now we need to sort the list!
actorList = sorted(actorList)
actorList = actorList[1:]

# Create the bitstring of genres
def actor_bitlist(actors):
    blist = []
    for actor in actorList:
        if actor in actors:
            blist.append(1)
        else:
            blist.append(0)
    return blist

df['cast_bs'] = df['cast'].apply(lambda x: actor_bitlist(x))

# Now directors
def isnull(s):
    if s is None:
        return ''
    return s

df['director'] = df['director'].apply(isnull)

directorList = []
for director in df['director']:
    if director not in directorList:
        directorList.append(director)
        
directorList = sorted(directorList)[1:]

def director_bitlist(directors):
    blist = []
    for director in directorList:
        if director in directors:
            blist.append(1)
        else:
            blist.append(0)
    return blist

df['director_bs'] = df['director'].apply(lambda x: director_bitlist(x))

# Allows us to make each genre a category in the dataframe
from collections import defaultdict
cols = defaultdict(list)
for g in genreList:
    for index, row in df.iterrows():
        genres = row["genres"]
        if g in genres:
            cols[g].append(1)
        else:
            cols[g].append(0)
        
    df[g] = cols[g]

# Allows us to make each actor a category in the dataframe
from collections import defaultdict
cols = defaultdict(list)
for a in actorList:
    for index, row in df.iterrows():
        cast = row["cast"]
        if a in cast:
            cols[a].append(1)
        else:
            cols[a].append(0)
        
    df[a] = cols[a]

In [120]:
# get data and labels for neural net
nn_features = ['popularity',  'vote_count', 'Action', 'Adventure', 'Animation', 'Comedy', 'Docmentary', 'Drama', 'Family', 'Fantasy', 'Foreign', 'History', 'Horror', 'Mystery', 'Romance', 'ScienceFiction', 'Thriller', 'War', 'Western', 'AlecBaldwin', 'BradPitt', 'BruceWillis', 'JohnGoodman', 'JohnnyDepp', 'LiamNeeson', 'MattDamon', 'MorganFreeman', 'NicolasCage', 'OwenWilson', 'RobertDeNiro', 'SamuelL.Jackson', 'SteveBuscemi', 'WillFerrell', 'WillemDafoe']
# nn_features = ['popularity',  'vote_count']
target = ['vote_average']

X = df[nn_features]
Y = df[target]

# normalize the data
from sklearn.preprocessing import MinMaxScaler
x_scaler = MinMaxScaler().fit(X)
X = x_scaler.transform(X)
y_scaler = MinMaxScaler().fit(Y)
Y = y_scaler.transform(Y)

In [141]:
# neural net setup
from keras.layers import Input, Dense
from keras.models import Model, Sequential

model = Sequential()
model.add(Dense(34, input_dim=34, kernel_initializer='normal', activation='relu'))
model.add(Dense(34, kernel_initializer='normal', activation='relu'))
model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))

model.compile(loss='mean_squared_error',
              optimizer='adam')

model.fit(X, Y, validation_split=0.1, epochs=100, batch_size=30)


Train on 4322 samples, validate on 481 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100


Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x12efeedd8>

In [142]:
title = "The Shawshank Redemption"
new_movie=df[df['original_title'].str.contains(title)].iloc[0].to_frame().T
print('Selected Movie: ',new_movie.original_title.values[0])
x_test = new_movie[nn_features]
x_test = x_scaler.transform(x_test)
y_pred = model.predict(x_test)
y_pred = y_scaler.inverse_transform(y_pred)
y = new_movie['vote_average']
print("Predicted y_pred: ", y_pred[0][0])
print("Actual y: ", y)

Selected Movie:  The Shawshank Redemption
Predicted y_pred:  8.4485
Actual y:  1881    8.5
Name: vote_average, dtype: object


In [143]:
title = "Fun with Dick and Jane"
new_movie=df[df['original_title'].str.contains(title)].iloc[0].to_frame().T
print('Selected Movie: ',new_movie.original_title.values[0])
x_test = new_movie[nn_features]
x_test = x_scaler.transform(x_test) # normalize test point
y_pred = model.predict(x_test)
y_pred = y_scaler.inverse_transform(y_pred) # inverse_transform y prediction
y = new_movie['vote_average']
print("Predicted y_pred: ", y_pred[0][0])
print("Actual y: ", y)

Selected Movie:  Fun with Dick and Jane
Predicted y_pred:  6.18236
Actual y:  155    5.9
Name: vote_average, dtype: object


In [144]:
title = "The Chronicles of Riddick"
new_movie=df[df['original_title'].str.contains(title)].iloc[0].to_frame().T
print('Selected Movie: ',new_movie.original_title.values[0])
x_test = new_movie[nn_features]
x_test = x_scaler.transform(x_test)
y_pred = model.predict(x_test)
y_pred = y_scaler.inverse_transform(y_pred)
y = new_movie['vote_average']
print("Predicted y_pred: ", y_pred[0][0])
print("Actual y: ", y)

Selected Movie:  The Chronicles of Riddick
Predicted y_pred:  6.14669
Actual y:  223    6.3
Name: vote_average, dtype: object
