In [1]:
# Import our regular old heroes 
import numpy as np
import pandas as pd
import scipy as sp # <-- The sister of Numpy, used in our code for numerical efficientcy. 
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.metrics import mean_squared_error


# Entity featurization and similarity computation
from sklearn.metrics.pairwise import cosine_similarity 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Libraries used during sorting procedures.
import operator # <-- Convienient item retrieval during iteration 
import heapq # <-- Efficient sorting of large lists

# Imported for our sanity
import warnings
warnings.filterwarnings('ignore')

In [2]:
#making sure that we can see all rows and cols
pd.set_option('display.max_columns', None)

pd.set_option('display.max_rows', None)

In [3]:
#imdb
#imdb_data = pd.read_csv("../input/edsa-movie-recommendation-wilderness/imdb_data.csv")

#test
#test = pd.read_csv("../input/edsa-movie-recommendation-wilderness/test.csv")

#movies
movies = pd.read_csv("../input/edsa-movie-recommendation-wilderness/movies.csv")

#train
cols_list = ['userId', 'movieId', 'rating']
train = pd.read_csv("../input/edsa-movie-recommendation-wilderness/train.csv", usecols = cols_list)

In [4]:
#############works, don't touch##################

In [5]:
# merging dataframe

train_df = movies

In [6]:
train_df = train_df[:27000]

In [7]:
train_df.shape

(27000, 3)

In [8]:
# convert data types to strings for string handling
#train_df['genres'] = train_df.genres.astype(str)
#train_df['title_cast'] = train_df.title_cast.astype(str)
#train_df['director'] = train_df.director.astype(str)
#train_df['plot_keywords'] = train_df.plot_keywords.astype(str)

In [9]:
# Every genre is separated by a | 
train_df['genres'] = train_df['genres'].map(lambda x: x.lower().split('|'))

# Every title cast is separated by a | so we simply have to call the split function on | and separate them by ,
#train_df['title_cast'] = train_df['title_cast'].str.split('|')

# And we will do the same thing for the plot keywords
#train_df['plot_keywords'] = train_df['plot_keywords'].str.split('|')

In [10]:
#def string_function(x):
#    """merges name and surname into one name"""
#    if isinstance(x, list):
#        return [str.lower(i.replace(" ", "")) for i in x]
#    else:
#        #Check if director exists. If not, return empty string
#        if isinstance(x, str):
#            return str.lower(x.replace(" ", ""))
#        else:
#            return ''

In [11]:
#features = ['title_cast','director']

#for feature in features:
#    train_df[feature] = train_df[feature].apply(string_function)

In [12]:
train_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),"[adventure, animation, children, comedy, fantasy]"
1,2,Jumanji (1995),"[adventure, children, fantasy]"
2,3,Grumpier Old Men (1995),"[comedy, romance]"
3,4,Waiting to Exhale (1995),"[comedy, drama, romance]"
4,5,Father of the Bride Part II (1995),[comedy]


In [13]:
cols = ['title','genres']

In [14]:
data_df = train_df[cols]

In [15]:
# generating the cosine similarity matrix
#cosine_ = cosine_similarity(count_matrix, count_matrix)

#import pickle

#model_save_path = "./cosine_sim_.pkl"
#with open(model_save_path,'wb') as file:
#    pickle.dump(cosine_sim,file, protocol = 4)
#pickle.dump(d, open("file", 'w'), protocol=4)

In [16]:
data_df.set_index('title', inplace = True)
data_df.head()

Unnamed: 0_level_0,genres
title,Unnamed: 1_level_1
Toy Story (1995),"[adventure, animation, children, comedy, fantasy]"
Jumanji (1995),"[adventure, children, fantasy]"
Grumpier Old Men (1995),"[comedy, romance]"
Waiting to Exhale (1995),"[comedy, drama, romance]"
Father of the Bride Part II (1995),[comedy]


In [17]:
data_df.head()

Unnamed: 0_level_0,genres
title,Unnamed: 1_level_1
Toy Story (1995),"[adventure, animation, children, comedy, fantasy]"
Jumanji (1995),"[adventure, children, fantasy]"
Grumpier Old Men (1995),"[comedy, romance]"
Waiting to Exhale (1995),"[comedy, drama, romance]"
Father of the Bride Part II (1995),[comedy]


In [18]:
data_df['bag_of_words'] = ''
columns = data_df.columns
for index, row in data_df.iterrows():
    words = ''
    for col in columns:
        if col != 'director':
            words = words + ' '.join(row[col])+ ' '
        else:
            words = words + row[col]+ ' '
    row['bag_of_words'] = words
    
data_df.drop(columns = [col for col in data_df.columns if col!= 'bag_of_words'], inplace = True)

In [19]:
data_df.head()

Unnamed: 0_level_0,bag_of_words
title,Unnamed: 1_level_1
Toy Story (1995),adventure animation children comedy fantasy
Jumanji (1995),adventure children fantasy
Grumpier Old Men (1995),comedy romance
Waiting to Exhale (1995),comedy drama romance
Father of the Bride Part II (1995),comedy


In [20]:
# instantiating and generating the count matrix
count = CountVectorizer()
count_matrix = count.fit_transform(data_df['bag_of_words'])

# creating a Series for the movie titles so they are associated to an ordered numerical
# list I will use later to match the indexes
indices = pd.Series(data_df.index)
indices[:10]

0                      Toy Story (1995)
1                        Jumanji (1995)
2               Grumpier Old Men (1995)
3              Waiting to Exhale (1995)
4    Father of the Bride Part II (1995)
5                           Heat (1995)
6                        Sabrina (1995)
7                   Tom and Huck (1995)
8                   Sudden Death (1995)
9                      GoldenEye (1995)
Name: title, dtype: object

In [21]:
# generating the cosine similarity matrix
cosine_sim = cosine_similarity(count_matrix, count_matrix)

import pickle

model_save_path = "./cosine_sim.pkl"
with open(model_save_path,'wb') as file:
    pickle.dump(cosine_sim,file, protocol = 4)
#pickle.dump(d, open("file", 'w'), protocol=4)

In [22]:
def recommendations(title, cosine_sim = cosine_sim):
    
    recommended_movies = []
    
    # gettin the index of the movie that matches the title
    idx = indices[indices == title].index[0]

    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)

    # getting the indexes of the 10 most similar movies
    top_10_indexes = list(score_series.iloc[1:11].index)
    
    # populating the list with the titles of the best 10 matching movies
    for i in top_10_indexes:
        recommended_movies.append(list(data_df.index)[i])
        
    return recommended_movies

In [23]:
recommendations('Hard Target (1993)')

['Hard Target (1993)',
 'Once Upon a Time in Mexico (2003)',
 'Into the Blue (2005)',
 'Death Hunt (1981)',
 'Hobo with a Shotgun (2011)',
 'Live Free or Die Hard (2007)',
 'Hackers (1995)',
 'Floods of Fear (1959)',
 'Welcome to the Punch (2013)',
 'Transporter 3 (2008)']

In [24]:
##################works, don't touch#######################################