In [1]:
#Importing the Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pandas as pd
import json

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

In [2]:
# get the data from: https://www.kaggle.com/tmdb/tmdb-movie-metadata
# load in the data
df = pd.read_csv('/home/alainkuiete/Documents/DATA612/tmdb_5000_movies.csv')


In [3]:
# convert the relevant data for each movie into a single string
# to be ingested by TfidfVectorizer
def genres_and_keywords_to_string(row):
  genres = json.loads(row['genres'])
  genres = ' '.join(''.join(j['name'].split()) for j in genres)

  keywords = json.loads(row['keywords'])
  keywords = ' '.join(''.join(j['name'].split()) for j in keywords)
  return "%s %s" % (genres, keywords)


In [4]:
# create a new string representation of each movie
df['string'] = df.apply(genres_and_keywords_to_string, axis=1)

In [5]:
# create a tf-idf vectorizer object
# remove stopwords automatically
tfidf = TfidfVectorizer(max_features=2000)

In [6]:
# create a data matrix from the overviews
X = tfidf.fit_transform(df['string'])


In [7]:
# check the shape of X
print("X.shape:", X.shape)


X.shape: (4803, 2000)


In [8]:
# generate a mapping from movie title -> index (in df)
movie2idx = pd.Series(df.index, index=df['title'])

In [9]:
# create a function that generates recommendations
def recommend(title):
  # get the row in the dataframe for this movie
  idx = movie2idx[title]
  if type(idx) == pd.Series:
    idx = idx.iloc[0]
  # print("idx:", idx)

  # calculate the pairwise similarities for this movie
  query = X[idx]
  scores = cosine_similarity(query, X)

  # currently the array is 1 x N, make it just a 1-D array
  scores = scores.flatten()

  # get the indexes of the highest scoring movies
  # get the first K recommendations
  # don't return itself!
  recommended_idx = (-scores).argsort()[1:6]

  # return the titles of the recommendations
  return df['title'].iloc[recommended_idx]


In [10]:
print("\nRecommendations for 'Scream 3':")
print(recommend('Scream 3'))


Recommendations for 'Scream 3':
3902    Friday the 13th Part VI: Jason Lives
4628                          Graduation Day
4053        Friday the 13th: A New Beginning
4048                             The Calling
1084                         The Glimmer Man
Name: title, dtype: object


In [11]:
print("\nRecommendations for 'Mortal Kombat':")
print(recommend('Mortal Kombat'))


Recommendations for 'Mortal Kombat':
1611              Mortal Kombat: Annihilation
1670                       DOA: Dead or Alive
3856              In the Name of the King III
1001    Street Fighter: The Legend of Chun-Li
2237                        Alone in the Dark
Name: title, dtype: object


In [12]:
print("\nRecommendations for 'Runaway Bride':")
print(recommend('Runaway Bride'))



Recommendations for 'Runaway Bride':
4115                    House of D
2325    My Big Fat Greek Wedding 2
4604         It Happened One Night
3313                  An Education
2689            Our Family Wedding
Name: title, dtype: object


##### Reference:  from Recomender System and Deep Learning UDEMY