In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import sigmoid_kernel

import pickle

%matplotlib inline

In [3]:
df_movies = pd.read_csv('/content/drive/MyDrive/netflix_prize_data/movies_lens/movie.csv')
df_ratings =  pd.read_csv('/content/drive/MyDrive/netflix_prize_data/movies_lens/rating.csv')

df_links =  pd.read_csv('/content/drive/MyDrive/netflix_prize_data/movies_lens/link.csv')

In [4]:
df_tags =  pd.read_csv('/content/drive/MyDrive/netflix_prize_data/movies_lens/tag.csv')

In [5]:
df_genome_scores = pd.read_csv('/content/drive/MyDrive/netflix_prize_data/movies_lens/genome_scores.csv')
df_genome_tags = pd.read_csv('/content/drive/MyDrive/netflix_prize_data/movies_lens/genome_tags.csv')

In [6]:
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [8]:
df_tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,18,4141,Mark Waters,2009-04-24 18:19:40
1,65,208,dark hero,2013-05-10 01:41:18
2,65,353,dark hero,2013-05-10 01:41:19
3,65,521,noir thriller,2013-05-10 01:39:43
4,65,592,dark hero,2013-05-10 01:41:18


In [9]:
df_links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [10]:
df_genome_scores.head()

Unnamed: 0,movieId,tagId,relevance
0,1,1,0.025
1,1,2,0.025
2,1,3,0.05775
3,1,4,0.09675
4,1,5,0.14675


In [11]:
df_genome_tags.head()

Unnamed: 0,tagId,tag
0,1,007
1,2,007 (series)
2,3,18th century
3,4,1920s
4,5,1930s


# Explore Tag Data of Movie 'Matrix'

In [12]:
matrix_movie = df_movies[df_movies['title'].str.contains('Matrix')]

In [13]:
matrix_movie

Unnamed: 0,movieId,title,genres
2486,2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller
6260,6365,"Matrix Reloaded, The (2003)",Action|Adventure|Sci-Fi|Thriller|IMAX
6822,6934,"Matrix Revolutions, The (2003)",Action|Adventure|Sci-Fi|Thriller|IMAX


In [14]:
matrix_movie_tags = df_tags[df_tags['movieId'] == 2571]

In [15]:
matrix_movie_tags

Unnamed: 0,userId,movieId,tag,timestamp
385,342,2571,post-apocalyptic,2012-01-25 17:52:01
386,342,2571,sci-fi,2012-01-25 17:51:58
387,342,2571,virtual reality,2012-01-25 17:52:04
1193,505,2571,post-apocalyptic,2011-03-09 07:17:37
1194,505,2571,virtual reality,2011-03-09 07:17:33
...,...,...,...,...
463932,138233,2571,philosophical,2011-11-23 03:53:55
463933,138233,2571,virtual reality,2011-11-23 03:54:22
465064,138414,2571,alternate reality,2013-01-23 21:47:09
465065,138414,2571,cyberpunk,2013-01-23 21:47:06


df_tags is not usable at the moment with simple vectorization. So, we want to use genome scores instead. We only want relevance larger than 0.9

Check the relevance score of tags for movieId 4226 **Movie Title - Memento**

In [16]:
df_movies[df_movies['movieId'] == 4226]['title'].values[0]

'Memento (2000)'

In [17]:
matrix_movie_tags_genome = df_genome_scores[df_genome_scores['movieId'] == 4226]

In [18]:
matrix_movie_tags_genome = matrix_movie_tags_genome[matrix_movie_tags_genome['relevance'] > 0.7]

In [19]:
matrix_movie_tags_genome

Unnamed: 0,movieId,tagId,relevance
4215392,4226,57,0.96925
4215475,4226,140,0.71275
4215478,4226,143,0.71550
4215519,4226,184,0.79850
4215527,4226,192,0.98550
...,...,...,...
4216408,4226,1073,0.72200
4216419,4226,1084,0.77000
4216426,4226,1091,0.81400
4216427,4226,1092,0.72050


In [20]:
matrix_movie_tags_names = matrix_movie_tags_genome.merge(df_genome_tags, how='inner', on='tagId')

In [21]:
unique_matrix_tags = matrix_movie_tags_names['tag'].unique()

In [22]:
unique_matrix_tags

array(['amnesia', 'black and white', 'bleak', 'carrie-anne moss',
       'cerebral', 'clever', 'complex', 'complicated', 'complicated plot',
       'confusing', 'corruption', 'cult film', 'dark', 'dark humor',
       'dramatic', 'enigmatic', 'excellent script', 'existentialism',
       'flashbacks', 'genius', 'good', 'good acting', 'good soundtrack',
       'great acting', 'great ending', 'great movie', 'identity',
       'imagination', 'imdb top 250', 'insanity', 'intellectual',
       'intelligent', 'intense', 'interesting', 'investigation',
       'justice', 'manipulation', 'masterpiece', 'memory', 'memory loss',
       'mentor', 'mindfuck', 'moral ambiguity', 'multiple storylines',
       'murder', 'murder mystery', 'mystery', 'narrated', 'neo-noir',
       'noir thriller', 'non-linear', 'nonlinear', 'obsession',
       'original', 'original plot', 'oscar (best directing)',
       'oscar (best editing)', 'oscar (best supporting actor)',
       'paranoia', 'paranoid', 'philosophical

In [23]:
unique_matrix_tags

array(['amnesia', 'black and white', 'bleak', 'carrie-anne moss',
       'cerebral', 'clever', 'complex', 'complicated', 'complicated plot',
       'confusing', 'corruption', 'cult film', 'dark', 'dark humor',
       'dramatic', 'enigmatic', 'excellent script', 'existentialism',
       'flashbacks', 'genius', 'good', 'good acting', 'good soundtrack',
       'great acting', 'great ending', 'great movie', 'identity',
       'imagination', 'imdb top 250', 'insanity', 'intellectual',
       'intelligent', 'intense', 'interesting', 'investigation',
       'justice', 'manipulation', 'masterpiece', 'memory', 'memory loss',
       'mentor', 'mindfuck', 'moral ambiguity', 'multiple storylines',
       'murder', 'murder mystery', 'mystery', 'narrated', 'neo-noir',
       'noir thriller', 'non-linear', 'nonlinear', 'obsession',
       'original', 'original plot', 'oscar (best directing)',
       'oscar (best editing)', 'oscar (best supporting actor)',
       'paranoia', 'paranoid', 'philosophical

In [24]:
matrix_movie_tags_names.head()

Unnamed: 0,movieId,tagId,relevance,tag
0,4226,57,0.96925,amnesia
1,4226,140,0.71275,black and white
2,4226,143,0.7155,bleak
3,4226,184,0.7985,carrie-anne moss
4,4226,192,0.9855,cerebral


In [25]:
matrix_movie_tags_names.groupby(['movieId'])['tag'].apply('|'.join).reset_index()

Unnamed: 0,movieId,tag
0,4226,amnesia|black and white|bleak|carrie-anne moss...


Successfully preprocessed data for one movie. Now, going to transform the whole.

In [26]:
len(df_genome_scores['movieId'].unique())

10381

# Preprocess tag data

In [27]:
df_genome_scores_with_tagnames = df_genome_scores.merge(df_genome_tags, how='inner', on='tagId')

In [28]:
df_genome_scores_with_tagnames = df_genome_scores_with_tagnames[df_genome_scores_with_tagnames['relevance'] > 0.7]

In [29]:
df_genome_scores_with_tagnames

Unnamed: 0,movieId,tagId,relevance,tag
9,10,1,0.99975,007
1342,1517,1,0.75075,007
1507,1722,1,0.99975,007
2085,2376,1,0.99975,007
2615,2947,1,0.99975,007
...,...,...,...,...
11709439,109720,1128,0.88850,zombies
11709501,111663,1128,0.81825,zombies
11709544,113159,1128,0.82675,zombies
11709637,116668,1128,0.92850,zombies


In [30]:
df_tagnames_compressed = df_genome_scores_with_tagnames.groupby(['movieId'])['tag'].apply('|'.join).reset_index()

In [31]:
df_tagnames_compressed

Unnamed: 0,movieId,tag
0,1,adventure|animated|animation|cartoon|cgi|child...
1,2,adventure|animals|big budget|childhood|childre...
2,3,comedy|good sequel|original|sequel|sequels
3,4,chick flick|girlie movie|romantic|unlikely fri...
4,5,comedy|destiny|family|father daughter relation...
...,...,...
10337,130578,action|assassin|assassination|good action|real...
10338,130840,cinematography|creepy|horror|immortality|love ...
10339,131013,buddy movie|coen bros|comedy|crude humor|foul ...
10340,131168,betrayal|camp|cinematography|criterion|dramati...


Merge with master movie dataframe

In [32]:
df_movies_with_tags = df_movies.merge(df_tagnames_compressed, how='left', on='movieId')

In [33]:
df_movies_with_tags['tag'].fillna('movie', inplace=True)

In [34]:
df_movies_with_tags

Unnamed: 0,movieId,title,genres,tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,adventure|animated|animation|cartoon|cgi|child...
1,2,Jumanji (1995),Adventure|Children|Fantasy,adventure|animals|big budget|childhood|childre...
2,3,Grumpier Old Men (1995),Comedy|Romance,comedy|good sequel|original|sequel|sequels
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,chick flick|girlie movie|romantic|unlikely fri...
4,5,Father of the Bride Part II (1995),Comedy,comedy|destiny|family|father daughter relation...
...,...,...,...,...
27273,131254,Kein Bund für's Leben (2007),Comedy,movie
27274,131256,"Feuer, Eis & Dosenbier (2002)",Comedy,movie
27275,131258,The Pirates (2014),Adventure,movie
27276,131260,Rentun Ruusu (2001),(no genres listed),movie


# TF-IDF

The reason using TF-IDF is to understand how importance of the word in the tags by looking at how many times a word appears in a movie tag while paying attention to the same word appears in another movie tag

In [35]:
vectorizer = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=1)
x = vectorizer.fit_transform(df_movies_with_tags['genres'] + df_movies_with_tags['tag'])

In [36]:
#checkout TF-IDF scores of tags
df = pd.DataFrame(x[0].T.todense(), index=vectorizer.get_feature_names(), columns=["TF-IDF"])
df = df.sort_values('TF-IDF', ascending=False)
df.head(15)

Unnamed: 0,TF-IDF
animation,0.221279
animated,0.21636
pixar,0.191012
animated feature,0.166882
feature,0.166882
good,0.163223
disney,0.154697
technology toys,0.134399
good whimsical,0.134399
kids,0.131522


**Measure of similarity between vectors using Sigmoid Kernel**

In [37]:
tag_genre_model = sigmoid_kernel(x, x)

In [38]:
tag_genre_model

array([[0.76160495, 0.76159605, 0.7615945 , ..., 0.76159416, 0.76159436,
        0.7615943 ],
       [0.76159605, 0.76160495, 0.76159431, ..., 0.76159416, 0.76159432,
        0.76159471],
       [0.7615945 , 0.76159431, 0.76160495, ..., 0.76159416, 0.76159416,
        0.76159416],
       ...,
       [0.76159416, 0.76159416, 0.76159416, ..., 0.76160495, 0.76159416,
        0.76159416],
       [0.76159436, 0.76159432, 0.76159416, ..., 0.76159416, 0.76160495,
        0.76159416],
       [0.7615943 , 0.76159471, 0.76159416, ..., 0.76159416, 0.76159416,
        0.76160495]])

In [39]:
#save similarity model as pickle file
# pickle.dump(tag_genre_model, open( 'genre_similarity_model.pickle', 'wb'))

In [40]:
#create dictionary for movieId and Index and vice versa
movieId_dict = dict(zip(df_movies_with_tags.movieId, df_movies_with_tags.index))
id_movieId_dict = dict(zip(df_movies_with_tags.index, df_movies_with_tags.movieId))

def get_similar_movies(movieId, n):
  '''
  Find similar movies according to movieId and returns similar movieIds and scores

  INPUT :
  movieId - movieId to calculate similarity scores
  n - number of similar movies

  OUTPUT :
  list(movieId, index) - list of movieId and its corresponding scores
  '''
  scores = []
  scores = list(enumerate(tag_genre_model[movieId_dict[movieId]]))
  scores.sort(key=lambda x: x[1], reverse=True)

  return list(map(lambda x: (id_movieId_dict[x[0]], x[1]), scores[1: n + 1]))

In [42]:
#create dictionary for movieId and title
movie_dict = dict(zip(df_movies.movieId, df_movies.title))

def get_movie_titles(movieIds):
  '''
  Get movie titles according to movieIds and return titles

  INPUT :
  movieIds - movieId lists

  OUTPUT :
  movie_titles - list of movie titles according to the movieIds
  '''
  movie_titles = []
  for movieId in movieIds:
    if movieId in movie_dict:
      movie_titles.append(movie_dict[movieId])
  return movie_titles

Find similar movie of movieIds - 5349 **Spider-Man (2002)**

In [43]:
movie_ids, similarity = list(zip(*get_similar_movies(5349, 10)))
get_movie_titles(movie_ids)

['Spider-Man 2 (2004)',
 'X-Men (2000)',
 'X2: X-Men United (2003)',
 'Incredible Hulk, The (2008)',
 'Thor (2011)',
 'X-Men: First Class (2011)',
 'Hulk (2003)',
 'X-Men Origins: Wolverine (2009)',
 'Amazing Spider-Man, The (2012)',
 'Iron Man (2008)']

**Just for a quick search of movies**

In [44]:
df_movies[df_movies['title'].str.contains("Spider")]

Unnamed: 0,movieId,title,genres
2737,2823,"Spiders Part 1: The Golden Lake, The (Die Spin...",Action|Adventure|Drama
3957,4051,Horrors of Spider Island (Ein Toter Hing im Ne...,Horror|Sci-Fi
4144,4238,Along Came a Spider (2001),Action|Crime|Mystery|Thriller
5252,5349,Spider-Man (2002),Action|Adventure|Sci-Fi|Thriller
5259,5356,"Giant Spider Invasion, The (1975)",Horror|Sci-Fi
6098,6197,Spider (2002),Drama|Mystery
6321,6430,Ziggy Stardust and the Spiders from Mars (1973),Documentary|Musical
6676,6786,Kiss of the Spider Woman (1985),Drama
6738,6848,Kingdom of the Spiders (1977),Horror|Sci-Fi
7953,8636,Spider-Man 2 (2004),Action|Adventure|Sci-Fi|IMAX


In [45]:
def get_movies(userid):
    '''
    INPUT:
    userid - (int) a user id
       
    OUTPUT:
    movie_ids - set of movie_ids that the user has already watched
    
    '''
    movie_titles = []

    movie_ids = set(df_ratings[df_ratings['userId'] == userid].sort_values('rating', ascending = False)['movieId'].tolist())

    return movie_ids 

In [46]:
def recommendation(userid, m=10):
    '''
    INPUT:
    userid - (int) a user id
    m - (int) the number of recommendations for user
    
    OUTPUT:
    recommended_movie_titles - list of recommended movie titles
    
    '''

    similar_movies = []

    watched_movie_ids = get_movies(userid)
    
    for movie in watched_movie_ids:
      movies = get_similar_movies(movie, 5)
      similar_movies.extend(movies)
    
    similar_movies = sorted(similar_movies, key = lambda x : x[1], reverse = True)
    similar_movie_ids, similarity_scores = list(zip(*similar_movies))
    similar_movie_ids = [movie for movie in similar_movie_ids if movie not in watched_movie_ids]
    similar_movie_ids = set(similar_movie_ids)

    return get_movie_titles(list(similar_movie_ids)[:m])


**Validate the movies that the user 138446 has already watched**

In [47]:
sample_movies_138446 = get_movies(138446)
get_movie_titles(sample_movies_138446)

['Toy Story (1995)',
 'Dumbo (1941)',
 'Sound of Music, The (1965)',
 '10 Things I Hate About You (1999)',
 'March of the Wooden Soldiers (a.k.a. Babes in Toyland) (1934)',
 'Sense and Sensibility (1995)',
 'Secret Garden, The (1993)',
 'Shop Around the Corner, The (1940)',
 'Persuasion (1995)',
 'Jungle Book, The (1967)',
 'Twelve Monkeys (a.k.a. 12 Monkeys) (1995)',
 'Babe (1995)',
 "William Shakespeare's Romeo + Juliet (1996)",
 'Fantastic Mr. Fox (2009)',
 '101 Dalmatians (One Hundred and One Dalmatians) (1961)',
 "Kiki's Delivery Service (Majo no takkyûbin) (1989)",
 'Nanny McPhee (2005)',
 'Toy Story 2 (1999)',
 'Harry Potter and the Deathly Hallows: Part 2 (2011)',
 'Basic Instinct (1992)',
 'Brady Bunch Movie, The (1995)',
 'Captain America: The First Avenger (2011)',
 'Aladdin (1992)',
 'Silence of the Lambs, The (1991)',
 'Beauty and the Beast (1991)',
 'Chronicles of Narnia: The Lion, the Witch and the Wardrobe, The (2005)',
 'Ice Age (2002)',
 'Braveheart (1995)',
 'Others,

**List of recommended movies for user 138446**

In [48]:
recommendation(138446)

['Fighter, The (2010)',
 'Balto (1995)',
 'Flowers in the Attic (1987)',
 'Cutthroat Island (1995)',
 'Day of the Jackal, The (1973)',
 'Money Train (1995)',
 'Copycat (1995)',
 'Pan (1995)',
 'Othello (1995)',
 'Lady and the Tramp (1955)']