Data Preparation stage

In [29]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import os 

# you should change path according to your local computer 
PATH = '/home/doolee13/movieLens/ml-1m'

ratings = pd.read_csv(os.path.join(PATH, 'ratings.dat'), delimiter='::', header=None, 
                      names = ['user_id', 'movie_id', 'rating', 'timestamp'], 
                      usecols= ['user_id', 'movie_id', 'rating'], engine='python')

users = pd.read_csv(os.path.join(PATH, 'users.dat'), delimiter='::', header=None, 
                    names = ['user_id', 'gender', 'zipcode', 'age_desc', 'occ_desc'],
                    usecols=['user_id', 'gender', 'zipcode', 'age_desc', 'occ_desc'], engine='python')

movies = pd.read_csv(os.path.join(PATH, 'movies.dat'), delimiter='::', header=None, 
                     encoding='latin-1', 
                     names = ['movie_id', 'title', 'genres'], 
                     usecols=['movie_id', 'title', 'genres'], engine='python')


Below is a quick description for rating info

In [18]:
ratings['rating'].describe()

count    1.000209e+06
mean     3.581564e+00
std      1.117102e+00
min      1.000000e+00
25%      3.000000e+00
50%      4.000000e+00
75%      4.000000e+00
max      5.000000e+00
Name: rating, dtype: float64

In [25]:
from collections import defaultdict
genres = set()
keyword_dict = defaultdict(int)
for gen in movies['genres'].str.split('|'):
    genres.union(set(gen))
    for g in gen:
        keyword_dict[g] += 1

keyword_list = []
for k, v in keyword_dict.items():
    keyword_list.append([k, v])
keyword_list.sort(key = lambda x: x[1], reverse= True)

# list top five genres in movieLens-1m dataset
print(keyword_list[:5])

[['Drama', 1603], ['Comedy', 1200], ['Action', 503], ['Thriller', 492], ['Romance', 471]]


In [30]:
movies['genres'] = movies['genres'].str.split('|')
print(movies['genres'])

0        [Animation, Children's, Comedy]
1       [Adventure, Children's, Fantasy]
2                      [Comedy, Romance]
3                        [Comedy, Drama]
4                               [Comedy]
                      ...               
3878                            [Comedy]
3879                             [Drama]
3880                             [Drama]
3881                             [Drama]
3882                   [Drama, Thriller]
Name: genres, Length: 3883, dtype: object


In [31]:
movies['genres'] = movies['genres'].fillna("").astype('str')
print(movies['genres'])

0        ['Animation', "Children's", 'Comedy']
1       ['Adventure', "Children's", 'Fantasy']
2                        ['Comedy', 'Romance']
3                          ['Comedy', 'Drama']
4                                   ['Comedy']
                         ...                  
3878                                ['Comedy']
3879                                 ['Drama']
3880                                 ['Drama']
3881                                 ['Drama']
3882                     ['Drama', 'Thriller']
Name: genres, Length: 3883, dtype: object


we use TF-IDF for category to vector mapping and cosine similarity for vector similarity 


for detailed info about TF-IDF and sklearn, https://m.blog.naver.com/myincizor/221644893910

In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word', ngram_range=(1,2), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(movies['genres'])

In [36]:
# calculate cosine similarity 
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

Now assign 15 most similar movies according to cosine similarity score 

In [46]:
# create (movie_title, index) data for index reference 
titles = movies['title']
inds = pd.Series(data=movies.index, index=movies['title'])

def get_rec(title):
    idx = inds[title]
    scores = list(enumerate(cosine_sim[idx]))
    scores.sort(key = lambda x:x[1], reverse=True)
    scores = scores[1:16] # idx 0 is itself 
    rec_ids = [i[0] for i in scores]
    return titles.iloc[rec_ids]

movies['title'].head(-30)

0                                        Toy Story (1995)
1                                          Jumanji (1995)
2                                 Grumpier Old Men (1995)
3                                Waiting to Exhale (1995)
4                      Father of the Bride Part II (1995)
                              ...                        
3848                      Hellbound: Hellraiser II (1988)
3849                 Hellraiser III: Hell on Earth (1992)
3850    Faraway, So Close (In Weiter Ferne, So Nah!) (...
3851                                   Beach Party (1963)
3852                                  Bikini Beach (1964)
Name: title, Length: 3853, dtype: object

In [48]:
get_rec('Toy Story (1995)')

1050            Aladdin and the King of Thieves (1996)
2072                          American Tail, An (1986)
2073        American Tail: Fievel Goes West, An (1991)
2285                         Rugrats Movie, The (1998)
2286                              Bug's Life, A (1998)
3045                                Toy Story 2 (1999)
3542                             Saludos Amigos (1943)
3682                                Chicken Run (2000)
3685    Adventures of Rocky and Bullwinkle, The (2000)
236                              Goofy Movie, A (1995)
12                                        Balto (1995)
241                            Gumby: The Movie (1995)
310                          Swan Princess, The (1994)
592                                   Pinocchio (1940)
612                             Aristocats, The (1970)
Name: title, dtype: object

In [47]:
get_rec('Hellraiser III: Hell on Earth (1992)')

175                              Lord of Illusions (1995)
186                                  Prophecy, The (1995)
218                                   Castle Freak (1995)
324     Tales From the Crypt Presents: Demon Knight (1...
362                     Wes Craven's New Nightmare (1994)
393                                      Fear, The (1995)
561                                         Cronos (1992)
602                Candyman: Farewell to the Flesh (1995)
765     Spirits of the Dead (Tre Passi nel Delirio) (1...
830                            Eyes Without a Face (1959)
831     Tales from the Crypt Presents: Bordello of Blo...
868                                     Relic, The (1997)
1089        Children of the Corn IV: The Gathering (1996)
1112                                      Fog, The (1980)
1114                                  Howling, The (1980)
Name: title, dtype: object