In [23]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [4]:
base_path = '..' #/Users/akshatpandey/Thesisproject/Movie recommendation/data/ratings.csv

In [5]:

# Reading ratings file
# Ignore the timestamp column
#sep and encoding is used because csv file is not in correct format


ratings = pd.read_csv(base_path+'//data//ratings.csv',sep='\t', encoding='latin-1',usecols=['user_id', 'movie_id', 'rating'])

# Reading users file
users = pd.read_csv(base_path+'//data//users.csv',sep='\t' ,encoding='latin-1',usecols=['user_id', 'gender', 'zipcode', 'age_desc', 'occ_desc'])

# Reading movies file
movies = pd.read_csv(base_path+'//data//movies.csv',sep='\t',encoding='latin-1', usecols=['movie_id', 'title', 'genres'])

In [9]:
movies['genres'] = movies['genres'].str.split('|')

In [18]:
movies['genres'] = movies['genres'].fillna("").astype('str')

In [19]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(movies['genres'])

In [22]:
tfidf_matrix[1:1]

<0x127 sparse matrix of type '<class 'numpy.float64'>'
	with 0 stored elements in Compressed Sparse Row format>

In [26]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim[:5, :5]

array([[1.        , 0.14193614, 0.09010857, 0.1056164 , 0.23523322],
       [0.14193614, 1.        , 0.        , 0.        , 0.        ],
       [0.09010857, 0.        , 1.        , 0.1719888 , 0.38306058],
       [0.1056164 , 0.        , 0.1719888 , 1.        , 0.4489859 ],
       [0.23523322, 0.        , 0.38306058, 0.4489859 , 1.        ]])

In [44]:
title = movies['title']
indices = pd.Series(movies.index, index=movies['title'])

In [53]:
def genre_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:21]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [68]:
genre_recommendations('Toy Story (1995)').head(20)

1050               Aladdin and the King of Thieves (1996)
2072                             American Tail, An (1986)
2073           American Tail: Fievel Goes West, An (1991)
2285                            Rugrats Movie, The (1998)
2286                                 Bug's Life, A (1998)
3045                                   Toy Story 2 (1999)
3542                                Saludos Amigos (1943)
3682                                   Chicken Run (2000)
3685       Adventures of Rocky and Bullwinkle, The (2000)
236                                 Goofy Movie, A (1995)
12                                           Balto (1995)
241                               Gumby: The Movie (1995)
310                             Swan Princess, The (1994)
592                                      Pinocchio (1940)
612                                Aristocats, The (1970)
700                               Oliver & Company (1988)
876     Land Before Time III: The Time of the Great Gi...
1010          

In [71]:
movies[movies["title"]=='Aladdin and the King of Thieves (1996)']

Unnamed: 0,movie_id,title,genres
1050,1064,Aladdin and the King of Thieves (1996),"['Animation', ""Children's"", 'Comedy']"


In [72]:
movies[movies["title"]=='Toy Story (1995)']

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),"['Animation', ""Children's"", 'Comedy']"


# hence our recommendor system is working good
# That is, it is not capable of capturing tastes and providing recommendations across genres