In [1]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

In [2]:
movies_path = os.path.join(os.getcwd(), 'dataset_small', 'movies.csv')
tags_path = os.path.join(os.getcwd(), 'dataset_small', 'tags.csv')

In [3]:
movies_path

'/home/dawid/projects/PiSR_1/dataset_small/movies.csv'

In [4]:
movies = pd.read_csv(movies_path)
tags = pd.read_csv(tags_path)

In [5]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [6]:
movies['genres_space'] = movies.apply(lambda x : x['genres'].replace('|', ' '), axis=1)

In [36]:
movies.head()

Unnamed: 0,movieId,title,genres,genres_space
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Adventure Animation Children Comedy Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy,Adventure Children Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance,Comedy Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy,Comedy


In [37]:
tags_gp = tags.groupby('movieId')

In [38]:
movies_dictionary = dict(zip(movies['movieId'], movies['title']))
title_dictionary = dict(zip(movies['title'], movies['movieId']))

In [39]:
indices = [ index for index, row in tags_gp]
titles = [ movies_dictionary.get(index) for index, row in tags_gp]
merged_tags = [ ' '.join(set(row['tag'])) for index, row in tags_gp]

In [83]:
merged_tags[:5]

['fun pixar',
 'fantasy Robin Williams game magic board game',
 'moldy old',
 'pregnancy remake',
 'remake']

In [41]:
indices[:5]

[1, 2, 3, 5, 7]

In [76]:
titles[:10]

['Toy Story (1995)',
 'Jumanji (1995)',
 'Grumpier Old Men (1995)',
 'Father of the Bride Part II (1995)',
 'Sabrina (1995)',
 'American President, The (1995)',
 'Nixon (1995)',
 'Casino (1995)',
 'Sense and Sensibility (1995)',
 'Get Shorty (1995)']

In [43]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(merged_tags)

In [44]:
tfidf_matrix.shape

(1572, 4328)

In [45]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [74]:
cosine_sim.shape

(1572, 1572)

In [46]:
indices_pd = pd.Series(indices, index=titles)

In [47]:
titles_pd = pd.Series(titles)

In [48]:
indices_pd.head()

Toy Story (1995)                      1
Jumanji (1995)                        2
Grumpier Old Men (1995)               3
Father of the Bride Part II (1995)    5
Sabrina (1995)                        7
dtype: int64

In [106]:
def get_recommendations(title):
    idx = indices_pd[title]
    print(idx)
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return titles_pd.iloc[movie_indices]

In [107]:
get_recommendations('Sabrina (1995)').head(10)

7


180               Godfather, The (1972)
301                   Goodfellas (1990)
322            Miller's Crossing (1990)
398                Donnie Brasco (1997)
526           Married to the Mob (1988)
528               My Blue Heaven (1990)
306      Godfather: Part II, The (1974)
108                Carlito's Way (1993)
1390           American Gangster (2007)
487     Godfather: Part III, The (1990)
dtype: object