In [1]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

In [6]:
movies_path = os.path.join(os.getcwd(), 'ml-10M100K', 'movies.dat')

In [7]:
movies_path

'/home/dawid/projects/PiSR_1/ml-10M100K/movies.dat'

In [11]:
movies = pd.read_csv(movies_path, delimiter='::', names=['id', 'title', 'genres'])

  """Entry point for launching an IPython kernel.


In [12]:
movies.head()

Unnamed: 0,id,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [23]:
movies.shape

(10681, 4)

In [13]:
movies['genres_space'] = movies.apply(lambda x : x['genres'].replace('|', ' '), axis=1)

In [14]:
movies.head(50)

Unnamed: 0,id,title,genres,genres_space
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Adventure Animation Children Comedy Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy,Adventure Children Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance,Comedy Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy,Comedy
5,6,Heat (1995),Action|Crime|Thriller,Action Crime Thriller
6,7,Sabrina (1995),Comedy|Romance,Comedy Romance
7,8,Tom and Huck (1995),Adventure|Children,Adventure Children
8,9,Sudden Death (1995),Action,Action
9,10,GoldenEye (1995),Action|Adventure|Thriller,Action Adventure Thriller


In [17]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(movies['genres_space'])
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [18]:
titles = movies['title']
indices = pd.Series(movies.index, index=movies['title'])

def genre_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:21]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [19]:
genre_recommendations('Toy Story (1995)').head(5)

2210                                          Antz (1998)
3029                                   Toy Story 2 (1999)
3665       Adventures of Rocky and Bullwinkle, The (2000)
3924                     Emperor's New Groove, The (2000)
8577    Kiki's Delivery Service (Majo no takkyûbin) (1...
Name: title, dtype: object

In [21]:
filtered_movies = movies[movies.genres == 'Adventure|Animation|Children|Comedy|Fantasy']

In [22]:
filtered_movies

Unnamed: 0,id,title,genres,genres_space
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Adventure Animation Children Comedy Fantasy
2210,2294,Antz (1998),Adventure|Animation|Children|Comedy|Fantasy,Adventure Animation Children Comedy Fantasy
3029,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Adventure Animation Children Comedy Fantasy
3665,3754,"Adventures of Rocky and Bullwinkle, The (2000)",Adventure|Animation|Children|Comedy|Fantasy,Adventure Animation Children Comedy Fantasy
3924,4016,"Emperor's New Groove, The (2000)",Adventure|Animation|Children|Comedy|Fantasy,Adventure Animation Children Comedy Fantasy
8577,26662,Kiki's Delivery Service (Majo no takkyûbin) (1...,Adventure|Animation|Children|Comedy|Fantasy,Adventure Animation Children Comedy Fantasy
9500,45074,"Wild, The (2006)",Adventure|Animation|Children|Comedy|Fantasy,Adventure Animation Children Comedy Fantasy
9584,47124,"Ant Bully, The (2006)",Adventure|Animation|Children|Comedy|Fantasy,Adventure Animation Children Comedy Fantasy
9895,53121,Shrek the Third (2007),Adventure|Animation|Children|Comedy|Fantasy,Adventure Animation Children Comedy Fantasy
