In [64]:
import pandas as pd
import numpy as np

In [16]:
movies = pd.read_csv("datasets/movies.csv")
ratings = pd.read_csv("datasets/ratings.csv")
tags = pd.read_csv("datasets/tags.csv")

movies.head(2)
ratings.head(2)
tags.head(10)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy


In [21]:
tags.isnull().sum()

userId       0
movieId      0
tag          0
timestamp    0
dtype: int64

In [17]:
merged_df = pd.merge(movies, tags, on='movieId')


movies_with_tags = merged_df.groupby(['movieId', 'title', 'genres'])['tag'].apply(lambda x: ', '.join(x)).reset_index()

movies_with_tags.rename(columns={'tag': 'tags'}, inplace=True)


In [22]:
movies_with_tags['genres'] = movies_with_tags['genres'].str.replace('|', ',', regex=False)  

In [23]:
movies_with_tags['content'] = movies_with_tags['genres'] + ', ' + movies_with_tags['tags']

In [None]:
movies_with_tags

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()

tfidf_matrix = tfidf_vectorizer.fit_transform(movies_with_tags['content'])

In [26]:
from sklearn.metrics.pairwise import linear_kernel
content_similarity = linear_kernel(tfidf_matrix, tfidf_matrix)


In [94]:
def get_content_based_recommendations(movie_title, top_n):
    index = movies_with_tags[movies_with_tags['title'] == movie_title].index[0]
    similarity_scores = content_similarity[index]
    similar_indices = similarity_scores.argsort()[::-1][1:top_n + 1]
    recommendations = movies_with_tags.loc[similar_indices, 'title'].values.tolist()
    return recommendations

In [95]:
get_content_based_recommendations("Toy Story (1995)",4)

["Bug's Life, A (1998)", 'Toy Story 2 (1999)', 'Up (2009)', 'Sintel (2010)']

In [45]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [None]:
movies_with_title = 3

In [55]:
movies_with_ratings = pd.merge(ratings, movies, on='movieId')

In [56]:
movies_pivot= movies_with_ratings.pivot_table(columns='userId',index='title',values='rating') 
movies_pivot

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),,,,,,,,,,,...,,,,,,,,,,4.0
'Hellboy': The Seeds of Creation (2004),,,,,,,,,,,...,,,,,,,,,,
'Round Midnight (1986),,,,,,,,,,,...,,,,,,,,,,
'Salem's Lot (2004),,,,,,,,,,,...,,,,,,,,,,
'Til There Was You (1997),,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
eXistenZ (1999),,,,,,,,,,,...,,,5.0,,,,,4.5,,
xXx (2002),,,,,,,,,1.0,,...,,,,,,,,3.5,,2.0
xXx: State of the Union (2005),,,,,,,,,,,...,,,,,,,,,,1.5
¡Three Amigos! (1986),4.0,,,,,,,,,,...,,,,,,,,,,


In [57]:
movies_pivot.fillna(0,inplace=True)

In [58]:
from scipy.sparse import csr_matrix
movies_sparse = csr_matrix(movies_pivot)

In [59]:
from sklearn.neighbors import NearestNeighbors
model = NearestNeighbors(algorithm='brute')

In [60]:
model.fit(movies_sparse)

In [88]:
def get_collaborative_filtering_recommendations(movie_name,top_k):
    movie_id = np.where(movies_pivot.index == movie_name)[0][0]
    distances, suggestions = model.kneighbors(movies_pivot.iloc[movie_id, :].values.reshape(1, -1), n_neighbors=top_k + 1)    
    recommended_movies = [movies_pivot.index[suggestions[0][i]] for i in range(1, len(suggestions[0]))]
    return recommended_movies


In [90]:
get_collaborative_filtering_recommendations("Toy Story (1995)",5)

['Toy Story 2 (1999)',
 'Mission: Impossible (1996)',
 'Independence Day (a.k.a. ID4) (1996)',
 "Bug's Life, A (1998)",
 'Nutty Professor, The (1996)']

In [96]:
def get_hybrid_recommendations( movie_title, top_n):
    content_based_recommendations = get_content_based_recommendations(movie_title, top_n)
    collaborative_filtering_recommendations = get_collaborative_filtering_recommendations(movie_title, top_n)
    hybrid_recommendations = list(set(content_based_recommendations + collaborative_filtering_recommendations))
    return hybrid_recommendations[:top_n]

In [97]:
get_hybrid_recommendations('Toy Story 2 (1999)',10)

['Aladdin (1992)',
 'Jurassic Park III (2001)',
 'Mulan (1998)',
 'Peter Pan (1953)',
 'Aristocats, The (1970)',
 'Fantasia (1940)',
 'Pinocchio (1940)',
 'Fox and the Hound, The (1981)',
 "Emperor's New Groove, The (2000)",
 'Prince of Egypt, The (1998)']