# Collaborative filtering

In [1]:
import pandas as pd
import numpy as np
from scipy import sparse

In [2]:
ratings = pd.read_csv('ratings.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.merge(movies,ratings).drop(['genres','timestamp'],axis=1)
print(ratings.shape)
ratings.head()

(100836, 4)


Unnamed: 0,movieId,title,userId,rating
0,1,Toy Story (1995),1,4.0
1,1,Toy Story (1995),5,4.0
2,1,Toy Story (1995),7,4.5
3,1,Toy Story (1995),15,2.5
4,1,Toy Story (1995),17,4.5


In [3]:
user_ratings = ratings.pivot_table(index=['userId'],columns=['title'],values='rating')
user_ratings.head()
print("Before: ",user_ratings.shape)
user_ratings = user_ratings.dropna(thresh=10, axis=1).fillna(0,axis=1)
user_ratings.fillna(0, inplace=True)
print("After: ",user_ratings.shape)

Before:  (610, 9719)
After:  (610, 2269)


In [4]:
corr_matrix = user_ratings.corr(method='pearson')
corr_matrix.head(10)

title,"'burbs, The (1989)",(500) Days of Summer (2009),10 Cloverfield Lane (2016),10 Things I Hate About You (1999),"10,000 BC (2008)",101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),12 Angry Men (1957),12 Years a Slave (2013),127 Hours (2010),...,Zack and Miri Make a Porno (2008),Zero Dark Thirty (2012),Zero Effect (1998),Zodiac (2007),Zombieland (2009),Zoolander (2001),Zootopia (2016),eXistenZ (1999),xXx (2002),¡Three Amigos! (1986)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"'burbs, The (1989)",1.0,0.063117,-0.023768,0.143482,0.011998,0.087931,0.224052,0.034223,0.009277,0.008331,...,0.017477,0.03247,0.134701,0.153158,0.101301,0.049897,0.003233,0.187953,0.062174,0.353194
(500) Days of Summer (2009),0.063117,1.0,0.142471,0.273989,0.19396,0.148903,0.142141,0.159756,0.135486,0.200135,...,0.374515,0.178655,0.068407,0.414585,0.355723,0.252226,0.216007,0.053614,0.241092,0.125905
10 Cloverfield Lane (2016),-0.023768,0.142471,1.0,-0.005799,0.112396,0.006139,-0.016835,0.031704,-0.024275,0.272943,...,0.242663,0.099059,-0.023477,0.272347,0.241751,0.195054,0.319371,0.177846,0.096638,0.002733
10 Things I Hate About You (1999),0.143482,0.273989,-0.005799,1.0,0.24467,0.223481,0.211473,0.011784,0.091964,0.043383,...,0.243118,0.104858,0.13246,0.091853,0.158637,0.281934,0.050031,0.121029,0.130813,0.110612
"10,000 BC (2008)",0.011998,0.19396,0.112396,0.24467,1.0,0.234459,0.119132,0.059187,-0.025882,0.089328,...,0.260261,0.087592,0.094913,0.184521,0.242299,0.240231,0.094773,0.088045,0.203002,0.083518
101 Dalmatians (1996),0.087931,0.148903,0.006139,0.223481,0.234459,1.0,0.285112,0.119843,0.072399,0.029967,...,0.114968,0.077232,0.096294,0.067134,0.113224,0.184324,0.054024,0.047804,0.156932,0.078734
101 Dalmatians (One Hundred and One Dalmatians) (1961),0.224052,0.142141,-0.016835,0.211473,0.119132,0.285112,1.0,0.134037,0.017264,-0.046277,...,0.120302,0.125816,0.049818,0.08365,0.171654,0.27426,0.077594,0.085606,0.24882,0.171118
12 Angry Men (1957),0.034223,0.159756,0.031704,0.011784,0.059187,0.119843,0.134037,1.0,0.132979,0.058862,...,0.104518,0.028415,0.079905,0.241435,0.144652,0.122107,0.056742,-0.001708,0.074306,0.102744
12 Years a Slave (2013),0.009277,0.135486,-0.024275,0.091964,-0.025882,0.072399,0.017264,0.132979,1.0,0.249931,...,0.024045,0.038127,0.013786,0.190366,0.10415,0.017351,0.063325,0.002528,0.037469,0.004213
127 Hours (2010),0.008331,0.200135,0.272943,0.043383,0.089328,0.029967,-0.046277,0.058862,0.249931,1.0,...,0.223135,0.154299,0.012907,0.364841,0.198926,0.091416,0.225747,0.128638,0.153335,0.002912


In [5]:
def get_similar(movie_name,rating):
    similar_ratings = corr_matrix[movie_name]*(rating-2.5)
    similar_ratings = similar_ratings.sort_values(ascending=False)
    #print(type(similar_ratings))
    return similar_ratings

In [6]:
romantic_lover = [("(500) Days of Summer (2009)",5),("Alice in Wonderland (2010)",3),("Aliens (1986)",1),("2001: A Space Odyssey (1968)",2)]
similar_movies = pd.DataFrame()
for movie,rating in romantic_lover:
    new_row = pd.DataFrame([get_similar(movie, rating)])
    similar_movies = pd.concat([similar_movies, new_row], ignore_index=True)

similar_movies.head(20)

title,(500) Days of Summer (2009),Silver Linings Playbook (2012),Adventureland (2009),Up in the Air (2009),50/50 (2011),"Descendants, The (2011)","Crazy, Stupid, Love. (2011)",About Time (2013),Toy Story 3 (2010),"Secret Life of Walter Mitty, The (2013)",...,"Remains of the Day, The (1993)",Leaving Las Vegas (1995),"Grifters, The (1990)","English Patient, The (1996)",Dances with Wolves (1990),Stargate (1994),"Madness of King George, The (1994)",Disclosure (1994),"Postman, The (Postino, Il) (1994)",Clear and Present Danger (1994)
0,2.5,1.254898,1.157021,1.129961,1.12389,1.082774,1.076731,1.050643,1.050386,1.043048,...,-0.127058,-0.131261,-0.134995,-0.135628,-0.142231,-0.14287,-0.144495,-0.160575,-0.161373,-0.20981
1,0.203998,0.191226,0.130105,0.221842,0.139619,0.058567,0.186331,0.148771,0.147115,0.17924,...,-0.004244,-0.022697,0.000935,-0.028072,0.013454,-0.010708,-0.022814,-0.030627,-0.025479,-0.036196
2,-0.062634,-0.131106,-0.09179,-0.2055,-0.112038,-0.187131,-0.131742,-0.057615,-0.170463,-0.160831,...,-0.066664,-0.093771,-0.308606,-0.328057,-0.170352,-0.32919,-0.110041,0.122334,-0.113838,-0.124716
3,-0.056808,-0.060217,-0.083101,-0.093265,-0.064954,-0.059233,-0.042563,-0.039608,-0.063763,-0.063478,...,-0.022207,-0.035515,-0.144882,-0.141102,-0.047911,-0.078467,-0.020044,0.022402,-0.061293,-0.023133


In [7]:
similar_movies.sum().sort_values(ascending=False).head(10)

title
(500) Days of Summer (2009)       2.584556
Alice in Wonderland (2010)        1.395229
Silver Linings Playbook (2012)    1.254800
Yes Man (2008)                    1.116264
Adventureland (2009)              1.112235
Marley & Me (2008)                1.108381
About Time (2013)                 1.102192
Crazy, Stupid, Love. (2011)       1.088757
50/50 (2011)                      1.086517
Help, The (2011)                  1.075963
dtype: float64

In [8]:
action_lover = [("Amazing Spider-Man, The (2012)",5),("Mission: Impossible III (2006)",4),("Toy Story 3 (2010)",2),("2 Fast 2 Furious (Fast and the Furious 2, The) (2003)",4)]
similar_movies = pd.DataFrame()
for movie,rating in action_lover:
    new_row = pd.DataFrame([get_similar(movie, rating)])
    similar_movies = pd.concat([similar_movies, new_row], ignore_index=True)

similar_movies.head(10)
similar_movies.sum().sort_values(ascending=False).head(20)

title
Amazing Spider-Man, The (2012)                           3.233134
Mission: Impossible III (2006)                           2.874798
2 Fast 2 Furious (Fast and the Furious 2, The) (2003)    2.701477
Over the Hedge (2006)                                    2.229721
Crank (2006)                                             2.176259
Mission: Impossible - Ghost Protocol (2011)              2.159666
Hancock (2008)                                           2.156098
The Amazing Spider-Man 2 (2014)                          2.153677
Hellboy (2004)                                           2.137518
Snakes on a Plane (2006)                                 2.137396
dtype: float64

# Content Based

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [10]:
from sklearn.metrics.pairwise import cosine_similarity

In [11]:
movies['genres_combined'] = movies['genres'].apply(lambda x: " ".join(x.split('|')))

In [12]:
cv = CountVectorizer() 
# Fit and transform the data
count_matrix = cv.fit_transform(movies['genres_combined'])

In [13]:
cosine_sim = cosine_similarity(count_matrix)

In [14]:
def get_recommendations(title, cosine_sim = cosine_sim):

    idx = movies[movies['title'] == title].index[0]

    sim_scores = list(enumerate(cosine_sim[idx]))

    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    sim_scores = sim_scores[1:21]

    movie_indices = [i[0] for i in sim_scores]

    return movies['title'].iloc[movie_indices]

In [15]:
print("Top 20 similar movies to 'Toy Story (1995)':")
print(get_recommendations('Toy Story (1995)'))

Top 20 similar movies to 'Toy Story (1995)':
1706                                          Antz (1998)
2355                                   Toy Story 2 (1999)
2809       Adventures of Rocky and Bullwinkle, The (2000)
3000                     Emperor's New Groove, The (2000)
3568                                Monsters, Inc. (2001)
6194                                     Wild, The (2006)
6486                               Shrek the Third (2007)
6948                       Tale of Despereaux, The (2008)
7760    Asterix and the Vikings (Astérix et les Viking...
8219                                         Turbo (2013)
8927                             The Good Dinosaur (2015)
9430                                         Moana (2016)
3194                                         Shrek (2001)
5490    Twelve Tasks of Asterix, The (Les douze travau...
5977                                       Valiant (2005)
6260                                Ant Bully, The (2006)
6448           TMNT (Teenag