In [1]:
import pandas as pd
from scipy import sparse
from sklearn.metrics.pairwise import pairwise_distances

### Load `movies.csv` and `ratings.csv`

We'll be using the [MovieLens](https://grouplens.org/datasets/movielens/) dataset for building our recommendation engine.

In [2]:
movies = pd.read_csv('ml-latest-small/movies.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
ratings = pd.read_csv('ml-latest-small/ratings.csv')
print(ratings.shape)
ratings.head()

(100004, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


### Drop unnecessary columns

In [4]:
ratings.drop('timestamp', axis=1, inplace=True)
movies.drop('genres', axis=1, inplace=True)

### Merge `movies` and `ratings`

In [6]:
df = pd.merge(movies, ratings, on='movieId')

### Create pivot table

In [9]:
pivot = pd.pivot_table(df, index='userId', columns='title', values='rating')

### Create sparse matrix from pivot table using `sparse.csr_matrix()`

In [11]:
sparse= sparse.csr_matrix(pivot.T.fillna(0))

### Calculate cosine similarity using `pairwise_distances()`

In [12]:
distances = pairwise_distances(sparse, metric='cosine')

### Create distances DataFrame

In [14]:
distance = pd.DataFrame(distances, index=pivot.columns, columns=pivot.columns)

In [15]:
distance

title,"""Great Performances"" Cats (1998)",$9.99 (2008),'Hellboy': The Seeds of Creation (2004),'Neath the Arizona Skies (1934),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),...,Zulu (1964),Zulu (2013),[REC] (2007),eXistenZ (1999),loudQUIETloud: A Film About the Pixies (2006),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931),İtirazım Var (2014)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"""Great Performances"" Cats (1998)",0.000000,1.000000,1.000000,0.835601,0.979609,1.000000,0.985954,1.000000,1.000000,0.996834,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
$9.99 (2008),1.000000,0.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.920526,1.000000,0.843670,...,1.000000,1.000000,1.000000,1.000000,1.000000,0.986101,1.000000,0.941782,1.000000,1.000000
'Hellboy': The Seeds of Creation (2004),1.000000,1.000000,0.000000,1.000000,1.000000,0.000000,1.000000,0.782643,1.000000,1.000000,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
'Neath the Arizona Skies (1934),0.835601,1.000000,1.000000,0.000000,0.875965,1.000000,0.914564,1.000000,1.000000,0.980741,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
'Round Midnight (1986),0.979609,1.000000,1.000000,0.875965,0.000000,1.000000,0.989403,0.856214,1.000000,0.863837,...,1.000000,1.000000,1.000000,0.878433,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
'Salem's Lot (2004),1.000000,1.000000,0.000000,1.000000,1.000000,0.000000,1.000000,0.782643,1.000000,1.000000,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
'Til There Was You (1997),0.985954,1.000000,1.000000,0.914564,0.989403,1.000000,0.000000,0.888580,0.704042,0.998355,...,1.000000,1.000000,1.000000,0.842995,1.000000,1.000000,1.000000,0.918380,1.000000,1.000000
"'burbs, The (1989)",1.000000,0.920526,0.782643,1.000000,0.856214,0.782643,0.888580,0.000000,0.790848,0.912091,...,1.000000,1.000000,1.000000,0.915675,1.000000,0.876060,1.000000,0.668337,1.000000,1.000000
'night Mother (1986),1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.704042,0.790848,0.000000,1.000000,...,1.000000,1.000000,1.000000,0.823168,1.000000,1.000000,1.000000,0.785502,1.000000,1.000000
(500) Days of Summer (2009),0.996834,0.843670,1.000000,0.980741,0.863837,1.000000,0.998355,0.912091,1.000000,0.000000,...,1.000000,0.903704,0.872122,0.983483,0.845926,0.855039,0.865185,0.935092,1.000000,0.865185


### Evaluate recommender performance

In [17]:
distance['Matrix, The (1999)']

title
"Great Performances" Cats (1998)                                          1.000000
$9.99 (2008)                                                              0.950621
'Hellboy': The Seeds of Creation (2004)                                   0.941914
'Neath the Arizona Skies (1934)                                           1.000000
'Round Midnight (1986)                                                    0.949568
'Salem's Lot (2004)                                                       0.941914
'Til There Was You (1997)                                                 0.977668
'burbs, The (1989)                                                        0.816670
'night Mother (1986)                                                      0.974848
(500) Days of Summer (2009)                                               0.742143
*batteries not included (1987)                                            0.855315
...And God Spoke (1993)                                                   0.92739

In [21]:
search = 'Matrix'
for title in movies.loc[movies['title'].str.contains(search), 'title'].values:
    print(title, 'Rating:', pivot[title].mean(), 'from', pivot[title].count(), 'reviews')
    print('-------------------------------------------------------------')
    print('Similar movies:', distance[title].sort_values()[1:11], '\n')

Matrix, The (1999) Rating: 4.183397683397684 from 259 reviews
-------------------------------------------------------------
Similar movies: title
Lord of the Rings: The Fellowship of the Ring, The (2001)    0.308156
Lord of the Rings: The Two Towers, The (2002)                0.325514
Fight Club (1999)                                            0.345182
Back to the Future (1985)                                    0.348224
Lord of the Rings: The Return of the King, The (2003)        0.352642
Star Wars: Episode V - The Empire Strikes Back (1980)        0.353922
Star Wars: Episode IV - A New Hope (1977)                    0.365303
Saving Private Ryan (1998)                                   0.370317
Gladiator (2000)                                             0.378954
Shrek (2001)                                                 0.379845
Name: Matrix, The (1999), dtype: float64 

Matrix Reloaded, The (2003) Rating: 3.268292682926829 from 82 reviews
-----------------------------------------

In [24]:
search = 'Godfather'
for title in movies.loc[movies['title'].str.contains(search), 'title'].values:
    print(title, 'Rating:', pivot[title].mean(), 'from', pivot[title].count(), 'reviews')
    print('-------------------------------------------------------------')
    print('Similar movies:', distance[title].sort_values()[1:11], '\n')

Godfather, The (1972) Rating: 4.4875 from 200 reviews
-------------------------------------------------------------
Similar movies: title
Godfather: Part II, The (1974)                                                    0.226315
Goodfellas (1990)                                                                 0.379651
One Flew Over the Cuckoo's Nest (1975)                                            0.431756
American Beauty (1999)                                                            0.442003
Star Wars: Episode IV - A New Hope (1977)                                         0.453250
Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)    0.461815
Saving Private Ryan (1998)                                                        0.465316
Apocalypse Now (1979)                                                             0.465653
Reservoir Dogs (1992)                                                             0.468287
Usual Suspects, The (1995)                 