In [37]:
from surprise import SVD, SVDpp
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split

import matplotlib.pyplot as plt

from tqdm import tqdm_notebook

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors

import pandas as pd
import numpy as np

In [2]:
links = pd.read_csv('../lecture-1/links.csv')
movies = pd.read_csv('../lecture-1/movies.csv')
ratings = pd.read_csv('../lecture-1/ratings.csv')
tags = pd.read_csv('../lecture-1/tags.csv')

In [3]:
movies_with_ratings = movies.join(ratings.set_index('movieId'), on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)

In [4]:
movies_with_ratings.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0,4.0,964982700.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0,4.0,847435000.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7.0,4.5,1106636000.0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15.0,2.5,1510578000.0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17.0,4.5,1305696000.0


In [5]:
dataset = pd.DataFrame({
    'uid': movies_with_ratings.userId,
    'iid': movies_with_ratings.title,
    'rating': movies_with_ratings.rating
})

In [8]:
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(dataset, reader)

In [10]:
trainset, testset = train_test_split(data, test_size=.15, random_state=42)

In [11]:
%%time
algo = SVD(n_factors=20, n_epochs=20)
algo.fit(trainset)

CPU times: user 3.46 s, sys: 52.1 ms, total: 3.51 s
Wall time: 4.68 s


In [12]:
test_pred = algo.test(testset)

In [13]:
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.8699


0.8699088985433424

In [14]:
algo.predict(uid=2.0, iid='Mortal Kombat (1995)').est

2.6246693735894704

In [15]:
current_user_id = 2.0
user_movies = movies_with_ratings[movies_with_ratings.userId == current_user_id].title.unique()

scores = []
titles = []

for movie in movies_with_ratings.title.unique():
    if movie in user_movies:
        continue
        
    scores.append(algo.predict(uid=current_user_id, iid=movie).est)
    titles.append(movie)

In [19]:
sorted(scores)[-10:]

[4.284910236745244,
 4.304434638493604,
 4.308854113339664,
 4.332408968017837,
 4.3446078785760545,
 4.345952085977351,
 4.3606525484085665,
 4.368565231238774,
 4.422920699365787,
 4.442075166770861]

In [20]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [21]:
movie_genres = [change_string(g) for g in movies.genres.values]

In [22]:
movie_genres[0]

'Adventure Animation Children Comedy Fantasy'

In [29]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(movie_genres)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

neigh = NearestNeighbors(n_neighbors=20, n_jobs=-1, metric='euclidean') 
neigh.fit(X_train_tfidf)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='euclidean',
         metric_params=None, n_jobs=-1, n_neighbors=20, p=2, radius=1.0)

In [30]:
test = change_string("Adventure|Comedy|Fantasy|Crime")

predict = count_vect.transform([test])
X_tfidf2 = tfidf_transformer.transform(predict)

res = neigh.kneighbors(X_tfidf2, return_distance=True)

In [31]:
res

(array([[0.42079615, 0.53300564, 0.54288608, 0.54288608, 0.54288608,
         0.54288608, 0.54288608, 0.54288608, 0.54288608, 0.54288608,
         0.54288608, 0.54288608, 0.54288608, 0.54288608, 0.54288608,
         0.54288608, 0.54288608, 0.6188388 , 0.62682864, 0.62682864]]),
 array([[6774, 9096, 3576,  863, 2302, 2608, 7865, 3582, 8361, 3302, 5737,
         6723, 5636, 3376, 7496, 5627, 9717, 2206, 6133, 5832]]))

In [32]:
movies.iloc[res[1][0]]

Unnamed: 0,movieId,title,genres
6774,60074,Hancock (2008),Action|Adventure|Comedy|Crime|Fantasy
9096,143559,L.A. Slasher (2015),Comedy|Crime|Fantasy
3576,4899,Black Knight (2001),Adventure|Comedy|Fantasy
863,1136,Monty Python and the Holy Grail (1975),Adventure|Comedy|Fantasy
2302,3052,Dogma (1999),Adventure|Comedy|Fantasy
2608,3489,Hook (1991),Adventure|Comedy|Fantasy
7865,94015,Mirror Mirror (2012),Adventure|Comedy|Fantasy
3582,4911,Jabberwocky (1977),Adventure|Comedy|Fantasy
8361,109042,Knights of Badassdom (2013),Adventure|Comedy|Fantasy
3302,4467,"Adventures of Baron Munchausen, The (1988)",Adventure|Comedy|Fantasy


In [34]:
movies_with_ratings.sort_values('timestamp', inplace=True)

In [35]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
movies_with_ratings[movies_with_ratings]

In [36]:
title_genres = {}

for index, row in tqdm_notebook(movies.iterrows()):
    title_genres[row.title] = row.genres




In [48]:
def recommend_for_user(user_id):
    current_user_id = user_id
    user_movies = movies_with_ratings[movies_with_ratings.userId == current_user_id].title.unique()
    
    last_user_movie = user_movies[-1]
    
    movie_genres = title_genres[last_user_movie]
    
    movie_genres = change_string(movie_genres)

    predict = count_vect.transform([movie_genres])
    X_tfidf2 = tfidf_transformer.transform(predict)

    res = neigh.kneighbors(X_tfidf2, return_distance=True)
    
    movies_to_score = movies.iloc[res[1][0]].title.values

    scores = []
    titles = []

    for movie in movies_to_score:
        if movie in user_movies:
            continue

        scores.append(algo.predict(uid=current_user_id, iid=movie).est)
        titles.append(movie)
        
    
    best_indexes = np.argsort(scores)[-10:]
    for i in reversed(best_indexes):
        print(titles[i], scores[i])

In [51]:
movies_with_ratings[movies_with_ratings.userId == 2.0].sort_values('rating')

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
97478,114060,The Drop (2014),Crime|Drama|Thriller,2.0,2.0,1445715000.0
93998,91658,"Girl with the Dragon Tattoo, The (2011)",Drama|Thriller,2.0,2.5,1445715000.0
8652,318,"Shawshank Redemption, The (1994)",Crime|Drama,2.0,3.0,1445715000.0
96746,109487,Interstellar (2014),Sci-Fi|IMAX,2.0,3.0,1445715000.0
91063,77455,Exit Through the Gift Shop (2010),Comedy|Documentary,2.0,3.0,1445715000.0
90135,71535,Zombieland (2009),Action|Comedy|Horror,2.0,3.0,1445715000.0
93833,91529,"Dark Knight Rises, The (2012)",Action|Adventure|Crime|IMAX,2.0,3.5,1445715000.0
95272,99114,Django Unchained (2012),Action|Drama|Western,2.0,3.5,1445715000.0
97675,115713,Ex Machina (2015),Drama|Sci-Fi|Thriller,2.0,3.5,1445715000.0
76960,8798,Collateral (2004),Action|Crime|Drama|Thriller,2.0,3.5,1445715000.0


In [50]:
recommend_for_user(2.0)

Infernal Affairs (Mou gaan dou) (2002) 4.051479818368768
Nightcrawler (2014) 3.916274845293362
Simple Plan, A (1998) 3.848011260370271
Cape Fear (1962) 3.8385701846293063
I, the Jury (1982) 3.710274222918589
Traitor (2008) 3.707983769730518
Badlands (1973) 3.611444494897166
Street Kings (2008) 3.607796440254812
Thief (1981) 3.598805662378916
Undertow (2004) 3.503551600336656


In [38]:
np.argsort([1,9,5,7])

array([0, 2, 3, 1])