In [67]:
from surprise import SVD, SVDpp
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split

import matplotlib.pyplot as plt

from tqdm import tqdm_notebook

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors

import pandas as pd
import numpy as np

In [68]:
links = pd.read_csv('C:/Users/gordee-ext/OneDrive - Louis Dreyfus Company/Desktop/Netology/ml-latest-small/links.csv')
movies = pd.read_csv('C:/Users/gordee-ext/OneDrive - Louis Dreyfus Company/Desktop/Netology/ml-latest-small/movies.csv')
ratings = pd.read_csv('C:/Users/gordee-ext/OneDrive - Louis Dreyfus Company/Desktop/Netology/ml-latest-small/ratings.csv')
tags = pd.read_csv('C:/Users/gordee-ext/OneDrive - Louis Dreyfus Company/Desktop/Netology/ml-latest-small/tags.csv')

In [84]:
data = pd.merge(ratings, movies, on='movieId')

In [85]:
data.head(2)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [91]:
dataset = pd.DataFrame({
    'uid': data.userId,
    'iid': data.title,
    'rating': data.rating,
})

In [92]:
data.genres

0         Adventure|Animation|Children|Comedy|Fantasy
1         Adventure|Animation|Children|Comedy|Fantasy
2         Adventure|Animation|Children|Comedy|Fantasy
3         Adventure|Animation|Children|Comedy|Fantasy
4         Adventure|Animation|Children|Comedy|Fantasy
5         Adventure|Animation|Children|Comedy|Fantasy
6         Adventure|Animation|Children|Comedy|Fantasy
7         Adventure|Animation|Children|Comedy|Fantasy
8         Adventure|Animation|Children|Comedy|Fantasy
9         Adventure|Animation|Children|Comedy|Fantasy
10        Adventure|Animation|Children|Comedy|Fantasy
11        Adventure|Animation|Children|Comedy|Fantasy
12        Adventure|Animation|Children|Comedy|Fantasy
13        Adventure|Animation|Children|Comedy|Fantasy
14        Adventure|Animation|Children|Comedy|Fantasy
15        Adventure|Animation|Children|Comedy|Fantasy
16        Adventure|Animation|Children|Comedy|Fantasy
17        Adventure|Animation|Children|Comedy|Fantasy
18        Adventure|Animatio

In [93]:
dataset.head()

Unnamed: 0,uid,iid,rating
0,1,Toy Story (1995),4.0
1,5,Toy Story (1995),4.0
2,7,Toy Story (1995),4.5
3,15,Toy Story (1995),2.5
4,17,Toy Story (1995),4.5


In [94]:
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(dataset, reader)

In [95]:
trainset, testset = train_test_split(data, test_size=.2, random_state=42)

In [96]:
algo = SVD(n_factors=20, n_epochs=20)
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x209fa5d1780>

In [97]:
test_pred = algo.test(testset)

In [98]:
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.8700


0.8700166474995911

In [99]:
algo.predict(uid=1.0, iid='Mortal Kombat (1995)').est

3.4534693505236187

In [100]:
current_user_id = 2.0
user_movies = movies_with_ratings[movies_with_ratings.userId == current_user_id].title.unique()

scores = []
titles = []

for movie in movies_with_ratings.title.unique():
    if movie in user_movies:
        continue
        
    scores.append(algo.predict(uid=current_user_id, iid=movie).est)
    titles.append(movie)

In [101]:
sorted(scores)[-10:]

[4.435856267693788,
 4.437628127128966,
 4.43914390778063,
 4.443031263142362,
 4.443547740844092,
 4.44380591778499,
 4.452870687184537,
 4.458806440079457,
 4.481237880701434,
 4.561817692163414]

In [102]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [103]:
movie_genres = [change_string(g) for g in movies.genres.values]

In [104]:
movie_genres[0]

'Adventure Animation Children Comedy Fantasy'

In [105]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(movie_genres)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

neigh = NearestNeighbors(n_neighbors=20, n_jobs=-1, metric='euclidean') 
neigh.fit(X_train_tfidf)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='euclidean',
         metric_params=None, n_jobs=-1, n_neighbors=20, p=2, radius=1.0)

In [106]:
test = change_string("Adventure|Comedy|Fantasy|Crime")

predict = count_vect.transform([test])
X_tfidf2 = tfidf_transformer.transform(predict)

res = neigh.kneighbors(X_tfidf2, return_distance=True)

In [107]:
res

(array([[0.42079615, 0.53300564, 0.54288608, 0.54288608, 0.54288608,
         0.54288608, 0.54288608, 0.54288608, 0.54288608, 0.54288608,
         0.54288608, 0.54288608, 0.54288608, 0.54288608, 0.54288608,
         0.54288608, 0.54288608, 0.6188388 , 0.62682864, 0.62682864]]),
 array([[6774, 9096, 3576,  863, 2302, 2608, 7865, 3582, 8361, 3302, 5737,
         6723, 5636, 3376, 7496, 5627, 9717, 2206, 6133, 5832]],
       dtype=int64))

In [108]:
movies.iloc[res[1][0]]

Unnamed: 0,movieId,title,genres
6774,60074,Hancock (2008),Action|Adventure|Comedy|Crime|Fantasy
9096,143559,L.A. Slasher (2015),Comedy|Crime|Fantasy
3576,4899,Black Knight (2001),Adventure|Comedy|Fantasy
863,1136,Monty Python and the Holy Grail (1975),Adventure|Comedy|Fantasy
2302,3052,Dogma (1999),Adventure|Comedy|Fantasy
2608,3489,Hook (1991),Adventure|Comedy|Fantasy
7865,94015,Mirror Mirror (2012),Adventure|Comedy|Fantasy
3582,4911,Jabberwocky (1977),Adventure|Comedy|Fantasy
8361,109042,Knights of Badassdom (2013),Adventure|Comedy|Fantasy
3302,4467,"Adventures of Baron Munchausen, The (1988)",Adventure|Comedy|Fantasy


In [112]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [115]:
title_genres = {}

for index, row in tqdm_notebook(movies.iterrows()):
    title_genres[row.title] = row.genres

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [116]:
def recommend_for_user(user_id):
    current_user_id = user_id
    user_movies = movies_with_ratings[movies_with_ratings.userId == current_user_id].title.unique()
    
    last_user_movie = user_movies[-1]
    
    movie_genres = title_genres[last_user_movie]
    
    movie_genres = change_string(movie_genres)

    predict = count_vect.transform([movie_genres])
    X_tfidf2 = tfidf_transformer.transform(predict)

    res = neigh.kneighbors(X_tfidf2, return_distance=True)
    
    movies_to_score = movies.iloc[res[1][0]].title.values

    scores = []
    titles = []

    for movie in movies_to_score:
        if movie in user_movies:
            continue

        scores.append(algo.predict(uid=current_user_id, iid=movie).est)
        titles.append(movie)
        
    
    best_indexes = np.argsort(scores)[-10:]
    for i in reversed(best_indexes):
        print(titles[i], scores[i])

In [117]:
movies_with_ratings[movies_with_ratings.userId == 2.0].sort_values('rating')

Unnamed: 0,movieId,title,genres,userId,rating
97478,114060,The Drop (2014),Crime|Drama|Thriller,2.0,2.0
93998,91658,"Girl with the Dragon Tattoo, The (2011)",Drama|Thriller,2.0,2.5
8652,318,"Shawshank Redemption, The (1994)",Crime|Drama,2.0,3.0
96746,109487,Interstellar (2014),Sci-Fi|IMAX,2.0,3.0
91063,77455,Exit Through the Gift Shop (2010),Comedy|Documentary,2.0,3.0
90135,71535,Zombieland (2009),Action|Comedy|Horror,2.0,3.0
97675,115713,Ex Machina (2015),Drama|Sci-Fi|Thriller,2.0,3.5
76960,8798,Collateral (2004),Action|Crime|Drama|Thriller,2.0,3.5
95272,99114,Django Unchained (2012),Action|Drama|Western,2.0,3.5
93833,91529,"Dark Knight Rises, The (2012)",Action|Adventure|Crime|IMAX,2.0,3.5


In [119]:
recommend_for_user(5.0)

Snatch (2000) 4.173328216213473
In Bruges (2008) 4.093360478507759
Lock, Stock & Two Smoking Barrels (1998) 4.073292247540929
Man Bites Dog (C'est arrivé près de chez vous) (1992) 4.003696679219208
Confessions of a Dangerous Mind (2002) 3.703169630686596
Perfect Crime, The (Crimen Ferpecto) (Ferpect Crime) (2004) 3.6403847227609187
Informant!, The (2009) 3.435650468494251
Life Eternal (2015) 3.4238473988044356
Party Monster (2003) 3.3979201921264717
The Voices (2014) 3.3877121159089207
