# Задание к теме «Гибридные рекомендательные системы»


In [1]:
import numpy as np
import pandas as pd

from surprise import SVD, SVDpp
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split

import matplotlib.pyplot as plt

from tqdm import tqdm_notebook

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors

In [2]:
movies = pd.read_table('movies.dat', names=['MovieID','Title','Genres'], sep='::')
users = pd.read_table('users.dat', names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'], sep='::')
ratings = pd.read_table('ratings.dat', names=['UserID', 'MovieID', 'Rating', 'Timestamp'], sep='::')

  movies = pd.read_table('movies.dat', names=['MovieID','Title','Genres'], sep='::')
  users = pd.read_table('users.dat', names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'], sep='::')
  ratings = pd.read_table('ratings.dat', names=['UserID', 'MovieID', 'Rating', 'Timestamp'], sep='::')


In [5]:
movies_ratings = movies.join(ratings.set_index('MovieID'), on='MovieID').reset_index(drop=True)

In [6]:
movies_ratings

Unnamed: 0,MovieID,Title,Genres,UserID,Rating,Timestamp
0,1,Toy Story (1995),Animation|Children's|Comedy,1.0,5.0,9.788243e+08
1,1,Toy Story (1995),Animation|Children's|Comedy,6.0,4.0,9.782370e+08
2,1,Toy Story (1995),Animation|Children's|Comedy,8.0,4.0,9.782335e+08
3,1,Toy Story (1995),Animation|Children's|Comedy,9.0,5.0,9.782260e+08
4,1,Toy Story (1995),Animation|Children's|Comedy,10.0,5.0,9.782265e+08
...,...,...,...,...,...,...
1000381,3952,"Contender, The (2000)",Drama|Thriller,5812.0,4.0,9.920721e+08
1000382,3952,"Contender, The (2000)",Drama|Thriller,5831.0,3.0,9.862231e+08
1000383,3952,"Contender, The (2000)",Drama|Thriller,5837.0,4.0,1.011903e+09
1000384,3952,"Contender, The (2000)",Drama|Thriller,5927.0,1.0,9.798525e+08


In [8]:
movies_ratings.dropna(inplace=True)

In [None]:
# Создаем датасет для анализа с нужными признаками

In [9]:
movies_df = pd.DataFrame({
    'uid': movies_ratings.UserID,
    'iid': movies_ratings.Title,
    'rating': movies_ratings.Rating
})

In [None]:
# Создаем разреженную матрицу для построения РС

In [14]:
reader = Reader(rating_scale = (1.0, 5.0))
data = Dataset.load_from_df(movies_df, reader)

In [16]:
train_data, test_data = train_test_split(data, test_size=.15, random_state=42)

In [46]:
model = SVD(n_factors=20, n_epochs=40, lr_all=0.005, reg_all=0.05)
model.fit(train_data)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7ff59b3715e0>

In [47]:
test_pred = model.test(test_data)

In [48]:
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.8537


0.8536681223854062

In [76]:
def sorted_movies(dict_):
    sorted_dict = {}
    sorted_keys = sorted(dict_, key=dict_.get, reverse = True) 

    for w in sorted_keys:
        sorted_dict[w] = dict_[w]
    
    return sorted_dict

In [104]:
def user_movies_get(current_uid):
    user_movies = movies_ratings[movies_ratings.UserID == current_uid].Title.unique()
    return user_movies

In [77]:
def user_scores(current_uid, user_movies):
    
    titles_scores = {}

    for movie in movies_ratings.Title.unique():
        if movie in user_movies:
            continue
        else:
            scores.append(model.predict(uid=current_uid, iid=movie).est)
            titles.append(movie)
            
            titles_scores[movie] = model.predict(uid=current_uid, iid=movie).est
            
    return sorted_movies(titles_scores)

In [82]:
def top_10(sorted_dict):
    top_10_dict = {}
    n = 0
    
    for m, r in sorted_dict.items():
        if n < 10:
            top_10_dict[m] = r
            n += 1
        
    return top_10_dict

In [103]:
titles_scores_10 = top_10(user_scores(10.0, user_movies_get(10.0)))
titles_scores_10

{"Ed's Next Move (1996)": 4.934223942787087,
 'Sanjuro (1962)': 4.891790504440555,
 'Firelight (1997)': 4.870249905118774,
 'Circus, The (1928)': 4.8617055491722665,
 'Eighth Day, The (Le Huiti�me jour ) (1996)': 4.8422713215275595,
 'Chushingura (1962)': 4.824489623500907,
 'Cinema Paradiso (1988)': 4.811694804526873,
 'Much Ado About Nothing (1993)': 4.801073438773405,
 'To Live (Huozhe) (1994)': 4.795520615009292,
 'Color Purple, The (1985)': 4.792865508852889}

In [84]:
def change_string(string):
    return ' '.join(string.replace(' ', '').replace('-', '').split('|'))

In [86]:
movie_genres = [change_string(g) for g in movies.Genres.values]

In [89]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(movie_genres)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

neigh = NearestNeighbors(n_neighbors=15, n_jobs=-1, metric='euclidean') 
neigh.fit(X_train_tfidf)

NearestNeighbors(metric='euclidean', n_jobs=-1, n_neighbors=15)

In [96]:
test = change_string("Adventure|Comedy|Fantasy|Crime")

predict = count_vect.transform([test])
X_tfidf2 = tfidf_transformer.transform(predict)

res = neigh.kneighbors(X_tfidf2, return_distance=True)

In [97]:
movies.iloc[res[1][0]].Title.values

array(['Mask, The (1994)', 'Hook (1991)', 'Borrowers, The (1997)',
       'Willy Wonka and the Chocolate Factory (1971)', 'Willow (1988)',
       'Highlander: Endgame (2000)', '7th Voyage of Sinbad, The (1958)',
       'Supergirl (1984)', 'Dragonheart (1996)', 'Ladyhawke (1985)',
       'Legend (1985)', 'Big (1988)', 'Heart and Souls (1993)',
       "Midsummer Night's Dream, A (1999)", 'Drop Dead Fred (1991)'],
      dtype=object)

In [94]:
title_genres = {}

for index, row in tqdm_notebook(movies.iterrows()):
    title_genres[row.Title] = row.Genres

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for index, row in tqdm_notebook(movies.iterrows()):


0it [00:00, ?it/s]

In [113]:
def recommend_for_user(user_id):
    current_uid = user_id
    user_movies = movies_ratings[movies_ratings.UserID == current_uid].Title.unique()
    
    last_user_movie = user_movies[-1]
    
    movie_genres = change_string(title_genres[last_user_movie])
    
    predict = count_vect.transform([movie_genres])
    X_tfidf2 = tfidf_transformer.transform(predict)
    
    res = neigh.kneighbors(X_tfidf2, return_distance=True)
    
    movies_to_score = movies.iloc[res[1][0]].Title.values
    
    return top_10(user_scores(current_uid, movies_to_score))

In [115]:
recommend_for_user(10.0)

{"Ed's Next Move (1996)": 4.934223942787087,
 'Wrong Trousers, The (1993)': 4.911526127604387,
 'Wallace & Gromit: The Best of Aardman Animation (1996)': 4.89620162538551,
 'Sanjuro (1962)': 4.891790504440555,
 'Firelight (1997)': 4.870249905118774,
 'Circus, The (1928)': 4.8617055491722665,
 'Roman Holiday (1953)': 4.847674234239353,
 'Sixth Sense, The (1999)': 4.8438945168265475,
 'Close Shave, A (1995)': 4.843871672857963,
 'Eighth Day, The (Le Huiti�me jour ) (1996)': 4.8422713215275595}