### Задание 
На датасете ml-latest написать свою гибридную рекомендательную систему

В качестве своей рекомендательной системы сделаем следующее:  
Для выбранного пользователя возьмем 5 просмотренных им фильмов с наивысшей оценкой.   
С помощью коллаборативной фильтрации отберем топ 15 фильмов.  
Далее отранжируем их посредством SVD и выдадим в качестве рекомендаций.   

In [1]:
from surprise import SVD, SVDpp, Dataset, accuracy, Reader
from surprise.model_selection import train_test_split
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
from scipy.spatial.distance import cosine, euclidean, hamming
import pandas as pd
import numpy as np

### Формируем датасет

In [2]:
movies = pd.read_csv('data/movies.csv')
ratings = pd.read_csv('data/ratings.csv')

In [3]:
movies_with_ratings = movies.join(ratings.set_index('movieId'), on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)

In [4]:
movies_with_ratings.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0,4.0,964982700.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0,4.0,847435000.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7.0,4.5,1106636000.0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15.0,2.5,1510578000.0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17.0,4.5,1305696000.0


In [5]:
dataset = pd.DataFrame({
    'uid': movies_with_ratings.userId,
    'iid': movies_with_ratings.title,
    'rating': movies_with_ratings.rating
})

### Формируем вектор фильмов для коллаборативной фильтрации

In [6]:
num_users = movies_with_ratings.userId.unique().shape[0]

In [7]:
movie_vector = {}

for movie, group in tqdm_notebook(movies_with_ratings.groupby('title')):
    movie_vector[movie] = np.zeros(num_users)
    for i in range(len(group.userId.values)):
        u = group.userId.values[i]
        r = group.rating.values[i]
        movie_vector[movie][int(u - 1)] = r

HBox(children=(IntProgress(value=0, max=9719), HTML(value='')))




In [8]:
print(f'Результирующий вектор размером : [{len(movie_vector)}, {len(movie_vector[movies.title[1]])}]') 

Результирующий вектор размером : [9719, 610]


### Подготавливаем алгоритм для ранжирования

In [9]:
RATING_MIN, RATING_MAX = dataset.rating.min(), dataset.rating.max()

In [10]:
reader = Reader(rating_scale=(RATING_MIN, RATING_MAX))
data = Dataset.load_from_df(dataset, reader)
trainset, testset = train_test_split(data, test_size=.20, random_state=42)

In [11]:
algo = SVD(n_factors=100, n_epochs=100, lr_all=0.005, reg_all=0.2)
algo.fit(trainset)
test_pred = algo.test(testset)
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.8631


0.8631145258696366

In [12]:
def get_top_user_films(movie_titles, distance_func=cosine, top_n=30):
    assert movie_vector is not None, 'Initialize movie_vector!'
    titles = []
    distances = []

    for movie_title in movie_titles:
        for key in movie_vector.keys():
            if key == movie_title:
                continue
            titles.append(key)
            distances.append(distance_func(movie_vector[movie_title], movie_vector[key]))

    best_indexes = np.argsort(distances)[:top_n]
    best_movies = [(titles[i], distances[i]) for i in best_indexes]
    return [i[0] for i in best_movies]

In [13]:
get_top_user_films(movie_titles=['Matrix, The (1999)'], distance_func=cosine, top_n=10)

['Fight Club (1999)',
 'Star Wars: Episode V - The Empire Strikes Back (1980)',
 'Saving Private Ryan (1998)',
 'Star Wars: Episode IV - A New Hope (1977)',
 'Star Wars: Episode VI - Return of the Jedi (1983)',
 'Lord of the Rings: The Fellowship of the Ring, The (2001)',
 'Sixth Sense, The (1999)',
 'Lord of the Rings: The Return of the King, The (2003)',
 'Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)',
 'Gladiator (2000)']

In [14]:
def rank_movies(movies_to_score, user_movies, top_n=15):
    # отранжировать с помощью SVD 
    scores = []
    titles = []
    for movie in movies_to_score:
        if movie in user_movies:
            continue
        scores.append(algo.predict(uid=current_user_id, iid=movie).est)
        titles.append(movie)
    
    best_indexes = np.argsort(scores)[-top_n:]
    res = []
    for i in reversed(best_indexes):
        res.append((titles[i], scores[i]))
        
    return res

In [15]:
def recommend_for_user(user_id, top_n=10):        
    # получить 5 любимых фильмов пользователя
    best_user_movies = movies_with_ratings[movies_with_ratings.userId == current_user_id].sort_values(by='rating', ascending=False).head(5).title.tolist()
    print(f'the best movies the user watched : {best_user_movies}')
    print()

    # получить списки фильмов, выданных коллаборативной фильтрацией
    user_movies = movies_with_ratings[movies_with_ratings.userId == user_id].title.unique()
    movies_to_score = get_top_user_films(best_user_movies)
    
    assert len(set(movies_to_score) - set(user_movies)) > 0, 'All good films user has already  watched!'

    # отранжировать с помощью SVD 
    ranked_movies = rank_movies(movies_to_score, user_movies, top_n=top_n)
    print(f'Top {top_n} films recommended to user based on his film history :')
    for i in ranked_movies:
        print(i)

In [16]:
current_user_id = 2
recommend_for_user(current_user_id)

the best movies the user watched : ['The Jinx: The Life and Deaths of Robert Durst (2015)', 'Mad Max: Fury Road (2015)', 'Wolf of Wall Street, The (2013)', 'Warrior (2011)', 'Step Brothers (2008)']

Top 10 films recommended to user based on his film history :
('Guardians of the Galaxy (2014)', 3.951287587562543)
('The Martian (2015)', 3.9367011462847046)
('Edge of Tomorrow (2014)', 3.9268518535387766)
('Kingsman: The Secret Service (2015)', 3.905499784740675)
('Moonrise Kingdom (2012)', 3.8901269729974928)
('The Revenant (2015)', 3.888402301237388)
('John Wick (2014)', 3.8804034492703527)
('Star Wars: Episode VII - The Force Awakens (2015)', 3.846354087121646)
('Grand Budapest Hotel, The (2014)', 3.8153094506053113)
('Avengers, The (2012)', 3.807066803095476)
