# Гибридные рекомендательные системы
- Датасет ml-latest
- Вспомнить подходы, которые мы разбирали
- Выбрать понравившийся подход к гибридным системам

Построение гибридной рекомендательной системы:

- Холодный старт (до 5 оценок) - рекомендуем наиболее популярные фильмы
- Теплый старт (от 5 до 10 оценок) - рекомендуем фильмы на основе сожержания
- Горячий старт (от 10 до бесконечности оценок) - коллаборативная фильтрация с блендингом (Item, User based)

In [2]:
import pandas as pd
import numpy as np
import xgboost

%matplotlib inline

from surprise import SVD, SVDpp
from surprise import Dataset
from surprise import accuracy
from surprise import Reader

from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV


import matplotlib.pyplot as plt

from tqdm import tqdm_notebook

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import mean_squared_log_error, mean_squared_error
from sklearn.preprocessing import normalize

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [3]:
movies = pd.read_csv('../ml-latest-small 2/movies.csv')
links = pd.read_csv('../ml-latest-small 2/links.csv')
tags = pd.read_csv('../ml-latest-small 2/tags.csv')
ratings = pd.read_csv('../ml-latest-small 2/ratings.csv')

In [4]:
# соединим датасет с рейтингами и названиями фильмов
movies_joined_ratings = ratings.join(movies.set_index('movieId'), on='movieId')

### Рекомендательная система для холодного старта

In [5]:
#рекомендуем фильмы по популярности

In [6]:
movies_means = ratings.groupby('movieId').agg({'userId': np.count_nonzero, 'rating': [np.median, np.var, np.average]})
movies_means.columns=['userid_count', 'movie_rating_median', 'movie_rating_var', 'movie_rating_average']
movies_means=movies_means.fillna(0)

In [7]:
movies_means_normalize = pd.DataFrame(data=normalize(movies_means), columns=movies_means.columns)
movies_means_normalize['movieId']=movies_means.index
movies_means_normalize.head()

Unnamed: 0,userid_count,movie_rating_median,movie_rating_var,movie_rating_average,movieId
0,0.999656,0.018598,0.003241,0.018231,1
1,0.998984,0.031786,0.00706,0.031167,2
2,0.996164,0.057471,0.021315,0.062444,3
3,0.87443,0.374756,0.090715,0.294451,4
4,0.996044,0.060982,0.016728,0.062434,5


In [8]:
# Нормируем количество оценок пользователей и рейтинг фильмов
movies_popularity = movies_means

# Популярность фильма = нормированный райтинг * нормированное количество оценок
movies_popularity['popularity'] = movies_popularity['userid_count'] * movies_popularity['movie_rating_average']
movies_popularity = movies_popularity.merge(movies, on='movieId', how='left', sort=False)[
                                            ['movieId', 'title', 'genres', 'popularity']]

In [9]:
def cold_start(userId):
    movies = movies_popularity.sort_values('popularity', ascending=False)[['movieId', 'title', 'popularity']].head(10)
    return movies

In [10]:
cold_start(100)

Unnamed: 0,movieId,title,popularity
277,318,"Shawshank Redemption, The (1994)",1404.0
314,356,Forrest Gump (1994),1370.0
257,296,Pulp Fiction (1994),1288.5
1938,2571,"Matrix, The (1999)",1165.5
510,593,"Silence of the Lambs, The (1991)",1161.0
224,260,Star Wars: Episode IV - A New Hope (1977),1062.0
97,110,Braveheart (1995),955.5
2224,2959,Fight Club (1999),931.5
461,527,Schindler's List (1993),929.5
418,480,Jurassic Park (1993),892.5


### Рекомендация на основе содержания Content-based

In [11]:
# группируем тэги для фильмов
grouped_tags = tags.groupby('movieId').agg({'tag': [(lambda x: "|".join(x)), np.count_nonzero]})
grouped_tags.columns=['all_tags', 'all_tags_count']

In [12]:
def tf_idf(row, value, dictionary):
    return (1/len(row.split('|')))*dictionary[value] if value in row else 0

In [13]:
movies_with_tags = movies.merge(grouped_tags, on='movieId', how='left', sort=False)\
                                 .merge(movies_means_normalize, on='movieId', how='left', sort=False)
movies_with_tags['all_tags'] = movies_with_tags['all_tags'].fillna('')
movies_with_tags = movies_with_tags.fillna(0)

In [14]:
#формируем список жанров:
genres_list = []
for i in movies.genres.str.split('|'):
    for j in i:
        genres_list.append(j)
        
#словарь жанров:
genres_dict = {i:np.log(len(movies)/genres_list.count(i)) for i in genres_list}

In [15]:
from tqdm import tqdm, tqdm_notebook

In [16]:
#добавим новые фичи в датасет (TF-IDF на жанрах):
for i in tqdm(genres_dict):
    movies_with_tags['tf_idf_'+i] = movies_with_tags.apply(lambda row: tf_idf(row['genres'], i, genres_dict), axis=1)

100%|██████████| 20/20 [00:03<00:00,  6.54it/s]


In [17]:
#формируем список тэгов
tags_list = []
for i in grouped_tags.all_tags.str.split('|'):
    for j in i :
        tags_list.append(j)
        
#словарь тэгов
tags_dict = {i:np.log(len(movies)/tags_list.count(i)) for i in tags_list if tags_list.count(i)>5 and i!=''}

In [18]:
#добавим новые фичи в датасет (TF-IDF на тэгах для пользователя и фильма):
for i in tqdm(tags_dict):
    movies_with_tags['tf_idf_'+i] = movies_with_tags\
                .apply(lambda row: tf_idf(row['all_tags'], i, tags_dict), axis=1)

100%|██████████| 125/125 [00:29<00:00,  3.28it/s]


In [19]:
movies_tf_idf = movies_with_tags.drop(['genres', 'title', 'all_tags'], axis=1)

In [21]:
#по пользователям - cредняя оценка, медианное значение, дисперсия, количество оценок пользователей
users_mean = ratings.groupby('userId').agg({'movieId': np.count_nonzero, 
                                'rating': [np.median, np.var, np.average]})
users_mean.columns=['movieid_count', 'user_rating_median', 'user_rating_var', 'user_rating_average']

In [22]:
users_mean_normalize = pd.DataFrame(data=normalize(users_mean), columns=users_mean.columns)
users_mean_normalize['userId']=users_mean.index
users_mean_normalize.head()

Unnamed: 0,movieid_count,user_rating_median,user_rating_var,user_rating_average,userId
0,0.999587,0.021543,0.002758,0.018813,1
1,0.981496,0.135379,0.021966,0.133628,2
2,0.99179,0.012715,0.111151,0.061946,3
3,0.999661,0.018512,0.007993,0.016455,4
4,0.992294,0.090209,0.022123,0.082008,5


In [23]:
#какие оценки в среднем пользователь ставит жанрам фильмов
user_ratings = ratings.merge(movies, on='movieId', how='left', sort=False)[['userId', 'movieId', 'rating', 'genres']]
for i in tqdm(genres_dict):
    user_ratings['genre_'+i] = user_ratings.apply(lambda row: row['rating'] if i in row['genres'] else None,axis=1)
    
user_ratings = user_ratings.groupby('userId').mean().drop(['movieId', 'rating'], axis=1).fillna(0)

100%|██████████| 20/20 [00:31<00:00,  1.54s/it]


In [24]:
users_values = users_mean_normalize.merge(user_ratings, on='userId', how='left', sort=False)
users_values.head(5)

Unnamed: 0,movieid_count,user_rating_median,user_rating_var,user_rating_average,userId,genre_Adventure,genre_Animation,genre_Children,genre_Comedy,genre_Fantasy,...,genre_Horror,genre_Mystery,genre_Sci-Fi,genre_War,genre_Musical,genre_Documentary,genre_IMAX,genre_Western,genre_Film-Noir,genre_(no genres listed)
0,0.999587,0.021543,0.002758,0.018813,1,4.388235,4.689655,4.547619,4.277108,4.297872,...,3.470588,4.166667,4.225,4.5,4.681818,0.0,0.0,4.285714,5.0,0.0
1,0.981496,0.135379,0.021966,0.133628,2,4.166667,0.0,0.0,4.0,0.0,...,3.0,4.0,3.875,4.5,0.0,4.333333,3.75,3.5,0.0,0.0
2,0.99179,0.012715,0.111151,0.061946,3,2.727273,0.5,0.5,1.0,3.375,...,4.6875,5.0,4.2,0.5,0.5,0.0,0.0,0.0,0.0,0.0
3,0.999661,0.018512,0.007993,0.016455,4,3.655172,4.0,3.8,3.509615,3.684211,...,4.25,3.478261,2.833333,3.571429,4.0,4.0,3.0,3.8,4.0,0.0
4,0.992294,0.090209,0.022123,0.082008,5,3.25,4.333333,4.111111,3.466667,4.142857,...,3.0,4.0,2.5,3.333333,4.4,0.0,3.666667,3.0,0.0,0.0


In [25]:
#датасет для обучения
movies_users = ratings.merge(users_values, on='userId', how='left', sort=False)\
                      .merge(movies_tf_idf, on='movieId', how='left', sort=False)

data = movies_users.drop(['userId', 'movieId','timestamp'], axis=1)

In [26]:
X = data.drop(['rating'], axis=1)
y = data['rating']

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [28]:
#линейная регрессия

In [29]:
lr_params={'fit_intercept':[True,False],
           'normalize':[True,False]
            }
grid_lr = GridSearchCV(cv=3, param_grid=lr_params, estimator=LinearRegression(),n_jobs=-1,scoring='neg_mean_squared_error')
grid_lr.fit(X_train,y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'fit_intercept': [True, False], 'normalize': [True, False]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [30]:
print(grid_lr.best_params_)
print(grid_lr.best_score_)
print(grid_lr.best_estimator_)
print('RMSE = ', np.sqrt(mean_squared_error(y_test, grid_lr.best_estimator_.predict(X_test))))

{'fit_intercept': True, 'normalize': False}
-0.8041858400805773
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)
RMSE =  0.9018093621510253


In [31]:
best_estimator = grid_lr.best_estimator_
best_estimator

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [32]:
predictions = best_estimator.predict(X_test)
mean_squared_error(predictions, y_test)

0.813260125663239

In [33]:
predictions

array([3.34147017, 3.47876166, 2.53070914, ..., 3.89700885, 3.41666966,
       2.75675225])

In [34]:
#рекомендуем фильмы на основе сожержания

In [35]:
def warm_start(userId):
    moviesids = ratings.loc[(ratings.userId==userId), 'movieId'].values
    
    movies_not_wathed = movies_tf_idf.loc[(~movies.movieId.isin(moviesids))]
    movies_not_wathed['key'] = 0
    users = users_values.loc[(users_values.userId==userId)]
    users['key'] = 0
    
    movies_not_wathed = movies_not_wathed.merge(users, on='key', how='left')
    data = movies_not_wathed.drop(['userId', 'movieId', 'key'], axis=1).fillna(0)
    
    result = movies_not_wathed[['movieId']].merge(movies, on='movieId', how='inner')
    result['prediction'] = best_estimator.predict(data)
    
    return result.sort_values('prediction', ascending=False).head(10)

In [36]:
warm_start(201)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


Unnamed: 0,movieId,title,genres,prediction
9569,183301,The Tale of the Bunny Picnic (1986),Children,18.218001
4136,6192,Open Hearts (Elsker dig for evigt) (2002),Romance,18.155099
5235,8911,Raise Your Voice (2004),Romance,18.155099
1075,1475,Kama Sutra: A Tale of Love (1996),Romance,18.107724
1077,1477,Love Jones (1997),Romance,18.107724
825,1137,Hustler White (1996),Romance,18.107724
8888,139747,Before We Go (2014),Romance,18.073727
5491,26958,Emma (1996),Romance,18.073727
1087,1493,Love and Other Catastrophes (1996),Romance,18.073727
1098,1514,Temptress Moon (Feng Yue) (1996),Romance,18.028323


### Коллаборативная фильтрация (SVD на Surprise)

In [37]:
#будем использовать данные по тем пользователям, которые поствили больше 10 оценок
movies_and_ratings = ratings.merge(movies, on='movieId', how='left')
dataset = pd.DataFrame({
    'uid': movies_and_ratings.userId,
    'iid': movies_and_ratings.movieId,
    'rating': movies_and_ratings.rating})

In [38]:
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(dataset, reader)

In [39]:
trainset, testset = train_test_split(data, test_size=.25) #параметры алгоритма

TypeError: Singleton array array(<surprise.dataset.DatasetAutoFolds object at 0x1a192ce668>,
      dtype=object) cannot be considered a valid collection.