### Задание

Использовать dataset MovieLens.
Построить рекомендации (регрессия, предсказываем оценку) на фичах:  
TF-IDF на тегах и жанрах  
Средние оценки (+ median, variance, etc.) пользователя и фильма  
Оценить RMSE на тестовой выборке  
Модель будет строиться для конкретного пользователя

In [1]:
import pandas as pd
import numpy as np

from tqdm import tqdm_notebook

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# соберем все данные в свои датасеты
links = pd.read_csv('data/links.csv')
movies = pd.read_csv('data/movies.csv')
ratings = pd.read_csv('data/ratings.csv')
tags = pd.read_csv('data/tags.csv')

In [3]:
# соберем общий датасет из полученных данных
df = pd.merge(left=movies, right=ratings[['userId', 'movieId','rating']], how='inner', on='movieId')
df = pd.merge(left=df, right=tags[['movieId', 'tag']], how='inner', on='movieId')

In [4]:
df.head()

Unnamed: 0,movieId,title,genres,userId,rating,tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,pixar
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,pixar
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,fun
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,pixar
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,pixar


In [5]:
# Отберем топ 10 пользователей, поставивших оценки
df[['userId','rating']].groupby('userId').count().sort_values('rating', ascending=False)[:5]

Unnamed: 0_level_0,rating
userId,Unnamed: 1_level_1
474,2455
414,2343
599,2100
68,1791
610,1701


In [6]:
# посмотрим на пользователя, поставившего максимальное число оценок
df[df.userId == 474].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2455 entries, 492 to 212440
Data columns (total 6 columns):
movieId    2455 non-null int64
title      2455 non-null object
genres     2455 non-null object
userId     2455 non-null int64
rating     2455 non-null float64
tag        2455 non-null object
dtypes: float64(1), int64(2), object(3)
memory usage: 134.3+ KB


In [7]:
user_474_index = df[df.userId == 474].index

### Получим значения tfidf для жанров и тегов

In [9]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [11]:
movie_genres = [change_string(g) for g in df.genres.values]
tag_strings = [change_string(g) for g in df.tag.values]

In [12]:
genres_vectorizer, tag_vectorizer = TfidfVectorizer(), TfidfVectorizer()
X_movies_tfidf = genres_vectorizer.fit_transform(movie_genres)
X_tag_tfidf = tag_vectorizer.fit_transform(tag_strings)

In [13]:
def get_df_from_csr(csr, prefix):
    cols = [ f'{prefix}_{i}' for i in range(csr.shape[1])] 
    return pd.DataFrame( csr.todense(), columns=cols )

In [14]:
X_tag_tfidf[user_474_index]

<2455x1469 sparse matrix of type '<class 'numpy.float64'>'
	with 2481 stored elements in Compressed Sparse Row format>

In [15]:
X_tag_tfidf[user_474_index]

<2455x1469 sparse matrix of type '<class 'numpy.float64'>'
	with 2481 stored elements in Compressed Sparse Row format>

In [16]:
df.iloc[user_474_index, :].userId.unique()[0] == 474

True

In [17]:
df_474 = pd.concat([
    df[['movieId', 'userId', 'rating']].iloc[user_474_index, :], 
    get_df_from_csr(X_movies_tfidf[user_474_index], prefix='movie_tfidf').set_index(user_474_index), 
    get_df_from_csr(X_tag_tfidf[user_474_index], prefix='tag_tfidf').set_index(user_474_index), 
    ], axis=1)

In [18]:
sum(df_474.isna().sum())

0

In [19]:
df_474.drop_duplicates(subset=['movieId', 'userId', 'rating'], inplace=True)
df_474.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1198 entries, 492 to 212437
Columns: 1492 entries, movieId to tag_tfidf_1468
dtypes: float64(1490), int64(2)
memory usage: 13.6 MB


In [20]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor

In [21]:
df_cols = list( set(df_474.columns) - set(['rating']) )
X_train, X_test, y_train, y_test = train_test_split(df_474[df_cols], df_474['rating'])

In [22]:
model = CatBoostRegressor(verbose=False)
model.fit(X_train, y_train);

In [23]:
print(f'train rmse : {np.sqrt(mean_squared_error(model.predict(X_train), y_train))}')
y_pred_first_iter = model.predict(X_test)
print(f'test rmse : {np.sqrt(mean_squared_error(y_pred_first_iter, y_test))}')

train rmse : 0.6588425969491828
test rmse : 0.777249690583498


In [24]:
# проверим, как выглядят выборочные ошибки
for i in zip(y_pred_first_iter[:30], y_test[:30]): 
    print(f'pred : {round(i[0], 3)}, target : {i[1]}, delta : {round(i[1] - i[0], 2)}  ')

pred : 3.646, target : 4.0, delta : 0.35  
pred : 4.123, target : 4.0, delta : -0.12  
pred : 3.793, target : 4.0, delta : 0.21  
pred : 3.882, target : 5.0, delta : 1.12  
pred : 3.778, target : 4.5, delta : 0.72  
pred : 3.78, target : 4.5, delta : 0.72  
pred : 3.656, target : 2.0, delta : -1.66  
pred : 3.158, target : 2.0, delta : -1.16  
pred : 3.606, target : 2.0, delta : -1.61  
pred : 3.794, target : 4.0, delta : 0.21  
pred : 3.162, target : 1.0, delta : -2.16  
pred : 3.873, target : 4.0, delta : 0.13  
pred : 3.674, target : 3.5, delta : -0.17  
pred : 3.899, target : 4.5, delta : 0.6  
pred : 3.326, target : 3.0, delta : -0.33  
pred : 4.291, target : 5.0, delta : 0.71  
pred : 3.871, target : 4.0, delta : 0.13  
pred : 3.521, target : 4.0, delta : 0.48  
pred : 3.778, target : 4.0, delta : 0.22  
pred : 3.911, target : 3.5, delta : -0.41  
pred : 3.425, target : 1.5, delta : -1.93  
pred : 4.32, target : 3.5, delta : -0.82  
pred : 3.818, target : 2.5, delta : -1.32  
pre

### Добавим средние оценки для пользователя и фильма

In [26]:
df_movie_mean_rating = pd.DataFrame(df[['movieId', 'rating']].groupby('movieId').mean())
df_movie_mean_rating.columns = ['movie_mean_rating']

In [27]:
df_user_mean_rating = pd.DataFrame(df[['userId', 'rating']].groupby('userId').mean())
df_user_mean_rating.columns = ['user_mean_rating']

In [28]:
df = pd.merge(left=df, right=df_movie_mean_rating, how='inner', on='movieId')
df = pd.merge(left=df, right=df_user_mean_rating, how='inner', on='userId')
df.head()

Unnamed: 0,movieId,title,genres,userId,rating,tag,movie_mean_rating,user_mean_rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,pixar,3.92093,4.040472
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,pixar,3.92093,4.040472
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,fun,3.92093,4.040472
3,3,Grumpier Old Men (1995),Comedy|Romance,1,4.0,moldy,3.259615,4.040472
4,3,Grumpier Old Men (1995),Comedy|Romance,1,4.0,old,3.259615,4.040472


In [29]:
# соберем финальный датасет, используя полученные с помощью tf-idf оценки 
user_474_index = df[df.userId == 474].index
df_474 = pd.concat([
    df[['movieId', 'userId', 'rating', 'movie_mean_rating', 'user_mean_rating']].iloc[user_474_index, :], 
    get_df_from_csr(X_movies_tfidf[user_474_index], prefix='movie_tfidf').set_index(user_474_index), 
    get_df_from_csr(X_tag_tfidf[user_474_index], prefix='tag_tfidf').set_index(user_474_index), 
    ], axis=1)

In [30]:
df_474.head()

Unnamed: 0,movieId,userId,rating,movie_mean_rating,user_mean_rating,movie_tfidf_0,movie_tfidf_1,movie_tfidf_2,movie_tfidf_3,movie_tfidf_4,...,tag_tfidf_1459,tag_tfidf_1460,tag_tfidf_1461,tag_tfidf_1462,tag_tfidf_1463,tag_tfidf_1464,tag_tfidf_1465,tag_tfidf_1466,tag_tfidf_1467,tag_tfidf_1468
88596,1,474,4.0,3.92093,3.778004,0.0,0.0,0.0,0.0,0.791136,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
88597,1,474,4.0,3.92093,3.778004,0.0,0.0,0.0,0.0,0.791136,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
88598,1,474,4.0,3.92093,3.778004,0.0,0.0,0.0,0.0,0.791136,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
88599,2,474,3.0,3.431818,3.778004,0.0,0.0,0.0,0.0,0.791136,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
88600,2,474,3.0,3.431818,3.778004,0.0,0.0,0.0,0.0,0.791136,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
df_cols = list( set(df_474.columns) - set(['rating']) )
X_train, X_test, y_train, y_test = train_test_split(df_474[df_cols], df_474['rating'])

In [32]:
model = CatBoostRegressor(verbose=False)
model.fit(X_train, y_train);

In [33]:
print(f'train rmse : {np.sqrt(mean_squared_error(model.predict(X_train), y_train))}')
y_pred = model.predict(X_test)
print(f'test rmse : {np.sqrt(mean_squared_error(y_pred, y_test))}')

# проверим, как выглядят выборочные ошибки
for i in zip(y_pred[:30], y_pred_first_iter[:30], y_test[:30]): 
    print(f'pred : {round(i[0], 3)}, old_pred : {round(i[1], 3)} target : {i[2]}, delta : {round(i[2] - i[0], 2)}, old delta : {round(i[2] - i[1], 2)}')

train rmse : 0.48325377502253597
test rmse : 0.5683353347686803
pred : 3.711, old_pred : 3.646 target : 2.5, delta : -1.21, old delta : -1.15
pred : 3.945, old_pred : 4.123 target : 4.5, delta : 0.56, old delta : 0.38
pred : 3.785, old_pred : 3.793 target : 3.5, delta : -0.29, old delta : -0.29
pred : 3.967, old_pred : 3.882 target : 4.0, delta : 0.03, old delta : 0.12
pred : 4.107, old_pred : 3.778 target : 4.0, delta : -0.11, old delta : 0.22
pred : 2.68, old_pred : 3.78 target : 4.0, delta : 1.32, old delta : 0.22
pred : 3.616, old_pred : 3.656 target : 2.0, delta : -1.62, old delta : -1.66
pred : 3.973, old_pred : 3.158 target : 5.0, delta : 1.03, old delta : 1.84
pred : 3.843, old_pred : 3.606 target : 4.5, delta : 0.66, old delta : 0.89
pred : 3.796, old_pred : 3.794 target : 3.0, delta : -0.8, old delta : -0.79
pred : 2.476, old_pred : 3.162 target : 2.0, delta : -0.48, old delta : -1.16
pred : 3.977, old_pred : 3.873 target : 5.0, delta : 1.02, old delta : 1.13
pred : 3.921, ol

C оценками фильмов качество получилось намного лучше