### Задание

Использовать dataset MovieLens.
Построить рекомендации (регрессия, предсказываем оценку) на фичах:  
TF-IDF на тегах и жанрах  
Средние оценки (+ median, variance, etc.) пользователя и фильма  
Оценить RMSE на тестовой выборке  
Для конкретного пользователя строим модель

In [89]:
import pandas as pd
import numpy as np

from tqdm import tqdm_notebook

import matplotlib.pyplot as plt
%matplotlib inline

In [8]:
# соберем все данные в свои датасеты
links = pd.read_csv('data/links.csv')
movies = pd.read_csv('data/movies.csv')
ratings = pd.read_csv('data/ratings.csv')
tags = pd.read_csv('data/tags.csv')

In [78]:
# соберем общий датасет из полученных данных
df = pd.merge(left=movies, right=ratings[['userId', 'movieId','rating']], how='inner', on='movieId')
df = pd.merge(left=df, right=tags[['movieId', 'tag']], how='inner', on='movieId')

In [127]:
df.head()

Unnamed: 0,movieId,title,genres,userId,rating,tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,pixar
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,pixar
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,fun
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,pixar
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,pixar


In [125]:
# Отберем топ 10 пользователей, поставивших оценки
df[['userId','rating']].groupby('userId').count().sort_values('rating', ascending=False)[:5]

Unnamed: 0_level_0,rating
userId,Unnamed: 1_level_1
474,2455
414,2343
599,2100
68,1791
610,1701


In [128]:
# посмотрим на пользователя, поставившего максимальное число оценок
df[df.userId == 474].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2455 entries, 492 to 212440
Data columns (total 6 columns):
movieId    2455 non-null int64
title      2455 non-null object
genres     2455 non-null object
userId     2455 non-null int64
rating     2455 non-null float64
tag        2455 non-null object
dtypes: float64(1), int64(2), object(3)
memory usage: 134.3+ KB


In [132]:
user_474_index = df[df.userId == 474].index

In [254]:
# получим значения tfidf для жанров и тегов

In [85]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [83]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [106]:
movie_genres = [change_string(g) for g in df.genres.values]
tag_strings = [change_string(g) for g in df.tag.values]

In [110]:
genres_vectorizer, tag_vectorizer = TfidfVectorizer(), TfidfVectorizer()
X_movies_tfidf = genres_vectorizer.fit_transform(movie_genres)
X_tag_tfidf = tag_vectorizer.fit_transform(tag_strings)

In [199]:
def get_df_from_csr(csr, prefix):
    cols = [ f'{prefix}_{i}' for i in range(csr.shape[1])] 
    return pd.DataFrame( csr.todense(), columns=cols )

In [134]:
X_tag_tfidf[user_474_index]

<2455x1469 sparse matrix of type '<class 'numpy.float64'>'
	with 2481 stored elements in Compressed Sparse Row format>

In [156]:
X_tag_tfidf[user_474_index]

<2455x1469 sparse matrix of type '<class 'numpy.float64'>'
	with 2481 stored elements in Compressed Sparse Row format>

In [142]:
df.iloc[user_474_index, :].userId.unique()[0] == 474

True

In [225]:
df_474 = pd.concat([
    df[['movieId', 'userId', 'rating']].iloc[user_474_index, :], 
    get_df_from_csr(X_movies_tfidf[user_474_index], prefix='movie_tfidf').set_index(user_474_index), 
    get_df_from_csr(X_tag_tfidf[user_474_index], prefix='tag_tfidf').set_index(user_474_index), 
    ], axis=1)

In [226]:
sum(df_474.isna().sum())

0

In [227]:
df_474.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2455 entries, 492 to 212440
Columns: 1492 entries, movieId to tag_tfidf_1468
dtypes: float64(1490), int64(2)
memory usage: 28.0 MB


In [239]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor

In [228]:
df_474.drop_duplicates(subset=['movieId', 'userId', 'rating'], inplace=True)

In [229]:
df_474.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1198 entries, 492 to 212437
Columns: 1492 entries, movieId to tag_tfidf_1468
dtypes: float64(1490), int64(2)
memory usage: 13.6 MB


In [237]:
df_cols = list( set(df_474.columns) - set(['rating']) )
X_train, X_test, y_train, y_test = train_test_split(df_474[df_cols], df_474['rating'])

In [243]:
model = RandomForestRegressor(n_estimators=3000, n_jobs=-1, verbose=True)
model.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=3000, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [248]:
r2_score(model.predict(X_train), y_train)

0.7633111895699443

In [250]:
y_pred = model.predict(X_test)

#pr2_score(, y_test)

In [252]:
np.sqrt(mean_squared_error(y_pred, y_test))

0.8315654378384225

In [253]:
for i in zip(y_pred[:30], y_test[:30]):
    print(i)

(2.922, 3.0)
(3.965333333333333, 4.0)
(3.971, 3.5)
(3.895166666666667, 3.5)
(3.9863333333333335, 4.5)
(3.7743333333333333, 5.0)
(4.099833333333334, 4.5)
(3.7436666666666665, 3.5)
(3.6935, 3.5)
(3.9778333333333333, 4.0)
(4.183833333333333, 3.0)
(3.8921666666666668, 4.0)
(3.8428333333333335, 3.5)
(3.8745, 2.5)
(3.9031666666666665, 4.0)
(3.8423333333333334, 3.5)
(3.7548333333333335, 4.0)
(3.3028333333333335, 4.0)
(3.3406666666666665, 3.5)
(3.797833333333333, 4.0)
(3.5726666666666667, 4.0)
(3.5773333333333333, 4.0)
(3.4418333333333333, 4.0)
(3.5116666666666667, 3.0)
(3.8185, 4.5)
(3.743, 4.0)
(4.125666666666667, 4.0)
(3.9103333333333334, 5.0)
(3.6111666666666666, 4.5)
(3.7163333333333335, 4.0)
