# Домашнее задание по теме «Рекомендации на основе содержания»


    1 Использовать dataset MovieLens
    2 Построить рекомендации (регрессия, предсказываем оценку) на фичах:

    TF-IDF на тегах и жанрах
    Средние оценки (+ median, variance, etc.) пользователя и фильма

    3 Оценить RMSE на тестовой выборке

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

from tqdm import tqdm_notebook

import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors

%matplotlib inline

In [2]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [3]:
# ratings

In [4]:
# tags

In [5]:
# movies

In [6]:
movies_with_tags = movies.join(tags.set_index('movieId'), on='movieId')

In [7]:
# movies_with_tags

In [8]:
# ratings

In [9]:
data = movies_with_tags.merge(ratings, on=['movieId', 'userId'])

In [10]:
# data

In [11]:
data.drop(['timestamp_x', 'timestamp_y'], axis=1, inplace=True)

In [12]:
data

Unnamed: 0,movieId,title,genres,userId,tag,rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336.0,pixar,4.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474.0,pixar,4.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567.0,fun,3.5
3,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,fantasy,4.0
4,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,magic board game,4.0
...,...,...,...,...,...,...
3471,187595,Solo: A Star Wars Story (2018),Action|Adventure|Children|Sci-Fi,62.0,star wars,4.0
3472,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184.0,anime,3.5
3473,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184.0,comedy,3.5
3474,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184.0,gintama,3.5


In [13]:
# найдем средние оценки по фильмам

In [14]:
m_film_rating = data.groupby('movieId').agg(['mean']).drop(['userId'], axis=1)

In [15]:
m_film_rating.reset_index(inplace=True)

In [16]:
# m_film_rating

In [17]:
m_film_rating.columns = ['movieId', 'movie_mean']

In [18]:
m_film_rating.head()

Unnamed: 0,movieId,movie_mean
0,1,3.833333
1,2,3.75
2,3,2.5
3,5,1.5
4,7,3.0


In [19]:
#найдем средние оценки, которые ставит пользователь

In [20]:
m_user_rating = data.groupby('userId').agg(['mean']).drop(['movieId'], axis=1)

In [21]:
m_user_rating.reset_index(inplace=True)

In [22]:
m_user_rating.columns = ['userId', 'user_mean']

In [23]:
m_user_rating.head()

Unnamed: 0,userId,user_mean
0,2.0,5.0
1,7.0,1.0
2,18.0,4.125
3,21.0,0.5
4,49.0,4.5


In [24]:
# data

In [25]:
data = data.merge(m_film_rating, on='movieId').merge(m_user_rating, on='userId')

In [26]:
data

Unnamed: 0,movieId,title,genres,userId,tag,rating,movie_mean,user_mean
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336.0,pixar,4.0,3.833333,3.777778
1,552,"Three Musketeers, The (1993)",Action|Adventure|Comedy|Romance,336.0,knights,3.0,3.000000,3.777778
2,1246,Dead Poets Society (1989),Drama,336.0,highschool,4.5,4.500000,3.777778
3,32587,Sin City (2005),Action|Crime|Film-Noir|Mystery|Thriller,336.0,cult,4.0,4.000000,3.777778
4,33660,Cinderella Man (2005),Drama|Romance,336.0,boksdrama,4.5,4.250000,3.777778
...,...,...,...,...,...,...,...,...
3471,152711,Who Killed Chea Vichea? (2010),Documentary,462.0,Cambodia,5.0,5.000000,5.000000
3472,152711,Who Killed Chea Vichea? (2010),Documentary,462.0,crime,5.0,5.000000,5.000000
3473,152711,Who Killed Chea Vichea? (2010),Documentary,462.0,human rights,5.0,5.000000,5.000000
3474,152711,Who Killed Chea Vichea? (2010),Documentary,462.0,murder,5.0,5.000000,5.000000


TF-IDF на жанрах

In [27]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [28]:
movie_genres = [change_string(g) for g in data.genres.values]

In [29]:
# movie_genres[1]

In [30]:
# len(movie_genres)

In [31]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(movie_genres)

In [32]:
# X_train_counts.shape

In [33]:
# X_train_counts.todense()[1]

In [34]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [35]:
# X_train_tfidf.todense()[1]

In [36]:
# pd.DataFrame(X_train_tfidf.todense())

In [37]:
# Поменяем названия столбцов

In [38]:
# функция для переименования столбцов
def ren_col(columns, prefix):
    names = []
    for p in list(columns):
        names.append(prefix+'_{}'.format(p))
#         print(p)
    return names

In [39]:
prefix='genre'
columns= pd.DataFrame(X_train_tfidf.todense()).columns
g_columns = ren_col(pd.DataFrame(X_train_tfidf.todense()).columns, prefix)

In [40]:
tf_g = pd.DataFrame(X_train_tfidf.todense())
tf_g.columns = g_columns
tf_g.head()

Unnamed: 0,genre_0,genre_1,genre_2,genre_3,genre_4,genre_5,genre_6,genre_7,genre_8,genre_9,genre_10,genre_11,genre_12,genre_13,genre_14,genre_15,genre_16,genre_17,genre_18,genre_19
0,0.0,0.379419,0.504921,0.536492,0.292207,0.0,0.0,0.0,0.477376,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.484561,0.543606,0.0,0.0,0.418655,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.542605,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.331342,0.0,0.0,0.0,0.0,0.332872,0.0,0.0,0.0,0.716513,0.0,0.0,0.0,0.425705,0.0,0.0,0.0,0.291191,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4875,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.873123,0.0,0.0,0.0,0.0


TF-IDF на тэгах

In [41]:
# movie_tags = [change_string(t) for t in data.tag.values]

In [42]:
# len(movie_tags)

In [43]:
# movie_tags

In [44]:
# count_vect_t = CountVectorizer()
# X_train_counts_t = count_vect_t.fit_transform(movie_tags)

In [45]:
# X_train_counts_t.shape

In [46]:
# X_train_counts_t.todense()

In [47]:
# tfidf_transformer_t = TfidfTransformer()
# X_train_tfidf_t = tfidf_transformer_t.fit_transform(X_train_counts_t)

In [48]:
# pd.DataFrame(X_train_tfidf_t.todense())

In [49]:
# переименуем столбцы

In [50]:
# prefix='tag'
# columns= pd.DataFrame(X_train_tfidf_t.todense()).columns
# t_columns = ren_col(pd.DataFrame(X_train_tfidf_t.todense()), prefix)

In [51]:
# tf_t = pd.DataFrame(X_train_tfidf_t.todense())
# tf_t.columns = t_columns
# tf_t

Объединим в одну таблицу

In [52]:
data = data.join(tf_g)

In [53]:
data

Unnamed: 0,movieId,title,genres,userId,tag,rating,movie_mean,user_mean,genre_0,genre_1,...,genre_10,genre_11,genre_12,genre_13,genre_14,genre_15,genre_16,genre_17,genre_18,genre_19
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336.0,pixar,4.0,3.833333,3.777778,0.000000,0.379419,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0
1,552,"Three Musketeers, The (1993)",Action|Adventure|Comedy|Romance,336.0,knights,3.0,3.000000,3.777778,0.484561,0.543606,...,0.0,0.0,0.0,0.000000,0.0,0.542605,0.0,0.000000,0.0,0.0
2,1246,Dead Poets Society (1989),Drama,336.0,highschool,4.5,4.500000,3.777778,0.000000,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0
3,32587,Sin City (2005),Action|Crime|Film-Noir|Mystery|Thriller,336.0,cult,4.0,4.000000,3.777778,0.331342,0.000000,...,0.0,0.0,0.0,0.425705,0.0,0.000000,0.0,0.291191,0.0,0.0
4,33660,Cinderella Man (2005),Drama|Romance,336.0,boksdrama,4.5,4.250000,3.777778,0.000000,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.873123,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3471,152711,Who Killed Chea Vichea? (2010),Documentary,462.0,Cambodia,5.0,5.000000,5.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0
3472,152711,Who Killed Chea Vichea? (2010),Documentary,462.0,crime,5.0,5.000000,5.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0
3473,152711,Who Killed Chea Vichea? (2010),Documentary,462.0,human rights,5.0,5.000000,5.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0
3474,152711,Who Killed Chea Vichea? (2010),Documentary,462.0,murder,5.0,5.000000,5.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0


Найдем пользователя, который поставил наибольшее количество оценок

In [54]:
data.groupby('userId').count()['rating'].sort_values(ascending=False).head()

userId
474.0    1414
567.0     432
62.0      370
599.0     323
477.0     267
Name: rating, dtype: int64

Отфильтруем наши данные для пользователя 474

In [55]:
usr_data = data[data['userId'] == 474]

In [56]:
usr_data

Unnamed: 0,movieId,title,genres,userId,tag,rating,movie_mean,user_mean,genre_0,genre_1,...,genre_10,genre_11,genre_12,genre_13,genre_14,genre_15,genre_16,genre_17,genre_18,genre_19
9,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474.0,pixar,4.0,3.833333,3.701909,0.000000,0.379419,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0
10,2,Jumanji (1995),Adventure|Children|Fantasy,474.0,game,3.0,3.750000,3.701909,0.000000,0.467149,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0
11,5,Father of the Bride Part II (1995),Comedy,474.0,pregnancy,1.5,1.500000,3.701909,0.000000,0.000000,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0
12,5,Father of the Bride Part II (1995),Comedy,474.0,remake,1.5,1.500000,3.701909,0.000000,0.000000,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0
13,7,Sabrina (1995),Comedy|Romance,474.0,remake,3.0,3.000000,3.701909,0.000000,0.000000,...,0.0,0.0,0.000000,0.0,0.0,0.791730,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1418,40819,Walk the Line (2005),Drama|Musical|Romance,474.0,Johnny Cash,4.0,4.000000,3.701909,0.000000,0.000000,...,0.0,0.0,0.809012,0.0,0.0,0.513215,0.0,0.000000,0.0,0.0
1419,41566,"Chronicles of Narnia: The Lion, the Witch and ...",Adventure|Children|Fantasy,474.0,C.S. Lewis,4.5,4.500000,3.701909,0.000000,0.467149,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0
1420,41997,Munich (2005),Action|Crime|Drama|Thriller,474.0,In Netflix queue,4.0,4.000000,3.701909,0.561435,0.000000,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.493402,0.0,0.0
1421,42002,"Producers, The (2005)",Comedy|Musical,474.0,In Netflix queue,4.0,4.000000,3.701909,0.000000,0.000000,...,0.0,0.0,0.898182,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0


Выделим входные и выходные данные, разделим на тестовую и обучающую части

In [57]:
y = usr_data['rating']
y

9       4.0
10      3.0
11      1.5
12      1.5
13      3.0
       ... 
1418    4.0
1419    4.5
1420    4.0
1421    4.0
1422    3.5
Name: rating, Length: 1414, dtype: float64

In [58]:
X = usr_data.iloc[:, 8:28]
X

Unnamed: 0,genre_0,genre_1,genre_2,genre_3,genre_4,genre_5,genre_6,genre_7,genre_8,genre_9,genre_10,genre_11,genre_12,genre_13,genre_14,genre_15,genre_16,genre_17,genre_18,genre_19
9,0.000000,0.379419,0.504921,0.536492,0.292207,0.000000,0.0,0.000000,0.477376,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0
10,0.000000,0.467149,0.000000,0.660541,0.000000,0.000000,0.0,0.000000,0.587756,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0
11,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0
12,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0
13,0.000000,0.000000,0.000000,0.000000,0.610872,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.791730,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1418,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.286548,0.000000,0.0,0.0,0.0,0.809012,0.0,0.0,0.513215,0.0,0.000000,0.0,0.0
1419,0.000000,0.467149,0.000000,0.660541,0.000000,0.000000,0.0,0.000000,0.587756,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0
1420,0.561435,0.000000,0.000000,0.000000,0.000000,0.564029,0.0,0.351021,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.493402,0.0,0.0
1421,0.000000,0.000000,0.000000,0.000000,0.439624,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.898182,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0


In [59]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse

In [60]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

Обучим модель, проверим rmse

In [61]:
model = LinearRegression().fit(X_train, y_train)

In [62]:
y_pred_test = model.predict(X_test)
y_pred_train = model.predict(X_train)

In [63]:
mse(y_test, y_pred_test, squared=False)

0.7575146639968191

In [64]:
# y_pred_test

In [65]:
# usr_data['rating'].unique()

Отфильтруем данные, для фильмов, которые не смотрел пользователь 474

In [66]:
not_usr_data = data[data['userId'] != 474]

In [67]:
not_usr_data

Unnamed: 0,movieId,title,genres,userId,tag,rating,movie_mean,user_mean,genre_0,genre_1,...,genre_10,genre_11,genre_12,genre_13,genre_14,genre_15,genre_16,genre_17,genre_18,genre_19
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336.0,pixar,4.0,3.833333,3.777778,0.000000,0.379419,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0
1,552,"Three Musketeers, The (1993)",Action|Adventure|Comedy|Romance,336.0,knights,3.0,3.000000,3.777778,0.484561,0.543606,...,0.0,0.0,0.0,0.000000,0.0,0.542605,0.0,0.000000,0.0,0.0
2,1246,Dead Poets Society (1989),Drama,336.0,highschool,4.5,4.500000,3.777778,0.000000,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0
3,32587,Sin City (2005),Action|Crime|Film-Noir|Mystery|Thriller,336.0,cult,4.0,4.000000,3.777778,0.331342,0.000000,...,0.0,0.0,0.0,0.425705,0.0,0.000000,0.0,0.291191,0.0,0.0
4,33660,Cinderella Man (2005),Drama|Romance,336.0,boksdrama,4.5,4.250000,3.777778,0.000000,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.873123,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3471,152711,Who Killed Chea Vichea? (2010),Documentary,462.0,Cambodia,5.0,5.000000,5.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0
3472,152711,Who Killed Chea Vichea? (2010),Documentary,462.0,crime,5.0,5.000000,5.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0
3473,152711,Who Killed Chea Vichea? (2010),Documentary,462.0,human rights,5.0,5.000000,5.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0
3474,152711,Who Killed Chea Vichea? (2010),Documentary,462.0,murder,5.0,5.000000,5.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0


предскажем оценки

In [68]:
X_n = not_usr_data.iloc[:, 8:28]

In [69]:
# X_n

In [70]:
y_pred = model.predict(X_n)

In [71]:
y_pred

array([3.52085221, 3.78717125, 3.84570941, ..., 3.85821477, 3.85821477,
       3.85821477])

In [72]:
# y_pred_train

In [73]:
# pd.DataFrame(y_pred).max()

In [74]:
pd.DataFrame(y_pred).columns=['y_pred']

In [75]:
res = pd.DataFrame(y_pred)

In [80]:
res.columns = ['pred_rating']

In [85]:
# res

In [86]:
not_usr_data.join(res)[['movieId', 'title', 'pred_rating']].sort_values(by='pred_rating', ascending=False).head(10)

Unnamed: 0,movieId,title,pred_rating
1820,164179,Arrival (2016),4.805032
1819,161634,Don't Breathe (2016),4.805032
1818,156605,Paterson,4.805032
1783,122916,Thor: Ragnarok (2017),4.494199
1775,122912,Avengers: Infinity War - Part I (2018),4.494199
1784,122918,Guardians of the Galaxy 2 (2017),4.494199
1786,127298,A Pigeon Sat on a Branch Reflecting on Existen...,4.494199
1774,122912,Avengers: Infinity War - Part I (2018),4.494199
1785,122922,Doctor Strange (2016),4.494199
1776,122912,Avengers: Infinity War - Part I (2018),4.494199
