# Домашнее задание по теме «Рекомендации на основе содержания»


1. Использовать dataset MovieLens
2. Построить рекомендации (регрессия, предсказываем оценку) на фичах:
    * TF-IDF на тегах и жанрах
    * Средние оценки (+ median, variance, etc.) пользователя и фильма


3. Оценить RMSE на тестовой выборке



In [139]:
# Импорт библиотек

import pandas as pd
import numpy as np

from tqdm import tqdm_notebook

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

%matplotlib inline

In [46]:
# загружаем данные
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [47]:
# Функция замены вертикальной черты в жанрах на пробел

def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [48]:
# Убираем вертикальные черты в жанрах
movies.genres = movies.genres.apply(change_string)

In [49]:
# Обогащаем данные тэгами
movies_with_tags = movies.join(tags.set_index('movieId'), on='movieId')
movies_with_tags.head()

Unnamed: 0,movieId,title,genres,userId,tag,timestamp
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,1040.0,animated,1514920000.0
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,1040.0,buddy movie,1514920000.0
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,1040.0,Cartoon,1514920000.0
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,1040.0,cgi,1514920000.0
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,1040.0,comedy,1514920000.0


In [68]:
# Считаем средние значения оценок по фильмам
mean_user = ratings.groupby('userId').rating.median()
mean_movie = ratings.groupby('movieId').rating.median()

In [75]:
mean_user = pd.DataFrame(mean_user)
mean_movie = pd.DataFrame(mean_movie)

In [80]:
movies_with_tags.tag.unique()
movies_with_tags.dropna(inplace=True)

In [81]:
# Объединяем тэги для одного и того же фильма

def change_string(s):
    return str(s).replace(' ', '').replace('-', '').lower()

tag_strings = []
movies = []

for movie, group in tqdm_notebook(movies_with_tags.groupby('title')):
    tag_strings.append(' '.join([change_string(s) for s in group.tag.values]))
    movies.append(movie)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for movie, group in tqdm_notebook(movies_with_tags.groupby('title')):


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=45935.0), HTML(value='')))




In [116]:
# Собираем исходные данные
data = movies_with_tags[['movieId', 'title', 'genres']]
data.drop_duplicates(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.drop_duplicates(inplace=True)


In [117]:
tags_df = pd.DataFrame({'title': movies, 'tags': tag_strings})
tags_df.head()

Unnamed: 0,title,tags
0,"""Great Performances"" Cats (1998)",basedonnovelorbook basedonplayormusical broadw...
1,#1 Cheerleader Camp (2010),sport cheerleading nudity(fullfrontal) camp ch...
2,#Captured (2017),computerscreen foundfootage kidnapping religio...
3,#Horror (2015),unbelievabledialogue
4,#chicagoGirl: The Social Network Takes on a Di...,camera cameraphone dissident documentary espio...


In [163]:
data_with_tags = data.join(tags_df.set_index('title'), on='title')
data_with_tags.head()

Unnamed: 0,movieId,title,genres,tags
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,animated buddymovie cartoon cgi comedy compute...
1,2,Jumanji (1995),Adventure Children Fantasy,fantasy adaptedfrom:book animals badcgi basedo...
2,3,Grumpier Old Men (1995),Comedy Romance,moldy old annmargaret burgessmeredith darylhan...
3,4,Waiting to Exhale (1995),Comedy Drama Romance,characters girlmovie characters chickflick bas...
4,5,Father of the Bride Part II (1995),Comedy,stevemartin stevemartin pregnancy remake aging...


In [161]:
#  Добавляем среднюю оценку фильма

full_data = data_with_tags.merge(mean_movie, how='left', on='movieId')

In [162]:
full_data.head()

Unnamed: 0,movieId,title,genres,tags,rating
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,animated buddymovie cartoon cgi comedy compute...,4.0
1,2,Jumanji (1995),Adventure Children Fantasy,fantasy adaptedfrom:book animals badcgi basedo...,3.0
2,3,Grumpier Old Men (1995),Comedy Romance,moldy old annmargaret burgessmeredith darylhan...,3.0
3,4,Waiting to Exhale (1995),Comedy Drama Romance,characters girlmovie characters chickflick bas...,3.0
4,5,Father of the Bride Part II (1995),Comedy,stevemartin stevemartin pregnancy remake aging...,3.0


In [121]:
# Объединяем жанр и тэги в единое описание для векторизации
full_data["descr"] = full_data["genres"].astype(str) + " " + full_data["tags"].astype(str)

In [124]:
full_data.genres = full_data.descr.apply(lambda x: x.lower())

In [152]:
full_data.dropna(inplace=True)

In [153]:
# Векторизуем полученное описание - его будем использовать для предсказания оценки фильма

tfidf = TfidfVectorizer()
movies_tfidf = tfidf.fit_transform(full_data.descr)


In [154]:
# Дробим на тестовые выборки и выборки для обучения
X_train, X_test, y_train, y_test = train_test_split(movies_tfidf, full_data.rating, test_size=0.20, random_state=42)


In [155]:
# Используем модель KNN
model = KNeighborsRegressor(n_neighbors=10, metric='euclidean')


In [156]:
# Обучаем
model.fit(X_train, y_train)

KNeighborsRegressor(metric='euclidean', n_neighbors=10)

In [158]:
# Получаем предсказание
y_pred = model.predict(X_test)

In [159]:
# Замеряем качество
mean_squared_error(y_test, y_pred)

0.49681096648713347

### Выводы:
Качество получилось низкое. Видимо такой метод получения векторов неоптимальный.