# Домашнее задание по теме «Рекомендации на основе содержания»


In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

from tqdm import tqdm_notebook

import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors

In [3]:
movies = pd.read_csv('movies.csv')
links = pd.read_csv('links.csv')
tags = pd.read_csv('tags.csv')
ratings = pd.read_csv('ratings.csv')

In [4]:
# Подсчет среднего рейтинга фильма

mean_movie_ratings = ratings.groupby('movieId').mean().drop(['userId', 'timestamp'], 1)

In [5]:
mean_movie_ratings = mean_movie_ratings.reset_index()

In [6]:
# Объединяем датафрейм с средним рейтингом с датафреймом с фильмами

movies_with_mean_ratings = movies.merge(mean_movie_ratings, how='inner', on = 'movieId')

In [7]:
movies_with_mean_ratings

Unnamed: 0,movieId,title,genres,rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.920930
1,2,Jumanji (1995),Adventure|Children|Fantasy,3.431818
2,3,Grumpier Old Men (1995),Comedy|Romance,3.259615
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,2.357143
4,5,Father of the Bride Part II (1995),Comedy,3.071429
...,...,...,...,...
9719,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,4.000000
9720,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,3.500000
9721,193585,Flint (2017),Drama,3.500000
9722,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,3.500000


In [8]:
# Объединяем датафрейм с фильмами и их рейтингами с датайфреймом с тэгами

movies_with_tags = movies_with_mean_ratings.merge(tags, on='movieId', how='outer')

In [9]:
movies_with_tags.dropna(subset=['rating'], inplace=True)

In [10]:
movies_with_tags

Unnamed: 0,movieId,title,genres,rating,userId,tag,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.920930,336.0,pixar,1.139046e+09
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.920930,474.0,pixar,1.137207e+09
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.920930,567.0,fun,1.525286e+09
3,2,Jumanji (1995),Adventure|Children|Fantasy,3.431818,62.0,fantasy,1.528844e+09
4,2,Jumanji (1995),Adventure|Children|Fantasy,3.431818,62.0,magic board game,1.528844e+09
...,...,...,...,...,...,...,...
11827,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,4.000000,,,
11828,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,3.500000,,,
11829,193585,Flint (2017),Drama,3.500000,,,
11830,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,3.500000,,,


In [11]:
movies_with_tags.title.unique().shape

(9719,)

In [12]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [13]:
movie_genres = [change_string(g) for g in movies_with_tags.genres.values]

In [14]:
movies_with_tags['genres'] = movie_genres

In [15]:
movies_with_tags['tag'].fillna(value='No tag', inplace=True)

Создадим датафрейм для последующего анализа

In [16]:
tag_strings = []
movies = []
movies_id = []
genres = []
ratings = []

for movie, group in tqdm_notebook(movies_with_tags.groupby('title')):
    tag_strings.append(' '.join([str(s).replace(' ', '').replace('-', '') for s in group.tag.values]))
    movies.append(movie)
    movies_id.append([s for s in group.movieId.values][0])
    genres.append([s for s in group.genres.values][0])
    ratings.append(round([s for s in group.rating.values][0], 2))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for movie, group in tqdm_notebook(movies_with_tags.groupby('title')):


  0%|          | 0/9719 [00:00<?, ?it/s]

In [17]:
movies_for_rs = pd.DataFrame(
    {
        'movies_id': movies_id,
        'movies': movies,
        'genres': genres,
        'tags': tag_strings,
        'ratings': ratings,
        }
)

# Обучение модели

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [19]:
bow = TfidfVectorizer()

genre_bow = bow.fit_transform(movies_for_rs['genres'])
tags_bow = bow.fit_transform(movies_for_rs['tags'])

print(genre_bow.shape, tags_bow.shape)

(9719, 20) (9719, 1470)


In [20]:
from scipy.sparse import hstack
X = hstack([genre_bow, tags_bow])

print(type(X))
print(X.shape)
print(X.count_nonzero())

<class 'scipy.sparse.coo.coo_matrix'>
(9719, 1490)
33778


In [21]:
# Линейная регрессия с параметрами на полученной матрице
y = movies_for_rs['ratings']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

LinearRegression()

In [22]:
y_pred = lin_reg.predict(X_test)

In [23]:
# Вычислим RMSE
from sklearn.metrics import mean_squared_error
print('MSE:', mean_squared_error(y_test, y_pred))


MSE: 0.6561366042388582


Результат не очень хороший