# Домашнее задание по теме "Рекомендации на основе содержания"

## Задание

1. Использовать датасет [MovieLens](https://grouplens.org/datasets/movielens/latest/)
2. Построить рекомендации (регрессия, предсказываем оценку) на фичах:
   - TF-IDF на тегах и жанрах;
   - средние оценки (+ median, variance и т.д.) пользователя фильма.
4. Оценить RMSE на тестовой выборке.

## Решение

In [65]:
import pandas as pd
import numpy as np

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

### Загрузка данных

In [2]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
movies.shape

(9742, 3)

- Всего фильмов: 9742

In [5]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [6]:
tags.shape

(3683, 4)

In [7]:
tags.tag.unique().shape

(1589,)

In [8]:
tags.movieId.unique().shape

(1572,)

In [9]:
tags.userId.unique().shape

(58,)

- 58 пользователей проставили теги 1572 фильмам.
- Всего было проставлено тегов: 3683.
- Из них уникальных тегов: 1589.

In [10]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [11]:
ratings.shape

(100836, 4)

In [12]:
ratings.movieId.unique().shape

(9724,)

In [13]:
ratings.userId.unique().shape

(610,)

- 610 пользователей поставили оценку фильму.
- Получили оценку всего 9724 фильма.

### Задаем неодбходимые функции

In [14]:
def change_string(s: str)-> str:
    """Удаляет из строки пробелы и дефисы"""
    return str(s).replace(' ', '').replace('-', '')

### Работаем с товаром (фильмы):

#### TF-IDF-преобразование тегов

In [15]:
# Добваляем к тегам названия фильмов
movies_with_tags = movies.merge(tags, on='movieId')[['movieId', 'tag']]
movies_with_tags.head()

Unnamed: 0,movieId,tag
0,1,pixar
1,1,pixar
2,1,fun
3,2,fantasy
4,2,magic board game


In [16]:
# Гриппируем по фильмам:
title_groups = movies_with_tags.groupby('movieId')
title_groups.head(10)

Unnamed: 0,movieId,tag
0,1,pixar
1,1,pixar
2,1,fun
3,2,fantasy
4,2,magic board game
...,...,...
3678,187595,star wars
3679,193565,anime
3680,193565,comedy
3681,193565,gintama


In [17]:
# Создаем два списка: тегов и названий фильмов
# Одновременно удаляем из строк пробелы и дефисы

tags_lst = []
movieId_lst = []

for id, group in title_groups:
    tags_lst.append(' '.join([change_string(s) for s in group.tag.values]))
    movieId_lst.append(id)

In [18]:
tags_lst[:5]

['pixar pixar fun',
 'fantasy magicboardgame RobinWilliams game',
 'moldy old',
 'pregnancy remake',
 'remake']

In [19]:
movieId_lst[:5]

[1, 2, 3, 5, 7]

In [20]:
# На основе списка tags_lst создаем спарс-матрицу:
tfidf_tag = TfidfVectorizer()
X_train_tfidf_tag = tfidf_tag.fit_transform(tags_lst)
X_train_tfidf_tag

<1572x1472 sparse matrix of type '<class 'numpy.float64'>'
	with 3598 stored elements in Compressed Sparse Row format>

In [21]:
# Преобразование спар-матирцы в датафрейм:
tfidf_tag_df = pd.DataFrame(X_train_tfidf_tag.toarray(), columns=tfidf_tag.get_feature_names_out())

In [22]:
# Добавление в датафрейм столбца 'movieId'
tfidf_tag_df['movieId'] = movieId_lst
tfidf_tag_df.head()

Unnamed: 0,06oscarnominatedbestmovieanimation,1900s,1920s,1950s,1960s,1970s,1980s,1990s,2001like,2danimation,...,worldwarii,writing,wrongfulimprisonment,wry,youngermen,zither,zoekazan,zombies,zooeydeschanel,movieId
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7


#### TF-IDF-преобразование жанров

In [23]:
# Создаем список жанров каждого из фильмов:
# Одновременно удаляем прбелы и дефисы в строках
genres_lst = []
movieId_lst = []

for index, row in movies.iterrows():
    movieId_lst.append(row['movieId'])
    genres_lst.append(row['genres'])

In [24]:
genres_lst[:5]

['Adventure|Animation|Children|Comedy|Fantasy',
 'Adventure|Children|Fantasy',
 'Comedy|Romance',
 'Comedy|Drama|Romance',
 'Comedy']

In [25]:
movieId_lst[:5]

[1, 2, 3, 4, 5]

In [26]:
# На основе списка tags_lst создаем спарс-матрицу:
tfidf_genres = TfidfVectorizer()
X_train_tfidf_genres = tfidf_genres.fit_transform(genres_lst)
X_train_tfidf_genres

<9742x24 sparse matrix of type '<class 'numpy.float64'>'
	with 23219 stored elements in Compressed Sparse Row format>

In [27]:
# Преобразование спар-матирцы в датафрейм:
tfidf_genres_df = pd.DataFrame(X_train_tfidf_genres.toarray(), columns=tfidf_genres.get_feature_names_out())

In [28]:
# Добавление в датафрейм столбца 'movieId'
tfidf_genres_df['movieId'] = movieId_lst
tfidf_genres_df.head()

Unnamed: 0,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,fi,...,musical,mystery,no,noir,romance,sci,thriller,war,western,movieId
0,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,0.48299,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.0,0.512361,0.0,0.620525,0.0,0.0,0.0,0.0,0.593662,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
2,0.0,0.0,0.0,0.0,0.570915,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.821009,0.0,0.0,0.0,0.0,3
3,0.0,0.0,0.0,0.0,0.505015,0.0,0.0,0.466405,0.0,0.0,...,0.0,0.0,0.0,0.0,0.726241,0.0,0.0,0.0,0.0,4
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5


In [29]:
tfidf_genres_df.shape

(9742, 25)

In [30]:
tfidf_tag_df.shape

(1572, 1473)

#### Объединим полученные датафреймы

In [31]:
# Объединение датафреймов
tfidf_df = tfidf_genres_df.merge(tfidf_tag_df, on='movieId')
tfidf_df.head()

Unnamed: 0,action_x,adventure_x,animation_x,children_x,comedy_x,crime_x,documentary_x,drama_x,fantasy_x,fi,...,worldwari,worldwarii,writing,wrongfulimprisonment,wry,youngermen,zither,zoekazan,zombies,zooeydeschanel
0,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,0.48299,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.512361,0.0,0.620525,0.0,0.0,0.0,0.0,0.593662,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.570915,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.570915,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
# Переместим столбец 'movieId' на первое место:
temp = tfidf_df['movieId']
tfidf_df = tfidf_df.drop('movieId', axis=1)
tfidf_df.insert(0, 'movieId', temp)
tfidf_df.head()

Unnamed: 0,movieId,action_x,adventure_x,animation_x,children_x,comedy_x,crime_x,documentary_x,drama_x,fantasy_x,...,worldwari,worldwarii,writing,wrongfulimprisonment,wry,youngermen,zither,zoekazan,zombies,zooeydeschanel
0,1,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,0.48299,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,0.0,0.512361,0.0,0.620525,0.0,0.0,0.0,0.0,0.593662,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,0.0,0.0,0.0,0.0,0.570915,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7,0.0,0.0,0.0,0.0,0.570915,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
tfidf_df.shape

(1572, 1497)

#### Масшабирование метрики среднее значение рейтинга фильма

In [34]:
# соединим таблицу с рейтингами и названиями фильмов
joined_ratings = ratings.join(movies.set_index('movieId'), on='movieId')
joined_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [35]:
# Cреднее арифметическое значение рейтинга для каждого фильма:
mean_ratings = joined_ratings.groupby('movieId')['rating'].mean()
mean_ratings.head()

movieId
1    3.920930
2    3.431818
3    3.259615
4    2.357143
5    3.071429
Name: rating, dtype: float64

In [36]:
# Количество рейтингов, проставленных каждому фильму:
num_ratings  = joined_ratings['movieId'].value_counts()
num_ratings.head()

movieId
356     329
318     317
296     307
593     279
2571    278
Name: count, dtype: int64

In [37]:
# Простые статистики по количеству оценок
min_num_ratings = np.min(num_ratings)
max_num_ratings = np.max(num_ratings)
mean_num_ratings = np.mean(num_ratings)
median_num_ratings = np.median(num_ratings)

In [38]:
norm_coef = max_num_ratings - min_num_ratings
norm_coef

328

In [39]:
movieId_norm_mark = []

# посчитаем нашу метрику для каждого фильма из датасета
for f in num_ratings.index:
    movieId_norm_mark.append(
        (f, mean_ratings[f] * (num_ratings[f] - min_num_ratings) / norm_coef)
    )
    
movieId_norm_mark = pd.DataFrame(movieId_norm_mark, columns=['movieId', 'norm_mark'])

movieId_norm_mark.head()

Unnamed: 0,movieId,norm_mark
0,356,4.164134
1,318,4.266985
2,296,3.915558
3,593,3.526947
4,2571,3.540572


In [40]:
movieId_norm_mark.shape

(9724, 2)

In [41]:
# Объединим датасеты
joined_tfidf_norm_mark = tfidf_df.merge(movieId_norm_mark, on='movieId')
joined_tfidf_norm_mark.shape

(1554, 1498)

In [42]:
joined_tfidf_norm_mark.head()

Unnamed: 0,movieId,action_x,adventure_x,animation_x,children_x,comedy_x,crime_x,documentary_x,drama_x,fantasy_x,...,worldwarii,writing,wrongfulimprisonment,wry,youngermen,zither,zoekazan,zombies,zooeydeschanel,norm_mark
0,1,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,0.48299,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.558168
1,2,0.0,0.512361,0.0,0.620525,0.0,0.0,0.0,0.0,0.593662,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.140452
2,3,0.0,0.0,0.0,0.0,0.570915,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.50683
3,5,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.449477
4,7,0.0,0.0,0.0,0.0,0.570915,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.514679


### Работаем с пользователями

In [43]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


Рассчитаем сколько фильмов просмотрел каждый из пользователей, а также среднее значение рейтинга для каждого пользователя:

In [44]:
user_activities = ratings[['userId', 'movieId', 'rating']].groupby(['userId']).agg({'movieId': ['count'], 'rating': ['mean']})

In [45]:
user_activities.head()

Unnamed: 0_level_0,movieId,rating
Unnamed: 0_level_1,count,mean
userId,Unnamed: 1_level_2,Unnamed: 2_level_2
1,232,4.366379
2,29,3.948276
3,39,2.435897
4,216,3.555556
5,44,3.636364


Добавим эти значения в датафрейм ratings:

In [46]:
ratings_count = ratings[['userId', 'movieId', 'rating']].merge(user_activities.movieId, on='userId')
ratings_count

Unnamed: 0,userId,movieId,rating,count
0,1,1,4.0,232
1,1,3,4.0,232
2,1,6,4.0,232
3,1,47,5.0,232
4,1,50,5.0,232
...,...,...,...,...
100831,610,166534,4.0,1302
100832,610,168248,5.0,1302
100833,610,168250,5.0,1302
100834,610,168252,5.0,1302


In [47]:
ratings_count_mean = ratings_count.merge(user_activities.rating, on='userId')
ratings_count_mean

Unnamed: 0,userId,movieId,rating,count,mean
0,1,1,4.0,232,4.366379
1,1,3,4.0,232,4.366379
2,1,6,4.0,232,4.366379
3,1,47,5.0,232,4.366379
4,1,50,5.0,232,4.366379
...,...,...,...,...,...
100831,610,166534,4.0,1302,3.688556
100832,610,168248,5.0,1302,3.688556
100833,610,168250,5.0,1302,3.688556
100834,610,168252,5.0,1302,3.688556


Соберем итоговый датафрейм:

In [48]:
data = ratings_count_mean.merge(joined_tfidf_norm_mark, on='movieId')

In [49]:
data

Unnamed: 0,userId,movieId,rating,count,mean,action_x,adventure_x,animation_x,children_x,comedy_x,...,worldwarii,writing,wrongfulimprisonment,wry,youngermen,zither,zoekazan,zombies,zooeydeschanel,norm_mark
0,1,1,4.0,232,4.366379,0.0,0.416846,0.516225,0.504845,0.267586,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.558168
1,5,1,4.0,44,3.636364,0.0,0.416846,0.516225,0.504845,0.267586,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.558168
2,7,1,4.5,152,3.230263,0.0,0.416846,0.516225,0.504845,0.267586,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.558168
3,15,1,2.5,135,3.448148,0.0,0.416846,0.516225,0.504845,0.267586,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.558168
4,17,1,4.5,105,4.209524,0.0,0.416846,0.516225,0.504845,0.267586,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.558168
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48282,567,176419,3.0,385,2.245455,0.0,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.009909
48283,599,176419,3.5,2478,2.642050,0.0,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.009909
48284,594,7023,4.5,232,3.924569,0.0,0.000000,0.000000,0.000000,0.505015,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
48285,606,6107,4.0,1115,3.657399,0.0,0.000000,0.000000,0.000000,0.000000,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000


### Регрессионный анализ

In [50]:
# Целевая переменная
Y = data.loc[:, ['rating']]
Y.shape

(48287, 1)

In [51]:
X = data.drop(columns=['userId', 'movieId', 'rating'], axis=1)
X.shape

(48287, 1499)

In [52]:
# Разбиваем выборку на обучающую и тестовую:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=55)

#### Обучение модели

In [53]:
#lr_model = LinearRegression()

In [54]:
#lr_model.fit(X_train, Y_train)

In [67]:
lr_model = make_pipeline(
    StandardScaler(),
    LinearRegression()
)

In [68]:
lr_model.fit(X_train, Y_train)

Оценка метрик RMSE и R2

In [69]:
train_pred = lr_model.predict(X_train)
train_mse = mean_squared_error(Y_train, train_pred)
train_r2 = r2_score(Y_train, train_pred)

In [70]:
test_pred = lr_model.predict(X_test)
test_mse = mean_squared_error(Y_test, test_pred)
test_r2 = r2_score(Y_test, test_pred)

In [71]:
score_df = pd.DataFrame({'Train_mse':[train_mse], 'Test_mse':[test_mse], 'Train_r2':[train_r2], 'Test_r2':[test_r2]})
score_df

Unnamed: 0,Train_mse,Test_mse,Train_r2,Test_r2
0,0.684874,1.538849e+22,0.272939,-1.648041e+22


Затруднясь интерпретировать результаты...

Уж очень большие различия получились в зачениях метрик между обучающей и тестовой выборками.