# Домашнее задание по теме "Рекомендации на основе содержания"

## Задание

1. Использовать датасет [MovieLens](https://grouplens.org/datasets/movielens/latest/)
2. Построить рекомендации (регрессия, предсказываем оценку) на фичах:
   - TF-IDF на тегах и жанрах;
   - средние оценки (+ median, variance и т.д.) пользователя фильма.
4. Оценить RMSE на тестовой выборке.

## Решение

In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer

### Загрузка данных

In [2]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
movies.shape

(9742, 3)

- Всего фильмов: 9742

In [5]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [6]:
tags.shape

(3683, 4)

In [7]:
tags.tag.unique().shape

(1589,)

In [8]:
tags.movieId.unique().shape

(1572,)

In [9]:
tags.userId.unique().shape

(58,)

- 58 пользователей проставили теги 1572 фильмам.
- Всего было проставлено тегов: 3683.
- Из них уникальных тегов: 1589.

In [10]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [11]:
ratings.shape

(100836, 4)

In [12]:
ratings.movieId.unique().shape

(9724,)

In [13]:
ratings.userId.unique().shape

(610,)

- 610 пользователей поставили оценку фильму.
- Получили оценку всего 9724 фильма.

### Задаем неодбходимые функции

In [34]:
def change_string(s: str)-> str:
    """Удаляет из строки пробелы и дефисы"""
    return str(s).replace(' ', '').replace('-', '')

### Работаем с товаром (фильмы):

#### TF-IDF-преобразование тегов

In [14]:
# Добваляем к тегам названия фильмов
movies_with_tags = movies.merge(tags, on='movieId')[['movieId', 'tag']]
movies_with_tags.head()

Unnamed: 0,movieId,tag
0,1,pixar
1,1,pixar
2,1,fun
3,2,fantasy
4,2,magic board game


In [27]:
# Гриппируем по фильмам:
title_groups = movies_with_tags.groupby('movieId')
title_groups.head(10)

Unnamed: 0,movieId,tag
0,1,pixar
1,1,pixar
2,1,fun
3,2,fantasy
4,2,magic board game
...,...,...
3678,187595,star wars
3679,193565,anime
3680,193565,comedy
3681,193565,gintama


In [23]:
# Создаем два списка: тегов и названий фильмов
# Одновременно удаляем из строк пробелы и дефисы

tags_lst = []
movieId_lst = []

for id, group in title_groups:
    tags_lst.append(' '.join([change_string(s) for s in group.tag.values]))
    movieId_lst.append(id)

In [24]:
tags_lst[:5]

['pixar pixar fun',
 'fantasy magicboardgame RobinWilliams game',
 'moldy old',
 'pregnancy remake',
 'remake']

In [25]:
movieId_lst[:5]

[1, 2, 3, 5, 7]

In [26]:
# На основе списка tags_lst создаем спарс-матрицу:
tfidf_tag = TfidfVectorizer()
X_train_tfidf_tag = tfidf_tag.fit_transform(tags_lst)
X_train_tfidf_tag

<1572x1472 sparse matrix of type '<class 'numpy.float64'>'
	with 3598 stored elements in Compressed Sparse Row format>

In [32]:
# Преобразование спар-матирцы в датафрейм:
tfidf_tag_df = pd.DataFrame(X_train_tfidf_tag.toarray(), columns=tfidf_tag.get_feature_names_out())

In [33]:
# Добавление в датафрейм столбца 'movieId'
tfidf_tag_df['movieId'] = movieId_lst
tfidf_tag_df.head()

Unnamed: 0,06oscarnominatedbestmovieanimation,1900s,1920s,1950s,1960s,1970s,1980s,1990s,2001like,2danimation,...,worldwarii,writing,wrongfulimprisonment,wry,youngermen,zither,zoekazan,zombies,zooeydeschanel,movieId
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7


#### TF-IDF-преобразование жанров

In [87]:
# Создаем список жанров каждого из фильмов:
# Одновременно удаляем прбелы и дефисы в строках
genres_lst = []
movieId_lst = []

for index, row in movies.iterrows():
    movieId_lst.append(row['movieId'])
    genres_lst.append(row['genres'])

In [88]:
genres_lst[:5]

['Adventure|Animation|Children|Comedy|Fantasy',
 'Adventure|Children|Fantasy',
 'Comedy|Romance',
 'Comedy|Drama|Romance',
 'Comedy']

In [89]:
movieId_lst[:5]

[1, 2, 3, 4, 5]

In [90]:
# На основе списка tags_lst создаем спарс-матрицу:
tfidf_genres = TfidfVectorizer()
X_train_tfidf_genres = tfidf_genres.fit_transform(genres_lst)
X_train_tfidf_genres

<9742x24 sparse matrix of type '<class 'numpy.float64'>'
	with 23219 stored elements in Compressed Sparse Row format>

In [91]:
# Преобразование спар-матирцы в датафрейм:
tfidf_genres_df = pd.DataFrame(X_train_tfidf_genres.toarray(), columns=tfidf_genres.get_feature_names_out())

In [93]:
# Добавление в датафрейм столбца 'movieId'
tfidf_genres_df['movieId'] = movieId_lst
tfidf_genres_df.head()

Unnamed: 0,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,fi,...,musical,mystery,no,noir,romance,sci,thriller,war,western,movieId
0,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,0.48299,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.0,0.512361,0.0,0.620525,0.0,0.0,0.0,0.0,0.593662,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
2,0.0,0.0,0.0,0.0,0.570915,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.821009,0.0,0.0,0.0,0.0,3
3,0.0,0.0,0.0,0.0,0.505015,0.0,0.0,0.466405,0.0,0.0,...,0.0,0.0,0.0,0.0,0.726241,0.0,0.0,0.0,0.0,4
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5


In [94]:
tfidf_genres_df.shape

(9742, 25)

In [96]:
tfidf_tag_df.shape

(1572, 1473)

#### Объединим полученные датафреймы

In [102]:
# Объединение датафреймов
tfidf_df = tfidf_genres_df.merge(tfidf_tag_df, on='movieId')
tfidf_df.head()

Unnamed: 0,action_x,adventure_x,animation_x,children_x,comedy_x,crime_x,documentary_x,drama_x,fantasy_x,fi,...,worldwari,worldwarii,writing,wrongfulimprisonment,wry,youngermen,zither,zoekazan,zombies,zooeydeschanel
0,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,0.48299,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.512361,0.0,0.620525,0.0,0.0,0.0,0.0,0.593662,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.570915,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.570915,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [105]:
# Переместим столбец 'movieId' на первое место:
temp = tfidf_df['movieId']
tfidf_df = tfidf_df.drop('movieId', axis=1)
tfidf_df.insert(0, 'movieId', temp)
tfidf_df.head()

Unnamed: 0,movieId,action_x,adventure_x,animation_x,children_x,comedy_x,crime_x,documentary_x,drama_x,fantasy_x,...,worldwari,worldwarii,writing,wrongfulimprisonment,wry,youngermen,zither,zoekazan,zombies,zooeydeschanel
0,1,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,0.48299,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,0.0,0.512361,0.0,0.620525,0.0,0.0,0.0,0.0,0.593662,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,0.0,0.0,0.0,0.0,0.570915,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7,0.0,0.0,0.0,0.0,0.570915,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Посчитаем, какое количество раз проставлялись каждому фильму теги

In [109]:
tags.groupby('movieId')['tag'].count()

movieId
1         3
2         4
3         2
5         2
7         1
         ..
183611    3
184471    3
187593    3
187595    2
193565    4
Name: tag, Length: 1572, dtype: int64

Масшабирование метрики среднее значение рейтинга фильма

In [111]:
# соединим таблицу с рейтингами и названиями фильмов
joined_ratings = ratings.join(movies.set_index('movieId'), on='movieId')
joined_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [114]:
# Cреднее арифметическое значение рейтинга для каждого фильма:
mean_ratings = joined_ratings.groupby('movieId')['rating'].mean()
mean_ratings.head()

movieId
1    3.920930
2    3.431818
3    3.259615
4    2.357143
5    3.071429
Name: rating, dtype: float64

In [116]:
# Количество рейтингов, проставленных каждому фильму:
num_ratings  = joined_ratings['movieId'].value_counts()
num_ratings.head()

movieId
356     329
318     317
296     307
593     279
2571    278
Name: count, dtype: int64

In [118]:
# Простые статистики по количеству оценок
min_num_ratings = np.min(num_ratings)
max_num_ratings = np.max(num_ratings)
mean_num_ratings = np.mean(num_ratings)
median_num_ratings = np.median(num_ratings)

In [119]:
norm_coef = max_num_ratings - min_num_ratings
norm_coef

328

In [135]:
movieId_norm_mark = []

# посчитаем нашу метрику для каждого фильма из датасета
for f in num_ratings.index:
    movieId_norm_mark.append(
        (f, mean_ratings[f] * (num_ratings[f] - min_num_ratings) / norm_coef)
    )
    
movieId_norm_mark = pd.DataFrame(movieId_norm_mark, columns=['movieId', 'norm_mark'])

movieId_norm_mark.head()

Unnamed: 0,movieId,norm_mark
0,356,4.164134
1,318,4.266985
2,296,3.915558
3,593,3.526947
4,2571,3.540572
