# Домашнее задание по теме «Рекомендации на основе содержания»

## Задание

Преподаватель: Наталья Баданина, Юлия Пономарева, Егор Шишковец

1. Использовать датасет MovieLens.
2. Построить рекомендации (регрессия, предсказываем оценку) на фичах:
- TF-IDF на тегах и жанрах;
-  средние оценки (+ median, variance и т. д.) пользователя и фильма.
3. Оценить RMSE на тестовой выборке.

In [15]:
import pandas as pd
import numpy as np
from collections import Counter
from datetime import datetime
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

%matplotlib inline



In [16]:
# Основной датасет
links = pd.read_csv('ml-latest/links.csv')
movies = pd.read_csv('ml-latest/movies.csv')
ratings = pd.read_csv('ml-latest/ratings.csv')
tags = pd.read_csv('ml-latest/tags.csv')
genome_scores = pd.read_csv('ml-latest/genome-scores.csv')
genome_tags = pd.read_csv('ml-latest/genome-tags.csv')

# Маленький датасет
small_links = pd.read_csv('ml-latest-small/links.csv')
small_movies = pd.read_csv('ml-latest-small/movies.csv')
small_ratings = pd.read_csv('ml-latest-small/ratings.csv')
small_tags = pd.read_csv('ml-latest-small/tags.csv')



### Преобразуем таблицу ratings чтобы найти средние оценки для каждого фильма, а также медиану, дисперсию и стандартное отклонение

In [74]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,1225734739
1,1,110,4.0,1225865086
2,1,158,4.0,1225733503
3,1,260,4.5,1225735204
4,1,356,5.0,1225735119


In [169]:
# Создаем пустой DataFrame
movie_average_ratings = pd.DataFrame(columns=['movieid', 'movie_average_rating', 'movie_median_rating', 'movie_rating_variance', 'movie_rating_std'])

for movieid in tqdm(ratings['movieId'].unique(), desc="Processing movies"):
    
    # Сделаем выборку по конкретному movieId
    movie_ratings = ratings[ratings['movieId'] == movieid]['rating']


    if len(movie_ratings) > 1:
        movie_average_rating = movie_ratings.mean()
        movie_median_rating = movie_ratings.median()
        movie_rating_variance = movie_ratings.var()
        movie_rating_std = movie_ratings.std()
    
    # Так как есть movieId где оценка была поставлена только 1 раз, то нужно прописать условие, 
    # что в таком случае variance и STD будут равны 0, иначе получим Nan   
    elif len(movie_ratings) == 1:
        movie_average_rating = movie_ratings.iloc[0]
        movie_median_rating = movie_ratings.iloc[0]
        movie_rating_variance = 0
        movie_rating_std = 0

    # Добавляем результаты вычислений в пустой датафрейм
    row_to_add = pd.DataFrame({'movieId': [movieid],
                               'movie_average_rating': [movie_average_rating],
                               'movie_median_rating': [movie_median_rating],
                               'movie_rating_variance': [movie_rating_variance],
                               'movie_rating_std': [movie_rating_std]})
    movie_average_ratings = pd.concat([movie_average_ratings, row_to_add], ignore_index=True)
 


Processing movies:   0%|          | 0/83239 [00:00<?, ?it/s]

In [170]:
movie_average_ratings.head(10)

Unnamed: 0,movieid,movie_average_rating,movie_median_rating,movie_rating_variance,movie_rating_std
0,1,3.893508,4.0,0.863236,0.929105
1,110,3.996166,4.0,0.942449,0.970798
2,158,2.888675,3.0,1.102736,1.050112
3,260,4.0924,4.0,1.020214,1.010057
4,356,4.068189,4.0,0.875435,0.935647
5,381,3.402407,3.0,0.950982,0.975183
6,596,3.442332,3.5,1.068525,1.033695
7,1036,3.942671,4.0,0.757773,0.870501
8,1049,3.40423,3.5,0.938032,0.968521
9,1066,3.961243,4.0,0.868208,0.931777


In [171]:
len(movie_average_ratings)

83239

In [172]:
# Сохраним получившийся датафрейм отдельно чтобы не делать повторно расчеты

movie_average_ratings.to_csv('My DataFrames/movie_average_ratings.csv', index=False)


In [265]:
# Для загрузки датафрейма

movie_average_ratings = pd.read_csv('My DataFrames/movie_average_ratings.csv')


In [29]:
# For small

# Создаем пустой DataFrame
small_movie_average_ratings = pd.DataFrame(columns=['movieId', 'movie_average_rating', 'movie_median_rating', 'movie_rating_variance', 'movie_rating_std'])

for movieid in tqdm(small_ratings['movieId'].unique(), desc="Processing movies"):
    
    # Сделаем выборку по конкретному movieId
    movie_ratings = small_ratings[small_ratings['movieId'] == movieid]['rating']


    if len(movie_ratings) > 1:
        movie_average_rating = movie_ratings.mean()
        movie_median_rating = movie_ratings.median()
        movie_rating_variance = movie_ratings.var()
        movie_rating_std = movie_ratings.std()
    
    # Так как есть movieId где оценка была поставлена только 1 раз, то нужно прописать условие, 
    # что в таком случае variance и STD будут равны 0, иначе получим Nan   
    elif len(movie_ratings) == 1:
        movie_average_rating = movie_ratings.iloc[0]
        movie_median_rating = movie_ratings.iloc[0]
        movie_rating_variance = 0
        movie_rating_std = 0

    # Добавляем результаты вычислений в пустой датафрейм
    row_to_add = pd.DataFrame({'movieId': [movieid],
                               'movie_average_rating': [movie_average_rating],
                               'movie_median_rating': [movie_median_rating],
                               'movie_rating_variance': [movie_rating_variance],
                               'movie_rating_std': [movie_rating_std]})
    small_movie_average_ratings = pd.concat([small_movie_average_ratings, row_to_add], ignore_index=True)
 
  
    

Processing movies:   0%|          | 0/9724 [00:00<?, ?it/s]

In [27]:
small_movie_average_ratings.head(10)

Unnamed: 0,movieid,movie_average_rating,movie_median_rating,movie_rating_variance,movie_rating_std
0,1,3.92093,4.0,0.69699,0.834859
1,3,3.259615,3.0,1.112651,1.054823
2,6,3.946078,4.0,0.667856,0.817224
3,47,3.975369,4.0,0.850875,0.922429
4,50,4.237745,4.5,0.641475,0.800921
5,70,3.509091,4.0,1.032323,1.016033
6,101,3.782609,4.0,1.086957,1.042572
7,110,4.031646,4.0,0.936494,0.967726
8,151,3.545455,3.75,0.904863,0.951243
9,157,2.863636,3.0,1.204545,1.097518


### Преобразуем данные о жанрах в строки

In [258]:
movies.head(10)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [19]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [260]:
movies['movie_genres'] = [change_string(g) for g in movies.genres.values]

movies = movies.drop(columns=['genres'])

In [261]:
movies.head()

Unnamed: 0,movieId,title,movie_genres
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy
1,2,Jumanji (1995),Adventure Children Fantasy
2,3,Grumpier Old Men (1995),Comedy Romance
3,4,Waiting to Exhale (1995),Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy


In [20]:
# For small dataset

small_movies['movie_genres'] = [change_string(g) for g in small_movies.genres.values]

small_movies = small_movies.drop(columns=['genres'])

In [21]:
small_movies.head()

Unnamed: 0,movieId,title,movie_genres
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy
1,2,Jumanji (1995),Adventure Children Fantasy
2,3,Grumpier Old Men (1995),Comedy Romance
3,4,Waiting to Exhale (1995),Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy


### Скомбинируем тэги в строки

In [166]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,10,260,good vs evil,1430666558
1,10,260,Harrison Ford,1430666505
2,10,260,sci-fi,1430666538
3,14,1221,Al Pacino,1311600756
4,14,1221,mafia,1311600746


In [192]:
# Переведем все тэги в нижний регистр и уберем дубли для каждого фильма

tags['tag'] = tags['tag'].str.lower()
unique_movie_tags_df = tags.drop_duplicates(subset=['movieId', 'tag'])


In [272]:
def change_string(s):
    return str(s).replace(' ', '').replace('-', '').replace(':', '').replace('.', '')

# Создаем пустой DataFrame
movie_tags_combined = pd.DataFrame(columns=['movieId', 'movie_tags'])

for movie, group in tqdm(unique_movie_tags_df.groupby('movieId')):
    movie_tags = (' '.join([change_string(s) for s in group.tag.values]))
    movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)




  0%|          | 0/53452 [00:00<?, ?it/s]

  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'mo

  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'mo

  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'mo

  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'mo

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags

  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'mo

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags

  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'mo

  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'mo

  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'mo

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags

  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'mo

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags

  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'mo

  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'mo

  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  movie_tags_combined = movie_tags_combined.append([{'mo

In [273]:
# Сохраним получившийся датафрейм отдельно чтобы не делать повторно преобразования

movie_tags_combined.to_csv('My DataFrames/movie_tags_combined.csv', index=False)


In [275]:
# Для загрузки датафрейма
movie_tags_combined = pd.read_csv('My DataFrames/movie_tags_combined.csv')


In [274]:
movie_tags_combined.head(10)

Unnamed: 0,movieId,movie_tags
0,1,animation friendship toys disney pixar cgi cla...
1,2,animals basedonabook fantasy magicboardgame mo...
2,3,sequel moldy old oldage oldmen wedding oldpeop...
3,4,characters chickflick girlmovie revenge clv sl...
4,5,family pregnancy wedding 4thwall aging baby da...
5,6,alpacino complexcharacters crime philosophy re...
6,7,basedonaplay harrisonford paris romance siblin...
7,8,adaptedfrombook authormarktwain prospectprefer...
8,9,jeanclaudevandamme can'tremember clv 1990s act...
9,10,itsanokmovieifyoulikejamesbound 007 jamesbond ...


In [24]:
# Для маленького датасета

# Переведем все тэги в нижний регистр и уберем дубли для каждого фильма

small_tags['tag'] = small_tags['tag'].str.lower()
small_unique_movie_tags_df = small_tags.drop_duplicates(subset=['movieId', 'tag'])


def change_string(s):
    return str(s).replace(' ', '').replace('-', '').replace(':', '').replace('.', '')

# Создаем пустой DataFrame
small_movie_tags_combined = pd.DataFrame(columns=['movieId', 'movie_tags'])

for movie, group in tqdm(small_unique_movie_tags_df.groupby('movieId')):
    movie_tags = (' '.join([change_string(s) for s in group.tag.values]))
    small_movie_tags_combined = small_movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)





  0%|          | 0/1572 [00:00<?, ?it/s]

  small_movie_tags_combined = small_movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  small_movie_tags_combined = small_movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  small_movie_tags_combined = small_movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  small_movie_tags_combined = small_movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  small_movie_tags_combined = small_movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  small_movie_tags_combined = small_movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  small_movie_tags_combined = small_movie_tags_combined.append([{'movieId': movie, 'movie_tags': movie_tags}], ignore_index=True)
  small_movie_tags_combined = small_movie_tags_combined.append([{'movieId': movie, 'movie_

In [26]:
small_movie_tags_combined.head()

Unnamed: 0,movieId,movie_tags
0,1,pixar fun
1,2,fantasy magicboardgame robinwilliams game
2,3,moldy old
3,5,pregnancy remake
4,7,remake


### Объединим преобразованные датафреймы со средними оценками, тэгами и жанрами

In [279]:
combined_movie_df = pd.merge(movies, movie_tags_combined, on='movieId')
combined_movie_df = pd.merge(combined_movie_df, movie_average_ratings, on='movieId')



In [280]:
combined_movie_df.head()

Unnamed: 0,movieId,title,movie_genres,movie_tags,movie_average_rating,movie_median_rating,movie_rating_variance,movie_rating_std
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,animation friendship toys disney pixar cgi cla...,3.893508,4.0,0.863236,0.929105
1,2,Jumanji (1995),Adventure Children Fantasy,animals basedonabook fantasy magicboardgame mo...,3.278179,3.0,0.919146,0.958721
2,3,Grumpier Old Men (1995),Comedy Romance,sequel moldy old oldage oldmen wedding oldpeop...,3.171271,3.0,1.032378,1.01606
3,4,Waiting to Exhale (1995),Comedy Drama Romance,characters chickflick girlmovie revenge clv sl...,2.868395,3.0,1.238456,1.112859
4,5,Father of the Bride Part II (1995),Comedy,family pregnancy wedding 4thwall aging baby da...,3.076957,3.0,0.999141,0.99957


In [281]:
# Сохраним получившийся датафрейм отдельно чтобы не делать повторно преобразования
combined_movie_df.to_csv('My DataFrames/combined_movie_df.csv', index=False)



In [8]:
# Для загрузки датафрейма
combined_movie_df = pd.read_csv('My DataFrames/combined_movie_df.csv')


In [30]:
small_combined_movie_df = pd.merge(small_movies, small_movie_tags_combined, on='movieId')
small_combined_movie_df = pd.merge(small_combined_movie_df, small_movie_average_ratings, on='movieId')



In [35]:
# Сохраним получившийся датафрейм отдельно чтобы не делать повторно преобразования
small_combined_movie_df.to_csv('My DataFrames/small_combined_movie_df.csv', index=False)


In [None]:
# Для загрузки датафрейма
small_combined_movie_df = pd.read_csv('My DataFrames/small_combined_movie_df.csv')


### Выберем пользователя, для которого будем тренировать модель рекомендаций

In [73]:
#Получим список пользователей, которые поставили больше всего оценок и выберем одного для построения рекомендации

user_ratings = ratings['userId'].value_counts()
user_ratings.head(20)

189614    33332
48766      9554
207216     9178
175998     9016
76618      8919
230765     7719
184775     7535
236260     7488
233891     7372
214831     7266
221500     6380
267315     6199
193414     6074
113052     5976
100696     5806
256904     5805
211359     5784
177589     5693
134353     5653
73700      5649
Name: userId, dtype: int64

In [77]:
# Возьмем пользователя 100696
user_100696_ratings = ratings[ratings['userId']==100696][['movieId','rating']]

user_100696_ratings.head()

Unnamed: 0,movieId,rating
10228627,1,3.0
10228628,2,4.0
10228629,3,3.0
10228630,6,3.5
10228631,7,3.5


In [78]:
# Добавим данные об оценках пользователя в скомбинированный датасет с нашими вычислениями, тэгами и жанрами

user_100696_small_combined_movie_df = pd.merge(small_combined_movie_df, user_100696_ratings, on='movieId')

user_100696_small_combined_movie_df.head()

Unnamed: 0,movieId,title,movie_genres,movie_tags,movie_average_rating,movie_median_rating,movie_rating_variance,movie_rating_std,rating
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,pixar fun,3.92093,4.0,0.69699,0.834859,3.0
1,2,Jumanji (1995),Adventure Children Fantasy,fantasy magicboardgame robinwilliams game,3.431818,3.5,0.777419,0.881713,4.0
2,3,Grumpier Old Men (1995),Comedy Romance,moldy old,3.259615,3.0,1.112651,1.054823,3.0
3,7,Sabrina (1995),Comedy Romance,remake,3.185185,3.0,0.955625,0.977561,3.5
4,11,"American President, The (1995)",Comedy Drama Romance,politics president,3.671429,4.0,0.810766,0.900425,4.5


In [80]:
# Сохраним получившийся датафрейм отдельно чтобы не делать повторно преобразования
user_100696_small_combined_movie_df.to_csv('My DataFrames/user_100696_small_combined_movie_df.csv', index=False)

# Для загрузки датафрейма
#user_100696_small_combined_movie_df = pd.read_csv('My DataFrames/user_100696_small_combined_movie_df.csv')



### Разделение данных на тренировочные и тестовые выборки

In [81]:
from sklearn.model_selection import train_test_split

x = user_100696_small_combined_movie_df.drop(columns=['movieId', 'title', 'rating'])  # Признаки для обучения модели

y = user_100696_small_combined_movie_df['rating']  # Целевая переменная - рейтинг фильма, который поставил пользователь

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


In [86]:
x_train.shape

(561, 6)

### Преобразование фичей в векторы


In [103]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

# Фичи, которые оставим без преобразований
x_numeric_features = ['movie_average_rating', 'movie_median_rating', 'movie_rating_variance', 'movie_rating_std']

# Применим TF-IDF к 'movie_tags'
tfidf_tags = TfidfVectorizer()
x_train_tags = tfidf_tags.fit_transform(x_train['movie_tags'])

# Применим TF-IDF к 'movie_genres'
tfidf_genres = TfidfVectorizer()
x_train_genres = tfidf_genres.fit_transform(x_train['movie_genres'])

# Объеденим получившиеся векторы и нетронутые фичи
x_train_combined = hstack([x_train_tags, x_train_genres, x_train[x_numeric_features]]).tocsr()



In [104]:
# Видим что количество признаков увеличилось с 6 до 866

x_train_combined.shape

(561, 866)

In [105]:
# Сделаем аналогичное преобразование для x_test

x_test_tags = tfidf_tags.transform(x_test['movie_tags'])
x_test_genres = tfidf_genres.transform(x_test['movie_genres'])

x_test_combined = hstack([x_test_tags, x_test_genres, x_test[x_numeric_features]]).tocsr()



In [106]:
x_test_combined.shape

(141, 866)

### Построим модель линейной регресси и оценим по RMSE

In [107]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np


lr = LinearRegression()

lr.fit(x_train_combined, y_train)

y_pred = lr.predict(x_test_combined)



Root Mean Squared Error (RMSE): 1.6602038449895862


In [108]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print("Получили RMSE равное:", rmse)

Получили RMSE равное: 1.6602038449895862
