
    Использовать датасет MovieLens.
    Построить рекомендации (регрессия, предсказываем оценку) на фичах:

    TF-IDF на тегах и жанрах;
    средние оценки (+ median, variance и т. д.) пользователя и фильма.

    Оценить RMSE на тестовой выборке.



In [None]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.decomposition import PCA

In [None]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [None]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [None]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


## Посчитаем метрики по юзерам и фильмам

In [None]:
user_metrix = ratings.groupby('userId')['rating'].agg(['mean', 'median', 'var', ('mode', lambda x: pd.Series.mode(x)[0])]).reset_index()
movie_metrix = ratings.groupby('movieId')['rating'].agg(['mean', 'median', 'var', ('mode', lambda x: pd.Series.mode(x)[0])]).reset_index()

## Обогатим данные тегами для фильмов

In [None]:
mr = movies.merge(tags[['userId', 'movieId', 'tag']], how='inner', left_on='movieId', right_on='movieId')
mr

Unnamed: 0,movieId,title,genres,userId,tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,pixar
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474,pixar
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567,fun
3,2,Jumanji (1995),Adventure|Children|Fantasy,62,fantasy
4,2,Jumanji (1995),Adventure|Children|Fantasy,62,magic board game
...,...,...,...,...,...
3678,187595,Solo: A Star Wars Story (2018),Action|Adventure|Children|Sci-Fi,62,star wars
3679,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,anime
3680,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,comedy
3681,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,gintama


## Приведем все к нижнему регистру и уберем лишние символы

In [None]:
def genres(genres):
  return ' '.join(genres.replace(' ', '').replace('-', '').lower().split('|'))

In [None]:
mr['genres'] = mr['genres'].apply(genres)

In [None]:
mr['tag'] = mr['tag'].str.lower()
mr

Unnamed: 0,movieId,title,genres,userId,tag
0,1,Toy Story (1995),adventure animation children comedy fantasy,336,pixar
1,1,Toy Story (1995),adventure animation children comedy fantasy,474,pixar
2,1,Toy Story (1995),adventure animation children comedy fantasy,567,fun
3,2,Jumanji (1995),adventure children fantasy,62,fantasy
4,2,Jumanji (1995),adventure children fantasy,62,magic board game
...,...,...,...,...,...
3678,187595,Solo: A Star Wars Story (2018),action adventure children scifi,62,star wars
3679,193565,Gintama: The Movie (2010),action animation comedy scifi,184,anime
3680,193565,Gintama: The Movie (2010),action animation comedy scifi,184,comedy
3681,193565,Gintama: The Movie (2010),action animation comedy scifi,184,gintama


## Обогатим данные и удалим посутые стриочки для лучшего предсказания

In [None]:
mru = mr.merge(user_metrix, how='left', left_on='userId', right_on='userId')
mrum = mru.merge(movie_metrix, how='left', left_on='movieId', right_on='movieId', suffixes=('user', 'movie'))
mrum['description'] = mrum['genres'] + ' ' + mrum['tag']
mrum = mrum.merge(ratings[['movieId', 'rating']], how='left', left_on='movieId', right_on='movieId')
mrum.drop(columns=['genres', 'tag', 'movieId', 'title', 'userId'], inplace=True)
mrum.dropna(inplace=True)

In [None]:
mrum.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 232993 entries, 0 to 233229
Data columns (total 10 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   meanuser     232993 non-null  float64
 1   medianuser   232993 non-null  float64
 2   varuser      232993 non-null  float64
 3   modeuser     232993 non-null  float64
 4   meanmovie    232993 non-null  float64
 5   medianmovie  232993 non-null  float64
 6   varmovie     232993 non-null  float64
 7   modemovie    232993 non-null  float64
 8   description  232993 non-null  object 
 9   rating       232993 non-null  float64
dtypes: float64(9), object(1)
memory usage: 19.6+ MB


In [None]:
mrum.head()

Unnamed: 0,meanuser,medianuser,varuser,modeuser,meanmovie,medianmovie,varmovie,modemovie,description,rating
0,4.321429,4.5,0.422078,5.0,3.92093,4.0,0.69699,4.0,adventure animation children comedy fantasy pixar,4.0
1,4.321429,4.5,0.422078,5.0,3.92093,4.0,0.69699,4.0,adventure animation children comedy fantasy pixar,4.0
2,4.321429,4.5,0.422078,5.0,3.92093,4.0,0.69699,4.0,adventure animation children comedy fantasy pixar,4.5
3,4.321429,4.5,0.422078,5.0,3.92093,4.0,0.69699,4.0,adventure animation children comedy fantasy pixar,2.5
4,4.321429,4.5,0.422078,5.0,3.92093,4.0,0.69699,4.0,adventure animation children comedy fantasy pixar,4.5


## TFiDF

In [None]:
def tfidf_vectors(df):
  tfidf = TfidfVectorizer()
  tfidf_matrix = tfidf.fit_transform(df.description)
  names = tfidf.get_feature_names_out()
  tfidf_matrix = pd.DataFrame(tfidf_matrix.toarray(), columns=names)
  return tfidf_matrix

In [None]:
df = pd.concat([tfidf_vectors(mrum), mrum], axis=1)
df

Unnamed: 0,06,1900s,1920s,1950s,1960s,1970s,1980s,1990s,2001,250,...,meanuser,medianuser,varuser,modeuser,meanmovie,medianmovie,varmovie,modemovie,description,rating
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.321429,4.5,0.422078,5.0,3.92093,4.0,0.69699,4.0,adventure animation children comedy fantasy pixar,4.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.321429,4.5,0.422078,5.0,3.92093,4.0,0.69699,4.0,adventure animation children comedy fantasy pixar,4.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.321429,4.5,0.422078,5.0,3.92093,4.0,0.69699,4.0,adventure animation children comedy fantasy pixar,4.5
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.321429,4.5,0.422078,5.0,3.92093,4.0,0.69699,4.0,adventure animation children comedy fantasy pixar,2.5
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.321429,4.5,0.422078,5.0,3.92093,4.0,0.69699,4.0,adventure animation children comedy fantasy pixar,4.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
233225,,,,,,,,,,,...,4.081967,4.0,0.509701,4.0,3.90000,4.0,0.55000,4.0,action adventure children scifi star wars,4.0
233226,,,,,,,,,,,...,4.081967,4.0,0.509701,4.0,3.90000,4.0,0.55000,4.0,action adventure children scifi star wars,4.0
233227,,,,,,,,,,,...,4.081967,4.0,0.509701,4.0,3.90000,4.0,0.55000,4.0,action adventure children scifi star wars,3.5
233228,,,,,,,,,,,...,4.081967,4.0,0.509701,4.0,3.90000,4.0,0.55000,4.0,action adventure children scifi star wars,3.0


In [None]:
df.dropna(inplace=True)

In [None]:
df.drop(columns=['description'], inplace=True)

In [None]:
df.head()

Unnamed: 0,06,1900s,1920s,1950s,1960s,1970s,1980s,1990s,2001,250,...,zooey,meanuser,medianuser,varuser,modeuser,meanmovie,medianmovie,varmovie,modemovie,rating
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.321429,4.5,0.422078,5.0,3.92093,4.0,0.69699,4.0,4.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.321429,4.5,0.422078,5.0,3.92093,4.0,0.69699,4.0,4.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.321429,4.5,0.422078,5.0,3.92093,4.0,0.69699,4.0,4.5
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.321429,4.5,0.422078,5.0,3.92093,4.0,0.69699,4.0,2.5
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.321429,4.5,0.422078,5.0,3.92093,4.0,0.69699,4.0,4.5


## Построение модели

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,:-1], df['rating'], test_size=42)

In [None]:
KNN = KNeighborsRegressor(n_neighbors=2, metric='euclidean')

In [None]:
# pca = PCA()
# pipe = Pipeline(steps=[("pca", pca),
#                         ("KNN", KNN)])
# n_neighbors = [2, 3, 5, 10, 15, 20, 30, 50, 100]
# algorithm = ["auto",  "ball_tree", "kd_tree", "brute"]
# metrics = ['cosine', 'euclidean', 'haversine', 'l1', 'l2', 'manhattan']
# n_components = list(range(1, df.shape[1]+1, 1))

# parameters = dict(pca__n_components = n_components,
#                   KNN__n_neighbors = n_neighbors,
#                   KNN__algorithm = algorithm,
#                   KNN__metric= metrics)

In [None]:
# clf = GridSearchCV(pipe, parameters)
# clf.fit(X_train, y_train)

In [None]:
KNN.fit(X_train, y_train)

KNeighborsRegressor(metric='euclidean', n_neighbors=2)

In [None]:
mean_squared_error(y_test, KNN.predict(X_test))

0.9136904761904762

## GridSearchCV вылетает по памяти ...