# Задание 
1. Использовать dataset [MovieLens](https://grouplens.org/datasets/movielens/latest/)
2. Построить рекомендации (регрессия, предсказываем оценку) на фичах:
TF-IDF на тегах и жанрах
3. Средние оценки (+ median, variance, etc.) пользователя и фильма
4. Оценить RMSE на тестовой выборке

In [1]:
import pandas as pd
import numpy as np

from tqdm.notebook import tqdm

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

!pip install pymorphy2
from pymorphy2 import MorphAnalyzer

%matplotlib inline

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pymorphy2
  Downloading pymorphy2-0.9.1-py3-none-any.whl (55 kB)
[K     |████████████████████████████████| 55 kB 2.6 MB/s 
[?25hCollecting pymorphy2-dicts-ru<3.0,>=2.4
  Downloading pymorphy2_dicts_ru-2.4.417127.4579844-py2.py3-none-any.whl (8.2 MB)
[K     |████████████████████████████████| 8.2 MB 8.0 MB/s 
[?25hCollecting dawg-python>=0.7.1
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)
Installing collected packages: pymorphy2-dicts-ru, dawg-python, pymorphy2
Successfully installed dawg-python-0.7.2 pymorphy2-0.9.1 pymorphy2-dicts-ru-2.4.417127.4579844


In [2]:
# links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [6]:
def change_string(s):
    return ' '.join(s.lower().replace(' ', '').replace('-', '').split('|'))

In [7]:
# Нормализуем жанры
movies['genres_formatted'] = movies.genres.apply(change_string)

In [8]:
movies=movies.drop(['genres'], axis=1)

In [9]:
# слепим теги каждого юзера,  если они относятся к одному фильму
tags['tag_new'] = tags[['userId','movieId','tag']].groupby(['userId','movieId'])['tag'].transform(lambda x: ' '.join(x))
tags=tags[['userId','movieId','tag_new']].drop_duplicates()

In [10]:
movies.head()

Unnamed: 0,movieId,title,genres_formatted
0,1,Toy Story (1995),adventure animation children comedy fantasy
1,2,Jumanji (1995),adventure children fantasy
2,3,Grumpier Old Men (1995),comedy romance
3,4,Waiting to Exhale (1995),comedy drama romance
4,5,Father of the Bride Part II (1995),comedy


In [11]:
import re

In [12]:
m = MorphAnalyzer()
def lemmatize(text):
    try:
        return " ".join([m.parse(w)[0].normal_form for w in text.split()])  
    except:
        return " "

In [13]:
tags['tag_normalized'] = tags['tag_new'].apply(lemmatize)

In [14]:
tags = tags.drop(['tag_new'], axis=1)

In [15]:
tags.head()

Unnamed: 0,userId,movieId,tag_normalized
0,2,60756,funny highly quotable will ferrell
3,2,89774,boxing story mma tom hardy
6,2,106782,drugs leonardo dicaprio martin scorsese
9,7,48516,way too long
10,18,431,al pacino gangster mafia


## *TF-IDF*

### *RandomForest*

In [16]:
# Разделим данные
X = ratings[['userId','movieId','rating']]

In [17]:
movies.head(2)

Unnamed: 0,movieId,title,genres_formatted
0,1,Toy Story (1995),adventure animation children comedy fantasy
1,2,Jumanji (1995),adventure children fantasy


In [18]:
X = pd.merge(X, movies[['movieId','genres_formatted']], on='movieId', how='left')

In [19]:
X.head()

Unnamed: 0,userId,movieId,rating,genres_formatted
0,1,1,4.0,adventure animation children comedy fantasy
1,1,3,4.0,comedy romance
2,1,6,4.0,action crime thriller
3,1,47,5.0,mystery thriller
4,1,50,5.0,crime mystery thriller


In [20]:
X.isnull().any()

userId              False
movieId             False
rating              False
genres_formatted    False
dtype: bool

In [21]:
y = X['rating']

In [22]:
X = X.drop(['userId'], axis = 1)
X = X.drop(['movieId'], axis = 1)
X = X.drop(['rating'], axis = 1)

In [23]:
X

Unnamed: 0,genres_formatted
0,adventure animation children comedy fantasy
1,comedy romance
2,action crime thriller
3,mystery thriller
4,crime mystery thriller
...,...
100831,drama horror thriller
100832,action crime thriller
100833,horror
100834,action scifi


In [24]:
vectors = CountVectorizer()
tf_idf = TfidfTransformer()

In [25]:
vec = vectors.fit_transform(X['genres_formatted'])
print(len(vectors.get_feature_names()))

20




In [26]:
vectors.get_feature_names()



['action',
 'adventure',
 'animation',
 'children',
 'comedy',
 'crime',
 'documentary',
 'drama',
 'fantasy',
 'filmnoir',
 'horror',
 'imax',
 'musical',
 'mystery',
 'nogenreslisted',
 'romance',
 'scifi',
 'thriller',
 'war',
 'western']

In [27]:
tf_idf = tf_idf.fit_transform(vec)

In [28]:
tf_idf

<100836x20 sparse matrix of type '<class 'numpy.float64'>'
	with 274480 stored elements in Compressed Sparse Row format>

In [29]:
X_train, X_test, y_train, y_test = train_test_split(tf_idf, y, test_size=0.3,random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((70585, 20), (30251, 20), (70585,), (30251,))

In [30]:
model = RandomForestRegressor(n_estimators=100, criterion='mse')
model.fit(X_train, y_train)



RandomForestRegressor(criterion='mse')

In [31]:
predictions = model.predict(X_test)
print('root_mean_squared_error = ', np.sqrt(mean_squared_error(y_test, predictions)))

root_mean_squared_error =  1.0055953697567397


In [32]:
print(model.feature_importances_)

[0.10010943 0.04960377 0.02417122 0.07735918 0.12264129 0.07840065
 0.00584422 0.17712514 0.04619235 0.00724288 0.06013573 0.01404834
 0.01267686 0.03724979 0.00053968 0.03571701 0.04414669 0.05353484
 0.03930554 0.01395538]


### *Metrics*

In [33]:
from scipy import stats

In [34]:
rating_agg = ratings.groupby(['movieId'])[['rating']].count()
rating_agg.colums = ['Count']
rating_agg['Mean'] = ratings.groupby(['movieId'])[['rating']].mean()
rating_agg['Median'] = ratings.groupby(['movieId'])[['rating']].median()
rating_agg['Var'] = ratings.groupby(['movieId'])[['rating']].var()
rating_agg['Mode'] = ratings.groupby(['movieId']).rating.agg(lambda x: stats.mode(x)[0])

  


In [35]:
rating_agg.reset_index().head()

Unnamed: 0,movieId,rating,Mean,Median,Var,Mode
0,1,215,3.92093,4.0,0.69699,4.0
1,2,110,3.431818,3.5,0.777419,4.0
2,3,52,3.259615,3.0,1.112651,3.0
3,4,7,2.357143,3.0,0.72619,3.0
4,5,49,3.071429,3.0,0.822917,3.0


In [36]:
ratings[['userId','movieId','rating']].shape

(100836, 3)