In [51]:
import pandas as pd
import numpy as np
from collections import Counter
from datetime import datetime
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor
%matplotlib inline

In [2]:
%cd ml-latest-small

d:\Machine_Learning\RS_item_to_item\ml-latest-small


In [3]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [4]:
tags

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200
...,...,...,...,...
3678,606,7382,for katie,1171234019
3679,606,7936,austere,1173392334
3680,610,3265,gun fu,1493843984
3681,610,3265,heroic bloodshed,1493843978


In [5]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [16]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [6]:
new_tags = []
for movie in tags['movieId'].unique():
    new_tags.append([movie, ' '.join([tag.replace(' ', '').replace('-', '') for tag in tags[tags['movieId'] == movie]['tag'].unique()])])

In [7]:
new_tags = pd.DataFrame(data=new_tags, columns=['movieId', 'tag'])

In [8]:
genres = movies['genres'].apply(lambda x: ' '.join(x.replace(' ', '').replace('-', '').split('|')))

In [416]:
movies['movieId']

0            1
1            2
2            3
3            4
4            5
         ...  
9737    193581
9738    193583
9739    193585
9740    193587
9741    193609
Name: movieId, Length: 9742, dtype: int64

In [9]:
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(genres)

In [10]:
df = pd.DataFrame(data=X_train_tfidf.toarray(), columns=tfidf.get_feature_names_out(), index=movies['movieId'])

In [11]:
ratings_with_genres = ratings.merge(df, on='movieId')

In [12]:
ratings_with_genres

Unnamed: 0,userId,movieId,rating,timestamp,action,adventure,animation,children,comedy,crime,...,horror,imax,musical,mystery,nogenreslisted,romance,scifi,thriller,war,western
0,1,1,4.0,964982703,0.000000,0.416846,0.516225,0.504845,0.267586,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
1,5,1,4.0,847434962,0.000000,0.416846,0.516225,0.504845,0.267586,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
2,7,1,4.5,1106635946,0.000000,0.416846,0.516225,0.504845,0.267586,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
3,15,1,2.5,1510577970,0.000000,0.416846,0.516225,0.504845,0.267586,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
4,17,1,4.5,1305696483,0.000000,0.416846,0.516225,0.504845,0.267586,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100831,610,160341,2.5,1479545749,0.711811,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.702371,0.0,0.0
100832,610,160527,4.5,1479544998,0.598085,0.000000,0.000000,0.000000,0.000000,0.692392,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
100833,610,160836,3.0,1493844794,0.641630,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.633120,0.0,0.0
100834,610,163937,3.5,1493848789,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.780971,0.0,0.0,0.0,0.0,0.0,0.0,0.624567,0.0,0.0


In [13]:
tfidf = TfidfVectorizer()
x_tags_train = tfidf.fit_transform(new_tags['tag'].values)

In [14]:
tags_ = pd.DataFrame(data=x_tags_train.toarray(), columns=tfidf.get_feature_names_out(), index=new_tags['movieId'])

In [21]:
rating_tags_genres = ratings_with_genres.merge(tags_, on='movieId', how='left')

In [17]:
rating_tags_genres.drop(columns='timestamp', axis=1 ,inplace=True)

In [18]:
user_rate = ratings.groupby('userId').mean()['rating']
film_rate = ratings.groupby('movieId').mean()['rating']

In [475]:
user_rate, film_rate

(userId
 1      4.366379
 2      3.948276
 3      2.435897
 4      3.555556
 5      3.636364
          ...   
 606    3.657399
 607    3.786096
 608    3.134176
 609    3.270270
 610    3.688556
 Name: rating, Length: 610, dtype: float64,
 movieId
 1         3.920930
 2         3.431818
 3         3.259615
 4         2.357143
 5         3.071429
             ...   
 193581    4.000000
 193583    3.500000
 193585    3.500000
 193587    3.500000
 193609    4.000000
 Name: rating, Length: 9724, dtype: float64)

In [33]:
completeDF = rating_tags_genres.merge(user_rate, on='userId', how='left').merge(film_rate, on='movieId', how='left')

In [36]:
completeDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Columns: 1498 entries, userId to rating
dtypes: float64(1495), int64(3)
memory usage: 1.1 GB


In [35]:
completeDF.fillna(0, inplace=True)

In [39]:
for i in completeDF:
    if completeDF[i].mean() == 0:
        completeDF.drop(columns=i, axis=1, inplace=True)

In [42]:
completeDF.drop_duplicates(inplace=True)

In [44]:
x, y = completeDF.drop(['userId', 'movieId', 'rating_x', 'timestamp'], axis=1), completeDF['rating_x']

In [45]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [56]:
model = GradientBoostingRegressor(loss='absolute_error', learning_rate=10e-2, min_samples_leaf=10)

In [57]:
model.fit(x_train, y_train)

In [58]:
y_pred = model.predict(x_test)

In [59]:
mean_squared_error(y_test, y_pred, squared=False)

0.8126223399957475

In [60]:
model.score(x_test, y_test)

0.3848595923169975

#### Получился довольно низкий скор, о чем также говорит ошибка, скорее всего это связано с огромным количеством признаков, возможно стоит использовать функции понижения размерности или просто удалить самые "неважные" признаки. 