In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,make_scorer


from tqdm.notebook import tqdm
from tqdm.notebook import tqdm
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from scipy.sparse import csr_matrix

from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor

from sklearn.model_selection import GridSearchCV




import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
movies = pd.read_csv('../Data/movies.csv')
tags = pd.read_csv('../Data/tags.csv')
ratings = pd.read_csv('../Data/ratings.csv')
# делаем выборку для обучения и теста
all_data = ratings[['userId','movieId','rating']]


In [3]:
print(movies.head(5))
print(tags.head(5))
print(ratings.head(5))

   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  
   userId  movieId              tag   timestamp
0       2    60756            funny  1445714994
1       2    60756  Highly quotable  1445714996
2       2    60756     will ferrell  1445714992
3       2    89774     Boxing story  1445715207
4       2    89774              MMA  1445715200
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2  

In [4]:
movies['year'] = movies.title.str.extract("\((\d{4})\)", expand=True)
movies.year = pd.to_datetime(movies.year, format='%Y')
movies.year = movies.year.dt.year # As there are some NaN years, resulting type will be float (decimals)
movies.year.fillna(0, inplace=True)

In [5]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))
movie_genres = [change_string(g) for g in movies.genres.values]

In [6]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(movie_genres)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
# Упорядоченный список названий жанров
list_col = sorted(count_vect.vocabulary_, key=count_vect.vocabulary_.get)


In [7]:
# средний рейтинг для фильмов 
movies_s1 = pd.concat([movies[['movieId','year']], 
                     pd.DataFrame(X_train_tfidf.toarray(),columns=list_col)], 
                     axis=1).set_index('movieId')

movies_s1 = movies_s1.join(ratings[['movieId', 'rating']].groupby('movieId').mean())
movies_s1.rename(columns={"rating": "mean_rating_movie"},inplace=True)

# средний рейтинг для жанров
for col in list_col:
    movies_s1['mean_rate_'+col]=0
    rt = movies_s1[movies_s1[col] != 0]['mean_rating_movie'].mean()
    movies_s1.loc[movies_s1[movies_s1[col] != 0][['mean_rate_'+col]].index,'mean_rate_'+col]=rt

users_s1 = ratings[['userId', 'rating']].groupby('userId').mean()
users_s1.rename(columns={"rating": "mean_rating_user"},inplace=True)    


In [8]:
X_train, X_test, Y_train, Y_test = train_test_split(all_data[['userId','movieId']], 
                                                    all_data[['rating']], test_size=0.2)
print('all_data={}; X_train={}, X_test={}, Y_train={}, Y_test={}'.format(all_data.shape, X_train.shape, X_test.shape, Y_train.shape, Y_test.shape))


all_data=(100836, 3); X_train=(80668, 2), X_test=(20168, 2), Y_train=(80668, 1), Y_test=(20168, 1)


In [9]:
#print(X_train) 
print(Y_train)

       rating
93441     2.0
18607     4.0
67444     3.0
21988     4.0
63056     1.0
...       ...
67505     5.0
59113     5.0
84122     1.0
78088     4.0
43085     2.5

[80668 rows x 1 columns]


In [10]:
XX_train = X_train.join(users_s1,on='userId').join(movies_s1,on='movieId')
XX_train.drop(['userId', 'movieId'], axis=1, inplace=True)
print(XX_train.shape)

XX_test = X_test.join(users_s1,on='userId').join(movies_s1,on='movieId')
XX_test.drop(['userId', 'movieId'], axis=1, inplace=True)
print(XX_test.shape)


(80668, 43)
(20168, 43)


## 1) KNeighborsRegressor

In [11]:
knn_param = {
            'n_neighbors': [1,2,3,5,7,10,15],
            'p':[1,2]
}
scorer1 = make_scorer(mean_squared_error)

In [12]:
knn = KNeighborsRegressor(n_jobs=3)
grid_knn = GridSearchCV(knn, knn_param, scoring=scorer1, cv=10)
grid_knn.fit(XX_train, Y_train)

GridSearchCV(cv=10, estimator=KNeighborsRegressor(n_jobs=3),
             param_grid={'n_neighbors': [1, 2, 3, 5, 7, 10, 15], 'p': [1, 2]},
             scoring=make_scorer(mean_squared_error))

In [13]:
print(grid_knn.best_params_ )
print(grid_knn.best_score_ )
print(grid_knn.best_estimator_ )

{'n_neighbors': 1, 'p': 1}
1.3833212237727235
KNeighborsRegressor(n_jobs=3, n_neighbors=1, p=1)


In [14]:
gs1 = KNeighborsRegressor(n_jobs=3, n_neighbors=1, p=1).fit( XX_train, Y_train )
Y_pred1 = gs1.predict( XX_test )
score_1 = mean_squared_error(Y_test,Y_pred1)
print(score_1)

1.394957358191194


### 2) SVR

In [37]:
gs2 = SVR(C=0.1, epsilon=0.2).fit(XX_train.values , Y_train.values.ravel())
Y_pred2 = gs2.predict( XX_test.values )
score_2 = mean_squared_error(Y_test,Y_pred2)
print(score_2)

1.1332310949938513


## 3) DecisionTreeRegressor

In [38]:
dtr_param = {
            'max_depth': [1,2,5,10],
            'min_samples_leaf':[1,2,3]
}
scorer2 = make_scorer(mean_squared_error)

In [39]:
e_dtr = DecisionTreeRegressor()
grid_dtr = GridSearchCV(e_dtr, dtr_param, scoring=scorer2, cv=10)
grid_dtr.fit(XX_train.values , Y_train.values.ravel())

GridSearchCV(cv=10, estimator=DecisionTreeRegressor(),
             param_grid={'max_depth': [1, 2, 5, 10],
                         'min_samples_leaf': [1, 2, 3]},
             scoring=make_scorer(mean_squared_error))

In [40]:
print(grid_dtr.best_params_ )
print(grid_dtr.best_score_ )
print(grid_dtr.best_estimator_ )

{'max_depth': 1, 'min_samples_leaf': 1}
0.8872226901732516
DecisionTreeRegressor(max_depth=1)


In [43]:
gs3 = DecisionTreeRegressor(max_depth=1).fit( XX_train, Y_train )
Y_pred3 = gs3.predict( XX_test )
score_3 = mean_squared_error(Y_test,Y_pred3)
print(score_3)

0.8930967331153137
