In [None]:
# 1. Использовать dataset MovieLens
# 2. Построить рекомендации (регрессия, предсказываем оценку) на фичах:
#    -  TF-IDF на тегах и жанрах
#    -  Средние оценки (+ median, variance, etc.) пользователя и фильма
# 3. Оценить RMSE на тестовой выборке

In [1]:
import pandas as pd
import numpy as np

from tqdm import tqdm_notebook

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors

%matplotlib inline

In [2]:
links = pd.read_csv('ml-latest-small/links.csv')
ratings = pd.read_csv('ml-latest-small/ratings.csv')
movies = pd.read_csv('ml-latest-small/movies.csv')
tags = pd.read_csv('ml-latest-small/tags.csv')

In [3]:
m_w_t = movies.join(ratings.set_index('movieId'), on='movieId')
m_w_t.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0,4.0,964982700.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0,4.0,847435000.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7.0,4.5,1106636000.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15.0,2.5,1510578000.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17.0,4.5,1305696000.0


In [4]:
m_w_t_T = m_w_t.groupby(['movieId','title','genres'])[['rating']].mean().reset_index().join(tags.set_index('movieId'), on='movieId')
m_w_t_T_r = m_w_t_T[['movieId', 'title', 'genres', 'rating','tag']]

In [5]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [6]:
m_w_t_T_r.genres = m_w_t_T_r.genres.apply(change_string)
m_w_t_T_r.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


(11853, 5)

In [7]:
m_w_t_T_r_1 = m_w_t_T_r[~m_w_t_T_r.rating.isnull()]

In [14]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(m_w_t_T_r_1.genres)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [20]:
X = X_train_tfidf
y = m_w_t_T_r_1.rating

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

In [22]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error

In [17]:
regr = linear_model.LinearRegression()

In [23]:
regr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [24]:
y_pred = regr.predict(X_test)
y_pred

array([3.18242012, 3.52816799, 3.79772404, ..., 3.63576244, 3.26233885,
       3.79772404])

In [25]:
from sklearn.metrics import mean_squared_error

mean_squared_error(y_test, y_pred)

0.6281641206847447

In [26]:
m_w_t_T_r_1.columns

Index(['movieId', 'title', 'genres', 'rating', 'tag'], dtype='object')

In [28]:
m_w_t_T_r_1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11832 entries, 0 to 9741
Data columns (total 5 columns):
movieId    11832 non-null int64
title      11832 non-null object
genres     11832 non-null object
rating     11832 non-null float64
tag        3662 non-null object
dtypes: float64(1), int64(1), object(3)
memory usage: 554.6+ KB


In [42]:
based_on_tags = m_w_t_T_r_1[~m_w_t_T_r_1.tag.isnull()]

In [43]:
based_on_tags.shape

(3662, 5)

In [44]:
count_vect_1 = CountVectorizer()
X_train_counts = count_vect_1.fit_transform(based_on_tags.tag)

tfidf_transformer_1 = TfidfTransformer()
X_train_tfidf_1 = tfidf_transformer_1.fit_transform(X_train_counts)

In [45]:
count_vect_2 = CountVectorizer()
X_train_counts = count_vect_2.fit_transform(based_on_tags.genres)

tfidf_transformer_2 = TfidfTransformer()
X_train_tfidf_2 = tfidf_transformer_2.fit_transform(X_train_counts)

In [49]:
from scipy.sparse import hstack
XXX = hstack((X_train_tfidf_1, X_train_tfidf_2))

In [50]:
yyy = based_on_tags.rating

In [51]:
regr_1 = linear_model.LinearRegression()

In [52]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    XXX, yyy, test_size=0.2, random_state=42)

In [55]:
regr_1 = linear_model.LinearRegression()
regr_1.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [57]:
y_pred_1 = regr_1.predict(X_test)

In [58]:
mean_squared_error(y_test, y_pred_1)

0.220788958662116