Что делать?

1.Датасет ml-latest

2.Вспомнить подходы, которые мы разбирали

3.Выбрать понравившийся подход к гибридным системам

4.Написать свою

In [644]:
from surprise import SVD, SVDpp, KNNWithMeans, KNNBasic, BaselineOnly
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split
from surprise.model_selection import KFold

import matplotlib.pyplot as plt

from tqdm import tqdm_notebook

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors

import pandas as pd
import numpy as np

In [645]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse

In [646]:
links = pd.read_csv('ml-latest-small/links.csv')
movies = pd.read_csv('ml-latest-small/movies.csv')
ratings = pd.read_csv('ml-latest-small/ratings.csv')
tags = pd.read_csv('ml-latest-small/tags.csv')

In [647]:
movies_with_ratings = movies.join(ratings.set_index('movieId'), on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)

In [648]:
movies_with_ratings

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0,4.0,9.649827e+08
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0,4.0,8.474350e+08
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7.0,4.5,1.106636e+09
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15.0,2.5,1.510578e+09
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17.0,4.5,1.305696e+09
...,...,...,...,...,...,...
100849,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,184.0,4.0,1.537109e+09
100850,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,184.0,3.5,1.537110e+09
100851,193585,Flint (2017),Drama,184.0,3.5,1.537110e+09
100852,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,184.0,3.5,1.537110e+09


Сделаем предсказание оценок на основе библиотеки surprise с использованием kFold

In [649]:
dataset = pd.DataFrame({
    'uid': movies_with_ratings.userId,
    'iid': movies_with_ratings.title,
    'rating': movies_with_ratings.rating
})

In [650]:
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(dataset, reader)

In [651]:
kf = KFold(n_splits=5)
algo = BaselineOnly({'method': 'als', 'n_epochs': 15, 'reg_u': 12, 'reg_i': 5})

In [652]:
for trainset, testset in kf.split(data):
     # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)

Estimating biases using als...
RMSE: 0.8585
Estimating biases using als...
RMSE: 0.8637
Estimating biases using als...
RMSE: 0.8716
Estimating biases using als...
RMSE: 0.8680
Estimating biases using als...
RMSE: 0.8731


In [653]:
# algo.predict(uid=474.0, iid='Dead Poets Society (1989)').est

In [613]:
# Предскажем оценки на фильмы, которые не смотрел пользователь

In [654]:
current_user_id = 15.0
not_user_movies = movies_with_ratings[movies_with_ratings.userId != current_user_id].title.unique()

scores = []
titles = []

for movie in movies_with_ratings.title.unique():
    if movie in not_user_movies:
#         continue
        
        scores.append(algo.predict(uid=current_user_id, iid=movie).est)
        titles.append(movie)

In [655]:
rec_1 = pd.DataFrame(list(zip(titles, scores)))

In [656]:
rec_1.columns=['title', 'score']

In [657]:
rec_1.sort_values('score',ascending=False).head(10)

Unnamed: 0,title,score
277,"Shawshank Redemption, The (1994)",4.154845
602,Dr. Strangelove or: How I Learned to Stop Worr...,4.070464
905,Lawrence of Arabia (1962),4.054862
686,Rear Window (1954),4.044348
659,"Godfather, The (1972)",4.036394
9595,"Three Billboards Outside Ebbing, Missouri (2017)",4.031597
900,Brazil (1985),4.019
2224,Fight Club (1999),4.015676
46,"Usual Suspects, The (1995)",4.013903
1493,Seven Samurai (Shichinin no samurai) (1954),4.011414


In [658]:
# for t in rec_1[rec_1.score > 3.5].title:
#     print(t)

Обучим модель линейной регрессии на TF-IDF на жанрах

In [659]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [660]:
movie_genres = [change_string(g) for g in movies_with_ratings.genres.values]

In [661]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(movie_genres)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [662]:
# pd.DataFrame(X_train_tfidf.todense())

In [663]:
newdata = movies_with_ratings.join(pd.DataFrame(X_train_tfidf.todense()))
newdata.dropna(inplace=True)

In [664]:
# newdata

In [665]:
# movies_with_ratings

In [666]:
user_movies_data = newdata[newdata.userId == current_user_id]
another_data = newdata[newdata.userId != current_user_id]

In [667]:
# user_movies_data

In [668]:
# another_data.title.nunique()

In [669]:
y = user_movies_data['rating']
X = user_movies_data.iloc[:, 6:]

In [670]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [671]:
model = LinearRegression(normalize=True).fit(X_train, y_train)

In [672]:
y_pred_test = model.predict(X_test)
y_pred_train = model.predict(X_train)

In [673]:
mse(y_test, y_pred_test, squared=False)

1.030837989180127

Предскажем моделью линейной регрессии оценки фильмам, которые были рекомендованы предыдущей моделью с результатом более 4 (с целью увеличить быстродействие за счет уменьшения объема данных на входе модели)

In [674]:
not_user_movies = another_data.title.unique()

In [675]:
scores_1 = []
titles_1 = []
for tit in rec_1[rec_1.score > 4].title:
#     print(title)
    if tit in not_user_movies:
#         continue
        scores_1.append(float(model.predict(newdata[newdata.title == tit].iloc[0:1,6:])))
        titles_1.append(tit)

In [676]:
# float(model.predict(another_data[another_data.title == 'Iron Soldier (2010)'].iloc[0:1,6:]))

In [677]:
# scores_1

In [678]:
# sorted(scores_1)[-10:]

In [679]:
rec_2 = pd.DataFrame(list(zip(titles_1, scores_1)))

In [680]:
rec_2.columns=['title', 'score']

In [684]:
rec_2.sort_values('score', ascending=False).head(10)

Unnamed: 0,title,score
7,Lawrence of Arabia (1962),4.96021
5,Casablanca (1942),4.458826
2,Dr. Strangelove or: How I Learned to Stop Worr...,4.169801
1,"Shawshank Redemption, The (1994)",3.962177
3,"Godfather, The (1972)",3.962177
8,Goodfellas (1990),3.962177
9,"Godfather: Part II, The (1974)",3.962177
12,"Departed, The (2006)",3.90902
0,"Usual Suspects, The (1995)",3.505328
14,"Three Billboards Outside Ebbing, Missouri (2017)",3.367358


In [682]:
# rec_1.sort_values('score',ascending=False).head(10)

In [690]:
# movies_with_ratings[movies_with_ratings.userId == 15].sort_values('rating', ascending=False).head(20)