In [25]:
import numpy as np
import pandas as pd

Будем рекомендовать фильмы.

In [2]:
movies = pd.read_csv('movies.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
#Информация по оценкам: userid-movied-рейтинг-дата

ratings = pd.read_csv('ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


Коллаборативная фильтрация user-based

In [5]:
#разделим выборку на train-test

ratings = ratings.sort_values("timestamp")

train = ratings.iloc[:-20000].copy()
test = ratings.iloc[-20000:].copy()

In [18]:
#ищем коррелирующих между собой пользователей
pivot = train.pivot_table(index = "movieId",
                          columns = "userId",
                          values = "rating")
corrs = pivot.corr()

corrs.head()

userId,1,2,3,4,5,6,7,8,9,10,...,600,601,602,603,604,605,606,607,608,609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,,0.079819,0.207983,0.268749,-0.291636,-0.118773,0.469668,0.918559,-0.037987,...,0.253649,,-1.597727e-16,-0.061503,-0.407556,-0.164871,0.066378,0.174557,0.26807,-0.175412
2,,1.0,,,,,-0.991241,,,0.037796,...,0.188982,,,-1.0,,,0.583333,,-0.125,
3,0.079819,,1.0,,,,,,,,...,0.104257,,,0.4332,,,-0.791334,-0.333333,-0.395092,
4,0.207983,,,1.0,-0.336525,0.148498,0.542861,0.117851,,0.485794,...,0.128722,-0.333333,0.3966413,0.09009,-0.080296,0.400124,0.144603,0.116518,-0.170501,-0.27735
5,0.268749,,,-0.336525,1.0,0.043166,0.158114,0.028347,,-0.777714,...,0.17882,,0.1533034,0.234743,0.067791,-0.364156,0.244321,0.23108,-0.020546,0.384111


In [19]:
# преобразуем по красивее матрицу корреляций
corrs = (
    corrs
    .stack()
    .rename_axis(['userId1', 'userId2'])
    .reset_index()
)

corrs.columns = ['userId1', 'userId2', 'corr']

In [20]:
corrs = corrs[corrs["corr"] >= 0]

In [21]:
corrs

Unnamed: 0,userId1,userId2,corr
0,1,1,1.000000
1,1,3,0.079819
2,1,4,0.207983
3,1,5,0.268749
6,1,8,0.469668
...,...,...,...
192255,609,604,0.641624
192257,609,606,0.533002
192258,609,607,0.190117
192259,609,608,0.488929


In [22]:
### Для каждого юзера из теста
### Найдем всех "соседей"
### Которые смотрели те же фильмы,
### Что и юзер на тесте


import math

preds = []

for user in test['userId'].unique():

    ### Если юзера не было в трейне,
    ### То прогноз в выбранной парадигме дать не сможем

    if user in train['userId'].unique():
        part = test[test['userId']==user]

        ### Выделим соседей данного юзера

        neighbours = corrs[corrs['userId1']==user]
        neighbours_users = neighbours['userId2'].unique()

        ### Если соседей нет, то и предсказывать нечего
        ### Разве что среднее выбранного юзера по фильмам
        ### Но это сильно тупо

        if neighbours_users.shape[0]==0:
            continue

        ### Выделим фильмы, для которых нужно дать прогноз

        movies_ = part['movieId'].unique()

        ### Выделим часть данных с трейна про соседей

        train_part = train[train['userId'].isin(neighbours_users)]

        ### Посчитаем средние оценки соседей

        neighbours_means = train_part.groupby('userId')['rating'].mean()

        ### Присоединим эту информацию и посчитаем
        ### Остальные компоненты формулы для рассчета предсказания
        ### Относительно соседей и фильмов,
        ### Для которых возможно сделать прогноз

        train_part = train_part[train_part['movieId'].isin(movies_)]
        train_part = pd.merge(train_part,
                              neighbours[['userId2', 'corr']],
                              right_on='userId2',
                              left_on='userId',
                              how='left')

        train_part['neighbour_mean'] = train_part['userId2'].map(neighbours_means)
        train_part['diff'] = train_part['rating'] - train_part['neighbour_mean']
        train_part['diff_dot_corr'] = train_part['diff'] * train_part['corr']

        ### Посчитаем среднее по юзеру

        user_mean = train[train['userId']==user]['rating'].mean()

        ### Применим формулу для предсказания

        upper_part = train_part.groupby('movieId')['diff_dot_corr'].sum()
        lower_part = train_part.groupby('movieId')['corr'].sum()

        predictions = upper_part / lower_part + user_mean
        predictions = predictions.reset_index()
        predictions.columns = ['movieId', 'prediction']
        predictions['userId'] = user

        preds.append(predictions)

preds = pd.concat(preds)

preds = pd.merge(
                    preds,
                    test[['userId', 'movieId', 'rating']],
                    on=['userId', 'movieId'],
                    how='left'
)

In [23]:
print(f"""Смогли дать предсказания только для {preds.shape[0]}
          пар айтем-юзер из {test.shape[0]} тестовых""")

preds

Смогли дать предсказания только для 1125 
          пар айтем-юзер из 20000 тестовых


Unnamed: 0,movieId,prediction,userId,rating
0,160,3.254725,495,3.0
1,1059,4.379525,495,5.0
2,1172,4.714483,495,5.0
3,1405,3.663485,495,0.5
4,1438,3.517763,495,3.5
...,...,...,...,...
1120,122886,4.360065,68,4.5
1121,122904,4.098191,68,4.0
1122,134130,3.431193,68,5.0
1123,139385,4.208013,68,3.0


In [26]:
### Посчитаем DSG@2 хотя бы для этих пар!

users_dsgs = []

for user in preds['userId'].unique():
    part = preds[preds['userId']==user]
    part = part.sort_values('prediction', ascending=False)
    part = part.reset_index()
    user_dsg2 = (np.log2(part.index+1) * part.rating)[:2].sum()

    users_dsgs.append(user_dsg2)

print(f"Среднее DSG@2 по пользователям из теста: {np.mean(users_dsgs)}")

Среднее DSG@2 по пользователям из теста: 3.9444444444444446


Контентный подход

In [27]:
df = pd.merge(
    ratings,
    movies,
    on='movieId',
    how='left'
)

df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,429,595,5.0,828124615,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX
1,429,588,5.0,828124615,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical
2,429,590,5.0,828124615,Dances with Wolves (1990),Adventure|Drama|Western
3,429,592,5.0,828124615,Batman (1989),Action|Crime|Thriller
4,429,432,3.0,828124615,City Slickers II: The Legend of Curly's Gold (...,Adventure|Comedy|Western


In [28]:
### Выделим год из названия фильмов

import re

def find_num(st):

    nums_list = re.findall(r'\d+', st)

    if len(nums_list) > 0:
        return nums_list[-1]
    else:
        return '0'

def filter_missing_data(num):
    if num > 1900:
        return num
    else:
        return 2000

df['movieYear'] = df['title'].apply(lambda x: filter_missing_data(int(find_num(x))))

df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,movieYear
0,429,595,5.0,828124615,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX,1991
1,429,588,5.0,828124615,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,1992
2,429,590,5.0,828124615,Dances with Wolves (1990),Adventure|Drama|Western,1990
3,429,592,5.0,828124615,Batman (1989),Action|Crime|Thriller,1989
4,429,432,3.0,828124615,City Slickers II: The Legend of Curly's Gold (...,Adventure|Comedy|Western,1994


In [29]:
### Сделаем One-Hot-Encoding жанров!

all_genres = ['Adventure', 'Comedy', 'Action', 'Mystery', 'Crime', 'Thriller',
              'Drama', 'Animation', 'Children', 'Horror', 'Documentary',
              'Sci-Fi', 'Fantasy', 'Film-Noir', 'Western', 'Musical', 'Romance',
              '(no genres listed)', 'War']

for genre in all_genres:
    df[genre] = (
        df['genres']
        .str
        .contains(genre)
        .apply(int)
    )

df = df.drop('genres', axis=1)

df.head()

  .contains(genre)


Unnamed: 0,userId,movieId,rating,timestamp,title,movieYear,Adventure,Comedy,Action,Mystery,...,Horror,Documentary,Sci-Fi,Fantasy,Film-Noir,Western,Musical,Romance,(no genres listed),War
0,429,595,5.0,828124615,Beauty and the Beast (1991),1991,0,0,0,0,...,0,0,0,1,0,0,1,1,0,0
1,429,588,5.0,828124615,Aladdin (1992),1992,1,1,0,0,...,0,0,0,0,0,0,1,0,0,0
2,429,590,5.0,828124615,Dances with Wolves (1990),1990,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,429,592,5.0,828124615,Batman (1989),1989,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,429,432,3.0,828124615,City Slickers II: The Legend of Curly's Gold (...,1994,1,1,0,0,...,0,0,0,0,0,1,0,0,0,0


In [30]:
### Разделим на трейн-тест

train_new = df.iloc[:-20000].copy()
test_new = df.iloc[-20000:].copy()

In [31]:
### Пока что имеем в качестве описания контента
### Только какую-то информацию о фильмах

train_new

Unnamed: 0,userId,movieId,rating,timestamp,title,movieYear,Adventure,Comedy,Action,Mystery,...,Horror,Documentary,Sci-Fi,Fantasy,Film-Noir,Western,Musical,Romance,(no genres listed),War
0,429,595,5.0,828124615,Beauty and the Beast (1991),1991,0,0,0,0,...,0,0,0,1,0,0,1,1,0,0
1,429,588,5.0,828124615,Aladdin (1992),1992,1,1,0,0,...,0,0,0,0,0,0,1,0,0,0
2,429,590,5.0,828124615,Dances with Wolves (1990),1990,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,429,592,5.0,828124615,Batman (1989),1989,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,429,432,3.0,828124615,City Slickers II: The Legend of Curly's Gold (...,1994,1,1,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80831,495,6157,2.0,1458636454,Daredevil (2003),2003,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
80832,495,3825,2.0,1458636458,Coyote Ugly (2000),2000,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
80833,495,3157,0.5,1458636467,Stuart Little (1999),1999,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
80834,495,6383,3.0,1458636473,"2 Fast 2 Furious (Fast and the Furious 2, The)...",2003,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
### Ну хоть что-то о юзерах хочется добавить!
### Например, сколько фильмов они посмотрели
### За тренировочный период
### И какие средние оценки получились (с шумом!)

user_count_views = train_new.groupby('userId').size()
user_means = train_new.groupby('userId')['rating'].mean()

train_new['userViews'] = train_new['userId'].map(user_count_views)

noise = np.random.normal(0, 0.1, [train_new.shape[0],])
train_new['userMeans'] = train_new['userId'].map(user_means) + noise

In [33]:
### Посчитаем среднее кол-во просмотров всех юзеров
### И среднюю оценку по средним оценкам всех юзеров

overall_views_mean = int(user_count_views.mean())
overall_meanrating_mean = int(user_means.mean())

test_new['userViews'] = (
    test_new['userId']
    .map(user_count_views)
    .fillna(overall_views_mean)
)


test_new['userMeans'] = (
    test_new['userId']
    .map(user_means)
    .fillna(overall_meanrating_mean)
)

train_new = train_new.drop(['userId', 'movieId',
                            'timestamp', 'title'], axis=1)

test_new = test_new.drop(['userId', 'movieId',
                          'timestamp', 'title'], axis=1)

In [34]:
X_train = train_new.drop('rating', axis=1)
X_test = test_new.drop('rating', axis=1)

y_train = train_new['rating']
y_test = test_new['rating']

In [35]:
from catboost import CatBoostRegressor, Pool

catboost = CatBoostRegressor()


catboost.fit(X_train,
             y_train,
             cat_features=['movieYear'],
             )

Learning rate set to 0.081956
0:	learn: 1.0203313	total: 241ms	remaining: 4m
1:	learn: 1.0089765	total: 311ms	remaining: 2m 35s
2:	learn: 0.9992060	total: 383ms	remaining: 2m 7s
3:	learn: 0.9908238	total: 453ms	remaining: 1m 52s
4:	learn: 0.9835470	total: 526ms	remaining: 1m 44s
5:	learn: 0.9764728	total: 604ms	remaining: 1m 40s
6:	learn: 0.9703220	total: 671ms	remaining: 1m 35s
7:	learn: 0.9651083	total: 742ms	remaining: 1m 32s
8:	learn: 0.9607223	total: 811ms	remaining: 1m 29s
9:	learn: 0.9568556	total: 874ms	remaining: 1m 26s
10:	learn: 0.9535627	total: 938ms	remaining: 1m 24s
11:	learn: 0.9507639	total: 1.01s	remaining: 1m 23s
12:	learn: 0.9485012	total: 1.08s	remaining: 1m 22s
13:	learn: 0.9459928	total: 1.15s	remaining: 1m 20s
14:	learn: 0.9437644	total: 1.22s	remaining: 1m 20s
15:	learn: 0.9416616	total: 1.29s	remaining: 1m 19s
16:	learn: 0.9397877	total: 1.35s	remaining: 1m 18s
17:	learn: 0.9384499	total: 1.42s	remaining: 1m 17s
18:	learn: 0.9370073	total: 1.49s	remaining: 1m 1

<catboost.core.CatBoostRegressor at 0x1c8d5700e50>

In [36]:
test_new = df.iloc[-20000:].copy()

X_test['pred'] = catboost.predict(X_test)
X_test['target'] = y_test
X_test['userId'] = test_new['userId']
X_test['movieId'] = test_new['movieId']

In [37]:
users_dsgs = []

for user in X_test['userId'].unique():
    part = X_test[X_test['userId']==user]
    part = part.sort_values('pred')
    part = part.reset_index()
    user_dsg2 = (np.log2(part.index+1) * part.target)[:2].sum()

    users_dsgs.append(user_dsg2)

print(f"Среднее DSG@2 по пользователям из теста: {np.mean(users_dsgs)}")

Среднее DSG@2 по пользователям из теста: 3.4741379310344827


In [38]:
new_preds = pd.merge(preds,
                     X_test[['userId', 'movieId', 'pred']],
                     on=['userId', 'movieId'],
                     how='left')

users_dsgs = []

for user in new_preds['userId'].unique():
    part = new_preds[new_preds['userId']==user]
    part = part.sort_values('pred')
    part = part.reset_index()
    user_dsg2 = (np.log2(part.index+1) * part.rating)[:2].sum()

    users_dsgs.append(user_dsg2)

print(f"Среднее DSG@2 по пользователям из теста, которые были в трейне: {np.mean(users_dsgs)}")

Среднее DSG@2 по пользователям из теста, которые были в трейне: 3.2777777777777777
