In [1]:
import os
import torch
import pickle
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch import nn
from collections import Counter
from torch.nn.utils.rnn import pack_padded_sequence
import numpy as np
import ml_metrics as metrics

In [2]:
!ls

logs
recsys.ipynb
recsys_rnn.ipynb
test.pkl
train.pkl
validate.pkl


## Реализация session-based подхода

По мотивам **Improved Recurrent Neural Networks for Session-based
Recommendations** (https://dl.acm.org/doi/pdf/10.1145/2988450.2988452)

Сначала составим словарь фильмов, на которые обратили внимание хотя бы 100 раз. Это намного уменьшит размер модели и, возможно, позволит избежать переобучения. Кроме того, фильмы эти скорее всего не очень крутые, раз у них так мало зрителей. 

In [3]:
def create_movies_dict(src, min_bound):
    with open(src, 'rb') as file:
        users, movies = pickle.load(file)
    counter = Counter()
    for movies_list in movies:
        for movie_id in movies_list:
            counter[movie_id] += 1 
            
    movies_indexes = {'initial': 0}
    for movie_id in counter:
        if counter[movie_id] >= min_bound:
            movies_indexes[movie_id] = len(movies_indexes)
    return movies_indexes

movies_indexes = create_movies_dict('train.pkl', 100)

In [4]:
def load_train_dataset(src, movies_indexes, limit=100):
    with open(src, 'rb') as file:
        users, user_movies = pickle.load(file)
    n_users = len(user_movies)
    np_movies = np.zeros((n_users, limit), dtype=np.int)
    np_next = np.zeros((n_users, limit), dtype=np.int)
    np_lens = np.zeros(n_users, dtype=np.int)
    np_users = np.zeros(n_users, dtype=np.int)
    
    
    pointer = 0
    for user_id, movies in zip(users, user_movies):
        result = np.array([0] + [movies_indexes[movie_id] for movie_id in movies if movie_id in movies_indexes])
        if len(result) <= 1:
            continue
        cur_len = min(len(result) - 1, limit)
        np_movies[pointer][0:cur_len] = result[-cur_len - 1: -1]
        np_next[pointer][0:cur_len] = result[-cur_len:]
        np_lens[pointer] = cur_len
        np_users[pointer] = user_id
        pointer += 1
    return TensorDataset(
        torch.LongTensor(np_users[:pointer]), 
        torch.LongTensor(np_movies[:pointer]), 
        torch.LongTensor(np_next[:pointer]),
        torch.LongTensor(np_lens[:pointer]))

In [14]:
class EvaluationSettings:
    def __init__(self, dataset, marked, targets):
        self.loader = DataLoader(dataset, batch_size=200, shuffle=False)
        self.marked = marked
        self.targets = targets
        
    def evaluate(self, model, N):
        expectations, recommendations  = self.get_recommendations(model, N)
        return metrics.mapk(expectations, recommendations, k=N)
    
    def get_recommendations(self, model, N):
        recommendations = []
        expectations = []
        for sample in self.loader:
            batch_users, batch_movies, batch_lens = sample
            predictions = model.predict_next(batch_movies.to('cuda'), batch_lens.to('cuda'))
            predictions = predictions.cpu().detach().numpy()
            batch_users = batch_users.cpu().detach().numpy()
            for user_id, prediction in zip(batch_users, predictions):
                if user_id not in self.marked:
                    continue
                # Удаляем фильмы, которые были отмечены пользователем в обучающей выборке
                prediction[self.marked[user_id]] = -1000000000   
                recommendations.append(np.argsort(prediction)[-N:])
                expectations.append(self.targets[user_id])
        return expectations, recommendations 



def create_evaluation_settings(train_src, test_src, movies_indexes, limit=100):
    with open(train_src, 'rb') as file:
        train_users, train_movies = pickle.load(file)
    with open(test_src, 'rb') as file:
        test_users, test_movies = pickle.load(file)
        
    n_users = len(test_users)
    np_movies = np.zeros((n_users, limit), dtype=np.int)
    np_lens = np.zeros(n_users, dtype=np.int)
    np_users = np.zeros(n_users, dtype=np.int)
    marked = {}
    targets = {}
    
    train_pointer = 0
    pointer = 0
    for user_id, movies in zip(test_users, test_movies):
        while train_users[train_pointer] != user_id:
            train_pointer += 1
            
        result = np.array([0] + [movies_indexes[movie_id] for movie_id in train_movies[train_pointer] 
                                                                    if movie_id in movies_indexes])
        cur_len = min(len(result), limit)
        np_movies[pointer][0:cur_len] = result[-cur_len:]
        np_lens[pointer] = cur_len
        np_users[pointer] = user_id
        marked[user_id] = result
        targets[user_id] = set([movies_indexes.get(movie_id, -1 - movie_id) for movie_id in movies])
        pointer += 1
    dataset = TensorDataset(
        torch.LongTensor(np_users), 
        torch.LongTensor(np_movies),
        torch.LongTensor(np_lens))
    return EvaluationSettings(dataset, marked, targets)

In [6]:
train_dataset = load_train_dataset('train.pkl', movies_indexes, limit=150) # ~90% users has less than 150 labels in train set
train_loader = DataLoader(train_dataset, batch_size=200, shuffle=True)

In [7]:
validate_evaluation = create_evaluation_settings('train.pkl', 'validate.pkl', movies_indexes, limit=150)

In [8]:
class SessionModel(nn.Module):
    def __init__(self, num_movies, movie_embedding_dim, hidden_dim):
        super(SessionModel, self).__init__()
        self.movie_embeddings = nn.Embedding(num_movies, movie_embedding_dim)
        # Если верить статье, то GRU справляется с задачей рекоммендации лучше, чем LSTM
        # Также утверждается, что дополнительные слои GRU практически не влияют на качество работы
        self.rnn = nn.GRU(movie_embedding_dim, hidden_dim)
        self.next_film_predictor = nn.Linear(hidden_dim, num_movies)

    def forward(self, movies, lens):
        vectors = self.movie_embeddings(movies)
        packed = pack_padded_sequence(vectors, lens, batch_first=True, enforce_sorted=False)
        packed_outputs, _ = self.rnn(packed)
        return torch.nn.utils.rnn.PackedSequence(
            self.next_film_predictor(packed_outputs.data), 
            packed_outputs.batch_sizes, 
            packed_outputs.sorted_indices, 
            packed_outputs.unsorted_indices)
    
    def predict_next(self, movies, lens):
        vectors = self.movie_embeddings(movies)
        packed = pack_padded_sequence(vectors, lens, batch_first=True, enforce_sorted=False)
        _, outputs = self.rnn(packed)
        return self.next_film_predictor(outputs).view(len(movies), -1)

In [9]:
model = SessionModel(len(movies_indexes), 128, 128)
model.to('cuda')
loss = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), weight_decay=0.0001)

In [10]:
for k_epoch in range(1000):
    print("Start epoch", k_epoch)
    model.train()
    total_loss = 0
    for sample in train_loader:
        optimizer.zero_grad()
        _, batch_movies, batch_targets, batch_lens = sample
        packed_pred = model(batch_movies.to('cuda'), batch_lens.to('cuda'))
        packed_targets = pack_padded_sequence(batch_targets.to('cuda'), batch_lens.to('cuda'), 
                                              batch_first=True, enforce_sorted=False)
        
        assert torch.all(packed_pred.batch_sizes == packed_targets.batch_sizes).item()
        assert torch.all(packed_pred.unsorted_indices == packed_targets.unsorted_indices).item()
        assert torch.all(packed_pred.sorted_indices == packed_targets.sorted_indices).item()
            
        loss_value = loss(packed_pred.data, packed_targets.data) 
        loss_value.backward()
        optimizer.step()
        
        total_loss += loss_value.item()
        
    model.eval()
    with torch.no_grad():
        torch.save(model.state_dict(), os.path.join("logs", "epoch_" + str(k_epoch) + ".tmp"))
        print("Average loss:", total_loss / len(train_dataset))
        print("MAP@20:", validate_evaluation.evaluate(model, 20))
        print()

Start epoch 0
Average loss: 0.03550281690410015
MAP@20: 0.03909415762871383

Start epoch 1
Average loss: 0.03238271530431121
MAP@20: 0.04560776497896835

Start epoch 2
Average loss: 0.03144529788450399
MAP@20: 0.04958662431351026

Start epoch 3
Average loss: 0.030913430963537124
MAP@20: 0.052219571959024325

Start epoch 4
Average loss: 0.030627378106659976
MAP@20: 0.05351813211290098

Start epoch 5
Average loss: 0.030461004843230145
MAP@20: 0.054065751241001936

Start epoch 6
Average loss: 0.030356289452404588
MAP@20: 0.055035850639178485

Start epoch 7
Average loss: 0.030280317441114892
MAP@20: 0.055206847746913194

Start epoch 8
Average loss: 0.03022721478518663
MAP@20: 0.05530359687697137

Start epoch 9
Average loss: 0.030179978035994526
MAP@20: 0.05524165071744877

Start epoch 10
Average loss: 0.030147957523121112
MAP@20: 0.055521993304684776

Start epoch 11
Average loss: 0.030113605730087978
MAP@20: 0.05597659326784098

Start epoch 12
Average loss: 0.03008808921268539
MAP@20: 0.05

KeyboardInterrupt: 

## Оценим качество на отложенной выборке

In [22]:
import pandas as pd

In [38]:
model.load_state_dict(torch.load(os.path.join("logs", "epoch_24.tmp")))

<All keys matched successfully>

In [39]:
test_evaluation = create_evaluation_settings('train.pkl', 'test.pkl', movies_indexes, limit=150)

In [40]:
expectations, recommendations = test_evaluation.get_recommendations(model, 20)

In [41]:
def precision_at_k(expectations, recommendations, k):
    total = 0
    for recommendation, expectation in zip(recommendations, expectations):
        total += len([x for x in recommendation[:k].tolist() if x in expectation]) / k
    return total / len(recommendations)

In [42]:
pd.DataFrame({'metric': ['MAP@20', 'P@1', 'P@5', 'P@10', 'P@20'],
             'gru': [
                 metrics.mapk(expectations, recommendations, 20),
                 precision_at_k(expectations, recommendations, 1),
                 precision_at_k(expectations, recommendations, 5),
                 precision_at_k(expectations, recommendations, 10),
                 precision_at_k(expectations, recommendations, 20),]},
             columns=['metric', 'gru'])

Unnamed: 0,metric,gru
0,MAP@20,0.02989
1,P@1,0.037141
2,P@5,0.038634
3,P@10,0.040309
4,P@20,0.044984


Интересно, что так сильно отличается MAP на валидационной выборке и тестовой. Можно было бы списать на переобучение на гиперпараметрах, но валидационная выборка использовалась только для ранней остановки обучения. Видимо так сильно влияет локальность: при разбиении сессий валидационная выборка шла сразу после обучающей. 