In [1]:
from pathlib import Path
import json
import csv
from collections import namedtuple, Counter, defaultdict
from itertools import chain

import numpy as np
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset, IterableDataset, DataLoader
from sklearn.model_selection import train_test_split

## Загрузка данных

In [2]:
data_root = Path('ThirtyMusic/')
entities_dir = data_root / 'entities'
relations_dir = data_root / 'relations'

In [3]:
tracks_path = entities_dir / 'tracks.idomaar'

with tracks_path.open() as tracks_file:
    tracks_reader = csv.reader(tracks_file, delimiter='\t')
    track_authors = {
        int(track_id): json.loads(linked_entities)['artists'][0]['id']
        for _, track_id, _, _, linked_entities in tracks_reader
    }

В статье, описывающей датасет указано, что прослушивания, продолжительность которых не достигает половины трека или 4 минут по правилам last.fm не должны попадать в историю воспроизведения. Будем рассматривать короткие воспроизведения как ошибки скробблинга или аномальные файлы. Прослушивания короче 30 секунд будем отбрасывать. При других методах сбора данных короткие воспроизведения можно было бы рассматривать как отрицательный сигнал, считая, что они происходят, когда пользователь проматывает трек.

In [4]:
ArtistsSession = namedtuple('ArtistsSesson', 'id user timestamp artists')

MIN_PLAYTIME = 30
sessions_path = relations_dir / 'sessions.idomaar'


raw_artist_sessions = []
artist_sessions = []
with sessions_path.open() as sessions_file:
    for session_line in sessions_file:
        _, session_id, timestamp, _, linked_entities = session_line.split()
        linked_entities_dict = json.loads(linked_entities)
        
        listened_track_ids = []
        all_listened_artist_ids = []
        listened_artist_ids = []
        prev_track_author = None
        user_id = linked_entities_dict['subjects'][0]['id']
        for track in linked_entities_dict['objects']:
            if track['playtime'] < MIN_PLAYTIME:
                continue
            track_id = track['id']
            
            track_author_id = track_authors[track_id]
            all_listened_artist_ids.append(track_author_id)
            
            if prev_track_author != track_author_id:
                listened_artist_ids.append(track_author_id)
            prev_track_author = track_author_id
        
        raw_artist_sessions.append(ArtistsSession(int(session_id), user_id, int(timestamp), all_listened_artist_ids))
        artist_sessions.append(ArtistsSession(int(session_id), user_id, int(timestamp), listened_artist_ids))

In [5]:
artists_path = entities_dir / 'persons.idomaar'

with artists_path.open() as artists_file:
    artists_reader = csv.reader(artists_file, delimiter='\t')
    artist_names = {
        int(artist_id): json.loads(artist_properties)['name']
        for _, artist_id, _, artist_properties, _ in artists_reader
    }
artist_ids = {artist_name: artist_id for artist_id, artist_name in artist_names.items()}

Среди исполнителей много дупликатов из-за непрвильно прописанных тегов. Не будем рассматривать исполнителей с менее чем 100 прослушиваниями.

In [30]:
MIN_ARTIST_PLAYS = 100

artist_play_counter = Counter(artist for session in artist_sessions for artist in session.artists)
popular_artists = {
    artist_id 
    for artist_id, n_listens in artist_play_counter.most_common() 
    if n_listens >= MIN_ARTIST_PLAYS
}
artists_number = len(popular_artists)
artists_list = list(popular_artists)
artists_indexes = {artist_id: i for i, artist_id in enumerate(artists_list)}

In [7]:
nontrivial_sessions = []
for session in artist_sessions:
    artists = [artist for artist in session.artists if artist in popular_artists]
    if len(artists) > 1:
        nontrivial_sessions.append(ArtistsSession(session.id, session.user, session.timestamp, artists))

## Протокол оценки

Разобъём на обучающую, валидационную и тестовую выборки по времени, чтобы исключить смешение тренировочных данных с оценочными.

In [8]:
session_timestamps = np.array([session.timestamp for session in artist_sessions])
train_val_threshold = np.percentile(session_timestamps, 60)
val_test_threshold = np.percentile(session_timestamps, 80)

def split_tvt(sessions):
    train_sessions = []
    val_sessions = []
    test_sessions = []
    for session in nontrivial_sessions:
        if session.timestamp < train_val_threshold:
            train_sessions.append(session)
        elif session.timestamp < val_test_threshold:
            val_sessions.append(session)
        else:
            test_sessions.append(session)
    return train_sessions, val_sessions, test_sessions

nt_train, nt_val, nt_test = split_tvt(nontrivial_sessions)
ra_train, ra_val, ra_test = split_tvt(raw_artist_sessions)

Рассмотрим только пользователей, попавших во все три части, чтобы избежать проблемы холодного старта.

In [9]:
train_users = {session.user for session in nt_train}
val_users = {session.user for session in nt_val}
test_users = {session.user for session in nt_test}
common_users = train_users & val_users & test_users
common_users_list = list(common_users)
users_indexes = {user_id: i for i, user_id in enumerate(common_users_list)}
users_number = len(common_users)

Релевантных исполнителей будем искать по скалярному произведению векторов. 
Это не расстояние, но оно лучше соответствуем использованным моделям.

In [10]:
class SimilarArtistsFinder:
    def __init__(self, embeddings, artists_list, artists_indexes):
        self.vectors = embeddings
        self.artists_list = artists_list
        self.artists_indexes = artists_indexes
        
    def get_closest_by_id(self, artist_id, k):
        artist_vector = self.vectors[self.artists_indexes[artist_id]]
        closest_ids = self.get_closest_by_vector(artist_vector, k + 1)
        closest_ids = [i for i in closest_ids if i != artist_id][:k]
        return closest_ids
    
    def get_closest_by_vector(self, artist_vector, k):
        with torch.no_grad():
            artist_vector = artist_vector.to(device)
            dists = self.vectors @ artist_vector
            _, closest_artist_indexes = torch.topk(dists, k)
        closest_artist_ids = [self.artists_list[int(index)] for index in closest_artist_indexes]
        return closest_artist_ids

Для рекомендаций будем использовать эмбеддинги пользователей и выбирать похожих на них исполнителей. 
Другой возможный вариант - пройти по прослушанным исполнителям и взять их соседей, но такой подход вычислительно сложнее.

In [12]:
def recommend(users_embeddings, neighbours_finder, artists_list, users_list):
    return {
        users_list[user]: neighbours_finder.get_closest_by_vector(user_embedding, 20)
        for user, user_embedding in enumerate(users_embeddings)
    }

In [14]:
def get_users_play_counters(sessions):
    played_by_users = defaultdict(Counter)
    for session in sessions:
        played_by_users[session.user].update(session.artists)
    return played_by_users

users_play_counters_val = get_users_play_counters(nt_val)
users_play_counters_test = get_users_play_counters(nt_test)

In [15]:
def precision_k(recommendations, played, k):
    users_prs = []
    for user, user_recs in recommendations.items():
        user_played = played[user]
        correct = 0
        for rec in user_recs[:k]:
            if rec in user_played:
                correct += 1
        users_prs.append(correct / k)
    return np.mean(users_prs)


def mean_average_precision(recommendations, played, k):
    users_aps = []
    for user, user_recs in recommendations.items():
        user_played = played[user]
        correct = 0
        precs = []
        for i, rec in enumerate(user_recs[:k], 1):
            if rec in user_played:
                correct += 1
            precs.append(correct / i)
        users_aps.append(precs)
    return np.mean(users_aps)

В качестве показателя релевантности для ndcg возьмём долю прослушиваний данного исполнителя. 
Количество воспроизведений используется вместо доли для упрощения вычислений, поскольку при нормализации 
сумма по исполнителям сокращается.

In [166]:
DISCOUNTS = 1 / np.log(np.arange(2, 102))


def ndcg(recommendations, play_counters, k):
    users_ndcgs = []
    for user, user_recs in recommendations.items():
        user_counter = play_counters[user]
        if user_counter:
            gains = np.array([user_counter[rec_art] for rec_art in user_recs[:k]])
            tgt_gains = np.array([count for _, count in user_counter.most_common()[:k]])
            c_k = min(k, len(user_counter))
            users_ndcgs.append(np.sum(gains[:c_k] * DISCOUNTS[:c_k]) / np.sum(tgt_gains[:c_k] * DISCOUNTS[:c_k]))
    return np.mean(users_ndcgs)

In [17]:
SAMPLE_ARTISTS = 228054, 287560, 4807, 315200, 310487

def show_neighbours(embeddings):
    neighbours_finder = SimilarArtistsFinder(next(embeddings.parameters()), artists_list, artists_indexes)
    for artist_id in SAMPLE_ARTISTS:
        print(artist_names[artist_id])
        for i in neighbours_finder.get_closest_by_id(artist_id, 20):
            print(artist_names[i])
        print()

## Модели

In [54]:
def train(model, train_data, val_data, n_epochs, lr, device='cpu', verbose=True):
    model.to(device)
    opt = torch.optim.Adam(model.parameters(), lr=lr)
    
    best_val_loss = float('inf')
    best_state = None
    for epoch in range(n_epochs):
        train_losses = []
        for batch in train_data:
            opt.zero_grad()
            loss = model.calc_loss(batch, device)
            loss.backward()
            opt.step()
            train_losses.append(loss.detach().cpu().numpy())

        with torch.no_grad():
            val_losses = [model.calc_loss(batch, device).cpu().numpy() for batch in val_data]

        train_loss, val_loss = np.mean(train_losses), np.mean(val_losses)
        if verbose:
            print(f'Epoch {epoch + 1} train_loss: {train_loss:.3f} val_loss: {val_loss:.3f}')
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_state = model.state_dict()
    model.load_state_dict(best_state)

In [19]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

### Модель на основе SkipGram

In [20]:
class ContextDataset(IterableDataset):
    def __init__(self, sessions, max_context_dist):
        self.sessions_artists = [
            [artists_indexes[artist_id] for artist_id in session.artists] 
            for session in sessions
        ]
        self.max_dist = max_context_dist
        
    def __iter__(self):
        for session_artists in self.sessions_artists:
            for target_index, target in enumerate(session_artists):
                for neighbour in session_artists[max(0, target_index - self.max_dist):target_index + self.max_dist + 1]:
                    if neighbour != target:
                        yield neighbour, target

In [89]:
BATCH_SIZE_SGL = 2 ** 19
CONTEXT_DIST = 4

sgl_train_dl = DataLoader(ContextDataset(nt_train, CONTEXT_DIST), batch_size=BATCH_SIZE_SGL)
sgl_val_dl = DataLoader(ContextDataset(nt_val, CONTEXT_DIST), batch_size=BATCH_SIZE_SGL)
sgl_test_dl = DataLoader(ContextDataset(nt_test, CONTEXT_DIST), batch_size=BATCH_SIZE_SGL)

In [101]:
TARGET_SAMPLE_OPTIONS_NUMBER = 100000000
artist_sample_probs = np.array([artist_play_counter[i] for i in artists_list]) ** .75
artist_sample_probs /= artist_sample_probs.sum()
artist_samples = torch.tensor(list(chain.from_iterable(
    [i] * int(TARGET_SAMPLE_OPTIONS_NUMBER * p) for i, p in enumerate(artist_sample_probs)
)))
SAMPLE_OPTIONS_NUMBER = len(artist_samples)

Модель, использующая архитектуру skip-gram для обучения представлений слов.
Сессии прослушивания рассматриваются как тексты, а исполнители - как слова.

In [102]:
class SkipGramLikeModel(nn.Module):
    def __init__(self, vocab_size, dimensionality):
        super().__init__()
        self.vocab_size = vocab_size
        self.embeddings = nn.Embedding(vocab_size, dimensionality)
        self.context_embeddings = nn.Embedding(vocab_size, dimensionality)
        
    def forward(self, x):
        return self.embeddings(x)
    
    def sample_negatives(self, sample_size):
        return artist_samples[torch.randint(0, SAMPLE_OPTIONS_NUMBER, (sample_size,), dtype=torch.long)]
    
    def calc_loss(self, batch, device):
        batch_samples, batch_target = batch
        batch_samples, batch_target = batch_samples.to(device), batch_target.to(device)
        
        out = self.context_embeddings(batch_samples)
        target = self.embeddings(batch_target)
        neg_samples = self.context_embeddings(self.sample_negatives(len(batch_target)).to(device))
                
        out_log_sigm = F.logsigmoid(-torch.sum(target * out, dim=1))
        neg_log_sigm = F.logsigmoid(torch.sum(target * neg_samples, dim=1))
        return neg_log_sigm.mean() + out_log_sigm.mean()

In [103]:
sgl_model = SkipGramLikeModel(artists_number, 50)

In [104]:
train(sgl_model, sgl_train_dl, sgl_val_dl, 5, .5, device)

Epoch 1 train_loss: -8224.831 val_loss: -36698.660
Epoch 2 train_loss: -152277.766 val_loss: -323900.188
Epoch 3 train_loss: -627397.375 val_loss: -986848.438
Epoch 4 train_loss: -1491180.500 val_loss: -2034366.250
Epoch 5 train_loss: -2735778.750 val_loss: -3449782.250


Для получения вектора пользователя векторы прослушанных исполнителей усредняются с весами, пропорциональными колиеству прослушиваний. 
Рассматривалось также скользящее среднее, но такое усреднение оказалось слишком сложным вычислительно.

In [108]:
users_art_counters_train = defaultdict(Counter)
for session in ra_train:
    if session.user in common_users:
        users_art_counters_train[session.user].update(session.artists)

user_art_shares_train = {}
for user, user_counter in users_art_counters_train.items():
    c_sum = sum(user_counter.values())
    listened_artists, artist_shares = [], []
    for art, c in user_counter.most_common():
        listened_artists.append(artists_indexes[art])
        artist_shares.append(c / c_sum)
    user_art_shares_train[user] = torch.LongTensor(listened_artists), torch.tensor(artist_shares)

def make_users_embedding(artists_embedding, users_indexes, artists_indexes):
    with torch.no_grad():
        users_embedding = torch.zeros((users_number, artists_embedding.shape[1]), dtype=torch.float32)
        for user_index, user_id in enumerate(common_users_list):
            arts, shares = user_art_shares_train[user_id]
            users_embedding[user_index] = torch.sum(artists_embedding[arts] * shares.reshape(-1, 1), dim=0)
    return users_embedding

### Модель на основе факторизации матриц

In [26]:
class MatrixFactorDataset(Dataset):
    def __init__(self, artists_sessions, user_indexes, artist_indexes):
        user_artist_counters = Counter(
            (session.user, artist_id)
            for session in artists_sessions 
            for artist_id in session.artists
        )
        self.data = [
            (user_indexes[user_id], artist_indexes[artist_id], c) 
            for (user_id, artist_id), c in user_artist_counters.items()
            if user_id in user_indexes and artist_id in artist_indexes
        ]
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        return self.data[index]

In [27]:
BATCH_SIZE_MF = 2 ** 18

mf_train_dl = DataLoader(
    MatrixFactorDataset(ra_train, users_indexes, artists_indexes), batch_size=BATCH_SIZE_MF, shuffle=True)
mf_val_dl = DataLoader(
    MatrixFactorDataset(ra_val, users_indexes, artists_indexes), batch_size=BATCH_SIZE_MF)
mf_test_dl = DataLoader(
    MatrixFactorDataset(ra_test, users_indexes, artists_indexes), batch_size=BATCH_SIZE_MF)

Эта модель раскладывает матрицу прослушиваний, в ячейке которой стоит едниница, если соответствующий пользователь слушал треки соответствующего исполнителя и ноль в противном случае, на матрицы из векторных представлений пользователей и исполнителей. Вес ячейки в функции потерь увеличивается с ростом числа прослушиваний. Оценки также могли бы влиять на функцию ошибки, но в имеющихся данных они не имеют временных меток, что мешает правильно разделить их на тренировочные и валидационные.

In [34]:
class MatrixFactorizationModel(nn.Module):
    def __init__(self, num_artists, num_users, dimensionality, reg_param, alpha, beta):
        super().__init__()
        self.user_embeddings = nn.Embedding(num_users, dimensionality)
        self.artist_embeddings = nn.Embedding(num_artists, dimensionality)
        self.num_artists = num_artists
        self.num_users = num_users
        self.reg_param = reg_param
        self.alpha = alpha
        self.beta = beta
        
    def forward(self, users, artists):
        return torch.sum(self.user_embeddings(users) * self.artist_embeddings(artists), dim=1)

    def calc_loss(self, batch, device):
        users, artists, counts = batch
        users, artists, counts = users.to(device), artists.to(device), counts.to(device)
        weights = 1 + self.alpha * torch.log((1 + counts * self.beta).float())
        neg_users = torch.randint_like(users, self.num_users)
        neg_artists = torch.randint_like(artists, self.num_artists)
        return torch.mean(self.forward(neg_users, neg_artists) ** 2) + \
                    torch.mean((1 - self.forward(users, artists)) ** 2 * weights) + \
                    self.reg_param * (torch.norm(self.user_embeddings(users), dim=1).mean() + 
                                      torch.norm(self.artist_embeddings(artists), dim=1).mean() +
                                      torch.norm(self.user_embeddings(neg_users), dim=1).mean() + 
                                      torch.norm(self.artist_embeddings(neg_artists), dim=1).mean())

In [43]:
mf_model = MatrixFactorizationModel(artists_number, users_number, 20, .1, 1, 1)

In [44]:
train(mf_model, mf_train_dl, mf_val_dl, 25, 0.05, device)

Epoch 1 train_loss: 30.165 val_loss: 10.060
Epoch 2 train_loss: 6.015 val_loss: 4.277
Epoch 3 train_loss: 3.383 val_loss: 3.038
Epoch 4 train_loss: 2.347 val_loss: 2.050
Epoch 5 train_loss: 1.452 val_loss: 1.419
Epoch 6 train_loss: 1.099 val_loss: 1.190
Epoch 7 train_loss: 0.970 val_loss: 1.085
Epoch 8 train_loss: 0.902 val_loss: 1.018
Epoch 9 train_loss: 0.856 val_loss: 0.972
Epoch 10 train_loss: 0.821 val_loss: 0.937
Epoch 11 train_loss: 0.794 val_loss: 0.907
Epoch 12 train_loss: 0.771 val_loss: 0.883
Epoch 13 train_loss: 0.751 val_loss: 0.862
Epoch 14 train_loss: 0.733 val_loss: 0.844
Epoch 15 train_loss: 0.717 val_loss: 0.829
Epoch 16 train_loss: 0.704 val_loss: 0.816
Epoch 17 train_loss: 0.692 val_loss: 0.804
Epoch 18 train_loss: 0.683 val_loss: 0.795
Epoch 19 train_loss: 0.675 val_loss: 0.787
Epoch 20 train_loss: 0.668 val_loss: 0.781
Epoch 21 train_loss: 0.662 val_loss: 0.776
Epoch 22 train_loss: 0.657 val_loss: 0.771
Epoch 23 train_loss: 0.653 val_loss: 0.767
Epoch 24 train_los

0.7609735

## Оптимизация гиперпараметров

In [106]:
from hyperopt import hp, fmin, tpe

In [111]:
def sgl_model_obj(config):
    train_data = DataLoader(ContextDataset(nt_train, config['context']), batch_size=BATCH_SIZE_SGL)
    val_data = DataLoader(ContextDataset(nt_val, config['context']), batch_size=BATCH_SIZE_SGL)
    
    model = SkipGramLikeModel(ARTISTS_NUMBER, config['dim'])
    train(model, train_data, val_data, 5, 0.5, device, False)
    
    artists_embeddings = next(iter(model.embeddings.parameters()))
    users_embeddings = make_users_embedding(artists_embeddings.cpu(), users_indexes, artists_indexes)
    artists_embeddings, users_embeddings = artists_embeddings.to(device), users_embeddings.to(device)
    
    neighbour_finder = SimilarArtistsFinder(artists_embeddings, artists_list, artists_indexes)
    recs = recommend(users_embeddings, neighbour_finder, artists_list, common_users_list)
    return -mean_average_precision(recs, users_play_counters_val, 20)


sgl_dim_options = [20, 50, 100]
sgl_context_options = [2, 4]
sgl_space = {
    'dim': hp.choice('dim', sgl_dim_options), 
    'context': hp.choice('context', sgl_context_options) 
}

best_sgl_params = fmin(sgl_model_obj, sgl_space, algo=tpe.suggest, max_evals=5)

sgl_dim = sgl_dim_options[best_sgl_params['dim']]
sgl_context = sgl_context_options[best_sgl_params['context']]
print(sgl_dim, sgl_context)

100%|███████████████████████████████████████████████| 5/5 [33:43<00:00, 404.67s/trial, best loss: -0.10982363734250662]
50 4


In [137]:
def mf_obj(config):
    model = MatrixFactorizationModel(artists_number, users_number, 
                                     config['dim'], config['reg'], config['alpha'], config['beta'])
    train(model, mf_train_dl, mf_val_dl, 20, 0.1, device, False)
    
    artist_embeddings = next(iter(model.artist_embeddings.parameters())).detach()
    user_embeddings = next(iter(model.user_embeddings.parameters())).detach()
    
    neighbour_finder = SimilarArtistsFinder(artist_embeddings, artists_list, artists_indexes)
    recs = recommend(user_embeddings, neighbour_finder, artists_list, common_users_list)
    return -mean_average_precision(recs, users_play_counters_val, 20)


mf_dim_options = [10, 20, 50]
mf_reg_options = [.01, .1, 1.]
mf_alpha_options = [1., 5., 20.]
mf_beta_options = [1, 5, 20]
mf_space = {
    'dim': hp.choice('dim', mf_dim_options),
    'reg': hp.choice('reg', mf_reg_options),
    'alpha': hp.choice('alpha', mf_alpha_options),
    'beta': hp.choice('beta', mf_beta_options)
}

best_mf_params = fmin(mf_obj, mf_space, algo=tpe.suggest, max_evals=40)
mf_dim = mf_dim_options[best_mf_params['dim']]
mf_reg = mf_reg_options[best_mf_params['reg']]
mf_alpha = mf_alpha_options[best_mf_params['alpha']]
mf_beta = mf_beta_options[best_mf_params['beta']]
print(mf_dim, mf_reg, mf_alpha, mf_beta)

100%|███████████████████████████████████████████| 40/40 [1:12:41<00:00, 109.03s/trial, best loss: -0.12017143732796823]
20 0.1 1.0 5


## Оценка

In [113]:
sgl_opt_model = SkipGramLikeModel(ARTISTS_NUMBER, sgl_dim)
sgl_train_dl = DataLoader(ContextDataset(nt_train, sgl_context), batch_size=BATCH_SIZE_SGL)
sgl_val_dl = DataLoader(ContextDataset(nt_val, sgl_context), batch_size=BATCH_SIZE_SGL)
train(sgl_opt_model, sgl_train_dl, sgl_val_dl, 5, 0.5, device)

Epoch 1 train_loss: -8635.944 val_loss: -40078.363
Epoch 2 train_loss: -174113.859 val_loss: -372984.750
Epoch 3 train_loss: -706580.938 val_loss: -1100334.750
Epoch 4 train_loss: -1630167.500 val_loss: -2205818.250
Epoch 5 train_loss: -2925555.000 val_loss: -3667587.250


In [139]:
mf_opt_model = MatrixFactorizationModel(artists_number, users_number, mf_dim, mf_reg, mf_alpha, mf_beta)
train(mf_opt_model, mf_train_dl, mf_val_dl, 20, 0.1, device)

Epoch 1 train_loss: 28.171 val_loss: 6.163
Epoch 2 train_loss: 4.323 val_loss: 3.127
Epoch 3 train_loss: 1.755 val_loss: 1.403
Epoch 4 train_loss: 1.048 val_loss: 1.157
Epoch 5 train_loss: 0.913 val_loss: 1.066
Epoch 6 train_loss: 0.861 val_loss: 1.017
Epoch 7 train_loss: 0.826 val_loss: 0.980
Epoch 8 train_loss: 0.799 val_loss: 0.951
Epoch 9 train_loss: 0.777 val_loss: 0.930
Epoch 10 train_loss: 0.758 val_loss: 0.913
Epoch 11 train_loss: 0.744 val_loss: 0.901
Epoch 12 train_loss: 0.733 val_loss: 0.891
Epoch 13 train_loss: 0.725 val_loss: 0.884
Epoch 14 train_loss: 0.718 val_loss: 0.878
Epoch 15 train_loss: 0.714 val_loss: 0.874
Epoch 16 train_loss: 0.711 val_loss: 0.871
Epoch 17 train_loss: 0.709 val_loss: 0.870
Epoch 18 train_loss: 0.707 val_loss: 0.867
Epoch 19 train_loss: 0.706 val_loss: 0.867
Epoch 20 train_loss: 0.705 val_loss: 0.866


In [140]:
sgl_artists_embeddings = next(iter(sgl_opt_model.embeddings.parameters()))
sgl_users_embeddings = make_users_embedding(sgl_artists_embeddings.cpu(), users_indexes, artists_indexes)
sgl_artists_embeddings, sgl_users_embeddings = sgl_artists_embeddings.to(device), sgl_users_embeddings.to(device)
sgl_neighbour_finder = SimilarArtistsFinder(sgl_artists_embeddings, artists_list, artists_indexes)
sgl_recs = recommend(sgl_users_embeddings, sgl_neighbour_finder, artists_list, common_users_list)

mf_artist_embeddings = next(iter(mf_opt_model.artist_embeddings.parameters()))
mf_user_embeddings = next(iter(mf_opt_model.user_embeddings.parameters()))
mf_neighbour_finder = SimilarArtistsFinder(mf_artist_embeddings, artists_list, artists_indexes)
mf_recs = recommend(mf_user_embeddings, mf_neighbour_finder, artists_list, common_users_list)

In [141]:
sgl_pr1 = precision_k(sgl_recs, users_play_counters_test, 1)
sgl_pr10 = precision_k(sgl_recs, users_play_counters_test, 10)
sgl_map = mean_average_precision(sgl_recs, users_play_counters_test, 20)
sgl_ndcg = ndcg(sgl_recs, users_play_counters_test, 20)

mf_pr1 = precision_k(mf_recs, users_play_counters_test, 1)
mf_pr10 = precision_k(mf_recs, users_play_counters_test, 10)
mf_map = mean_average_precision(mf_recs, users_play_counters_test, 20)
mf_ndcg = ndcg(mf_recs, users_play_counters_test, 20)

print(f'SkipGram metrics: pr1 {sgl_pr1:.3f} pr10 {sgl_pr10:.3f} map {sgl_map:.3f} ndcg {sgl_ndcg:.3f}')
print(f'MatrixFactorization metrics: pr1 {mf_pr1:.3f} pr10 {mf_pr10:.3f} map {mf_map:.3f} ndcg {mf_ndcg:.3f}')

SkipGram metrics: pr1 0.078 pr10 0.074 map 0.074 ndcg 0.041
MatrixFactorization metrics: pr1 0.089 pr10 0.099 map 0.098 ndcg 0.065


In [132]:
show_neighbours(sgl_opt_model.embeddings)

Metallica+&+Korn
Guns+Nroses
Cure,+The
Johnny+Cash+&+Waylon+Jennings
RED+HOT+CHILI+PAPPERS
The+Beatles,+Brian+Matthew
Simon
Foo+Fighter
Nirvana
Bruce+Springsteen++&+The+E+Street+Band
Nine+Inch+Nails
OASIS+-+T.R.W.M.
P.J.+Harvey
The+Pet+Shop+Boys
Santana+feat.+Dave+Matthews+&+Carter+Beauford
Tool
Black+Sabbath
Jamiroquai
Sonic+Youth+&+Cypress+Hill
Neil+Young+&+Graham+Nash+with+The+Stray+Gators
Lenny+Krawitz

Rammstein
Cure,+The
Johnny+Cash+&+Waylon+Jennings
Simon
The+Beatles,+Brian+Matthew
RED+HOT+CHILI+PAPPERS
Foo+Fighter
Guns+Nroses
Nine+Inch+Nails
Nirvana
P.J.+Harvey
The+Pet+Shop+Boys
OASIS+-+T.R.W.M.
Bruce+Springsteen++&+The+E+Street+Band
Jamiroquai
Sonic+Youth+&+Cypress+Hill
Santana+feat.+Dave+Matthews+&+Carter+Beauford
Tool
Lenny+Krawitz
Alanis+Morissette
Neil+Young+&+Graham+Nash+with+The+Stray+Gators

50+Cent+ft+Lloyd+Banks+&+Tony+Yayo
Ellie+Goulding
Justin+Timberlake
Sia+Furler
Lana+Del+Rey+(www.hitov.ru)
John+Legend+&+Teyana+Taylor
Calvin+Harris+Ft.+John+Newman
Ed+Sheeran+&+Gar

In [142]:
show_neighbours(mf_opt_model.artist_embeddings)

Metallica+&+Korn
%23
RED+HOT+CHILI+PAPPERS
Nirvana
Pearl+Jam+&+Zeke
Pink+Floud
Muse
Daft+Punk+&+M83+VS+Big+Black+Delta
Led+Zeppelin
The+Beatles,+Brian+Matthew
Foo+Fighter
Queen+&+Wyclef+Jean+(featuring+Pras+Michael+&+Free)
Alice+In+Chains+with+Pearl+Jam
System+of+a+Down
Arctic+Monkeys
Queens+Of+The+Stone+Age+&+Bea
U2+&+Sin%C3%A9ad+O%E2%80%99Connor
Nine+Inch+Nails
Radiohead+&+Sigur+R%C3%B3s
Linkin+Park+&+Jay+Gordon+of+Orgy
Placebo+feat.+Alison+Mosshart

Rammstein
%23
System+of+a+Down
Muse
Marilyn+Manson+&+Rasputina
RED+HOT+CHILI+PAPPERS
Linkin+Park+&+Jay+Gordon+of+Orgy
Nine+Inch+Nails
Nirvana
Depeche+Mode
Daft+Punk+&+M83+VS+Big+Black+Delta
Placebo+feat.+Alison+Mosshart
Queen+&+Wyclef+Jean+(featuring+Pras+Michael+&+Free)
Pink+Floud
Metallica+&+Korn
Pearl+Jam+&+Zeke
The+Prodigy
Foo+Fighter
Alice+In+Chains+with+Pearl+Jam
U2+&+Sin%C3%A9ad+O%E2%80%99Connor
Led+Zeppelin

50+Cent+ft+Lloyd+Banks+&+Tony+Yayo
%23
Eminem+Featuring+Dido
Daft+Punk+&+M83+VS+Big+Black+Delta
Pharell+wiliams
Lana+Del+Re

### Статистическая значимость

In [188]:
import random
from scipy.stats import mannwhitneyu

In [183]:
N_SUBSAMPLES = 5
SUBSAMPLE_SHARE = .5

Разбиение на тренировочную и валидационную выборки по времени не позволяет использовать обычную k-fold кросс-валидацию, поэтому будем на каждом шаге брать случайные подмножества тренировочной и валидационной выборки.

In [199]:
sgl_pr1s, sgl_pr10s, sgl_maps, sgl_ndcgs = [], [], [], []

for _ in range(N_SUBSAMPLES):
    tr_sub = random.sample(nt_train + nt_val, int(SUBSAMPLE_SHARE * (len(nt_train) + len(nt_val))))
    te_sub = random.sample(nt_test, int(SUBSAMPLE_SHARE * len(nt_test)))
    train_data = DataLoader(ContextDataset(tr_sub, sgl_context), batch_size=BATCH_SIZE_SGL)
    val_data = DataLoader(ContextDataset(te_sub, sgl_context), batch_size=BATCH_SIZE_SGL)
    
    model = SkipGramLikeModel(ARTISTS_NUMBER, sgl_dim)
    train(model, train_data, val_data, int(5 / SUBSAMPLE_SHARE), 0.5, device, False)
    
    artists_embeddings = next(iter(model.embeddings.parameters()))
    users_embeddings = make_users_embedding(artists_embeddings.cpu(), users_indexes, artists_indexes)
    artists_embeddings, users_embeddings = artists_embeddings.to(device), users_embeddings.to(device)
    
    neighbour_finder = SimilarArtistsFinder(artists_embeddings, artists_list, artists_indexes)
    recs = recommend(users_embeddings, neighbour_finder, artists_list, common_users_list)
    te_ua_counters = get_users_play_counters(te_sub)
    
    c_pr1 = precision_k(recs, te_ua_counters, 1)
    c_pr10 = precision_k(recs, te_ua_counters, 10)
    c_map = mean_average_precision(recs, te_ua_counters, 20)
    c_ndcg = ndcg(recs, te_ua_counters, 20)
    print(c_pr1, c_pr10, c_map, c_ndcg)
    sgl_pr1s.append(c_pr1)
    sgl_pr10s.append(c_pr10)
    sgl_maps.append(c_map)
    sgl_ndcgs.append(c_ndcg)

0.07111244467781862 0.04591553521866327 0.048022814098212185 0.036295484825726324
0.0672867751856575 0.052464181231715555 0.053813431628133616 0.04059481935565944
0.055809766709174105 0.052989273122796485 0.052892195053751284 0.039524833620521775
0.06068561998349711 0.04223989198109669 0.043478994660156034 0.033479416459550194
0.05130897907133748 0.03732653214312505 0.03711909884758768 0.0289581776970256


In [187]:
mf_pr1s, mf_pr10s, mf_maps, mf_ndcgs = [], [], [], []

for _ in range(N_SUBSAMPLES):
    tr_sub = random.sample(ra_train + ra_val, int(SUBSAMPLE_SHARE * (len(ra_train) + len(ra_val))))
    te_sub = random.sample(ra_test, int(SUBSAMPLE_SHARE * len(ra_test)))
    train_data = DataLoader(MatrixFactorDataset(tr_sub, users_indexes, artists_indexes), batch_size=BATCH_SIZE_MF, 
                            shuffle=True)
    val_data = DataLoader(MatrixFactorDataset(te_sub, users_indexes, artists_indexes), batch_size=BATCH_SIZE_MF)
    
    model = MatrixFactorizationModel(artists_number, users_number, mf_dim, mf_reg, mf_alpha, mf_beta)
    train(model, train_data, val_data, int(20 / SUBSAMPLE_SHARE), 0.1, device)
    
    artist_embeddings = next(iter(model.artist_embeddings.parameters())).detach()
    user_embeddings = next(iter(model.user_embeddings.parameters())).detach()
    
    neighbour_finder = SimilarArtistsFinder(artist_embeddings, artists_list, artists_indexes)
    recs = recommend(user_embeddings, neighbour_finder, artists_list, common_users_list)
    te_ua_counters = get_users_play_counters(te_sub)
    
    c_pr1 = precision_k(recs, te_ua_counters, 1)
    c_pr10 = precision_k(recs, te_ua_counters, 10)
    c_map = mean_average_precision(recs, te_ua_counters, 20)
    c_ndcg = ndcg(recs, te_ua_counters, 20)
    print(c_pr1, c_pr10, c_map, c_ndcg)
    mf_pr1s.append(c_pr1)
    mf_pr10s.append(c_pr10)
    mf_maps.append(c_map)
    mf_ndcgs.append(c_ndcg)

Epoch 1 train_loss: 31.889 val_loss: 7.403
Epoch 2 train_loss: 4.944 val_loss: 3.798
Epoch 3 train_loss: 2.453 val_loss: 1.775
Epoch 4 train_loss: 1.235 val_loss: 1.241
Epoch 5 train_loss: 0.989 val_loss: 1.100
Epoch 6 train_loss: 0.906 val_loss: 1.026
Epoch 7 train_loss: 0.863 val_loss: 0.985
Epoch 8 train_loss: 0.832 val_loss: 0.950
Epoch 9 train_loss: 0.806 val_loss: 0.924
Epoch 10 train_loss: 0.784 val_loss: 0.903
Epoch 11 train_loss: 0.765 val_loss: 0.886
Epoch 12 train_loss: 0.751 val_loss: 0.871
Epoch 13 train_loss: 0.739 val_loss: 0.863
Epoch 14 train_loss: 0.730 val_loss: 0.853
Epoch 15 train_loss: 0.723 val_loss: 0.846
Epoch 16 train_loss: 0.717 val_loss: 0.841
Epoch 17 train_loss: 0.712 val_loss: 0.839
Epoch 18 train_loss: 0.709 val_loss: 0.835
Epoch 19 train_loss: 0.706 val_loss: 0.832
Epoch 20 train_loss: 0.704 val_loss: 0.830
Epoch 21 train_loss: 0.702 val_loss: 0.829
Epoch 22 train_loss: 0.701 val_loss: 0.828
Epoch 23 train_loss: 0.700 val_loss: 0.826
Epoch 24 train_loss

Epoch 26 train_loss: 0.697 val_loss: 0.823
Epoch 27 train_loss: 0.697 val_loss: 0.822
Epoch 28 train_loss: 0.697 val_loss: 0.823
Epoch 29 train_loss: 0.697 val_loss: 0.822
Epoch 30 train_loss: 0.697 val_loss: 0.823
Epoch 31 train_loss: 0.697 val_loss: 0.822
Epoch 32 train_loss: 0.697 val_loss: 0.823
Epoch 33 train_loss: 0.696 val_loss: 0.823
Epoch 34 train_loss: 0.697 val_loss: 0.823
Epoch 35 train_loss: 0.697 val_loss: 0.823
Epoch 36 train_loss: 0.697 val_loss: 0.824
Epoch 37 train_loss: 0.698 val_loss: 0.821
Epoch 38 train_loss: 0.698 val_loss: 0.825
Epoch 39 train_loss: 0.699 val_loss: 0.824
Epoch 40 train_loss: 0.699 val_loss: 0.824
0.06173580376565899 0.06331107943890181 0.06268200158655272 0.05412528194693056


In [201]:
_, p_value_pr1 = mannwhitneyu(mf_pr1s, sgl_pr1s)
_, p_value_pr10 = mannwhitneyu(mf_pr10s, sgl_pr10s)
_, p_value_map = mannwhitneyu(mf_maps, sgl_maps)
_, p_value_ndcg = mannwhitneyu(mf_ndcgs, sgl_ndcgs)

print(p_value_pr1, p_value_pr10, p_value_map, p_value_ndcg)

0.5 0.006092890177672406 0.006092890177672406 0.006092890177672406


p-value для precision@10, mean average precision и ndcg ниже общепринятого порога в 0.05, что позволяет считать различия в этих метриках значимыми. Метрики отличаются от полученных на полных выборках, что говорит о том, что уменьшение размера данных влияет на качество рекомендаций.