In [174]:
import datetime
from collections import Counter, defaultdict

import pandas as pd
import numpy as np
import scipy.sparse as sp
import torch
from torch import nn

In [47]:
header = ['user_name', 'time', 'artist_id', 'artist_name', 'track_id', 'track_name']
listens = pd.read_csv('lastfm-dataset-1K/userid-timestamp-artid-artname-traid-traname.tsv', 
                      delimiter='\t', error_bad_lines=False, names=header)

In [48]:
def to_unix_time(dt):
    return datetime.datetime.strptime(dt, '%Y-%m-%dT%H:%M:%SZ').timestamp()

listens['ts'] = listens['time'].apply(to_unix_time)

ts_threshold = sorted(history['ts'])[int(len(history) * .8)]

train_listens = listens[listens['ts'] < ts_threshold]
val_listens = listens[listens['ts'] >= ts_threshold]

In [89]:
train_users = set(train_listens['user_name'])
val_users = set(val_listens['user_name'])
common_users = list(train_users & val_users)
user_ids = {u: i for i, u in enumerate(common_users)}

val_artists = {v for v, c in Counter(val_listens['artist_id']).most_common() if c >= 250}
train_artists = {v for v, c in Counter(train_listens['artist_id']).most_common() if c >= 250}
common_artists = list(val_artists & train_artists - {np.NaN})
artist_ids = {a: i for i, a in enumerate(common_artists)}

In [235]:
art_name_id = {art_name: aid for aid, art_name in art_id_name.items()}

In [98]:
def filter_listens(listens):
    listens = listens[listens['user_name'].apply(lambda u: u in user_ids) & 
                      listens['artist_id'].apply(lambda a: a in artist_ids)]
    listens = listens.assign(uid=listens['user_name'].apply(user_ids.get))
    listens = listens.assign(aid=listens['artist_id'].apply(artist_ids.get))
    return listens

train_listens = filter_listens(train_listens)
val_listens = filter_listens(val_listens)

In [2]:
ratings = pd.read_csv('ml-1m/ratings.dat', delimiter='::', header=None, 
        names=['user_id', 'movie_id', 'rating', 'timestamp'], 
        usecols=['user_id', 'movie_id', 'rating'], engine='python')

In [3]:
movie_info = pd.read_csv('ml-1m/movies.dat', delimiter='::', header=None, 
        names=['movie_id', 'name', 'category'], engine='python')

In [34]:
implicit_mask = ratings['rating'] > 3
implicit_users = ratings[implicit_mask]['user_id']
implicit_movies = ratings[implicit_mask]['movie_id']
user_item = sp.coo_matrix((np.ones(implicit_mask.sum()), (implicit_users, implicit_movies)))
N_USERS = implicit_users.max() + 1
N_ITEMS = implicit_movies.max() + 1

In [110]:
def make_user_item_matrix(listens):
    user_item = sp.coo_matrix((len(common_users), len(common_artists)))
    for u, a in listens[['uid', 'aid']].iterrows():
        user_item[u, a] = 1
    return user_item

# make_user_item_matrix(train_listens)
N_USERS = len(common_users)
N_ITEMS = len(common_artists)
train_user_item = sp.coo_matrix((np.ones(len(train_listens)), (train_listens['uid'], train_listens['aid']))).tocsr().tocoo()
val_user_item = sp.coo_matrix((np.ones(len(val_listens)), (val_listens['uid'], val_listens['aid']))).tocsr().tocoo()

In [159]:
art_id_name = {
    artist_ids[aid]: art_name 
    for _, aid, art_name in listens[['artist_id', 'artist_name']].itertuples() 
    if aid in artist_ids
}

artists = pd.DataFrame({'name': art_id_name.values()}, index=art_id_name.keys())

In [162]:
# def get_movie_names(movies):
#     sim_df = pd.DataFrame({'m_id': movies})
#     return sim_df.join(movie_info.set_index('movie_id'), on='m_id')[['m_id', 'name']]

def get_artist_names(art_ids):
    sim_df = pd.DataFrame({'aid': art_ids})
    return sim_df.join(artists, on='aid')['name']


def get_similars(item_id, model):
    similars = model.similar_items(item_id)
    sim_ids = [m_id for m_id, _ in similars]
    return get_artist_names(sim_ids)
#     return get_movie_names(sim_ids)


def get_recommendations(user_id, model):
    recs = model.recommend(user_id)
    rec_ids = [m_id for m_id, _ in recs]
    return get_artist_names(rec_ids)
#     return get_movie_names(rec_ids)

In [177]:
val_user_artists = defaultdict(set)
for _, uid, aid in val_listens[['uid', 'aid']].itertuples():
    val_user_artists[uid].add(aid)

In [205]:
def prec(model, k):
    rels = []
    for u, listened in val_user_artists.items():
        recs = model.recommend(u, n_recs=k)
        rels.append(sum(r in listened for r, _ in recs))
    return np.mean(rels) / k


gain_discounts = 1 / np.log2(np.arange(2, 52))


def ndcg(model, k):
    normalizer = 1 / gain_discounts[:k].sum()
    ndcgs = []
    for u, listened in val_user_artists.items():
        recs = model.recommend(u, n_recs=k)
        gains = np.array([r in listened for r, _ in recs])
        dcg = np.sum(gains * gain_discounts[:k])
        ndcg = dcg * normalizer
        ndcgs.append(ndcg)
    return np.mean(ndcgs)

In [228]:
def view_hist(user_id):
    return train_listens[train_listens['uid'] == user_id]

In [115]:
class NegativeSampler:
    def __init__(self, interactions, n_items, pop_dist=False):
        self.positives = sp.csr_matrix(interactions)
        self.n_items = n_items
        self.items = interactions.col if pop_dist else np.unique(interactions.col)
        
    def get_positive_mask(self, samples, users):
        return np.array(self.positives[users, samples], np.bool).ravel()
        
    def sample(self, users):
        samples = np.random.choice(self.items, users.shape)
        positive_mask = self.get_positive_mask(samples, users)
        while np.any(positive_mask):
            samples[positive_mask] = np.random.choice(self.items, positive_mask.sum())
            positive_mask = self.get_positive_mask(samples, users)
        return samples

## WARP matrix factorization model

In [134]:
def scalar_prods(vecs1, vecs2):
    return np.sum(vecs1 * vecs2, axis=1).flatten()


class MatrixFactorizationBase:
    def __init__(self, dim, reg_param, n_users, n_items):
        self.dim = dim
        self.n_users = n_users
        self.n_items = n_items
        init_std = 1 / dim ** .5
        self.users_embeddings = np.random.normal(0, init_std, (n_users, dim))
        self.items_embeddings = np.random.normal(0, init_std, (n_items, dim))
        self.users_biases = np.random.uniform(0, .5, n_users)
        self.items_biases = np.random.uniform(0, .5, n_items)
        self.reg_param = reg_param
    
    def fit(self, interactions, n_epochs, lr):
        pass
    
    def similarities(self, users_ids, items_ids):
        return self.users_biases[users_ids] + self.items_biases[items_ids] + \
                scalar_prods(self.users_embeddings[users_ids], self.items_embeddings[items_ids])
    
    def recommend(self, user_id, n_recs = 20):
        similarities = self.items_embeddings @ self.users_embeddings[user_id]
        closest_item_ids = similarities.argsort()[::-1][:n_recs]
        return list(zip(closest_item_ids, similarities[closest_item_ids]))
    
    def similar_items(self, item_id, n_items = 20):
        similarities = self.items_embeddings @ self.items_embeddings[item_id]
        items_by_similariry = similarities.argsort()[::-1]
        items_by_similariry = items_by_similariry[items_by_similariry != item_id]
        most_similar_items = items_by_similariry[:n_items]
        return list(zip(most_similar_items, similarities[most_similar_items]))

In [135]:
WARP_BATCH_SIZE = 4
WARP_MAX_SAMPLE_TRIALS = 100
WARP_MARGIN = 1


def project_vectors(vectors, indexes, max_norm):
    vector_norms = np.linalg.norm(vectors[indexes], axis=1)
    vectors[indexes] *= np.maximum(max_norm / vector_norms, 1).reshape((-1, 1))


class WARPMF(MatrixFactorizationBase):
    def __init__(self, dim, reg_param, n_users, n_items):
        super().__init__(dim, reg_param, n_users, n_items)
        self.items_biases.fill(0.)
        
    def fit(self, interactions, n_epochs, lr):
        users = interactions.row
        positives = interactions.col
        neg_sampler = NegativeSampler(interactions, self.n_items)
            
        for epoch in range(1, n_epochs + 1):
            loss = 0.
            indexes = np.arange(interactions.nnz)
            for batch_start in range(0, interactions.nnz, WARP_BATCH_SIZE):
                batch_indexes = indexes[batch_start:batch_start + WARP_BATCH_SIZE]
                batch_users = users[batch_indexes]
                batch_positives = positives[batch_indexes]
                positives_similarities = self.similarities(batch_users, batch_positives)
                
                batch_negatives = neg_sampler.sample(batch_users)
                negatives_similarities = self.similarities(batch_users, batch_negatives)
                good_mask = positives_similarities - negatives_similarities > WARP_MARGIN
                sampling_counters = np.ones(len(batch_users))
                for _ in range(WARP_MAX_SAMPLE_TRIALS):
                    n_good = good_mask.sum()
                    if n_good == 0:
                        break
                    batch_negatives[good_mask] = neg_sampler.sample(batch_users[good_mask])
                    sampling_counters[good_mask] += 1
                    negatives_similarities[good_mask] = self.similarities(
                        batch_users[good_mask], batch_negatives[good_mask])
                    good_mask = positives_similarities - negatives_similarities > WARP_MARGIN
                to_opt_mask = ~good_mask
                n_to_opt = to_opt_mask.sum()
                
                batch_users = batch_users[to_opt_mask]
                batch_positives = batch_positives[to_opt_mask]
                batch_negatives = batch_negatives[to_opt_mask]
                positives_similarities = positives_similarities[to_opt_mask]
                negatives_similarities = negatives_similarities[to_opt_mask]
                samples_weights = np.log((WARP_MAX_SAMPLE_TRIALS - 1) / sampling_counters[to_opt_mask])
                
                
                loss += np.sum((WARP_MARGIN + negatives_similarities - positives_similarities) * samples_weights)
                positive_biases_grads = -samples_weights
                negative_biases_grads = samples_weights
                samples_weights = np.expand_dims(samples_weights, 1)
                user_grads = samples_weights * \
                        (self.items_embeddings[batch_negatives] - self.items_embeddings[batch_positives])
                positive_grads = samples_weights * (-self.users_embeddings[batch_users])
                negative_grads = samples_weights * self.users_embeddings[batch_users]
                
                np.add.at(self.users_embeddings, batch_users, -lr * user_grads)
                np.add.at(self.items_embeddings, batch_positives, -lr * positive_grads)
                np.add.at(self.items_embeddings, batch_negatives, -lr * negative_grads)
                project_vectors(self.users_embeddings, batch_users, self.reg_param)
                project_vectors(self.items_embeddings, batch_positives, self.reg_param)
                project_vectors(self.items_embeddings, batch_negatives, self.reg_param)
                np.add.at(self.items_biases, batch_positives, -lr * positive_biases_grads)
                np.add.at(self.items_biases, batch_negatives, -lr * negative_biases_grads)
            print(f'Epoch {epoch} loss {loss:.3f}')

In [136]:
warp_model = WARPMF(64, 4, N_USERS, N_ITEMS)
warp_model.fit(train_user_item, 5 , .01)

Epoch 1 loss 1376052.344
Epoch 2 loss 941775.715
Epoch 3 loss 840094.209
Epoch 4 loss 756990.410
Epoch 5 loss 711431.673


In [240]:
get_similars(art_name_id['Metallica'], warp_model)

0                        Korn
1            System Of A Down
2               Dream Theater
3           Children Of Bodom
4                Serj Tankian
5           Killswitch Engage
6                Machine Head
7                   Scorpions
8             Cradle Of Filth
9                 Lamb Of God
10                  Sepultura
11    Bullet For My Valentine
12                  Motörhead
13           Led Zeppelin Jam
14            Velvet Revolver
15                      Ac/Dc
16                    Manowar
17                  In Flames
18                 Darkthrone
19               White Zombie
Name: name, dtype: object

In [245]:
prec(warp_model, 1), prec(warp_model, 10), ndcg(warp_model, 20)

(0.508641975308642, 0.4276543209876543, 0.41783873860532816)

## Neural matrix factorization Model

In [132]:
class NMFModule(nn.Module):
    def __init__(self, n_users, n_items, dim):
        super().__init__()
        self.users_emb = nn.Embedding(n_users, dim)
        self.items_emb = nn.Embedding(n_items, dim)
        self.mlp = nn.Sequential(
            nn.Linear(2 * dim, 2 * dim),
            nn.PReLU(),
            nn.Linear(2 * dim, dim),
            nn.PReLU()
        )
        self.final_clf = nn.Sequential(
            nn.Linear(2 * dim, 1),
            nn.Sigmoid()
        )
        
    def forward(self, users, items):
        users_emb = self.users_emb(users)
        items_emb = self.items_emb(items)
        ewp = users_emb * items_emb
        mlp_out = self.mlp(torch.cat((users_emb, items_emb), dim=1))
        return self.final_clf(torch.cat((ewp, mlp_out), dim=1))

    
class NMFDataLoader:
    def __init__(self, users, items, neg_sampler, batch_size):
        super().__init__()
        self.users = users
        self.items = items
        self.neg_sampler = neg_sampler
        self.batch_size = batch_size
    
    def __len__(self):
        return (len(self.users) + self.batch_size - 1) // self.batch_size
    
    def __getitem__(self, index):
        batch_start = index * self.batch_size
        batch_end = batch_start + self.batch_size
        users = torch.tensor(self.users[batch_start:batch_end], dtype=torch.long)
        pos_items = self.items[batch_start:batch_end]
        neg_items = self.neg_sampler.sample(self.users[batch_start:batch_end])
        pos_items, neg_items = torch.tensor(pos_items, dtype=torch.long), torch.tensor(neg_items, dtype=torch.long)
        return users, pos_items, neg_items
    
    def __iter__(self):
        for i in range(len(self)):
            yield self[i]

            
def train_nmf(model, data, n_epochs, opt, lr_scheduler):
    loss_function = nn.BCELoss()
    model = model.cuda()
    for epoch in range(n_epochs):
        losses = []
        for users, pos_items, neg_items in data:
            users, pos_items, neg_items = users.cuda(), pos_items.cuda(), neg_items.cuda()
            opt.zero_grad()
            pos_pred = model(users, pos_items)
            neg_pred = model(users, neg_items)
            loss = loss_function(torch.cat((pos_pred, neg_pred)), 
                                 torch.cat((torch.ones((len(pos_items), 1)), torch.zeros((len(neg_items), 1)))).cuda())
            loss.backward()
            opt.step()
            losses.append(loss.detach())
        lr_scheduler.step()
        print(f'Epoch {epoch} loss: {torch.tensor(losses).mean():.6f}')

In [181]:
class NMFModel:
    def __init__(self, n_users, n_items, dim):
        self.nmf_module = NMFModule(n_users, n_items, dim)
        self.n_items = n_items
    
    def fit(self, interactions):
        neg_sampler = NegativeSampler(interactions, self.n_items, True)
        users = interactions.row
        items = interactions.col
        nmf_dl = NMFDataLoader(users, items, neg_sampler, batch_size=16384)
        opt = torch.optim.Adam(self.nmf_module.parameters(), lr=.02)
        sched = torch.optim.lr_scheduler.StepLR(opt, 40, .5)
        train_nmf(self.nmf_module, nmf_dl, 400, opt, sched)
        
    def recommend(self, user_id, n_recs = 20):
        user_tensor = torch.tensor([user_id] * self.n_items, dtype=torch.long)
        items = torch.arange(self.n_items)
        with torch.no_grad():
            similarities = self.nmf_module(user_tensor, items).numpy().ravel()
        closest_item_ids = similarities.argsort()[::-1][:n_recs]
        return list(zip(closest_item_ids, similarities[closest_item_ids]))
    
    def similar_items(self, item_id, n_items = 20):
        items_emb = next(self.nmf_module.items_emb.parameters()).detach().numpy()
        similarities = items_emb @ items_emb[item_id]
        items_by_similariry = similarities.argsort()[::-1]
        items_by_similariry = items_by_similariry[items_by_similariry != item_id]
        most_similar_items = items_by_similariry[:n_items]
        return list(zip(most_similar_items, similarities[most_similar_items]))

In [133]:
nmf_model = NMFModel(N_USERS, N_ITEMS, 64)
nmf_model.fit(train_user_item)

Epoch 0 loss: 0.698665
Epoch 1 loss: 0.690904
Epoch 2 loss: 0.685517
Epoch 3 loss: 0.680690
Epoch 4 loss: 0.674413
Epoch 5 loss: 0.666886
Epoch 6 loss: 0.655812
Epoch 7 loss: 0.648533
Epoch 8 loss: 0.646601
Epoch 9 loss: 0.630834
Epoch 10 loss: 0.615481
Epoch 11 loss: 0.604956
Epoch 12 loss: 0.589689
Epoch 13 loss: 0.581610
Epoch 14 loss: 0.585183
Epoch 15 loss: 0.555074
Epoch 16 loss: 0.528112
Epoch 17 loss: 0.507328
Epoch 18 loss: 0.491671
Epoch 19 loss: 0.476524
Epoch 20 loss: 0.470018
Epoch 21 loss: 0.463922
Epoch 22 loss: 0.449010
Epoch 23 loss: 0.433243
Epoch 24 loss: 0.420732
Epoch 25 loss: 0.414199
Epoch 26 loss: 0.407613
Epoch 27 loss: 0.403591
Epoch 28 loss: 0.391552
Epoch 29 loss: 0.379749
Epoch 30 loss: 0.373634
Epoch 31 loss: 0.365026
Epoch 32 loss: 0.359863
Epoch 33 loss: 0.354314
Epoch 34 loss: 0.348967
Epoch 35 loss: 0.347670
Epoch 36 loss: 0.342854
Epoch 37 loss: 0.342788
Epoch 38 loss: 0.339098
Epoch 39 loss: 0.333753
Epoch 40 loss: 0.330156
Epoch 41 loss: 0.320767
Ep

Epoch 333 loss: 0.173692
Epoch 334 loss: 0.173472
Epoch 335 loss: 0.174124
Epoch 336 loss: 0.174830
Epoch 337 loss: 0.173707
Epoch 338 loss: 0.174065
Epoch 339 loss: 0.174648
Epoch 340 loss: 0.174250
Epoch 341 loss: 0.173460
Epoch 342 loss: 0.174414
Epoch 343 loss: 0.174735
Epoch 344 loss: 0.173986
Epoch 345 loss: 0.172458
Epoch 346 loss: 0.174082
Epoch 347 loss: 0.174709
Epoch 348 loss: 0.174869
Epoch 349 loss: 0.173723
Epoch 350 loss: 0.175141
Epoch 351 loss: 0.172991
Epoch 352 loss: 0.174601
Epoch 353 loss: 0.172670
Epoch 354 loss: 0.173427
Epoch 355 loss: 0.173214
Epoch 356 loss: 0.173855
Epoch 357 loss: 0.172931
Epoch 358 loss: 0.173892
Epoch 359 loss: 0.175029
Epoch 360 loss: 0.173799
Epoch 361 loss: 0.172820
Epoch 362 loss: 0.173256
Epoch 363 loss: 0.174198
Epoch 364 loss: 0.173173
Epoch 365 loss: 0.172378
Epoch 366 loss: 0.172665
Epoch 367 loss: 0.175850
Epoch 368 loss: 0.174227
Epoch 369 loss: 0.172470
Epoch 370 loss: 0.172727
Epoch 371 loss: 0.173732
Epoch 372 loss: 0.173535


In [171]:
_ = nmf_model.nmf_module.cpu()

In [241]:
# get_recommendations(1, nmf_model)
get_similars(art_name_id['Eminem'], nmf_model)

0                  Dr. Dre
1      The Black Eyed Peas
2                      Dmx
3                     2Pac
4             Busta Rhymes
5                Too $Hort
6                    N.W.A
7            Mary J. Blige
8     Bone Thugs-N-Harmony
9               Snoop Dogg
10                     Nas
11                   Jay-Z
12                 Rihanna
13        Jedi Mind Tricks
14      Christina Aguilera
15                 Garbage
16          Britney Spears
17                 50 Cent
18                Ice Cube
19             The Prodigy
Name: name, dtype: object

In [244]:
prec(nmf_model, 1), prec(nmf_model, 10), ndcg(nmf_model, 20)

(0.4641975308641975, 0.4296296296296297, 0.4222763012970428)

## Attention model