In [1]:
import torch
from torch import nn
import torch.utils.data as Data
import pandas as pd
import numpy as np
import random
import collections
import math
import sys
import time

In [2]:
class item2VecDataset(torch.utils.data.Dataset):
    def __init__(self, centers, contexts, negatives):
        assert len(centers) == len(contexts) == len(negatives)
        super(item2VecDataset, self).__init__()
        self.centers = centers
        self.contexts = contexts
        self.negatives = negatives

    def __getitem__(self, index):
        return (self.centers[index], self.contexts[index], self.negatives[index])

    def __len__(self):
        return len(self.centers)

In [3]:
class BinaryCrossEntropyLoss(nn.Module):
    def __init__(self):
        super(BinaryCrossEntropyLoss, self).__init__()

    def forward(self, inputs, targets, mask = None):
        inputs, targets, mask = inputs.float(), targets.float(), mask.float()
        res = nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction = 'none', weight = mask)
        return res.mean(dim = 1)

In [4]:
class CBOW(nn.Module):
    def __init__(self):
        super(CBOW, self).__init__()
        self.embedding1 = nn.Embedding(num_embeddings = size, embedding_dim = embed_size)
        self.embedding2 = nn.Embedding(num_embeddings = size, embedding_dim = embed_size)

    def forward(self, context, center_negative):
        v = self.embedding1(context)
        v = v.mean(dim = 0)
        u = self.embedding2(center_negative)
        pred = torch.bmm(v, u.permute(0, 2, 1))
        return pred

In [5]:
def skip_gram(center, contexts_negatives, embed_v, embed_u):
    v = embed_v(center)
    u = embed_u(contexts_negatives)
    pred = torch.bmm(v, u.permute(0, 2, 1))
    return pred

In [6]:
def train_item2Vec(net, lr, num_epochs, loss, data_iter):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print('train on', device)
    net = net.to(device)
    optimizer = torch.optim.Adam(net.parameters(), lr = lr)
    for epoch in range(num_epochs):
        start, l_sum, n = time.time(), 0.0, 0
        for batch in data_iter:
            center, context_negative, mask, label = [d.to(device) for d in batch]
#            print(center.shape, context_negative.shape)
            pred = skip_gram(center, context_negative, net[0], net[1])
#            print(pred.shape, mask.shape, label.shape)
            l = (loss(pred.view(label.shape), label, mask.view(label.shape)) * mask.shape[1] / mask.float().sum(dim = 1)).mean()
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
            l_sum += l.cpu().item()
            n += 1

        print('epoch %d, loss %f, time %.2fs' % (epoch + 1, l_sum / n, time.time() - start))

In [7]:
def get_corpus(path):
    ratings = pd.read_csv(path)
    all_rating = pd.read_csv("~/Data/clean_rating4.csv").drop(["id"], axis = 1)
#     print(ratings[ratings.anime_id == 31687])
    positive_rating = ratings
    all_anime_list = all_rating['anime_id'].tolist()
    anime_list = ratings["anime"].tolist()
    counter = collections.Counter(anime_list)

    idx_to_anime = list(set(all_anime_list))
    anime_to_idx = {anime_id: idx for idx, anime_id in enumerate(idx_to_anime)}
    
    gp = positive_rating.groupby("user")
    corpus = [list(map(lambda x: anime_to_idx[x], gp.get_group(user_id)['anime'].tolist())) for user_id, _ in gp]
#     print(len(sum(corpus, [])))
    corpus = [[aid for aid in st if subsampling(aid, counter, len(anime_list), idx_to_anime)] for st in corpus]
#     print(len(sum(corpus, [])))
    return corpus, anime_to_idx, idx_to_anime, counter

In [8]:
def subsampling(aid, counter, size, idx_to_anime):
    return random.uniform(0, 1) < 1 - math.sqrt(1e-4 * size / counter[idx_to_anime[aid]])

In [9]:
def get_centers_and_contexts(corpus):
    centers, contexts = [], []
    for animes in corpus:
        if len(animes) < 2:
            continue
        centers += animes
        for idx, _ in enumerate(animes):
            contexts.append(animes[:idx] + animes[idx + 1:])

    return centers, contexts

In [10]:
def negative_sampling(contexts, weights, K):
    negatives, neg_candidates = [], []
    all_animes = list(range(len(weights)))
    for idx, context in enumerate(contexts):
        if not idx % 10000: print(idx)
        negs, i = [], 0
        neg_candidates = random.choices(all_animes, weights, k = int(1e5))
        while len(negs) < K:
            if neg_candidates[i] in context:
                i += 1
                continue
            else:
                negs.append(neg_candidates[i])

        negatives.append(negs)

    return negatives

In [11]:
def select_batch(data):
    max_len = max([len(context) + len(negative) for _, context, negative in data])
    context_negatives, masks, labels, centers = [], [], [], []
    for center, context, negative in data:
        cur_len = len(context) + len(negative)
        context_negatives.append(context + negative + [0] * (max_len - cur_len))
        centers.append(center)
        masks.append([1] * cur_len + [0] * (max_len - cur_len))
        labels.append([1] * len(context) + [0] * (max_len - len(context)))

    return (torch.tensor(centers).view(-1, 1), torch.tensor(context_negatives), torch.tensor(masks), torch.tensor(labels))

In [12]:
if __name__ == "__main__":

    anime = pd.read_csv('~/Data/anime.csv')
    corpus, anime_to_idx, idx_to_anime, counter = get_corpus('~/Thesis/Data/train.csv')
#     print()
    print(sum([len(st) for st in corpus]), len(corpus), len(idx_to_anime))

755217 4701 9775


In [None]:
    centers, contexts = get_centers_and_contexts(corpus)
    weights = [counter[aid] ** 0.75 for aid in idx_to_anime]
    print(len(centers), len(contexts))
    
    negatives = negative_sampling(contexts, weights, 5)
    print(len(centers), len(contexts), len(negatives))
    
    with open('negative.txt', 'w') as f:
        f.write(str(negatives))
        f.close()
    
    with open('centers.txt', 'w') as f:
        f.write(str(centers))
        f.close()
    
    with open('contexts.txt', 'w') as f:
        f.write(str(contexts))
        f.close()

755217 755217
0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
360000
370000
380000


In [23]:
        
    batch_size = 512
    num_workers = 0 if sys.platform.startswith('win32') else 4

    dataset = item2VecDataset(centers, contexts, negatives)
    data_iter = Data.DataLoader(dataset, batch_size, shuffle = True, collate_fn = select_batch, num_workers = num_workers)

    embed_size = 100
#    for batch in data_iter:
#        for name, data in zip(['centers', 'context_negatives', 'masks', 'labels'], batch):
#            print(name, data.shape)
#        break
    net = nn.Sequential(
        nn.Embedding(num_embeddings = len(idx_to_anime), embedding_dim = embed_size),
        nn.Embedding(num_embeddings = len(idx_to_anime), embedding_dim = embed_size) 
    )
    train_item2Vec(net, 0.1, 30, BinaryCrossEntropyLoss(), data_iter)

    torch.save(net.state_dict(), 'skip_gram_complete.pt')
    

755333 755333
0


KeyboardInterrupt: 

In [7]:
embed_size = 100
net = nn.Sequential(
        nn.Embedding(num_embeddings = len(idx_to_anime), embedding_dim = embed_size),
        nn.Embedding(num_embeddings = len(idx_to_anime), embedding_dim = embed_size) 
    )
net.load_state_dict(torch.load("skip_gram_complete.pt"))

<All keys matched successfully>

In [17]:
# W = net[0].weight.data
# x = W[10]
# print(anime[anime['anime_id'] == 5114])
# cos = torch.matmul(W, x) / (torch.sum(W * W, dim = 1) * torch.sum(x * x) + 1e-9).sqrt()
# _, topk = torch.topk(cos, k = 6)
# topk = topk.cpu().numpy()
# for i in topk[1:]:
#     print('cosine sim = %.3f' % (cos[i]))
#     print(anime[anime['anime_id'] == idx_to_anime[i]])
    
final = []
test_df = pd.read_csv("~/Thesis/Data/test.csv")
users = list(set(test_df.user))
for k in range(10):
    precision = []
    for user in users:
        for i in range(10, -1, -1):
            animes = test_df[(test_df.user == user) & (test_df.rating == i)].anime.tolist()
            if animes:
                anime = animes[0]
                break
                
        
        x = net[0].weight.data[anime_to_idx[anime]]
        cos = torch.matmul(W, x) / (torch.sum(W * W, dim = 1) * torch.sum(x * x) + 1e-9).sqrt()
        target_anime = test_df[(test_df.user == user) & (test_df.rating > k)].anime.tolist()
        temp = []
        
        for ta in target_anime:
            temp.append((ta, cos[anime_to_idx[ta]]))
            
        idx = [st[0] for st in sorted(temp, key = lambda x: x[1], reverse = True)[1:6]]    
        both = set(idx) & set(target_anime)
#         print(len(idx), len(both))
        precision.append(len(both) / len(idx))
        
    final.append(np.mean(precision))
    print(final[-1])

KeyError: 31687