In [1]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, random_split
from torch.utils.data import DataLoader
import random
from Transformer import BERTModel
import torch.nn as nn
import torch.optim as optim

In [2]:
data_csv_path = "data/ml-20m/ratings.csv"
movies_path = "data/ml-20m/movies.csv"

In [3]:
data = pd.read_csv(data_csv_path)

In [4]:
data.head(10)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580
5,1,112,3.5,1094785740
6,1,151,4.0,1094785734
7,1,223,4.0,1112485573
8,1,253,4.0,1112484940
9,1,260,4.0,1112484826


In [5]:
data.sort_values(by="timestamp",inplace=True)

In [6]:
# build map and inversed map from movieId to tokenId
movies = sorted(data["movieId"].unique().tolist())

In [7]:
# 0 : PAD
# 1 : MASK
movie_to_id = {k:i+2 for i,k in enumerate(movies)}
id_to_movie = {movie_to_id[k]:k for k in movie_to_id}

In [8]:
group_by_data = data.groupby(by='userId').agg(list)["movieId"]

In [9]:
groups_data = group_by_data.to_list()

In [10]:
train_set,test_set,validate_set = random_split(groups_data,[0.8,0.1,0.1])

In [11]:
class BERTDataset(Dataset):
    
    def __init__(self, data, mapping, padding_id, mask_id, max_len = 128, train = False  ):
        """ Dataset class object for ml-20m dataset

        Args:
            data (list): data
            mapping (dict): the dictionary that map moive id to token id
            padding_id(int): the token id of [PAD]
            mask_id (int): the token id of [MASK]
            max_len (int, optional): the maximum length of a sequence. Defaults to 128.
            train (bool, optional): if this dataset is a training set. Defaults to False.
        """
        self.data = data
        self.mapping = mapping
        self.padding_id = padding_id
        self.masked_id = mask_id
        self.max_len = max_len
        self.train = train
        self.num_items = len(mapping)
    
    def __getitem__(self, index):
        seq = self.data[index]
        # depricate parts over max_len
        if len(seq) > self.max_len:
            seq = seq[:self.max_len]
        # tokenize the sequence
        seq = [self.mapping[x] for x in seq]
        mask = [0 for _ in range(len(seq))]
        # if it is training set, mask it
        if self.train:
            seq, mask = self.random_mask(seq)
        # padding 
        padding_len = self.max_len - len(seq)
        seq = seq + [self.padding_id] * padding_len
        mask = mask + [self.padding_id] * padding_len
        return seq, mask

    def __len__(self):
        return len(self.data)

    def random_mask(self, sequence):
        """randomly mask sequence use following strategy:
           85% chance not to mask
           15% chance to mask
           when masking, 80% chance to use [MASK] to replace the token,
           10% chance to replace it with an random token and 10% chance to make no change

        Args:
            sequence(iteratble) sequence to be masked

        Return:
            sequence(list) sequence after masking
            mask(list) mask matrix
        """
        tokens = []
        mask = []
        for s in sequence:
            prob = random.random()
            # not mask
            if prob < 0.85:
                tokens.append(s)
                mask.append(0)
            # mask
            else:
                prob = random.random()
                if prob < 0.8:
                    tokens.append(self.masked_id)
                elif prob < 0.9:
                    tokens.append(random.randint(2,self.num_items+1))
                else:
                    tokens.append(s)
                mask.append(s) 
        return tokens, mask

In [12]:
my_dataset = BERTDataset(train_set, movie_to_id, 0, 1, 128, True)

In [13]:
bert = BERTModel(128,2,8,len(movies),128,0.1)

In [14]:
train_dataloader = DataLoader(my_dataset, batch_size=128, shuffle=True)
it = iter(train_dataloader)

In [15]:
#it = iter(train_dataloader)
loss = nn.NLLLoss(ignore_index=0)
optimizer = optim.Adam(bert.parameters(), lr=1e-5)

for idx, (train_tokens, train_mask) in enumerate(train_dataloader):
    #train_tokens, train_mask = next(it)

    # convert first dimension to tensor
    train_tokens = torch.stack(train_tokens)
    train_mask = torch.stack(train_mask)

    train_h = bert(train_tokens)

    #print(train_h.shape)

    train_h = train_h.view(-1, train_h.size(-1))
    train_mask = train_mask.view(-1)

    #train_h = torch.transpose(train_h, 1, 2)

    #print(train_h.shape)
    #print(train_mask.shape)

    optimizer.zero_grad()
    l = loss(train_h, train_mask)
    l.backward()
    optimizer.step()

    print(l.item(), idx)
    #mask = train_mask.ne(0)
    #mask_ = mask.unsqueeze(-1).expand(train_h.size())
    #print(train_h.shape, mask_.shape)
    #train_hat = torch.masked_select(train_h, mask)
    #lable_hat = torch.masked_select(train_mask, mask)

-3.531120455591008e-05 0
-3.478731377981603e-05 1
-3.5394972655922174e-05 2
-3.0581530154449865e-05 3
-4.330832962295972e-05 4
-3.3473013900220394e-05 5
-3.363618816365488e-05 6
-3.689961522468366e-05 7
-3.199779166607186e-05 8
-5.280142431729473e-05 9
-2.9211691071395762e-05 10
-3.148517134832218e-05 11
-2.9987015295773745e-05 12
-2.8250466129975393e-05 13
-2.9060769520583563e-05 14
-3.391346035641618e-05 15
-3.661254595499486e-05 16
-3.092967745033093e-05 17
-3.424743772484362e-05 18
-2.9884949981351383e-05 19
-3.0193885322660208e-05 20
-3.625311001087539e-05 21
-3.2586285669822246e-05 22
-3.3450083719799295e-05 23
-3.9313392335316166e-05 24
-3.7429501389851794e-05 25
-3.740273677976802e-05 26
-4.02764453610871e-05 27
-3.528354864101857e-05 28
-3.3066084142774343e-05 29
-2.918748759839218e-05 30
-3.4059899917338043e-05 31
-3.409197597648017e-05 32
-3.97050789615605e-05 33
-3.00255687761819e-05 34
-3.918162838090211e-05 35
-3.5846645914716646e-05 36
-5.164441608940251e-05 37
-3.176994