In [1]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, random_split
from torch.utils.data import DataLoader
import random
from Transformer import BERTModel
import torch.nn as nn
import torch.optim as optim
from Transformer import Trainer

In [2]:
data_csv_path = "../bert4rec/data/ml-20m/ratings.csv"
movies_path = "../bert4rec/data/ml-20m/movies.csv"

In [3]:
data = pd.read_csv(data_csv_path)

In [4]:
# data.head(10)

In [5]:
data.sort_values(by="timestamp",inplace=True)

In [6]:
# build map and inversed map from movieId to tokenId
movies = sorted(data["movieId"].unique().tolist())

In [7]:
# 0 : PAD
# 1 : MASK
movie_to_id = {k:i+2 for i,k in enumerate(movies)}
id_to_movie = {movie_to_id[k]:k for k in movie_to_id}

In [8]:
group_by_data = data.groupby(by='userId').agg(list)["movieId"]

In [9]:
groups_data = group_by_data.to_list()

In [10]:
train_set,test_set,validate_set = random_split(groups_data,[0.8,0.1,0.1])

In [11]:
class BERTDataset(Dataset):
    
    def __init__(self, data, mapping, padding_id, mask_id, max_len = 128, train = False  ):
        """ Dataset class object for ml-20m dataset

        Args:
            data (list): data
            mapping (dict): the dictionary that map moive id to token id
            padding_id(int): the token id of [PAD]
            mask_id (int): the token id of [MASK]
            max_len (int, optional): the maximum length of a sequence. Defaults to 128.
            train (bool, optional): if this dataset is a training set. Defaults to False.
        """
        self.data = data
        self.mapping = mapping
        self.padding_id = padding_id
        self.masked_id = mask_id
        self.max_len = max_len
        self.train = train
        self.num_items = len(mapping)
    
    def __getitem__(self, index):
        seq = self.data[index]
        # depricate parts over max_len
        if len(seq) > self.max_len:
            seq = seq[:self.max_len]
        # tokenize the sequence
        seq = [self.mapping[x] for x in seq]
        mask = [0 for _ in range(len(seq))]
        # if it is training set, mask it
        if self.train:
            seq, mask = self.random_mask(seq)
        # padding 
        padding_len = self.max_len - len(seq)
        seq = seq + [self.padding_id] * padding_len
        mask = mask + [self.padding_id] * padding_len
        return torch.LongTensor(seq), torch.LongTensor(mask)

    def __len__(self):
        return len(self.data)

    def random_mask(self, sequence):
        """randomly mask sequence use following strategy:
           85% chance not to mask
           15% chance to mask
           when masking, 80% chance to use [MASK] to replace the token,
           10% chance to replace it with an random token and 10% chance to make no change

        Args:
            sequence(iteratble) sequence to be masked

        Return:
            sequence(list) sequence after masking
            mask(list) mask matrix
        """
        tokens = []
        mask = []
        for s in sequence:
            prob = random.random()
            # not mask
            if prob < 0.85:
                tokens.append(s)
                mask.append(0)
            # mask
            else:
                prob = random.random()
                if prob < 0.8:
                    tokens.append(self.masked_id)
                elif prob < 0.9:
                    tokens.append(random.randint(2,self.num_items+1))
                else:
                    tokens.append(s)
                mask.append(s) 
        return tokens, mask

In [12]:
device = 'mps'
lr = 1e-3
epochs = 100
decay_steps=25
gamma = 0.01
weight_decay = 0.01
model = BERTModel(128,2,4,len(movies),256,0.1)
optimizer = optim.Adam(model.parameters(),lr = lr, weight_decay= weight_decay)
loss_fn = nn.CrossEntropyLoss(ignore_index=0)
lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=decay_steps, gamma=gamma)
train_dataset = BERTDataset(train_set, movie_to_id, 0, 1, 128, True)
val_dataset = BERTDataset(train_set, movie_to_id, 0, 1, 128, False)
train_loader = DataLoader(train_dataset,batch_size=64,shuffle=True, drop_last=True)
val_loader = DataLoader(val_dataset,batch_size=64,shuffle=True, drop_last=True)

In [13]:
bert_trainer = Trainer(model,train_loader,val_loader,'./checkpoint/',device,optimizer,loss_fn,lr_scheduler,epochs)

In [14]:
bert_trainer.train()

  0%|          | 0/1731 [00:00<?, ?it/s]