In [50]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, random_split
import random

In [2]:
data_csv_path = "data/ml-20m/ratings.csv"
movies_path = "data/ml-20m/movies.csv"

In [3]:
data = pd.read_csv(data_csv_path)

In [4]:
data.head(10)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580
5,1,112,3.5,1094785740
6,1,151,4.0,1094785734
7,1,223,4.0,1112485573
8,1,253,4.0,1112484940
9,1,260,4.0,1112484826


In [5]:
data.sort_values(by="timestamp",inplace=True)

In [6]:
# build map and inversed map from movieId to tokenId
moives = sorted(data["movieId"].unique().tolist())

In [8]:
# 0 : PAD
# 1 : MASK
moive_to_id = {k:i+2 for i,k in enumerate(moives)}
id_to_moive = {moive_to_id[k]:k for k in moive_to_id}

In [37]:
group_by_data = data.groupby(by='userId').agg(list)["movieId"]

In [42]:
groups_data = group_by_data.to_list()

In [46]:
train_set,test_set,validate_set = random_split(groups_data,[0.8,0.1,0.1])

In [53]:
class BERTDataset(Dataset):
    
    def __init__(self, data, mapping, padding_id, mask_id, max_len = 128, train = False  ):
        """ Dataset class object for ml-20m dataset

        Args:
            data (list): data
            mapping (dict): the dictionary that map moive id to token id
            padding_id(int): the token id of [PAD]
            mask_id (int): the token id of [MASK]
            max_len (int, optional): the maximum length of a sequence. Defaults to 128.
            train (bool, optional): if this dataset is a training set. Defaults to False.
        """
        self.data = data
        self.mapping = mapping
        self.max_len = max_len
        self.train = train
        self.masked_id = mask_id
        self.padding_id = padding_id
        self.num_items = len(mapping)
    
    def __getitem__(self, index):
        seq = self.data[index]
        # depricate parts over max_len
        if len(seq)>self.max_len:
            seq = seq[:self.max_len]
        # tokenize the sequence
        seq = [self.mapping[x] for x in seq]
        mask = [0 for _ in range(len(seq))]
        # if it is training set, mask it
        if self.train:
            seq,mask = self.random_mask(seq)
        # padding 
        padding_len = self.max_len - len(seq)
        seq = seq + [self.padding_id]*padding_len
        mask = mask + [0]*padding_len
        return seq,mask

    def __len__(self):
        return len(self.data)

    def random_mask(self, sequence):
        """randomly mask sequence use following strategy:
           85% chance not to mask
           15% chance to mask
           when masking, 80% chance to use [MASK] to replace the token,
           10% chance to replace it with an random token and 10% chance to make no change

        Args:
            sequence(iteratble) sequence to be masked

        Return:
            sequence(list) sequence after masking
            mask(list) mask matrix
        """
        tokens = []
        mask = []
        for s in sequence:
            prob = random.random()
            # not mask
            if prob<0.85:
                tokens.append(s)
                mask.append(0)
            # mask
            else:
                prob = random.random()
                if prob <0.8:
                    tokens.append(self.masked_id)
                elif prob <0.9:
                    tokens.append(random.randint(2,self.num_items+2))
                else:
                    tokens.append(s)
                mask.append(1)
        return tokens,mask

In [54]:
my_dataset = BERTDataset(train_set,moive_to_id,0,1,128,True)

tensor([0.6094, 0.5008, 0.5124])