In [12]:
import cornac
import numpy as np
import pandas as pd
import math, sys
from dset import RecsysData, SequenceDataset
from negative_sampler import NegativeSampler
import random
from model import BERTModel
import pytorch_lightning as pl
from torch.utils.data import DataLoader

In [13]:
ml_100k = cornac.datasets.movielens.load_feedback(fmt='UIRT')

In [14]:

np_data = np.array(ml_100k)
print(np_data[:,0])
print(np_data[:,1])
print(np_data[:,2].astype(float))
print(np_data[:,3].astype(int))

['196' '186' '22' ... '276' '13' '12']
['242' '302' '377' ... '1090' '225' '203']
[3. 3. 1. ... 1. 2. 3.]
[881250949 891717742 878887116 ... 874795795 882399156 879959583]


In [15]:


pd_data = pd.DataFrame({
    'user_id': np_data[:,0],
    'item_id': np_data[:,1],
    'rating': np_data[:,2].astype(float),
    'timestamp': np_data[:,3].astype(int)
})

In [16]:
pd_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   user_id    100000 non-null  object 
 1   item_id    100000 non-null  object 
 2   rating     100000 non-null  float64
 3   timestamp  100000 non-null  int64  
dtypes: float64(1), int64(1), object(2)
memory usage: 3.1+ MB


In [17]:
type(pd_data['user_id'].astype('category').cat.codes)

pandas.core.series.Series

In [18]:
print(sys.getsizeof(pd_data))
print(sys.getsizeof(np_data))
print(sys.getsizeof(ml_100k))

13579317
51200128
800984


In [19]:


myData = RecsysData(pd_data)
myData.num_items

Splitting


100%|██████████| 943/943 [00:00<00:00, 4549.06it/s]


1682

In [20]:
max_len = 100


trainset = SequenceDataset(
    mask_token=myData.num_items + 1,
    u2seq=myData.train_seqs,
    max_len = max_len,
    mask_prob = 0.15,
    num_items = myData.num_items,
    rng = random.Random(1234)
)

test_negative_sampler = NegativeSampler(
    train=myData.train_seqs,
    val=myData.val_seqs,
    test=myData.test_seqs,
    user_count=myData.num_users,
    item_count=myData.num_items,
    sample_size=99,
    seed=1234,
    save_folder="./Data/",
)
test_negative_samples = test_negative_sampler.get_negative_samples()


valset = SequenceDataset(
    mask_token = myData.num_items + 1,
    eval=True,
    u2seq=myData.train_seqs,
    u2answer=myData.val_seqs,
    max_len = max_len,
    negative_samples = test_negative_samples
)

mymodel = BERTModel(
    hidden_size=256,
    num_items=myData.num_items,     # item 的數量
    n_layers=2,
    dropout=0,
    heads=8,
    max_len=max_len,
)

Negatives samples exist. Loading.


In [21]:
train_loader = DataLoader(
    trainset, 
    batch_size=128,
    shuffle=True,
    pin_memory=True
)

val_loader = DataLoader(
    valset,
    batch_size=128,
    shuffle=False,
    pin_memory=True
)

In [22]:

trainer = pl.Trainer(limit_train_batches=100, max_epochs=10, gpus=1)
trainer.fit(mymodel, train_loader, val_loader)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name | Type   | Params
--------------------------------
0 | bert | BERT   | 2.0 M 
1 | out  | Linear | 432 K 
--------------------------------
2.5 M     Trainable params
0         Non-trainable params
2.5 M     Total params
9.875     Total estimated model params size (MB)


Epoch 3:  25%|██▌       | 4/16 [00:00<00:00, 16.24it/s, loss=6.78, v_num=4] 