In [1]:
import cornac
import numpy as np
import pandas as pd
import math, sys
from dset import RecsysData, SequenceDataset
from negative_sampler import NegativeSampler
import random
from model import BERTModel
import pytorch_lightning as pl
from torch.utils.data import DataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ml_100k = cornac.datasets.movielens.load_feedback(fmt='UIRT')

In [3]:
np_data = np.array(ml_100k)
print(np_data[:,0])
print(np_data[:,1])
print(np_data[:,2].astype(float))
print(np_data[:,3].astype(int))

['196' '186' '22' ... '276' '13' '12']
['242' '302' '377' ... '1090' '225' '203']
[3. 3. 1. ... 1. 2. 3.]
[881250949 891717742 878887116 ... 874795795 882399156 879959583]


In [4]:
pd_data = pd.DataFrame({
    'user_id': np_data[:,0],
    'item_id': np_data[:,1],
    'rating': np_data[:,2].astype(float),
    'timestamp': np_data[:,3].astype(int)
})

In [5]:
pd_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   user_id    100000 non-null  object 
 1   item_id    100000 non-null  object 
 2   rating     100000 non-null  float64
 3   timestamp  100000 non-null  int64  
dtypes: float64(1), int64(1), object(2)
memory usage: 3.1+ MB


In [7]:
myData = RecsysData(pd_data)
myData.num_items

Splitting


100%|██████████| 943/943 [00:00<00:00, 4666.04it/s]


1682

In [None]:
max_len = 100

trainset = SequenceDataset(
    # Hyperparameters
    max_len = max_len,
    mask_prob = 0.15,
    num_items = myData.num_items,
    mask_token=myData.num_items + 1,    
    u2seq=myData.train_seqs,
    rng = random.Random(1234)
)

test_negative_sampler = NegativeSampler(
    train=myData.train_seqs,
    val=myData.val_seqs,
    test=myData.test_seqs,
    user_count=myData.num_users,
    item_count=myData.num_items,
    sample_size=99,
    seed=1234,
    save_folder="./Data/",
)
test_negative_samples = test_negative_sampler.get_negative_samples()


valset = SequenceDataset(
    mask_token = myData.num_items + 1,
    eval=True,
    u2seq=myData.train_seqs,
    u2answer=myData.val_seqs,
    max_len = max_len,
    negative_samples = test_negative_samples
)

mymodel = BERTModel(
    hidden_size=256,
    num_items=myData.num_items,     # item 的數量
    n_layers=2,
    dropout=0,
    heads=8,
    max_len=max_len,
)

In [None]:
train_loader = DataLoader(
    trainset, 
    batch_size=128,
    shuffle=True,
    pin_memory=True
)

val_loader = DataLoader(
    valset,
    batch_size=128,
    shuffle=False,
    pin_memory=True
)

In [None]:

trainer = pl.Trainer(limit_train_batches=100, max_epochs=10, gpus=1)
trainer.fit(mymodel, train_loader, val_loader)