In [1]:
# 序列推荐模型
# GRU4Rec、Caser（基于CNN）、DIN、DIEN
# 基于transformer: SASRec、BSTransformer

In [1]:
import copy
# 加载数据集2成序列数据集，评分[0,1,2]为负反馈，评分[3,4,5]为正反馈，只保留正样本，构造简单序列推荐数据集
# 数据集：ml-100k

import os, random
import numpy as np
import pandas as pd
random.seed(100)

# 加载数据: >=3分为正，用户评分次数不低于50，只保留最后50个，拆分为40: 5 + 15负例 (随机采样): 5 + 15负例 (随机采样)
ratings = np.array([[int(x) for x in line.strip().split('\t')[:4]] for line in open('./data/ml-100k/ua.base','r').read().strip().split('\n')], dtype=np.int32)
ratings_pd = pd.DataFrame({feature_name: list(feature_data) for feature_name, feature_data in zip(['user_id','item_id','rating','timestamp'], ratings.T)})
pos_ratings_pd = ratings_pd[ratings_pd['rating']>2.9][['user_id','item_id','timestamp']].dropna().sort_values('timestamp') # 已经排序了
pos_ratings_pd = pos_ratings_pd.groupby('user_id').filter(lambda x: x['user_id'].count()>=50)
userid2id = {user_id: i for i, user_id in enumerate(sorted(list(set(pos_ratings_pd['user_id'].tolist()))))}
itemid2id = {item_id: i for i, item_id in enumerate(sorted(list(set(pos_ratings_pd['item_id'].tolist()))))}
print(len(userid2id), len(itemid2id))
del ratings, ratings_pd

# new id
user_train_validate_test = {}
for user,item,t in pos_ratings_pd.values:
    u, i = userid2id[user], itemid2id[item]
    if u not in user_train_validate_test:
        user_train_validate_test[u] = [i]
    else:
        user_train_validate_test[u].append(i)
    user_train_validate_test[u] = user_train_validate_test[u][-50:]
train_seq_len = 40
pos_num = 5
neg_sample_num = 15
def sample(low, high, notinset, num):
    nums = set([])
    n = num
    while n>0:
        id = random.randint(low, high)
        if id not in notinset and id not in nums:
            nums.add(id)
            n -= 1
    return list(nums)
data = np.zeros((len(user_train_validate_test), 81), dtype=np.int32)
i = 0
for user, train_validate_test in user_train_validate_test.items():
    train, validate, test = train_validate_test[:train_seq_len], train_validate_test[-pos_num*2:-pos_num], train_validate_test[-pos_num:]
    data[i, 0] = user
    data[i,1:train_seq_len+1] = np.array(train)
    samples = sample(0, len(itemid2id)-1, set(train_validate_test), neg_sample_num * 2)
    data[i,1+train_seq_len : 1+train_seq_len+pos_num+neg_sample_num] = np.array(validate + samples[:neg_sample_num])
    data[i,1+train_seq_len+pos_num+neg_sample_num : ] = np.array(test + samples[neg_sample_num:])
    i += 1
del user_train_validate_test
print(data.shape)
print(data[:2,:])

# 继续加载info特征信息，内容特征
occupation_dict = {'administrator':0, 'artist':1, 'doctor':2, 'educator':3, 'engineer':4, 'entertainment':5, 'executive':6, 'healthcare':7, 'homemaker':8, 'lawyer':9, 'librarian':10, 'marketing':11, 'none':12, 'other':13, 'programmer':14, 'retired':15, 'salesman':16, 'scientist':17, 'student':18, 'technician':19, 'writer':20}
gender_dict={'M':1,'F':0}
user_info = {}
for line in open('./data/ml-100k/u.user','r', encoding='utf-8').read().strip().split('\n'):
    phs = line.strip().split('|')
    if int(phs[0]) not in userid2id:
        continue
    uid = userid2id[int(phs[0])]
    user_info[uid] = [gender_dict[phs[2]], occupation_dict[phs[3]]] # int(phs[1]) 为了方便，不要连续型特征
user_num_features = 2
item_info = {}
for line in open('./data/ml-100k/u.item','r', encoding='ISO-8859-1').read().strip().split('\n'):
    phs = line.strip().split('|')
    if int(phs[0]) not in itemid2id:
        continue
    iid = itemid2id[int(phs[0])]
    item_info[iid] = phs[5:]
item_num_features = 19
num_users = len(user_info)
num_items = len(item_info)
num_features = 21



446 1548
(446, 81)
[[ 397  303  260  306  312  744  257  285  338  270  682  862  328 1543
   344  881  326  298  867  265  673 1491  301  337  353  261  300 1260
  1238  302  325  334  331  351  347 1090  683  901  897  272  345   49
   585  309  521  126  386 1282 1038  539  288  417  418  931 1444  804
   164  933  941 1326  686 1001  316  324 1128  900 1091 1349  710  716
   470 1499  225   98 1508  357  365  757  887  248  633]
 [ 344  773  365  397  445   48  374   62  431  981  231  780  384  109
    39  775 1021  929 1022  393   93  399   89  386  569  715   66 1059
   748  414  459  418  139  832  495  831 1413  413  398  396   77  784
   717  786   50  142  768  259 1159  394   11  917  793 1306 1307  810
   302  944 1328  564   53  293  141  212  778  645  825  839  328  332
  1234 1363  342 1247  485  107 1134 1137  369 1277  254]]


In [17]:
# GRU4Rec: 只用行为特征
# user_embedding = GRU(item_embedding_seq)
# y = user_embedding * item_embedding.T
# 数据集：ml-100k


import torch
from torch import nn
from torch.nn import Module, CrossEntropyLoss, Sequential, Linear, Sigmoid
from torch.utils.data import Dataset, DataLoader, TensorDataset 
from sklearn.model_selection import train_test_split
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
device = torch.device("cuda:0" if torch.cuda.is_available() else ('mps:0' if torch.backends.mps.is_available() else "cpu"))
batch_size = 100
num_epochs = 10
dim=100

train_loader = DataLoader(dataset=TensorDataset(torch.from_numpy(data[:,1: 1+ train_seq_len]).long(), torch.from_numpy(data[:,1+ train_seq_len:-(pos_num+neg_sample_num)]).long()), batch_size=batch_size, shuffle=True, pin_memory=True)
test_loader = DataLoader(dataset=TensorDataset(torch.from_numpy(data[:,1: 1+ train_seq_len + pos_num]).long(), torch.from_numpy(data[:,-(pos_num+neg_sample_num) : ]).long()), batch_size=batch_size, shuffle=False, pin_memory=True)

class GRU4Rec(nn.Module):
    def __init__(self, num_items, embedding_dim, gru_num_layers=1):
        super(GRU4Rec, self).__init__()
        self.num_items = num_items
        self.embedding_dim, self.gru_num_layers = embedding_dim, gru_num_layers
        self.item_embeddings = nn.Embedding(num_items, self.embedding_dim, padding_idx=-1)
        torch.nn.init.kaiming_normal_(self.item_embeddings.weight.data)
        self.gru = nn.GRU(input_size=self.embedding_dim, hidden_size=self.embedding_dim, num_layers=self.gru_num_layers, batch_first=True)
    # [batch, seq_len], [batch, label_len]
    def forward(self, item_seqs: torch.Tensor, test: torch.Tensor):
        batch_len = item_seqs.shape[0]
        # [batch, seq_len, dim]
        item_seqs_embeddings = self.item_embeddings(item_seqs)
        # [batch, label_len, dim]
        test_embeddings = self.item_embeddings(test)
        # gru输出最后的隐层输出当为user embedding
        _, user_emb = self.gru(item_seqs_embeddings)
        # [batch, dim * gru_num_layers]
        user_emb = user_emb.reshape((batch_len, self.gru_num_layers * self.embedding_dim))
        # predict
        scores = torch.sigmoid(torch.bmm(test_embeddings.repeat([1,1,self.gru_num_layers]), user_emb.unsqueeze(-1)).squeeze())
        return scores
model = GRU4Rec(num_items = len(itemid2id), embedding_dim = dim).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=0.0003)
criterion = CrossEntropyLoss(reduction='sum').to(device)
label = torch.FloatTensor([1 for i in range(pos_num)] + [0 for i in range(neg_sample_num)]).to(device)

def DCG(batch_labels):
    dcgsum = np.zeros((batch_labels.shape[0]))
    for i in range(batch_labels.shape[-1]):
        dcg = (2 ** batch_labels[:,i] - 1) / np.math.log(i + 2, 2)
        dcgsum += dcg
    return dcgsum
def NDCG(output, labels):
    # ideal_dcg
    ideal_dcg = DCG(labels)
    # this
    dcg = DCG((np.argsort( - output, axis=-1)<pos_num).astype(np.float32))
    return np.sum(dcg/ideal_dcg)

for epoch in range(num_epochs):
    # train:
    epoch_train_losses = []
    model.train()
    for i, inputs in enumerate(train_loader):
        optimizer.zero_grad()
        item_seqs = inputs[0].to(device)
        test = inputs[1].to(device)
        output = model(item_seqs, test)
        labels = label.unsqueeze(0).repeat([item_seqs.shape[0],1])
        loss = criterion(output, labels)
        loss.backward()
        # torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=1, norm_type=2)
        optimizer.step()
        epoch_train_losses.append([item_seqs.shape[0], loss.item(), NDCG(output.detach().numpy(), labels.detach().numpy())])
    # validate:
    model.eval()
    epoch_test_losses = []
    for i, inputs in enumerate(test_loader):
        item_seqs = inputs[0].to(device)
        test = inputs[1].to(device)
        output = model(item_seqs, test)
        labels = label.unsqueeze(0).repeat([item_seqs.shape[0],1])
        loss = criterion(output, labels)
        epoch_test_losses.append([item_seqs.shape[0], loss.item(), NDCG(output.detach().numpy(), labels.detach().numpy())])
    train_loss = sum([x[1] for x in epoch_train_losses])/sum([x[0] * (pos_num + neg_sample_num) for x in epoch_train_losses])
    test_loss  = sum([x[1] for x in epoch_test_losses])/sum([x[0] * (pos_num + neg_sample_num) for x in epoch_test_losses])
    train_ndcg = sum([x[2] for x in epoch_train_losses])/sum([x[0] for x in epoch_train_losses])
    test_ndcg  = sum([x[2] for x in epoch_test_losses])/sum([x[0] for x in epoch_test_losses])
    # print
    print('['+datetime.now().strftime("%Y-%m-%d %H:%M:%S")+']', 'epoch=[{}/{}], train_ce_loss: {:.4f}, train_ndcg: {:.4f}, validate_ce_loss: {:.4f}, validate_ndcg: {:.4f}'.format(epoch+1, num_epochs,  train_loss, train_ndcg, test_loss, test_ndcg))


[2023-09-01 16:22:16] epoch=[1/10], train_ce_loss: 0.7405, train_ndcg: 0.6901, validate_ce_loss: 0.7556, validate_ndcg: 0.6248
[2023-09-01 16:22:20] epoch=[2/10], train_ce_loss: 0.6774, train_ndcg: 0.8633, validate_ce_loss: 0.7632, validate_ndcg: 0.6208
[2023-09-01 16:22:23] epoch=[3/10], train_ce_loss: 0.6616, train_ndcg: 0.8945, validate_ce_loss: 0.7653, validate_ndcg: 0.6203
[2023-09-01 16:22:27] epoch=[4/10], train_ce_loss: 0.6592, train_ndcg: 0.9063, validate_ce_loss: 0.7659, validate_ndcg: 0.6252
[2023-09-01 16:22:30] epoch=[5/10], train_ce_loss: 0.6581, train_ndcg: 0.9107, validate_ce_loss: 0.7662, validate_ndcg: 0.6275
[2023-09-01 16:22:34] epoch=[6/10], train_ce_loss: 0.6568, train_ndcg: 0.9145, validate_ce_loss: 0.7664, validate_ndcg: 0.6273
[2023-09-01 16:22:38] epoch=[7/10], train_ce_loss: 0.6549, train_ndcg: 0.9156, validate_ce_loss: 0.7665, validate_ndcg: 0.6279
[2023-09-01 16:22:41] epoch=[8/10], train_ce_loss: 0.6524, train_ndcg: 0.9182, validate_ce_loss: 0.7673, valida

In [2]:
#  Caser： 
#  y = dnn（用户行为序列的水平卷积 + 垂直卷积)
# Convolutional Sequence Embedding Recommendation Model (Caser)
# 数据集：ml-100k

import torch
from torch import nn
from torch.nn import Module, CrossEntropyLoss, Sequential, Linear, Sigmoid
from torch.utils.data import Dataset, DataLoader, TensorDataset 
from sklearn.model_selection import train_test_split
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
device = torch.device("cuda:0" if torch.cuda.is_available() else ('mps:0' if torch.backends.mps.is_available() else "cpu"))
batch_size = 100
num_epochs = 10
dim=100


train_loader = DataLoader(dataset=TensorDataset(torch.from_numpy(data[:,: 1+ train_seq_len]).long(), torch.from_numpy(data[:,1+ train_seq_len:-(pos_num+neg_sample_num)]).long()), batch_size=batch_size, shuffle=True, pin_memory=True)
test_loader = DataLoader(dataset=TensorDataset(torch.from_numpy(data[:,: 1+ train_seq_len]).long(), torch.from_numpy(data[:,-(pos_num+neg_sample_num) : ]).long()), batch_size=batch_size, shuffle=False, pin_memory=True) # 这里图方便

class Caser(nn.Module):
    def __init__(self, num_users, num_items, seq_len, v_out_chinnel, h_out_chinnel, embedding_dim, mpl_conv_layers=[16], final_mlp_layers=[32]):
        super(Caser, self).__init__()
        self.num_users, self.num_items = num_users, num_items
        self.seq_len, self.v_out_chinnel, self.h_out_chinnel = seq_len, v_out_chinnel, h_out_chinnel
        self.embedding_dim, self.mpl_conv_layers = embedding_dim, mpl_conv_layers
        self.user_embeddings = nn.Embedding(num_users, self.embedding_dim, padding_idx=-1)
        torch.nn.init.kaiming_normal_(self.user_embeddings.weight.data)
        self.item_embeddings = nn.Embedding(num_items, self.embedding_dim, padding_idx=-1)
        torch.nn.init.kaiming_normal_(self.item_embeddings.weight.data)
        # 卷积部分
        # L * d
        # vertical conv layer：垂直卷积
        self.conv_v = nn.Conv2d(1, v_out_chinnel, (seq_len,1)) # v_kernel (L*1)
        self.conv_v_out_dim = v_out_chinnel * embedding_dim
        # horizontal conv layer：水平卷积 (h变化的, d)
        self.conv_h = nn.ModuleList([nn.Conv2d(1, h_out_chinnel, (i+1, embedding_dim)) for i in range(seq_len)]) # 还需要max pool在embedding_dim
        self.conv_h_out_dim = h_out_chinnel * seq_len
        # mlp部分
        self.mpl_conv = nn.Sequential(nn.Linear(self.conv_v_out_dim + self.conv_h_out_dim, mpl_conv_layers[0]), nn.ReLU())
        if len(mpl_conv_layers)>1:
            for i, layer_dim in enumerate(mpl_conv_layers[1:]):
                self.attention_net.append(nn.Linear(mpl_conv_layers[i], layer_dim))
                self.attention_net.append(nn.ReLU())
        # final mlp
        self.final_mlp = nn.Sequential(nn.Linear(mpl_conv_layers[-1] + embedding_dim, final_mlp_layers[0]), nn.ReLU())
        if len(final_mlp_layers)>1:
            for i, layer_dim in enumerate(final_mlp_layers[1:]):
                self.attention_net.append(nn.Linear(final_mlp_layers[i], layer_dim))
                self.attention_net.append(nn.ReLU())
        self.final_mlp.append(nn.Linear(final_mlp_layers[-1], embedding_dim))
    def forward(self, users_item_seqs: torch.Tensor, test: torch.Tensor):
        batch_len = users_item_seqs.shape[0]
        user_emb = self.user_embeddings(users_item_seqs[:,0])
        # [batch, seq_len, dim]
        item_seqs_embeddings = self.item_embeddings(users_item_seqs[:,1:])
        # [batch, label_len, dim]
        test_embeddings = self.item_embeddings(test)
        
        # Convolutional Layers
        out, out_h, out_v = None, None, None
        # vertical conv layer
        out_v = self.conv_v(item_seqs_embeddings.unsqueeze(1)).squeeze()
        out_v = out_v.reshape((batch_len, -1))
        # horizontal conv layer
        out_hs = []
        for conv in self.conv_h:
            conv_out = conv(item_seqs_embeddings.unsqueeze(1)).squeeze(-1)
            pool_out = torch.max_pool1d(conv_out, conv_out.size(2)).squeeze(-1)
            out_hs.append(pool_out)
        out_h = torch.cat(out_hs, dim=1)
        out = torch.cat([out_v, out_h], dim=1)
        # fully-connected layer
        z = self.mpl_conv(out)
        x = torch.cat([z, user_emb], 1) #z is combined by seq item embs and user emb
        t = self.final_mlp(x)
        # final
        seq_len = test_embeddings.shape[1]
        y = torch.sigmoid(torch.sum(t.unsqueeze(1).repeat((1,seq_len,1)) * test_embeddings, dim=-1)).squeeze()
        return y
model = Caser(num_users=num_users, num_items=num_items, seq_len=train_seq_len, v_out_chinnel=4, h_out_chinnel=16, embedding_dim=dim, mpl_conv_layers=[16], final_mlp_layers=[32]).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=0.0003)
criterion = CrossEntropyLoss(reduction='sum').to(device)
label = torch.FloatTensor([1 for i in range(pos_num)] + [0 for i in range(neg_sample_num)]).to(device)

def DCG(batch_labels):
    dcgsum = np.zeros((batch_labels.shape[0]))
    for i in range(batch_labels.shape[-1]):
        dcg = (2 ** batch_labels[:,i] - 1) / np.math.log(i + 2, 2)
        dcgsum += dcg
    return dcgsum
def NDCG(output, labels):
    # ideal_dcg
    ideal_dcg = DCG(labels)
    # this
    dcg = DCG((np.argsort( - output, axis=-1)<pos_num).astype(np.float32))
    return np.sum(dcg/ideal_dcg)

for epoch in range(num_epochs):
    # train:
    epoch_train_losses = []
    model.train()
    for i, inputs in enumerate(train_loader):
        optimizer.zero_grad()
        item_seqs = inputs[0].to(device)
        test = inputs[1].to(device)
        output = model(item_seqs, test)
        labels = label.unsqueeze(0).repeat([item_seqs.shape[0],1])
        loss = criterion(output, labels)
        loss.backward()
        # torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=1, norm_type=2)
        optimizer.step()
        epoch_train_losses.append([item_seqs.shape[0], loss.item(), NDCG(output.detach().numpy(), labels.detach().numpy())])
    # validate:
    model.eval()
    epoch_test_losses = []
    for i, inputs in enumerate(test_loader):
        item_seqs = inputs[0].to(device)
        test = inputs[1].to(device)
        output = model(item_seqs, test)
        labels = label.unsqueeze(0).repeat([item_seqs.shape[0],1])
        loss = criterion(output, labels)
        epoch_test_losses.append([item_seqs.shape[0], loss.item(), NDCG(output.detach().numpy(), labels.detach().numpy())])
    train_loss = sum([x[1] for x in epoch_train_losses])/sum([x[0] * (pos_num + neg_sample_num) for x in epoch_train_losses])
    test_loss  = sum([x[1] for x in epoch_test_losses])/sum([x[0] * (pos_num + neg_sample_num) for x in epoch_test_losses])
    train_ndcg = sum([x[2] for x in epoch_train_losses])/sum([x[0] for x in epoch_train_losses])
    test_ndcg  = sum([x[2] for x in epoch_test_losses])/sum([x[0] for x in epoch_test_losses])
    # print
    print('['+datetime.now().strftime("%Y-%m-%d %H:%M:%S")+']', 'epoch=[{}/{}], train_ce_loss: {:.4f}, train_ndcg: {:.4f}, validate_ce_loss: {:.4f}, validate_ndcg: {:.4f}'.format(epoch+1, num_epochs,  train_loss, train_ndcg, test_loss, test_ndcg))

[2023-09-04 11:08:01] epoch=[1/10], train_ce_loss: 0.7375, train_ndcg: 0.6850, validate_ce_loss: 0.7630, validate_ndcg: 0.6297
[2023-09-04 11:08:02] epoch=[2/10], train_ce_loss: 0.6722, train_ndcg: 0.8702, validate_ce_loss: 0.7703, validate_ndcg: 0.6816
[2023-09-04 11:08:04] epoch=[3/10], train_ce_loss: 0.6641, train_ndcg: 0.8949, validate_ce_loss: 0.7714, validate_ndcg: 0.6793
[2023-09-04 11:08:07] epoch=[4/10], train_ce_loss: 0.6638, train_ndcg: 0.8901, validate_ce_loss: 0.7710, validate_ndcg: 0.6796
[2023-09-04 11:08:09] epoch=[5/10], train_ce_loss: 0.6635, train_ndcg: 0.8845, validate_ce_loss: 0.7705, validate_ndcg: 0.6776
[2023-09-04 11:08:12] epoch=[6/10], train_ce_loss: 0.6636, train_ndcg: 0.8813, validate_ce_loss: 0.7709, validate_ndcg: 0.6782
[2023-09-04 11:08:14] epoch=[7/10], train_ce_loss: 0.6636, train_ndcg: 0.8810, validate_ce_loss: 0.7698, validate_ndcg: 0.6809
[2023-09-04 11:08:16] epoch=[8/10], train_ce_loss: 0.6636, train_ndcg: 0.8809, validate_ce_loss: 0.7700, valida

In [10]:
# DIN： Deep Interest Network
# 阿里妈妈：使用内容特征和基于内容特征之上的行为序列。
# y = dnn(user内容特征+行为item序列特征（attention sum）+candidate item内容特征)
# 我这里实现没有用物品id的行为嵌入，所以效果不一定好。
# 数据集：ml-100k，这里没有考虑连续型特征，故特征总数为2+19

import torch
from torch import nn
from torch.nn import Module, CrossEntropyLoss, Sequential, Linear, Sigmoid
from torch.utils.data import Dataset, DataLoader, TensorDataset 
from sklearn.model_selection import train_test_split
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
device = torch.device("cuda:0" if torch.cuda.is_available() else ('mps:0' if torch.backends.mps.is_available() else "cpu"))
batch_size = 100
num_epochs = 10
dim=50


user_feature_vals = {}
for i in range(user_num_features):
    user_feature_vals[i] = sorted(list(set([val[i] for val in user_info.values()])))
    for user, info in user_info.items():
        user_info[user][i] = user_feature_vals[i].index(info[i])
item_feature_vals = {}
for i in range(item_num_features):
    item_feature_vals[i] = sorted(list(set([val[i] for val in item_info.values()])))
    for item, info in item_info.items():
        item_info[item][i] = item_feature_vals[i].index(info[i])

user_profile_data = np.array([user_info[u] for u in data[:,0]]) # [data_len, ufeature]
item_seq_profile_data = np.array([[item_info[item] for item in item_seq] for item_seq in data[:,1:]]) # [data_len, seq_len, ufeature]

train_loader = DataLoader(dataset=TensorDataset(torch.from_numpy(user_profile_data).long(), 
                                                torch.from_numpy(item_seq_profile_data[:,:train_seq_len,:]).long(),
                                                torch.from_numpy(item_seq_profile_data[:,train_seq_len:(train_seq_len + pos_num + neg_sample_num),:]).long()
                                                ), batch_size=batch_size, shuffle=True, pin_memory=True)
test_loader = DataLoader(dataset=TensorDataset(torch.from_numpy(user_profile_data).long(), 
                                                torch.from_numpy(item_seq_profile_data[:,:train_seq_len + pos_num,:]).long(),
                                                torch.from_numpy(item_seq_profile_data[:,-(pos_num + neg_sample_num):,:]).long()
                                               ), batch_size=batch_size, shuffle=False, pin_memory=True)


class DIN(nn.Module):
    def __init__(self, user_profile_feature: [tuple], item_profile_feature: [tuple], profile_feature_embedding_dim: int, 
                 dnn_layer_dims: list[int], attention_layer_dims: list[int]):
        super(DIN, self).__init__()
        # 内容特征
        self.user_profile_feature, self.item_profile_feature, self.profile_feature_embedding_dim = user_profile_feature, item_profile_feature, profile_feature_embedding_dim
        self.user_profile_embed = nn.ModuleDict({'user_embed_' + str(i): nn.Embedding(num_embeddings=valcount, embedding_dim=profile_feature_embedding_dim) for i, valcount in user_profile_feature})
        self.item_profile_embed = nn.ModuleDict({'item_embed_' + str(i): nn.Embedding(num_embeddings=valcount, embedding_dim=profile_feature_embedding_dim) for i, valcount in item_profile_feature})
        self.user_profile_all_embed_dim = profile_feature_embedding_dim * len(user_profile_feature)
        self.item_profile_all_embed_dim = profile_feature_embedding_dim * len(item_profile_feature)
        # 注意力 attention net：基于行为特征
        self.dnn_layer_dims, self.attention_layer_dims = dnn_layer_dims, attention_layer_dims
        self.attention_input_dim = len(user_profile_feature) * profile_feature_embedding_dim + len(item_profile_feature) * profile_feature_embedding_dim + len(user_profile_feature) * len(item_profile_feature)
        self.attention_net = nn.Sequential(nn.Linear(self.attention_input_dim, attention_layer_dims[0]))
        if len(attention_layer_dims)>1:
            for i, layer_dim in enumerate(attention_layer_dims[1:]):
                self.attention_net.append(nn.Linear(attention_layer_dims[i], layer_dim))
                self.attention_net.append(nn.ReLU())
        self.attention_net.append(nn.Linear(attention_layer_dims[-1], 1))
        self.attention_net.append(nn.Softmax(dim=-2))
        # final dnn
        self.all_embedding_dim = len(self.item_profile_feature) * self.profile_feature_embedding_dim * 2
        self.final_dnn_network = nn.Sequential(nn.Linear(self.all_embedding_dim, dnn_layer_dims[0]), nn.ReLU())
        if len(dnn_layer_dims) > 1:
            for i, layer_dim in enumerate(dnn_layer_dims[1:]):
                self.final_dnn_network.append(nn.Linear(dnn_layer_dims[i], layer_dim))
                self.final_dnn_network.append(nn.ReLU())
        self.final_dnn_network.append(nn.Linear(dnn_layer_dims[-1], 1))
        self.final_dnn_network.append(nn.Sigmoid())
    # torch.Tensor([batch, feature]),   torch.Tensor([batch, seq_len, feature]),   torch.Tensor([batch, seq_len, feature])
    def forward(self, user_profiles, item_history_list_profile, item_future_list_profile):
        batch_len = user_profiles.shape[0]
        # user profile: [batch, feature * embed_dim]
        user_profile_embeddings = torch.cat([self.user_profile_embed['user_embed_' + str(i)](user_profiles[:,i].long()) for i in range(user_profiles.shape[-1])], axis=-1)
        user_profile_embeddings = user_profile_embeddings.reshape((batch_len, len(self.user_profile_feature), self.profile_feature_embedding_dim)) # [batch, feature, embed_dim]
        # item_history_list_profile: torch.Tensor([batch, seq_len, feature * embed_dim])
        seq_len = item_history_list_profile.shape[1]
        item_history_list_profile_embeddings = torch.cat([self.item_profile_embed['item_embed_' + str(i)](item_history_list_profile[:,:,i].long()) for i in range(item_history_list_profile.shape[-1])], axis=-1)
        item_history_list_profile_embeddings = item_history_list_profile_embeddings.reshape((batch_len, seq_len, len(self.item_profile_feature), self.profile_feature_embedding_dim)) # [batch, seq_len, feature, embed_dim]
        # # attention
        a = user_profile_embeddings.unsqueeze(1).repeat((1,seq_len,1,1)).reshape((batch_len * seq_len, len(self.user_profile_feature), self.profile_feature_embedding_dim))
        b = item_history_list_profile_embeddings.reshape((batch_len * seq_len, len(self.item_profile_feature), self.profile_feature_embedding_dim))
        ab = torch.bmm(a, b.permute(0,2,1)).reshape((batch_len, seq_len, len(self.user_profile_feature) * len(self.item_profile_feature)))
        a_ = a.reshape((batch_len, seq_len, len(self.user_profile_feature) * self.profile_feature_embedding_dim))
        b_ = b.reshape((batch_len, seq_len, len(self.item_profile_feature) * self.profile_feature_embedding_dim))
        # print(a_.shape, b_.shape, ab.shape) # torch.Size([100, 40, 100]) torch.Size([100, 40, 950]) torch.Size([100, 40, 38])
        in_attention = torch.cat([a_, b_, ab], dim=-1) # [batch, seq_len, feature, 3 * embed_dim]
        # [batch, seq_len, 1] * [batch, seq_len, feature * embed_dim]
        out_attention = torch.sum(self.attention_net(in_attention) * item_history_list_profile_embeddings.reshape((batch_len, seq_len, -1)), dim=1) # [batch, feature * embed_dim]

        # # 以上处理user profile和行为历史，下面进行与candidate组合预测， item_future_list 和 item_future_list_profile
        seq_len_ = item_future_list_profile.shape[1]
        item_future_list_profile_embeddings = torch.cat([self.item_profile_embed['item_embed_' + str(i)](item_future_list_profile[:,:,i].long()) for i in range(item_future_list_profile.shape[-1])], axis=-1)
        # print(item_future_list_profile_embeddings.shape, batch_len, seq_len_, len(self.item_profile_feature) * self.profile_feature_embedding_dim) # torch.Size([100, 20, 950]) 100 20 950
        item_future_list_profile_embeddings = item_future_list_profile_embeddings.reshape((batch_len, seq_len_, len(self.item_profile_feature) * self.profile_feature_embedding_dim)) # [batch, seq_len, feature * embed_dim]
        
        x = torch.cat([out_attention.unsqueeze(1).repeat((1,seq_len_,1)), item_future_list_profile_embeddings], dim=-1)
        output = self.final_dnn_network(x).squeeze()
        return output
model = DIN(user_profile_feature = [(i,len(list_)) for i, list_ in user_feature_vals.items()], item_profile_feature= [(i,len(list_)) for i, list_ in item_feature_vals.items()], 
            profile_feature_embedding_dim = dim, dnn_layer_dims = [16], attention_layer_dims=[16]).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=0.0003)
criterion = CrossEntropyLoss(reduction='sum').to(device)
label = torch.FloatTensor([1 for i in range(pos_num)] + [0 for i in range(neg_sample_num)]).to(device)

def DCG(batch_labels):
    dcgsum = np.zeros((batch_labels.shape[0]))
    for i in range(batch_labels.shape[-1]):
        dcg = (2 ** batch_labels[:,i] - 1) / np.math.log(i + 2, 2)
        dcgsum += dcg
    return dcgsum
def NDCG(output, labels):
    # ideal_dcg
    ideal_dcg = DCG(labels)
    # this
    dcg = DCG((np.argsort( - output, axis=-1)<pos_num).astype(np.float32))
    return np.sum(dcg/ideal_dcg)

for epoch in range(num_epochs):
    # train:
    epoch_train_losses = []
    model.train()
    for i, inputs in enumerate(train_loader):
        optimizer.zero_grad()
        user_profiles, item_history_list_profile, item_future_list_profile = inputs
        batch_len = user_profiles.shape[0]
        # print(item_history_list_profile.shape, item_future_list_profile.shape)
        user_profiles = user_profiles.to(device)
        item_history_list_profile = item_history_list_profile.to(device)
        item_future_list_profile = item_future_list_profile.to(device)
        output = model(user_profiles, item_history_list_profile, item_future_list_profile)
        labels = label.unsqueeze(0).repeat([batch_len,1])
        loss = criterion(output, labels)
        loss.backward()
        # torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=1, norm_type=2)
        optimizer.step()
        epoch_train_losses.append([batch_len, loss.item(), NDCG(output.cpu().detach().numpy(), labels.cpu().detach().numpy())])
    # validate:
    model.eval()
    epoch_test_losses = []
    for i, inputs in enumerate(test_loader):
        user_profiles, item_history_list_profile, item_future_list_profile = inputs
        batch_len = user_profiles.shape[0]
        user_profiles = user_profiles.to(device)
        item_history_list_profile = item_history_list_profile.to(device)
        item_future_list_profile = item_future_list_profile.to(device)
        output = model(user_profiles, item_history_list_profile, item_future_list_profile)
        labels = label.unsqueeze(0).repeat([batch_len,1])
        loss = criterion(output, labels)
        epoch_test_losses.append([batch_len, loss.item(), NDCG(output.cpu().detach().numpy(), labels.cpu().detach().numpy())])
    train_loss = sum([x[1] for x in epoch_train_losses])/sum([x[0] * (pos_num + neg_sample_num) for x in epoch_train_losses])
    test_loss  = sum([x[1] for x in epoch_test_losses])/sum([x[0] * (pos_num + neg_sample_num) for x in epoch_test_losses])
    train_ndcg = sum([x[2] for x in epoch_train_losses])/sum([x[0] for x in epoch_train_losses])
    test_ndcg  = sum([x[2] for x in epoch_test_losses])/sum([x[0] for x in epoch_test_losses])
    # print
    print('['+datetime.now().strftime("%Y-%m-%d %H:%M:%S")+']', 'epoch=[{}/{}], train_ce_loss: {:.4f}, train_ndcg: {:.4f}, validate_ce_loss: {:.4f}, validate_ndcg: {:.4f}'.format(epoch+1, num_epochs,  train_loss, train_ndcg, test_loss, test_ndcg))
     


[2023-09-03 00:17:34] epoch=[1/10], train_ce_loss: 0.7470, train_ndcg: 0.6494, validate_ce_loss: 0.7428, validate_ndcg: 0.6644
[2023-09-03 00:17:35] epoch=[2/10], train_ce_loss: 0.7412, train_ndcg: 0.6671, validate_ce_loss: 0.7403, validate_ndcg: 0.6676
[2023-09-03 00:17:37] epoch=[3/10], train_ce_loss: 0.7392, train_ndcg: 0.6733, validate_ce_loss: 0.7438, validate_ndcg: 0.6666
[2023-09-03 00:17:38] epoch=[4/10], train_ce_loss: 0.7381, train_ndcg: 0.6805, validate_ce_loss: 0.7396, validate_ndcg: 0.6584
[2023-09-03 00:17:39] epoch=[5/10], train_ce_loss: 0.7376, train_ndcg: 0.6855, validate_ce_loss: 0.7445, validate_ndcg: 0.6578
[2023-09-03 00:17:41] epoch=[6/10], train_ce_loss: 0.7383, train_ndcg: 0.6833, validate_ce_loss: 0.7428, validate_ndcg: 0.6577
[2023-09-03 00:17:42] epoch=[7/10], train_ce_loss: 0.7385, train_ndcg: 0.6853, validate_ce_loss: 0.7452, validate_ndcg: 0.6579
[2023-09-03 00:17:43] epoch=[8/10], train_ce_loss: 0.7397, train_ndcg: 0.6860, validate_ce_loss: 0.7408, valida

In [2]:
# DIEN
# Deep Interest Evolution Network for Click-Through Rate Prediction, 2018
# y = dnn(采用AUGRU来建模行为序列，融合内容特征)
# 简单实现，适应这个数据集
# 数据集：ml-100k

import torch
from torch import nn
from torch.nn import Module, CrossEntropyLoss, Sequential, Linear, Sigmoid
from torch.utils.data import Dataset, DataLoader, TensorDataset 
from sklearn.model_selection import train_test_split
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
device = torch.device("cuda:0" if torch.cuda.is_available() else ('mps:0' if torch.backends.mps.is_available() else "cpu"))
batch_size = 100
num_epochs = 10
dim=50

user_feature_vals = {}
for i in range(user_num_features):
    user_feature_vals[i] = sorted(list(set([val[i] for val in user_info.values()])))
    for user, info in user_info.items():
        user_info[user][i] = user_feature_vals[i].index(info[i])
item_feature_vals = {}
for i in range(item_num_features):
    item_feature_vals[i] = sorted(list(set([val[i] for val in item_info.values()])))
    for item, info in item_info.items():
        item_info[item][i] = item_feature_vals[i].index(info[i])

user_profile_data = np.array([user_info[u] for u in data[:,0]]) # [data_len, ufeature]
item_seq_profile_data = np.array([[item_info[item] for item in item_seq] for item_seq in data[:,1:]]) # [data_len, seq_len, ufeature]

train_loader = DataLoader(dataset=TensorDataset(torch.from_numpy(user_profile_data).long(), 
                                                torch.from_numpy(item_seq_profile_data[:,:train_seq_len,:]).long(),
                                                torch.from_numpy(item_seq_profile_data[:,train_seq_len:(train_seq_len + pos_num + neg_sample_num),:]).long()
                                                ), batch_size=batch_size, shuffle=True, pin_memory=True)
test_loader = DataLoader(dataset=TensorDataset(torch.from_numpy(user_profile_data).long(), 
                                                torch.from_numpy(item_seq_profile_data[:,:train_seq_len + pos_num,:]).long(),
                                                torch.from_numpy(item_seq_profile_data[:,-(pos_num + neg_sample_num):,:]).long()
                                               ), batch_size=batch_size, shuffle=False, pin_memory=True)

class AUGRUCell(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(AUGRUCell, self).__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.linear_ih = nn.Linear(input_dim, 3 * hidden_dim).to(device)
        self.linear_hh = nn.Linear(hidden_dim, 3 * hidden_dim).to(device)
    # [batch, test_len, n_feature * dim], [batch, test_len, dim], [batch, test_len, dim]
    def forward(self, inputs, hx, att_score):
        gi = self.linear_ih(inputs)
        gh = self.linear_hh(hx)
        i_r, i_z, i_n = gi[:,:,:self.hidden_dim], gi[:,:,self.hidden_dim:-self.hidden_dim], gi[:,:,-self.hidden_dim:]
        h_r, h_z, h_n = gh[:,:,:self.hidden_dim], gi[:,:,self.hidden_dim:-self.hidden_dim], gi[:,:,-self.hidden_dim:]
        reset_gate = torch.sigmoid(i_r + h_r)
        update_gate = torch.sigmoid(i_z + h_z)
        new_state = torch.tanh(i_n + reset_gate * h_n)
        update_gate = att_score * update_gate
        hy = (1. - update_gate) * hx + update_gate * new_state
        return hy
    def parameters(self, recurse: bool = True):
        return [para for para in self.linear_hh.parameters()] + [para for para in self.linear_ih.parameters()]
class AUGRU(nn.Module):
    # n_feature * dim, 
    def __init__(self, input_dim, hidden_dim):
        super(AUGRU, self).__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.attention_net = nn.Sequential(nn.Linear(input_dim*2, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, hidden_dim), nn.Sigmoid())
        self.agrucell = AUGRUCell(input_dim, hidden_dim).to(device)
    # , [batch, seq_len, n_feature * dim], [batch, test_len, n_feature * dim]
    # torch.Size([100, 40, 950]) torch.Size([100, 20, 950])
    def forward(self, history, target):
        batch_len, seq_len = history.shape[0], history.shape[1]
        test_len = target.shape[1]
        # # [batch, seq_len, test_len, n_feature * dim]  
        history_ = history.unsqueeze(-2).repeat((1,1,test_len,1))
        target_ = target.unsqueeze(1).repeat((1,seq_len,1,1))
        # # [batch, seq_len, test_len, dim]
        # torch.Size([100, 40, 20, 950]) torch.Size([100, 40, 20, 950]) 50
        attention = self.attention_net(torch.cat([history_, target_], dim=-1))
        h = torch.zeros((batch_len, test_len, self.hidden_dim)).to(device) # h0
        for i in range(seq_len):
            attention_ = attention[:,i,:,:] # [batch, test_len, 1]
            # history_[:,i,:,:]: [batch, test_len, n_feature * dim]
            # h [batch, test_len, dim]
            h = self.agrucell(history_[:,i,:,:], h, attention_)
        # [batch_len, test_len, hidden_dim]
        return h
    def parameters(self, recurse: bool = True):
        return [para for para in self.attention_net.parameters()] + [para for para in self.agrucell.parameters()]
class DIEN(nn.Module):
    def __init__(self, user_profile_feature: [tuple], item_profile_feature: [tuple], profile_feature_embedding_dim: int, hidden_dim: int,
                 dnn_layer_dims: list[int]):
        super(DIEN, self).__init__()
        # 内容特征
        self.user_profile_feature, self.item_profile_feature, self.profile_feature_embedding_dim = user_profile_feature, item_profile_feature, profile_feature_embedding_dim
        self.user_profile_embed = nn.ModuleDict({'user_embed_' + str(i): nn.Embedding(num_embeddings=valcount, embedding_dim=profile_feature_embedding_dim) for i, valcount in user_profile_feature})
        self.item_profile_embed = nn.ModuleDict({'item_embed_' + str(i): nn.Embedding(num_embeddings=valcount, embedding_dim=profile_feature_embedding_dim) for i, valcount in item_profile_feature})
        self.user_profile_all_embed_dim = profile_feature_embedding_dim * len(user_profile_feature)
        self.item_profile_all_embed_dim = profile_feature_embedding_dim * len(item_profile_feature)
        self.dnn_layer_dims, self.hidden_dim = dnn_layer_dims, hidden_dim
        # augru
        self.augru = AUGRU(profile_feature_embedding_dim * len(item_profile_feature), hidden_dim)
        # final dnn
        self.all_embedding_dim = hidden_dim + profile_feature_embedding_dim * len(self.item_profile_feature) + profile_feature_embedding_dim * len(self.user_profile_feature)
        self.final_dnn_network = nn.Sequential(nn.Linear(self.all_embedding_dim, dnn_layer_dims[0]), nn.ReLU())
        if len(dnn_layer_dims) > 1:
            for i, layer_dim in enumerate(dnn_layer_dims[1:]):
                self.final_dnn_network.append(nn.Linear(dnn_layer_dims[i], layer_dim))
                self.final_dnn_network.append(nn.ReLU())
        self.final_dnn_network.append(nn.Linear(dnn_layer_dims[-1], 1))
        self.final_dnn_network.append(nn.Sigmoid())
    # torch.Tensor([batch, feature]),   torch.Tensor([batch, seq_len, feature]),   torch.Tensor([batch, seq_len, feature])
    def forward(self, user_profiles, item_history_list_profile, item_future_list_profile):
        batch_len = user_profiles.shape[0]
        # user profile: [batch, feature * embed_dim]
        user_profile_embeddings = torch.cat([self.user_profile_embed['user_embed_' + str(i)](user_profiles[:,i].long()) for i in range(user_profiles.shape[-1])], axis=-1)
        user_profile_embeddings = user_profile_embeddings.reshape((batch_len, len(self.user_profile_feature) * self.profile_feature_embedding_dim)) # [batch, feature, embed_dim]
        # item_history_list_profile: torch.Tensor([batch, seq_len, feature * embed_dim])
        seq_len = item_history_list_profile.shape[1]
        item_history_list_profile_embeddings = torch.cat([self.item_profile_embed['item_embed_' + str(i)](item_history_list_profile[:,:,i].long()) for i in range(item_history_list_profile.shape[-1])], axis=-1)
        item_history_list_profile_embeddings = item_history_list_profile_embeddings.reshape((batch_len, seq_len, len(self.item_profile_feature) * self.profile_feature_embedding_dim)) # [batch, seq_len, feature, embed_dim]
        # 以上处理user profile和行为历史，下面进行与candidate组合预测， item_future_list 和 item_future_list_profile
        seq_len_ = item_future_list_profile.shape[1]
        item_future_list_profile_embeddings = torch.cat([self.item_profile_embed['item_embed_' + str(i)](item_future_list_profile[:,:,i].long()) for i in range(item_future_list_profile.shape[-1])], axis=-1)
        item_future_list_profile_embeddings = item_future_list_profile_embeddings.reshape((batch_len, seq_len_, len(self.item_profile_feature) * self.profile_feature_embedding_dim)) # [batch, seq_len, feature * embed_dim]
        # [batch_len, test_len, hidden_dim]
        h = self.augru(item_history_list_profile_embeddings, item_future_list_profile_embeddings)
        x = torch.cat([h, user_profile_embeddings.unsqueeze(1).repeat((1,seq_len_,1)), item_future_list_profile_embeddings], dim=-1)
        output = self.final_dnn_network(x).squeeze() # [batch, seq_len, 1]
        return output
    def parameters(self, recurse: bool = True):
        return [para for para in self.user_profile_embed.parameters()] + [para for para in self.item_profile_embed.parameters()] + [para for para in self.augru.parameters()] + [para for para in self.final_dnn_network.parameters()]
model = DIEN(user_profile_feature = [(i,len(list_)) for i, list_ in user_feature_vals.items()], item_profile_feature= [(i,len(list_)) for i, list_ in item_feature_vals.items()], hidden_dim=dim,
            profile_feature_embedding_dim = dim, dnn_layer_dims = [16]).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=0.0003)
criterion = CrossEntropyLoss(reduction='sum').to(device)
label = torch.FloatTensor([1 for i in range(pos_num)] + [0 for i in range(neg_sample_num)]).to(device)

def DCG(batch_labels):
    dcgsum = np.zeros((batch_labels.shape[0]))
    for i in range(batch_labels.shape[-1]):
        dcg = (2 ** batch_labels[:,i] - 1) / np.math.log(i + 2, 2)
        dcgsum += dcg
    return dcgsum
def NDCG(output, labels):
    # ideal_dcg
    ideal_dcg = DCG(labels)
    # this
    dcg = DCG((np.argsort( - output, axis=-1)<pos_num).astype(np.float32))
    return np.sum(dcg/ideal_dcg)

for epoch in range(num_epochs):
    # train:
    epoch_train_losses = []
    model.train()
    for i, inputs in enumerate(train_loader):
        optimizer.zero_grad()
        user_profiles, item_history_list_profile, item_future_list_profile = inputs
        batch_len = user_profiles.shape[0]
        # print(item_history_list_profile.shape, item_future_list_profile.shape)
        user_profiles = user_profiles.to(device)
        item_history_list_profile = item_history_list_profile.to(device)
        item_future_list_profile = item_future_list_profile.to(device)
        output = model(user_profiles, item_history_list_profile, item_future_list_profile)
        labels = label.unsqueeze(0).repeat([batch_len,1])
        loss = criterion(output, labels)
        loss.backward()
        # torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=1, norm_type=2)
        optimizer.step()
        epoch_train_losses.append([batch_len, loss.item(), NDCG(output.cpu().detach().numpy(), labels.cpu().detach().numpy())])
    # validate:
    model.eval()
    epoch_test_losses = []
    for i, inputs in enumerate(test_loader):
        user_profiles, item_history_list_profile, item_future_list_profile = inputs
        batch_len = user_profiles.shape[0]
        user_profiles = user_profiles.to(device)
        item_history_list_profile = item_history_list_profile.to(device)
        item_future_list_profile = item_future_list_profile.to(device)
        output = model(user_profiles, item_history_list_profile, item_future_list_profile)
        labels = label.unsqueeze(0).repeat([batch_len,1])
        loss = criterion(output, labels)
        epoch_test_losses.append([batch_len, loss.item(), NDCG(output.cpu().detach().numpy(), labels.cpu().detach().numpy())])
    train_loss = sum([x[1] for x in epoch_train_losses])/sum([x[0] * (pos_num + neg_sample_num) for x in epoch_train_losses])
    test_loss  = sum([x[1] for x in epoch_test_losses])/sum([x[0] * (pos_num + neg_sample_num) for x in epoch_test_losses])
    train_ndcg = sum([x[2] for x in epoch_train_losses])/sum([x[0] for x in epoch_train_losses])
    test_ndcg  = sum([x[2] for x in epoch_test_losses])/sum([x[0] for x in epoch_test_losses])
    # print
    print('['+datetime.now().strftime("%Y-%m-%d %H:%M:%S")+']', 'epoch=[{}/{}], train_ce_loss: {:.4f}, train_ndcg: {:.4f}, validate_ce_loss: {:.4f}, validate_ndcg: {:.4f}'.format(epoch+1, num_epochs,  train_loss, train_ndcg, test_loss, test_ndcg))


[2023-09-03 11:26:10] epoch=[1/10], train_ce_loss: 0.7461, train_ndcg: 0.6509, validate_ce_loss: 0.7435, validate_ndcg: 0.6702
[2023-09-03 11:26:22] epoch=[2/10], train_ce_loss: 0.7398, train_ndcg: 0.6730, validate_ce_loss: 0.7416, validate_ndcg: 0.6630
[2023-09-03 11:26:34] epoch=[3/10], train_ce_loss: 0.7386, train_ndcg: 0.6827, validate_ce_loss: 0.7431, validate_ndcg: 0.6596
[2023-09-03 11:26:47] epoch=[4/10], train_ce_loss: 0.7376, train_ndcg: 0.6872, validate_ce_loss: 0.7421, validate_ndcg: 0.6648
[2023-09-03 11:26:59] epoch=[5/10], train_ce_loss: 0.7366, train_ndcg: 0.6832, validate_ce_loss: 0.7396, validate_ndcg: 0.6614
[2023-09-03 11:27:11] epoch=[6/10], train_ce_loss: 0.7362, train_ndcg: 0.6826, validate_ce_loss: 0.7400, validate_ndcg: 0.6607
[2023-09-03 11:27:23] epoch=[7/10], train_ce_loss: 0.7358, train_ndcg: 0.6808, validate_ce_loss: 0.7422, validate_ndcg: 0.6628
[2023-09-03 11:27:35] epoch=[8/10], train_ce_loss: 0.7353, train_ndcg: 0.6830, validate_ce_loss: 0.7397, valida

In [5]:
# SASRec：（效果很好，基于transformer，但模型本身创新意义不大）
# Self-attentive Sequential Recommendation
# 自回归transformer模型，casual attention，基本架构照抄transformer，next-step预测形式，拿最后的token输出采用mf计算最终得分。
# 只用了行为item embedding，没有用内容特征
# 数据集：ml-100k

import math, copy
import torch
from torch import nn
from torch.nn import Module, CrossEntropyLoss, Sequential, Linear, Sigmoid
from torch.utils.data import Dataset, DataLoader, TensorDataset 
from sklearn.model_selection import train_test_split
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
device = torch.device("cuda:0" if torch.cuda.is_available() else ('mps:0' if torch.backends.mps.is_available() else "cpu"))
batch_size = 100
num_epochs = 10
dim=100
num_attention_heads = 4 # dim % 4
num_hidden_layers = 1

train_loader = DataLoader(dataset=TensorDataset(torch.from_numpy(data[:,1: 1+ train_seq_len] + 1).long(), torch.from_numpy(data[:,1+ train_seq_len:-(pos_num+neg_sample_num)]+1).long()), batch_size=batch_size, shuffle=True, pin_memory=True)
test_loader = DataLoader(dataset=TensorDataset(torch.from_numpy(data[:,1: 1+ train_seq_len] + 1).long(), torch.from_numpy(data[:,-(pos_num+neg_sample_num) : ] + 1).long()), batch_size=batch_size, shuffle=False, pin_memory=True) # 这里图方便
num_items += 1 # 因为data中id为0表示mask，所以这里做了简便处理

# 层内归一化的层（输入的特征维度归一化）
class LayerNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-12):
        super(LayerNorm, self).__init__()
        self.weight = nn.Parameter(torch.ones(hidden_size))
        self.bias = nn.Parameter(torch.zeros(hidden_size))
        self.variance_epsilon = eps
    def forward(self, x):
        u = x.mean(-1, keepdim=True)
        s = (x - u).pow(2).mean(-1, keepdim=True)
        x = (x - u) / torch.sqrt(s + self.variance_epsilon)
        return self.weight * x + self.bias
    def parameters(self, recurse: bool = True):
        return [self.weight, self.bias]
# 自注意力层
class SelfAttention(nn.Module):
    def __init__(self, hidden_size, num_attention_heads, attention_probs_dropout_prob, hidden_dropout_prob):
        super(SelfAttention, self).__init__()
        assert hidden_size % num_attention_heads == 0
        self.num_attention_heads = num_attention_heads
        self.attention_head_size = int(hidden_size / num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size
        # w_q, w_k, w_v
        self.query = nn.Linear(hidden_size, self.all_head_size)
        self.key = nn.Linear(hidden_size, self.all_head_size)
        self.value = nn.Linear(hidden_size, self.all_head_size)
        self.attn_dropout = nn.Dropout(attention_probs_dropout_prob)
        # 做完self-attention 做一个前馈全连接 LayerNorm 输出
        self.dense = nn.Linear(hidden_size, hidden_size)
        self.LayerNorm = LayerNorm(hidden_size, eps=1e-12)
        self.out_dropout = nn.Dropout(hidden_dropout_prob)
    def _transpose_for_scores(self, x):
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        x = x.view(*new_x_shape)
        return x.permute(0, 2, 1, 3)
    def forward(self, input_tensor, attention_mask):
        # q, k, v
        mixed_query_layer = self.query(input_tensor)
        mixed_key_layer = self.key(input_tensor)
        mixed_value_layer = self.value(input_tensor)
        query_layer = self._transpose_for_scores(mixed_query_layer)
        key_layer = self._transpose_for_scores(mixed_key_layer)
        value_layer = self._transpose_for_scores(mixed_value_layer)
        # attention
        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
        attention_scores = attention_scores + attention_mask
        attention_probs = nn.Softmax(dim=-1)(attention_scores)
        attention_probs = self.attn_dropout(attention_probs)
        # attention * v
        context_layer = torch.matmul(attention_probs, value_layer)
        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
        context_layer = context_layer.view(*new_context_layer_shape)
        # fnn + norm
        hidden_states = self.out_dropout(self.dense(context_layer))
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states
    def parameters(self, recurse: bool = True):
        return [para for para in self.query.parameters()] + [para for para in self.key.parameters()] + [para for para in self.value.parameters()] + [para for para in self.dense.parameters()] + self.LayerNorm.parameters()
class PointWiseFeedForward(nn.Module):
    def __init__(self, hidden_size, hidden_dropout_prob):
        super(PointWiseFeedForward, self).__init__()
        self.conv1d_1 = nn.Conv1d(hidden_size, hidden_size, kernel_size=(1,))
        self.activation = nn.ReLU()
        self.conv1d_2 = nn.Conv1d(hidden_size, hidden_size, kernel_size=(1,))
        self.LayerNorm = LayerNorm(hidden_size, eps=1e-12)
        self.dropout = nn.Dropout(hidden_dropout_prob)
    def forward(self, input_tensor):
        hidden_states = self.conv1d_1(input_tensor.transpose(1, 2))
        hidden_states = hidden_states.transpose(1, 2)
        hidden_states = self.activation(hidden_states)
        hidden_states = self.conv1d_2(hidden_states.transpose(1, 2))
        hidden_states = hidden_states.transpose(1, 2)
        hidden_states = self.activation(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states
    def parameters(self, recurse: bool = True):
        return [para for para in self.conv1d_1.parameters()] + [para for para in self.conv1d_2.parameters()] + self.LayerNorm.parameters()
class Layer(nn.Module):
    def __init__(self, hidden_size, num_attention_heads, hidden_dropout_prob, attention_probs_dropout_prob):
        super(Layer, self).__init__()
        self.attention = SelfAttention(hidden_size, num_attention_heads, attention_probs_dropout_prob, hidden_dropout_prob)
        self.intermediate = PointWiseFeedForward(hidden_size, hidden_dropout_prob)
    def forward(self, hidden_states, attention_mask):
        attention_output = self.attention(hidden_states, attention_mask)
        intermediate_output = self.intermediate(attention_output)
        return intermediate_output
    def parameters(self, recurse: bool = True):
        return [para for para in self.attention.parameters()] + [para for para in self.intermediate.parameters()]
class SASEncoder(nn.Module):
    def __init__(self, hidden_size, num_attention_heads, hidden_dropout_prob, attention_probs_dropout_prob, num_hidden_layers=1):
        super(SASEncoder, self).__init__()
        layer = Layer(hidden_size, num_attention_heads, hidden_dropout_prob, attention_probs_dropout_prob)
        self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(num_hidden_layers)])
    def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True):
        all_encoder_layers = []
        for layer_module in self.layer:
            hidden_states = layer_module(hidden_states, attention_mask)
            if output_all_encoded_layers:
                all_encoder_layers.append(hidden_states)
        if not output_all_encoded_layers:
            return hidden_states
        return all_encoder_layers
    def parameters(self, recurse: bool = True):
        paras = []
        for layer in self.layer:
            for para in layer.parameters():
                paras.append(para)
        return paras
class SASRec(nn.Module):
    def __init__(self, num_items, embedding_dim, seq_len, num_attention_heads, num_hidden_layers=1, hidden_dropout_prob=0., attention_probs_dropout_prob=0.):
        super(SASRec, self).__init__()
        self.num_items, self.embedding_dim, self.seq_len = num_items, embedding_dim, seq_len
        self.item_embeddings = nn.Embedding(num_items, embedding_dim, padding_idx=0)
        self.position_embeddings = nn.Embedding(seq_len, embedding_dim)
        self.item_encoder = SASEncoder(embedding_dim, num_attention_heads, hidden_dropout_prob, attention_probs_dropout_prob, num_hidden_layers)
        self.LayerNorm = LayerNorm(embedding_dim, eps=1e-12)
        self.dropout = nn.Dropout(hidden_dropout_prob)
    def _add_position_embedding(self, sequence: torch.Tensor) -> torch.Tensor:
        seq_length = sequence.size(1)
        position_ids = torch.arange(seq_length, dtype=torch.long, device=sequence.device)
        position_ids = position_ids.unsqueeze(0).expand_as(sequence)
        item_embeddings = self.item_embeddings(sequence)
        position_embeddings = self.position_embeddings(position_ids)
        sequence_emb = item_embeddings + position_embeddings
        sequence_emb = self.LayerNorm(sequence_emb)
        sequence_emb = self.dropout(sequence_emb)
        return sequence_emb
    def _get_embedding_and_mask(self, input_ids):
        sequence_emb = self._add_position_embedding(input_ids)
        attention_mask = (input_ids > 0).long()
        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)  # torch.int64
        max_len = attention_mask.size(-1)
        attn_shape = (1, max_len, max_len)
        subsequent_mask = torch.triu(torch.ones(attn_shape), diagonal=1)  # torch.uint8
        subsequent_mask = (subsequent_mask == 0).unsqueeze(1)
        subsequent_mask = subsequent_mask.long()
        subsequent_mask = subsequent_mask.to(device)
        extended_attention_mask = extended_attention_mask * subsequent_mask
        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
        return sequence_emb, extended_attention_mask
    def get_seq_out(self, input_ids):
        sequence_emb, extended_attention_mask = self._get_embedding_and_mask(input_ids)
        sequence_output = self.item_encoder(sequence_emb, extended_attention_mask, output_all_encoded_layers=False)
        return sequence_output
    def forward(self, input_ids, test):
        batch_len, seq_len = test.shape[0], test.shape[1]
        sequence_output = self.get_seq_out(input_ids)
        test_embeddings = self.item_embeddings(test)
        # mf: 原论文中采用mf，这里拿最后一个token位置的向量
        # y = torch.cosine_similarity(sequence_output[:,-1,:].unsqueeze(1).repeat((1,seq_len,1)), test_embeddings, dim=-1).squeeze()
        y = torch.sigmoid(torch.sum(sequence_output[:,-1,:].unsqueeze(1) * test_embeddings, dim=-1)).squeeze()
        return y
model = SASRec(num_items=num_items, embedding_dim=dim, seq_len=train_seq_len, num_attention_heads=num_attention_heads, num_hidden_layers=num_hidden_layers, hidden_dropout_prob=0., attention_probs_dropout_prob=0.).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.05, weight_decay=0.002)
criterion = CrossEntropyLoss(reduction='sum').to(device)
label = torch.FloatTensor([1 for i in range(pos_num)] + [0 for i in range(neg_sample_num)]).to(device)

def DCG(batch_labels):
    dcgsum = np.zeros((batch_labels.shape[0]))
    for i in range(batch_labels.shape[-1]):
        dcg = (2 ** batch_labels[:,i] - 1) / np.math.log(i + 2, 2)
        dcgsum += dcg
    return dcgsum
def NDCG(output, labels):
    # ideal_dcg
    ideal_dcg = DCG(labels)
    # this
    dcg = DCG((np.argsort( - output, axis=-1)<pos_num).astype(np.float32))
    return np.sum(dcg/ideal_dcg)

for epoch in range(num_epochs):
    # train:
    epoch_train_losses = []
    model.train()
    for i, inputs in enumerate(train_loader):
        optimizer.zero_grad()
        item_seqs = inputs[0].to(device)
        test = inputs[1].to(device)
        output = model(item_seqs, test)
        labels = label.unsqueeze(0).repeat([item_seqs.shape[0],1])
        loss = criterion(output, labels)
        loss.backward()
        # torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=1, norm_type=2)
        optimizer.step()
        epoch_train_losses.append([item_seqs.shape[0], loss.item(), NDCG(output.detach().numpy(), labels.detach().numpy())])
    # validate:
    model.eval()
    epoch_test_losses = []
    for i, inputs in enumerate(test_loader):
        item_seqs = inputs[0].to(device)
        test = inputs[1].to(device)
        output = model(item_seqs, test)
        labels = label.unsqueeze(0).repeat([item_seqs.shape[0],1])
        loss = criterion(output, labels)
        epoch_test_losses.append([item_seqs.shape[0], loss.item(), NDCG(output.detach().numpy(), labels.detach().numpy())])
    train_loss = sum([x[1] for x in epoch_train_losses])/sum([x[0] * (pos_num + neg_sample_num) for x in epoch_train_losses])
    test_loss  = sum([x[1] for x in epoch_test_losses])/sum([x[0] * (pos_num + neg_sample_num) for x in epoch_test_losses])
    train_ndcg = sum([x[2] for x in epoch_train_losses])/sum([x[0] for x in epoch_train_losses])
    test_ndcg  = sum([x[2] for x in epoch_test_losses])/sum([x[0] for x in epoch_test_losses])
    # print
    print('['+datetime.now().strftime("%Y-%m-%d %H:%M:%S")+']', 'epoch=[{}/{}], train_ce_loss: {:.4f}, train_ndcg: {:.4f}, validate_ce_loss: {:.4f}, validate_ndcg: {:.4f}'.format(epoch+1, num_epochs,  train_loss, train_ndcg, test_loss, test_ndcg))
    

[2023-09-04 15:20:24] epoch=[1/10], train_ce_loss: 0.7626, train_ndcg: 0.6294, validate_ce_loss: 0.7687, validate_ndcg: 0.6553
[2023-09-04 15:20:31] epoch=[2/10], train_ce_loss: 0.7204, train_ndcg: 0.7175, validate_ce_loss: 0.7703, validate_ndcg: 0.6699
[2023-09-04 15:20:35] epoch=[3/10], train_ce_loss: 0.7016, train_ndcg: 0.7747, validate_ce_loss: 0.7699, validate_ndcg: 0.6796
[2023-09-04 15:20:38] epoch=[4/10], train_ce_loss: 0.6929, train_ndcg: 0.8143, validate_ce_loss: 0.7731, validate_ndcg: 0.6920
[2023-09-04 15:20:42] epoch=[5/10], train_ce_loss: 0.6858, train_ndcg: 0.8337, validate_ce_loss: 0.7740, validate_ndcg: 0.6799
[2023-09-04 15:20:45] epoch=[6/10], train_ce_loss: 0.6799, train_ndcg: 0.8392, validate_ce_loss: 0.7731, validate_ndcg: 0.6790
[2023-09-04 15:20:48] epoch=[7/10], train_ce_loss: 0.6759, train_ndcg: 0.8419, validate_ce_loss: 0.7711, validate_ndcg: 0.6859
[2023-09-04 15:20:51] epoch=[8/10], train_ce_loss: 0.6733, train_ndcg: 0.8422, validate_ce_loss: 0.7700, valida

In [2]:
# BST：基于transformer建模行为序列（模型本身也没什么创新）
# Behavior Sequence Transformer for E-commerce Recommendation in Alibaba
# 用户内容特征+上下文特征（这里数据集没有）+对行为序列（item embedding + pos embedding）进行建模

import torch
from torch import nn
from torch.nn import Module, CrossEntropyLoss, Sequential, Linear, Sigmoid
from torch.utils.data import Dataset, DataLoader, TensorDataset 
from sklearn.model_selection import train_test_split
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
device = torch.device("cuda:0" if torch.cuda.is_available() else ('mps:0' if torch.backends.mps.is_available() else "cpu"))
batch_size = 100
num_epochs = 10
dim=100

user_feature_vals = {}
for i in range(user_num_features):
    user_feature_vals[i] = sorted(list(set([val[i] for val in user_info.values()])))
    for user, info in user_info.items():
        user_info[user][i] = user_feature_vals[i].index(info[i])

user_profile_data = np.array([user_info[u] for u in data[:,0]]) # [data_len, ufeature]
train_loader = DataLoader(dataset=TensorDataset(torch.from_numpy(user_profile_data).long(), 
                                                torch.from_numpy(data[:,1: 1+ train_seq_len] + 1).long(),
                                                torch.from_numpy(data[:,1+ train_seq_len:-(pos_num+neg_sample_num)]+1).long()
                                                ), batch_size=batch_size, shuffle=True, pin_memory=True) # mask = 0
test_loader = DataLoader(dataset=TensorDataset(torch.from_numpy(user_profile_data).long(), 
                                                torch.from_numpy(data[:,1: 1+ train_seq_len] + 1).long(),
                                                torch.from_numpy(data[:,-(pos_num+neg_sample_num):] + 1).long()
                                               ), batch_size=batch_size, shuffle=False, pin_memory=True) # mask = 0
num_items += 1

class FFN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(FFN, self).__init__()
        self.linear_1 = nn.Linear(input_size, hidden_size)
        self.linear_2 = nn.Linear(hidden_size, input_size)
        self.relu = nn.ReLU()
    def forward(self, x):
        output = self.linear_2(self.relu(self.linear_1(x)))
        return output
    def parameters(self, recurse: bool = True):
        return [para for para in self.linear_1.parameters()] + [para for para in self.linear_2.parameters()] + [para for para in self.relu.parameters()]
class MultiHeadAttention(nn.Module):
    def __init__(self, att_dim, n_heads):
        super(MultiHeadAttention, self).__init__()
        self.n_heads = n_heads
        assert att_dim % self.n_heads == 0
        self.att_size = int(att_dim / n_heads)
        # Query, Key, Value
        self._query = nn.Linear(att_dim, att_dim, bias=False)
        self._key = nn.Linear(att_dim, att_dim, bias=False)
        self._value = nn.Linear(att_dim, att_dim, bias=False)
        # Attention Block
        self.dense = nn.Linear(att_dim, att_dim, bias=False)
        self.activation = nn.Softmax(dim=-1)
        self.dropout = nn.Dropout(0.1)
    def forward(self, q, k, v, mask=None):
        scale_factor = torch.sqrt(torch.FloatTensor([self.n_heads])).item()
        batch_size = q.size(0)
        # To Multiple Attention Heads
        _query = self._query(q).view(batch_size, -1, self.n_heads, self.att_size).transpose(1, 2)
        _key = self._key(k).view(batch_size, -1, self.n_heads, self.att_size).transpose(1, 2)
        _value = self._value(v).view(batch_size, -1, self.n_heads, self.att_size).transpose(1, 2)
        # Scaled dot-product Attention score
        score = torch.matmul(_query, _key.transpose(-2, -1)) / scale_factor
        # Mask applied.
        if mask is not None:
            mask = mask.unsqueeze(1)
            score = score.masked_fill(mask == 0, -1e9)
        # Softmax on Score
        score = self.activation(score)
        z = torch.matmul(self.dropout(score), _value)
        # To fully-connected layer
        z = z.transpose(1, 2).reshape(batch_size, -1, self.att_size * self.n_heads)
        return self.dense(z)
class EncoderLayer(nn.Module):
    def __init__(self, input_size, hidden_size, n_heads):
        super(EncoderLayer, self).__init__()
        self.mh_attention = MultiHeadAttention(input_size, n_heads)
        self.lnorm_1 = nn.LayerNorm(input_size)
        self.ff = FFN(input_size, hidden_size)
        self.lnorm_2 = nn.LayerNorm(input_size)
        self.dropout = nn.Dropout(0.1)
    def forward(self, x, mask=None):
        attention_out = self.mh_attention(x, x, x, mask)
        attention_out = self.lnorm_1(self.dropout(attention_out) + x)
        ff_attention = self.ff(attention_out)
        return self.lnorm_2(self.dropout(ff_attention) + attention_out)
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size, n_layers, n_heads):
        super(Encoder, self).__init__()
        self.stack = nn.ModuleList()
        for layer in range(n_layers):
            self.stack.append(EncoderLayer(input_size, hidden_size, n_heads))
        self.dropout = nn.Dropout(0.1)
    def forward(self, x, mask=None):
        for cell in self.stack:
            x = cell(self.dropout(x), mask)
        return x
class BSTransformer(nn.Module):
    def __init__(self, max_seq_len: int, num_encoder_layer: int, num_heads: int, user_profile_feature: [tuple], profile_feature_embedding_dim: int, hidden_dim: int, dnn_layer_dims: list[int]):
        super(BSTransformer, self).__init__()
        self.max_seq_len = max_seq_len
        # 内容特征
        self.user_profile_feature, self.profile_feature_embedding_dim = user_profile_feature, profile_feature_embedding_dim
        self.user_profile_embed = nn.ModuleDict({'user_embed_' + str(i): nn.Embedding(num_embeddings=valcount, embedding_dim=profile_feature_embedding_dim) for i, valcount in user_profile_feature})
        self.item_embeddings = nn.Embedding(num_items, hidden_dim, padding_idx=-1)
        self.pos_embedding = self.pos_embedding_sinusoidal(max_seq_len, hidden_dim)
        self.user_profile_all_embed_dim = profile_feature_embedding_dim * len(user_profile_feature)
        self.dnn_layer_dims, self.hidden_dim = dnn_layer_dims, hidden_dim
        self.encoder = Encoder(hidden_dim, hidden_dim, num_encoder_layer, num_heads)
        # final dnn
        self.all_embedding_dim = len(user_profile_feature) * profile_feature_embedding_dim + hidden_dim * 2
        self.final_dnn_network = nn.Sequential(nn.Linear(self.all_embedding_dim, dnn_layer_dims[0]), nn.ReLU())
        if len(dnn_layer_dims) > 1:
            for i, layer_dim in enumerate(dnn_layer_dims[1:]):
                self.final_dnn_network.append(nn.Linear(dnn_layer_dims[i], layer_dim))
                self.final_dnn_network.append(nn.ReLU())
        self.final_dnn_network.append(nn.Linear(dnn_layer_dims[-1], 1))
        self.final_dnn_network.append(nn.Sigmoid())
    def forward(self, user_profiles, item_history_list, item_future_list):
        batch_len = user_profiles.shape[0]
        # user profile: [batch, feature * embed_dim]
        user_profile_embeddings = torch.cat([self.user_profile_embed['user_embed_' + str(i)](user_profiles[:,i].long()) for i in range(user_profiles.shape[-1])], axis=-1)
        user_profile_embeddings = user_profile_embeddings.reshape((batch_len, len(self.user_profile_feature) * self.profile_feature_embedding_dim)) # [batch, feature, embed_dim]
        # item_history_list_profile: torch.Tensor([batch, seq_len, feature * embed_dim])
        seq_len = item_history_list.shape[1]
        enc_mask = self.get_mask(item_history_list)
        item_embed = self.item_embeddings(item_history_list.long())
        bst_encoding = torch.mean(self.encoder(item_embed + self.pos_embedding, mask=enc_mask), dim=1)
        # 
        seq_len_ = item_future_list.shape[1]
        bst_encoding = bst_encoding.unsqueeze(1).repeat((1,seq_len_,1))
        user_profile_embeddings = user_profile_embeddings.unsqueeze(1).repeat((1,seq_len_,1))
        test_item_embed = self.item_embeddings(item_future_list.long())
        output = self.final_dnn_network(torch.cat([bst_encoding, user_profile_embeddings, test_item_embed], dim=-1)).squeeze()
        return output
    def get_mask(self, x):
        seq_len = x.size(1)
        mask = (x != 0).unsqueeze(1).byte()
        triu = (np.triu(np.ones([1, seq_len, seq_len]), k=1) == 0).astype('uint8')
        return mask * triu
    @staticmethod
    def pos_embedding_sinusoidal(max_seq_len, embedding_dim):
        half_dim = embedding_dim // 2
        emb = torch.log(torch.tensor(10000)) / (half_dim - 1)
        emb = torch.exp(torch.arange(half_dim, dtype=torch.float) * -emb)
        emb = torch.arange(max_seq_len, dtype=torch.float).unsqueeze(1) * emb.unsqueeze(0)
        emb = torch.stack((torch.sin(emb), torch.cos(emb)), dim=0).view(
            max_seq_len, -1).t().contiguous().view(max_seq_len, -1)
        if embedding_dim % 2 == 1:
            emb = torch.cat([emb, torch.zeros(max_seq_len, 1)], dim=1)
        return emb.to(device)
model = BSTransformer(max_seq_len=train_seq_len, num_encoder_layer = 1, num_heads = 4, user_profile_feature = [(i,len(list_)) for i, list_ in user_feature_vals.items()], profile_feature_embedding_dim = dim, hidden_dim = dim, dnn_layer_dims = [16]).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=0.0003)
criterion = CrossEntropyLoss(reduction='sum').to(device)
label = torch.FloatTensor([1 for i in range(pos_num)] + [0 for i in range(neg_sample_num)]).to(device)

def DCG(batch_labels):
    dcgsum = np.zeros((batch_labels.shape[0]))
    for i in range(batch_labels.shape[-1]):
        dcg = (2 ** batch_labels[:,i] - 1) / np.math.log(i + 2, 2)
        dcgsum += dcg
    return dcgsum
def NDCG(output, labels):
    # ideal_dcg
    ideal_dcg = DCG(labels)
    # this
    dcg = DCG((np.argsort( - output, axis=-1)<pos_num).astype(np.float32))
    return np.sum(dcg/ideal_dcg)

for epoch in range(num_epochs):
    # train:
    epoch_train_losses = []
    model.train()
    for i, inputs in enumerate(train_loader):
        optimizer.zero_grad()
        user_profiles, item_history_list, item_future_list = inputs
        batch_len = user_profiles.shape[0]
        user_profiles = user_profiles.to(device)
        item_history_list = item_history_list.to(device)
        item_future_list = item_future_list.to(device)
        output = model(user_profiles, item_history_list, item_future_list)
        labels = label.unsqueeze(0).repeat([batch_len,1])
        loss = criterion(output, labels)
        loss.backward()
        # torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=1, norm_type=2)
        optimizer.step()
        epoch_train_losses.append([batch_len, loss.item(), NDCG(output.cpu().detach().numpy(), labels.cpu().detach().numpy())])
    # validate:
    model.eval()
    epoch_test_losses = []
    for i, inputs in enumerate(test_loader):
        user_profiles, item_history_list, item_future_list = inputs
        batch_len = user_profiles.shape[0]
        user_profiles = user_profiles.to(device)
        item_history_list = item_history_list.to(device)
        item_future_list = item_future_list.to(device)
        output = model(user_profiles, item_history_list, item_future_list)
        labels = label.unsqueeze(0).repeat([batch_len,1])
        loss = criterion(output, labels)
        epoch_test_losses.append([batch_len, loss.item(), NDCG(output.cpu().detach().numpy(), labels.cpu().detach().numpy())])
    train_loss = sum([x[1] for x in epoch_train_losses])/sum([x[0] * (pos_num + neg_sample_num) for x in epoch_train_losses])
    test_loss  = sum([x[1] for x in epoch_test_losses])/sum([x[0] * (pos_num + neg_sample_num) for x in epoch_test_losses])
    train_ndcg = sum([x[2] for x in epoch_train_losses])/sum([x[0] for x in epoch_train_losses])
    test_ndcg  = sum([x[2] for x in epoch_test_losses])/sum([x[0] for x in epoch_test_losses])
    # print
    print('['+datetime.now().strftime("%Y-%m-%d %H:%M:%S")+']', 'epoch=[{}/{}], train_ce_loss: {:.4f}, train_ndcg: {:.4f}, validate_ce_loss: {:.4f}, validate_ndcg: {:.4f}'.format(epoch+1, num_epochs,  train_loss, train_ndcg, test_loss, test_ndcg))


[2023-09-04 16:53:24] epoch=[1/10], train_ce_loss: 0.7475, train_ndcg: 0.6297, validate_ce_loss: 0.7493, validate_ndcg: 0.6096
[2023-09-04 16:53:28] epoch=[2/10], train_ce_loss: 0.7351, train_ndcg: 0.7317, validate_ce_loss: 0.7513, validate_ndcg: 0.6235
[2023-09-04 16:53:32] epoch=[3/10], train_ce_loss: 0.7139, train_ndcg: 0.7865, validate_ce_loss: 0.7568, validate_ndcg: 0.6260
[2023-09-04 16:53:36] epoch=[4/10], train_ce_loss: 0.6921, train_ndcg: 0.8263, validate_ce_loss: 0.7620, validate_ndcg: 0.6285
[2023-09-04 16:53:40] epoch=[5/10], train_ce_loss: 0.6768, train_ndcg: 0.8543, validate_ce_loss: 0.7638, validate_ndcg: 0.6294
[2023-09-04 16:53:44] epoch=[6/10], train_ce_loss: 0.6679, train_ndcg: 0.8702, validate_ce_loss: 0.7658, validate_ndcg: 0.6283
[2023-09-04 16:53:49] epoch=[7/10], train_ce_loss: 0.6621, train_ndcg: 0.8772, validate_ce_loss: 0.7679, validate_ndcg: 0.6306
[2023-09-04 16:53:56] epoch=[8/10], train_ce_loss: 0.6582, train_ndcg: 0.8818, validate_ce_loss: 0.7689, valida