In [None]:
# 基于深度学习的推荐模型
# DSSM、DeepFM、Deep&Wide、DCN（deepcross）
# 、AFM

In [1]:
# 加载数据集2
# 数据集：ml-100k

import os
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# 加载数据
ratings = np.array([[int(x) for x in line.strip().split('\t')[:3]] for line in open('./data/ml-100k/ua.base','r').read().strip().split('\n')], dtype=np.int32)
ratings[:,-1] = (ratings[:,-1] - 0)/(max(ratings[:,-1]) - 0)
occupation_dict = {'administrator':0, 'artist':1, 'doctor':2, 'educator':3, 'engineer':4, 'entertainment':5, 'executive':6, 'healthcare':7, 'homemaker':8, 'lawyer':9, 'librarian':10, 'marketing':11, 'none':12, 'other':13, 'programmer':14, 'retired':15, 'salesman':16, 'scientist':17, 'student':18, 'technician':19, 'writer':20}
gender_dict={'M':1,'F':0}
user_info = {}
for line in open('./data/ml-100k/u.user','r', encoding='utf-8').read().strip().split('\n'):
    phs = line.strip().split('|')
    user_info[int(phs[0])] = [int(phs[1]), gender_dict[phs[2]], occupation_dict[phs[3]]]
item_info = {}
for line in open('./data/ml-100k/u.item','r', encoding='ISO-8859-1').read().strip().split('\n'):
    phs = line.strip().split('|')
    item_info[int(phs[0])] = phs[5:]
# print(len(user_info[list(user_info.keys())[0]]), len(item_info[list(item_info.keys())[0]]))
# print(data.shape)
num_users = len(user_info)
num_items = len(item_info)
num_features = 22

In [3]:
# DSSM: Deep Structured Semantic Model  
# y = cosine(mlp_u(user_feature), mlp_i(item_feature))
# Learning deep structured semantic models for web search using clickthrough data
# 2013年微软
# 数据集：ml-100k

import torch
from torch.nn import Module, MSELoss, Sequential, Linear, Sigmoid
from torch.utils.data import Dataset, DataLoader, TensorDataset 
from sklearn.model_selection import train_test_split
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
device = torch.device("cuda:0" if torch.cuda.is_available() else ('mps:0' if torch.backends.mps.is_available() else "cpu"))
batch_size = 100
num_epochs = 10
dim=100

data = np.array([user_info[u] + item_info[i] + [r] for u, i, r in ratings], dtype=np.float32)
data[:,:-1] = MinMaxScaler().fit_transform(data[:,:-1])
X_train, X_test, y_train, y_test = train_test_split(data[:,:-1], data[:,-1], test_size=0.4, random_state=0)
train_loader = DataLoader(dataset=TensorDataset(torch.from_numpy(X_train).float(), torch.from_numpy(y_train).float()), batch_size=batch_size, shuffle=True, pin_memory=True)
test_loader = DataLoader(dataset=TensorDataset(torch.from_numpy(X_test).float(), torch.from_numpy(y_test).float()), batch_size=batch_size, shuffle=False, pin_memory=True)

class DSSM(Module):
    def __init__(self, user_layer_dims, item_layer_dims):
        super(DSSM, self).__init__()
        self.user_layer_dims = user_layer_dims
        self.item_layer_dims = item_layer_dims
        # user dnn
        self.user_dnn = Sequential()
        for i, layer_dim in enumerate(user_layer_dims[1:]):
            self.user_dnn.append(Linear(user_layer_dims[i], layer_dim))
            self.user_dnn.append(Sigmoid())
        # item dnn
        self.item_dnn = Sequential()
        for i, layer_dim in enumerate(item_layer_dims[1:]):
            self.item_dnn.append(Linear(item_layer_dims[i], layer_dim))
            self.item_dnn.append(Sigmoid())
    def forward(self, user_features, item_features):
        return torch.cosine_similarity(self.user_dnn(user_features), self.item_dnn(item_features), dim=-1)
model = DSSM(user_layer_dims=[3, 8], item_layer_dims=[19, 8]).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = MSELoss(reduction='sum').to(device)

for epoch in range(num_epochs):
    # train:
    epoch_train_losses = []
    model.train()
    for i, inputs in enumerate(train_loader):
        optimizer.zero_grad()
        input = inputs[0].to(device)
        label = inputs[1].to(device)
        output = model(input[:,:3], input[:,3:])
        loss = criterion(output, label)
        loss.backward()
        # torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=1, norm_type=2)
        optimizer.step()
        epoch_train_losses.append([input.shape[0], loss.item()])
    # validate:
    model.eval()
    epoch_test_losses = []
    for i, inputs in enumerate(test_loader):
        input = inputs[0].to(device)
        label = inputs[1].to(device)
        output = model(input[:,:3], input[:,3:])
        loss = criterion(output, label)
        epoch_test_losses.append([input.shape[0], loss.item()])
    train_loss = sum([x[1] for x in epoch_train_losses])/sum([x[0] for x in epoch_train_losses])
    test_loss  = sum([x[1] for x in epoch_test_losses])/sum([x[0] for x in epoch_test_losses])
    # print
    print('['+datetime.now().strftime("%Y-%m-%d %H:%M:%S")+']', 'epoch=[{}/{}], train_mse_loss: {:.4f}, validate_mse_loss: {:.4f}'.format(epoch+1, num_epochs,  train_loss, test_loss))


[2023-08-31 22:06:45] epoch=[1/10], train_mse_loss: 0.3916, validate_mse_loss: 0.2222
[2023-08-31 22:06:49] epoch=[2/10], train_mse_loss: 0.1933, validate_mse_loss: 0.1806
[2023-08-31 22:06:54] epoch=[3/10], train_mse_loss: 0.1747, validate_mse_loss: 0.1736
[2023-08-31 22:06:59] epoch=[4/10], train_mse_loss: 0.1707, validate_mse_loss: 0.1713
[2023-08-31 22:07:03] epoch=[5/10], train_mse_loss: 0.1690, validate_mse_loss: 0.1700
[2023-08-31 22:07:08] epoch=[6/10], train_mse_loss: 0.1678, validate_mse_loss: 0.1688
[2023-08-31 22:07:13] epoch=[7/10], train_mse_loss: 0.1667, validate_mse_loss: 0.1678
[2023-08-31 22:07:18] epoch=[8/10], train_mse_loss: 0.1659, validate_mse_loss: 0.1670
[2023-08-31 22:07:23] epoch=[9/10], train_mse_loss: 0.1652, validate_mse_loss: 0.1664
[2023-08-31 22:07:27] epoch=[10/10], train_mse_loss: 0.1646, validate_mse_loss: 0.1659


In [2]:
# Wide & Deep: y = simoid(dnn(x), x)
# 2016年youtube Wide & deep learning for recommender systems[C]//Proceedings of the 1st Workshop on Deep Learning for Recommender Systems
# 数据集：ml-100k


import os
import numpy as np
import torch
import torch.nn as nn
from torch.nn import Module, Parameter, MSELoss
from torch.utils.data import Dataset, DataLoader, TensorDataset 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
from torch.nn import Module, Sequential, ReLU, Dropout, Sigmoid

# category: [1,2] + [all]
# number: [0] + []
number_feature_data = MinMaxScaler().fit_transform(np.array([[user_info[u][0]]  for u, i, r in ratings], dtype=np.float32))
category_feature_data = np.array([user_info[u][1:] + item_info[i] for u, i, r in ratings], dtype=np.int32)
data = np.concatenate([number_feature_data, category_feature_data, ratings[:,-1:]], axis=-1)
num_number_features = number_feature_data.shape[-1]
num_category_features = category_feature_data.shape[-1]
num_features = data.shape[-1] - 1
category_feature_vals = {}
for i in range(num_number_features, num_features):
    category_feature_vals[i] = sorted(list(set(list(data[:, i]))))
    for rid in range(data.shape[0]):
        data[rid, i] = category_feature_vals[i].index(data[rid, i])
# print(len(user_info[list(user_info.keys())[0]]), len(item_info[list(item_info.keys())[0]]))
# print(data.shape)

num_users = len(user_info)
num_items = len(item_info)
device = torch.device("cuda:0" if torch.cuda.is_available() else ('mps:0' if torch.backends.mps.is_available() else "cpu"))
batch_size = 100
num_epochs = 10
embedding_dim = 8
X_train, X_test, y_train, y_test = train_test_split(data[:,:-1], data[:,-1], test_size=0.4, random_state=0)
train_loader = DataLoader(dataset=TensorDataset(torch.from_numpy(X_train).float(), torch.from_numpy(y_train).float()), batch_size=batch_size, shuffle=True, pin_memory=True)
test_loader = DataLoader(dataset=TensorDataset(torch.from_numpy(X_test).float(), torch.from_numpy(y_test).float()), batch_size=batch_size, shuffle=False, pin_memory=True)

class WideDeep(nn.Module):
    def __init__(self, dense_feature_cols, sparse_feature_col_dims, dnn_layer_dims, dnn_dropout=0.):
        super(WideDeep, self).__init__()
        self.dense_feature_cols, self.sparse_feature_col_dims = dense_feature_cols, sparse_feature_col_dims
        # sparse feature embedding dict
        self.embed_layers = nn.ModuleDict({'embed_' + str(i): nn.Embedding(num_embeddings=valcount, embedding_dim=dim) for i, valcount, dim in sparse_feature_col_dims})
        # dnn part
        dnn_layer_dims.insert(0, len(dense_feature_cols) + sum([x[-1] for x in sparse_feature_col_dims]))
        self.dnn_network = Sequential()
        for i, layer_dim in enumerate(dnn_layer_dims[1:]):
            self.dnn_network.append(nn.Linear(dnn_layer_dims[i], layer_dim))
            self.dnn_network.append(ReLU())
        self.dnn_network.append(Dropout(dnn_dropout))
        # linear + final layer
        self.final_nn = Sequential(nn.Linear(dnn_layer_dims[-1] + dnn_layer_dims[0], 1), Sigmoid())
    def forward(self, x):
        dense_input = x[:, :len(self.dense_feature_cols)]
        sparse_embeds = torch.cat([self.embed_layers['embed_' + str(i)](x[:, i].long()) for i in range(len(self.dense_feature_cols), x.shape[1])], axis=-1)
        x = torch.cat([sparse_embeds, dense_input], axis=-1)
        # dnn part
        deep_out = self.dnn_network(x)
        #  Concatenate and out
        outputs = self.final_nn(torch.cat([x, deep_out], axis=-1))
        return outputs.squeeze()
model = WideDeep(dense_feature_cols=[i for i in range(num_number_features)], sparse_feature_col_dims=[(i,len(category_feature_vals[i]), embedding_dim) for i in range(num_number_features, num_features)], dnn_layer_dims=[128, 32], dnn_dropout=0.).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = MSELoss(reduction='sum').to(device)

for epoch in range(num_epochs):
    # train:
    epoch_train_losses = []
    model.train()
    for i, inputs in enumerate(train_loader):
        optimizer.zero_grad()
        input = inputs[0].to(device)
        label = inputs[1].to(device)
        output = model(input)
        loss = criterion(output, label)
        loss.backward()
        # torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=1, norm_type=2)
        optimizer.step()
        epoch_train_losses.append([input.shape[0], loss.item()])
    # validate:
    model.eval()
    epoch_test_losses = []
    for i, inputs in enumerate(test_loader):
        input = inputs[0].to(device)
        label = inputs[1].to(device)
        output = model(input)
        loss = criterion(output, label)
        epoch_test_losses.append([input.shape[0], loss.item()])
    train_loss = sum([x[1] for x in epoch_train_losses])/sum([x[0] for x in epoch_train_losses])
    test_loss  = sum([x[1] for x in epoch_test_losses])/sum([x[0] for x in epoch_test_losses])
    # print
    print('['+datetime.now().strftime("%Y-%m-%d %H:%M:%S")+']', 'epoch=[{}/{}], train_mse_loss: {:.4f}, validate_mse_loss: {:.4f}'.format(epoch+1, num_epochs,  train_loss, test_loss))


[2023-09-01 10:09:08] epoch=[1/10], train_mse_loss: 0.1634, validate_mse_loss: 0.1635
[2023-09-01 10:09:15] epoch=[2/10], train_mse_loss: 0.1621, validate_mse_loss: 0.1628
[2023-09-01 10:09:23] epoch=[3/10], train_mse_loss: 0.1614, validate_mse_loss: 0.1620
[2023-09-01 10:09:31] epoch=[4/10], train_mse_loss: 0.1607, validate_mse_loss: 0.1618
[2023-09-01 10:09:39] epoch=[5/10], train_mse_loss: 0.1602, validate_mse_loss: 0.1620
[2023-09-01 10:09:46] epoch=[6/10], train_mse_loss: 0.1596, validate_mse_loss: 0.1602
[2023-09-01 10:09:54] epoch=[7/10], train_mse_loss: 0.1592, validate_mse_loss: 0.1606
[2023-09-01 10:10:01] epoch=[8/10], train_mse_loss: 0.1590, validate_mse_loss: 0.1597
[2023-09-01 10:10:10] epoch=[9/10], train_mse_loss: 0.1584, validate_mse_loss: 0.1595
[2023-09-01 10:10:18] epoch=[10/10], train_mse_loss: 0.1582, validate_mse_loss: 0.1617


In [18]:
# DeepCross： DCN
# 和Wide&Deep相比，去掉Wide部分，DNN部分换成残差网络，模型结构简单
# x = f_i f_j * w1 + w0 + x 残差网络迭代多次
# y = sigmoid(linear(x))
# [ACM kdd 2016] Deep Crossing: Web-Scale Modeling without Manually Crafted Combinatorial Features
# 数据集：ml-100k

import os
import numpy as np
import torch
import torch.nn as nn
from torch.nn import Module, Parameter, MSELoss
from torch.utils.data import Dataset, DataLoader, TensorDataset 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
from torch.nn import Module, Sequential, ReLU, Dropout, Sigmoid

# category: [1,2] + [all]
# number: [0] + []
number_feature_data = MinMaxScaler().fit_transform(np.array([[user_info[u][0]]  for u, i, r in ratings], dtype=np.float32))
category_feature_data = np.array([user_info[u][1:] + item_info[i] for u, i, r in ratings], dtype=np.int32)
data = np.concatenate([number_feature_data, category_feature_data, ratings[:,-1:]], axis=-1)
num_number_features = number_feature_data.shape[-1]
num_category_features = category_feature_data.shape[-1]
num_features = data.shape[-1] - 1
category_feature_vals = {}
for i in range(num_number_features, num_features):
    category_feature_vals[i] = sorted(list(set(list(data[:, i]))))
    for rid in range(data.shape[0]):
        data[rid, i] = category_feature_vals[i].index(data[rid, i])
# print(len(user_info[list(user_info.keys())[0]]), len(item_info[list(item_info.keys())[0]]))
# print(data.shape)

num_users = len(user_info)
num_items = len(item_info)
device = torch.device("cuda:0" if torch.cuda.is_available() else ('mps:0' if torch.backends.mps.is_available() else "cpu"))
batch_size = 100
num_epochs = 10
embedding_dim = 8
X_train, X_test, y_train, y_test = train_test_split(data[:,:-1], data[:,-1], test_size=0.4, random_state=0)
train_loader = DataLoader(dataset=TensorDataset(torch.from_numpy(X_train).float(), torch.from_numpy(y_train).float()), batch_size=batch_size, shuffle=True, pin_memory=True)
test_loader = DataLoader(dataset=TensorDataset(torch.from_numpy(X_test).float(), torch.from_numpy(y_test).float()), batch_size=batch_size, shuffle=False, pin_memory=True)

class CrossNetwork(nn.Module):
    def __init__(self, layer_num, input_dim):
        super(CrossNetwork, self).__init__()
        self.layer_num = layer_num
        self.cross_weights = nn.ParameterList([nn.Parameter(torch.rand(input_dim, 1)) for i in range(self.layer_num)])
        self.cross_bias = nn.ParameterList([nn.Parameter(torch.rand(input_dim, 1)) for i in range(self.layer_num)])
    def forward(self, x):
        x_ = torch.unsqueeze(x.clone(), dim=2)
        for i in range(self.layer_num):
            x_ = torch.matmul(torch.bmm(x_, x_.permute((0, 2, 1))), self.cross_weights[i]) + self.cross_bias[i] + x_
        return torch.squeeze(torch.sigmoid(x_))

class DeepCross(nn.Module):
    def __init__(self, dense_feature_cols, sparse_feature_col_dims, dnn_layer_dims, cross_layer_num, dnn_dropout=0.):
        super(DeepCross, self).__init__()
        self.dense_feature_cols, self.sparse_feature_col_dims = dense_feature_cols, sparse_feature_col_dims
        # sparse feature embedding dict
        self.embed_layers = nn.ModuleDict({'embed_' + str(i): nn.Embedding(num_embeddings=valcount, embedding_dim=dim) for i, valcount, dim in sparse_feature_col_dims})
        # Dnn part: sparse features
        dnn_layer_dims.insert(0, len(dense_feature_cols) + sum([x[-1] for x in sparse_feature_col_dims]))
        self.dnn_network = Sequential()
        for i, layer_dim in enumerate(dnn_layer_dims[1:]):
            self.dnn_network.append(nn.Linear(dnn_layer_dims[i], layer_dim))
            self.dnn_network.append(ReLU())
        self.dnn_network.append(Dropout(dnn_dropout))
        # cross part
        self.cross_network = CrossNetwork(cross_layer_num, dnn_layer_dims[0])
        # final layer
        self.final_nn = Sequential(nn.Linear(dnn_layer_dims[-1] + dnn_layer_dims[0], 1), Sigmoid())
    # [batch, [dense, sparse]]
    def forward(self, x):
        dense_input = x[:, :len(self.dense_feature_cols)]
        sparse_embeds = torch.cat([self.embed_layers['embed_' + str(i)](x[:, i].long()) for i in range(len(self.dense_feature_cols), x.shape[1])], axis=-1)
        x = torch.cat([sparse_embeds, dense_input], axis=-1)
        # cross Network
        cross_out = self.cross_network(x)
        # Deep Network
        deep_out = self.dnn_network(x)
        #  Concatenate and out
        outputs = self.final_nn(torch.cat([cross_out, deep_out], axis=-1))
        return outputs.squeeze()
    def parameters(self, recurse: bool = True):
        return [para for para in self.embed_layers.parameters()] + [para for para in self.dnn_network.parameters()] \
             + [para for para in self.cross_network.parameters()] + [para for para in self.final_nn.parameters()]
model = DeepCross(dense_feature_cols=[i for i in range(num_number_features)], sparse_feature_col_dims=[(i,len(category_feature_vals[i]), embedding_dim) for i in range(num_number_features, num_features)], dnn_layer_dims=[128, 32], cross_layer_num=2, dnn_dropout=0.).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = MSELoss(reduction='sum').to(device)

for epoch in range(num_epochs):
    # train:
    epoch_train_losses = []
    model.train()
    for i, inputs in enumerate(train_loader):
        optimizer.zero_grad()
        input = inputs[0].to(device)
        label = inputs[1].to(device)
        output = model(input)
        loss = criterion(output, label)
        loss.backward()
        # torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=1, norm_type=2)
        optimizer.step()
        epoch_train_losses.append([input.shape[0], loss.item()])
    # validate:
    model.eval()
    epoch_test_losses = []
    for i, inputs in enumerate(test_loader):
        input = inputs[0].to(device)
        label = inputs[1].to(device)
        output = model(input)
        loss = criterion(output, label)
        epoch_test_losses.append([input.shape[0], loss.item()])
    train_loss = sum([x[1] for x in epoch_train_losses])/sum([x[0] for x in epoch_train_losses])
    test_loss  = sum([x[1] for x in epoch_test_losses])/sum([x[0] for x in epoch_test_losses])
    # print
    print('['+datetime.now().strftime("%Y-%m-%d %H:%M:%S")+']', 'epoch=[{}/{}], train_mse_loss: {:.4f}, validate_mse_loss: {:.4f}'.format(epoch+1, num_epochs,  train_loss, test_loss))

[2023-09-01 11:05:57] epoch=[1/10], train_mse_loss: 0.1643, validate_mse_loss: 0.1634
[2023-09-01 11:07:04] epoch=[2/10], train_mse_loss: 0.1623, validate_mse_loss: 0.1628
[2023-09-01 11:08:10] epoch=[3/10], train_mse_loss: 0.1620, validate_mse_loss: 0.1627
[2023-09-01 11:09:14] epoch=[4/10], train_mse_loss: 0.1614, validate_mse_loss: 0.1618
[2023-09-01 11:10:19] epoch=[5/10], train_mse_loss: 0.1607, validate_mse_loss: 0.1614
[2023-09-01 11:11:24] epoch=[6/10], train_mse_loss: 0.1604, validate_mse_loss: 0.1624
[2023-09-01 11:12:29] epoch=[7/10], train_mse_loss: 0.1598, validate_mse_loss: 0.1605
[2023-09-01 11:13:35] epoch=[8/10], train_mse_loss: 0.1593, validate_mse_loss: 0.1602
[2023-09-01 11:14:40] epoch=[9/10], train_mse_loss: 0.1592, validate_mse_loss: 0.1602
[2023-09-01 11:15:46] epoch=[10/10], train_mse_loss: 0.1590, validate_mse_loss: 0.1603


In [19]:
# DeepFM: y = sigmoid(dnn(x) + fm(x_discrete))，使用FM替换了Wide & Deep模型中wide部分的LR
# Deepfm: a factorization-machine based neural network for ctr prediction
# FM + DNN

import os
import numpy as np
import torch
import torch.nn as nn
from torch.nn import Module, Parameter, MSELoss
from torch.utils.data import Dataset, DataLoader, TensorDataset 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
from torch.nn import Module, Sequential, ReLU, Dropout, Sigmoid

# category: [1,2] + [all]
# number: [0] + []
number_feature_data = MinMaxScaler().fit_transform(np.array([[user_info[u][0]]  for u, i, r in ratings], dtype=np.float32))
category_feature_data = np.array([user_info[u][1:] + item_info[i] for u, i, r in ratings], dtype=np.int32)
data = np.concatenate([number_feature_data, category_feature_data, ratings[:,-1:]], axis=-1)
num_number_features = number_feature_data.shape[-1]
num_category_features = category_feature_data.shape[-1]
num_features = data.shape[-1] - 1
category_feature_vals = {}
for i in range(num_number_features, num_features):
    category_feature_vals[i] = sorted(list(set(list(data[:, i]))))
    for rid in range(data.shape[0]):
        data[rid, i] = category_feature_vals[i].index(data[rid, i])
# print(len(user_info[list(user_info.keys())[0]]), len(item_info[list(item_info.keys())[0]]))
# print(data.shape)

num_users = len(user_info)
num_items = len(item_info)
device = torch.device("cuda:0" if torch.cuda.is_available() else ('mps:0' if torch.backends.mps.is_available() else "cpu"))
batch_size = 100
num_epochs = 10
embedding_dim = 8 # sparse feature embedding dim
X_train, X_test, y_train, y_test = train_test_split(data[:,:-1], data[:,-1], test_size=0.4, random_state=0)
train_loader = DataLoader(dataset=TensorDataset(torch.from_numpy(X_train).float(), torch.from_numpy(y_train).float()), batch_size=batch_size, shuffle=True, pin_memory=True)
test_loader = DataLoader(dataset=TensorDataset(torch.from_numpy(X_test).float(), torch.from_numpy(y_test).float()), batch_size=batch_size, shuffle=False, pin_memory=True)

# vector-wise cross FM仅对离散特征进行交叉, 输出没有sigmoid，留到最后做
class FactorizationMachine(Module):
    def __init__(self, num_features, dim):
        super(FactorizationMachine, self).__init__()
        self.num_features = num_features
        self.dim = dim
        self.w0 = Parameter(torch.randn((1,1)), requires_grad=True)
        self.w = Parameter(torch.randn((num_features, 1)), requires_grad=True)
        self.bw = Parameter(torch.randn((num_features, dim)), requires_grad=True)
    def forward(self, X: torch.Tensor):
        X_ = X.reshape([-1, self.num_features, self.dim])
        # 二阶
        X_ = torch.bmm(X_, X_.permute([0,2,1]))
        tmp = torch.matmul(self.bw, self.bw.T)
        tmp[np.tril_indices(self.num_features)] = 0
        X_ = X_ * tmp.unsqueeze(0)
        two = torch.sum(torch.sum(X_, dim=-1, keepdim=False), dim=-1, keepdim=True)
        # 一阶
        one = torch.matmul(X, self.w.repeat([self.dim,1]))
        return (self.w0 + one + two).squeeze()

class DeepFM(Module):
    def __init__(self, dense_feature_cols, sparse_feature_cols, sparse_feature_embedding_dim, dnn_layer_dims, dnn_dropout=0.):
        super(DeepFM, self).__init__()
        self.dense_feature_cols, self.sparse_feature_cols, self.sparse_feature_embedding_dim = dense_feature_cols, sparse_feature_cols, sparse_feature_embedding_dim
        # fm
        self.discrete_feature_FM = FactorizationMachine(len(sparse_feature_cols), sparse_feature_embedding_dim)
        # sparse feature embedding dict
        self.embed_layers = nn.ModuleDict({'embed_' + str(i): nn.Embedding(num_embeddings=valcount, embedding_dim=sparse_feature_embedding_dim) for i, valcount in sparse_feature_cols})
        # dnn part
        dnn_layer_dims.insert(0, len(dense_feature_cols) + sparse_feature_embedding_dim * len(sparse_feature_cols))
        self.dnn_network = Sequential()
        for i, layer_dim in enumerate(dnn_layer_dims[1:]):
            self.dnn_network.append(nn.Linear(dnn_layer_dims[i], layer_dim))
            self.dnn_network.append(ReLU())
        self.dnn_network.append(Dropout(dnn_dropout))
        self.dnn_network.append(nn.Linear(dnn_layer_dims[-1], 1))
    def forward(self, x):
        dense_input = x[:, :len(self.dense_feature_cols)]
        sparse_embeds = torch.cat([self.embed_layers['embed_' + str(i)](x[:, i].long()) for i in range(len(self.dense_feature_cols), x.shape[1])], axis=1)
        x = torch.cat([sparse_embeds, dense_input], axis=-1)
        # FM仅对离散特征进行交叉
        fm_out = self.discrete_feature_FM(sparse_embeds)
        # dnn part
        deep_out = self.dnn_network(x).squeeze()
        y = torch.sigmoid(fm_out + deep_out).squeeze() # 最后sigmoid
        return y
    def parameters(self, recurse: bool = True):
        return [para for para in self.discrete_feature_FM.parameters()] + [para for para in self.embed_layers.parameters()] \
             + [para for para in self.dnn_network.parameters()]
    
model = DeepFM(dense_feature_cols=[i for i in range(num_number_features)], sparse_feature_cols=[(i,len(category_feature_vals[i])) for i in range(num_number_features, num_features)], sparse_feature_embedding_dim =embedding_dim, dnn_layer_dims=[128, 32], dnn_dropout=0.).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = MSELoss(reduction='sum').to(device)

for epoch in range(num_epochs):
    # train:
    epoch_train_losses = []
    model.train()
    for i, inputs in enumerate(train_loader):
        optimizer.zero_grad()
        input = inputs[0].to(device)
        label = inputs[1].to(device)
        output = model(input)
        loss = criterion(output, label)
        loss.backward()
        # torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=1, norm_type=2)
        optimizer.step()
        epoch_train_losses.append([input.shape[0], loss.item()])
    # validate:
    model.eval()
    epoch_test_losses = []
    for i, inputs in enumerate(test_loader):
        input = inputs[0].to(device)
        label = inputs[1].to(device)
        output = model(input)
        loss = criterion(output, label)
        epoch_test_losses.append([input.shape[0], loss.item()])
    train_loss = sum([x[1] for x in epoch_train_losses])/sum([x[0] for x in epoch_train_losses])
    test_loss  = sum([x[1] for x in epoch_test_losses])/sum([x[0] for x in epoch_test_losses])
    # print
    print('['+datetime.now().strftime("%Y-%m-%d %H:%M:%S")+']', 'epoch=[{}/{}], train_mse_loss: {:.4f}, validate_mse_loss: {:.4f}'.format(epoch+1, num_epochs,  train_loss, test_loss))

[2023-09-01 11:16:10] epoch=[1/10], train_mse_loss: 0.2199, validate_mse_loss: 0.2115
[2023-09-01 11:16:20] epoch=[2/10], train_mse_loss: 0.2096, validate_mse_loss: 0.2113
[2023-09-01 11:16:30] epoch=[3/10], train_mse_loss: 0.2096, validate_mse_loss: 0.2113
[2023-09-01 11:16:42] epoch=[4/10], train_mse_loss: 0.2096, validate_mse_loss: 0.2113
[2023-09-01 11:16:54] epoch=[5/10], train_mse_loss: 0.2096, validate_mse_loss: 0.2113
[2023-09-01 11:17:07] epoch=[6/10], train_mse_loss: 0.2096, validate_mse_loss: 0.2113
[2023-09-01 11:17:19] epoch=[7/10], train_mse_loss: 0.2096, validate_mse_loss: 0.2113
[2023-09-01 11:17:29] epoch=[8/10], train_mse_loss: 0.2096, validate_mse_loss: 0.2113
[2023-09-01 11:17:39] epoch=[9/10], train_mse_loss: 0.2096, validate_mse_loss: 0.2113
[2023-09-01 11:17:49] epoch=[10/10], train_mse_loss: 0.2096, validate_mse_loss: 0.2113


In [None]:
# AFM : Attentive Factorization Machine
# y = w0 + w1 * f + a * (w2, w2) * (f, f)

import torch
from torch import nn

class AFM(nn.Module):
    def __init__(self, dense_feature_cols, sparse_feature_cols, sparse_feature_embedding_dim, attention_layer_dim):
        super(AFM, self).__init__()
        self.dense_feature_cols, self.sparse_feature_cols, self.sparse_feature_embedding_dim = dense_feature_cols, sparse_feature_cols, sparse_feature_embedding_dim
        self.attention_layer_dim = attention_layer_dim
        self.attention_layer = nn.Sequential(nn.Linear(sparse_feature_embedding_dim, attention_layer_dim), nn.ReLU(), nn.Linear(attention_layer_dim, 1), nn.Softmax(dim=-1))
        # sparse feature embedding dict
        self.embed_layers = nn.ModuleDict({'embed_' + str(i): nn.Embedding(num_embeddings=valcount, embedding_dim=sparse_feature_embedding_dim) for i, valcount in sparse_feature_cols})
        # final layer
        self.final_layer = nn.Sequential(Linear(len(sparse_feature_cols) * sparse_feature_embedding_dim + len(dense_feature_cols), 1), nn.Sigmoid())
    def forward(self, x):
        dense_input = x[:, :len(self.dense_feature_cols)]
        sparse_embeds = torch.cat([self.embed_layers['embed_' + str(i)](x[:, i].long()) for i in range(len(self.dense_feature_cols), x.shape[1])], axis=1)
        x = torch.cat([sparse_embeds, dense_input], axis=-1)
        # FM仅对离散特征进行交叉
        fm_out = self.discrete_feature_FM(sparse_embeds)
        # dnn part
        deep_out = self.dnn_network(x).squeeze()
        y = torch.sigmoid(fm_out + deep_out).squeeze() # 最后sigmoid
        return y
    def parameters(self, recurse: bool = True):
        return [para for para in self.discrete_feature_FM.parameters()] + [para for para in self.embed_layers.parameters()] \
             + [para for para in self.dnn_network.parameters()]
