In [1]:
#导入相应的库
import torch
import torch.nn as nn
import torch.utils.data as data
import torch.nn.functional as F
import numpy as np
import pandas as pd
import random
import os
import time
from operator import itemgetter
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

In [2]:
# 固定随机数种子，保证模型的可复现性
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
set_seed(666)

In [3]:
epochs = 10
batch_size = 1024
device = torch.device("cpu")

In [4]:
def get_data(feature, target, batch_size, shuffle=True):
    class MyDataset(data.Dataset):
        def __init__(self, feature, target):
            self.feature = feature
            self.target = target

        def __getitem__(self, index):
            m, n = self.feature[index], self.target[index]
            return m, n

        def __len__(self):
            return len(self.feature)

    bdata = data.DataLoader(MyDataset(feature, target), batch_size = batch_size, shuffle = shuffle)
    return bdata

In [5]:
def load_dataset(path):
    df = pd.read_csv(path, usecols=[0, 1, 11, 3, 4], header = 0, names = ['id', 'click', 'C1', 'banner', 'device_id'])
    
    # 处理
    df["item_id"] = df["id"]
    df_user = pd.get_dummies(df, columns=['C1', 'banner'])
    df_user1 = df_user.drop(columns=['item_id'])
    user_feature = df_user1.columns.values.tolist()
    cols = user_feature
    cols.remove('device_id')
    cols.remove('id')
    cols.remove('click')
    
    # get field
    field_index = {} 
    feature2field = {}
    other_idxs = []
    
    for idx, col in enumerate(cols):
        infos = col.split('_')
        if len(infos) == 2:
            field = infos[0]
            field_index[field] = field_index.get(field, len(field_index))
            feature2field[idx] = field_index[field]
        if len(infos) == 1:
            other_idxs.append(idx)
            
    for idx in other_idxs:
        feature2field[idx] = len(field_index)
        
    # merge得到相应的数据
    df_data = pd.read_csv(path, usecols =[0, 1 , 11],header = 0,names = ['id', 'click', 'device_id'])
    df_data['id'] = df['id']
    train, test = train_test_split(df_data, test_size = 0.1, random_state = 666)
    
    label_l = ['id', 'click', 'device_id']
    df_train = train.merge(df_user1, on = label_l, how = 'left')
    df_test= test.merge(df_user1, on = label_l, how = 'left')
    
    # 转换label
    dic = {}
    label_set = sorted(set(df_train['click']) | set(df_test['click']))
    
    for label in label_set:
        dic[label] = dic.get(label, len(dic))
 
    df_train['click'] = df_train["click"].apply(lambda x: 1 if int(x) == 1 else 0)
    df_test['click'] = df_test["click"].apply(lambda x: 1 if int(x) == 1 else 0)
 
    # 转换格式
    train_labels = np.array(df_train['click'].astype(np.int32))
    test_labels = np.array(df_test['click'].astype(np.int32))
    
    return df_data, df_test, df_train[cols].values, train_labels, df_test[cols].values, test_labels, feature2field

In [6]:
class FFM_layer(nn.Module):
    def __init__(self, field_dic, fea_num, reg_l1=0.01, reg_l2=0.01, class_num=1, latent_factor_dim=10):
        super(FFM_layer, self).__init__()
        self.reg_l1 = reg_l1
        self.reg_l2 = reg_l2
        self.fea_num = fea_num
        self.field_dic = field_dic
        self.linear = nn.Linear(fea_num, class_num)   
        self.v = nn.Parameter(torch.randn(fea_num, len(field_dic), latent_factor_dim, class_num)) # 主要是看明白这个v的构造，多个field的维度
 
    def forward(self, x):
        linear_part = self.linear(x)
 
        p = 0.0
        for i in range(0, self.fea_num):
            for j in range(i + 1, self.fea_num):
                v_ifj = self.v[i, self.field_dic[j], :, :]
                v_jfi = self.v[j, self.field_dic[i], :, :]
 
                xij = torch.unsqueeze(x[:, i] * x[:, j], dim = 1)
                v_ijji = torch.unsqueeze(torch.sum(v_ifj * v_jfi, dim = 0), dim = 0)  
                p += torch.mm(xij, v_ijji)
 
        output = linear_part + p
    
        output = torch.log_softmax(output, dim = 1)
        
        return output

In [7]:
def train_model(model, device, train, optimizer, epoch):
    model.train()
    
    for idx, (x, y) in enumerate(train):
        x = x.to(device, dtype = torch.float32)
        y = y.to(device).long()
        optimizer.zero_grad()
        output = model(x)
        loss = F.nll_loss(output, y)
 
        loss1 = 0
        for param in model.parameters():
            loss1 += model.reg_l1 * torch.sum(torch.abs(param))
            loss1 += model.reg_l2 * torch.sum(torch.pow(param, 2))
        loss += loss1
 
        loss.backward()
        optimizer.step()

In [8]:
def test_model(model, device, test):
    loss = 0
    corr = 0
    
    model.eval()
    with torch.no_grad():
        for x, y in test:
            x = x.to(device, dtype = torch.float32)
            y = y.to(device).long()
            output = model(x)
            
            loss += F.nll_loss(output, y, reduction = 'sum').item()
 
            loss1 = 0
            for param in model.parameters():
                loss1 += model.reg_l1 * torch.sum(torch.abs(param))
                loss1 += model.reg_l2 * torch.sum(torch.pow(param, 2))
            loss += loss1
 
            pred = output.max(1, keepdim = True)[1]
            corr += pred.eq(y.view_as(pred)).sum().item()
    
    loss /= len(test.dataset)
    print("loss:" + str(loss.item()) + " acc:" + str(100. * corr / len(test.dataset)))

In [9]:
def recommend(model, user_id, device, df_test, data):
    dic = {}
    l = []
    labels = []
    predicts = []
    
    model.eval()
    with torch.no_grad():
        for x, y in data:
            x = x.to(device, dtype = torch.float32)
            y = y.to(device).long()
            pre = model(x).max(1, keepdim = True)[0]
            
            for i in range(len(pre)):
                l.append([list(df_test['device_id'])[i], list(df_test['id'])[i], pre[i], y[i]])   
            
            for user, item, pre, label in l:
                if int(label) == 1: #如果点击过就跳过 
                    continue
                dic.setdefault(user, {})
                pre = float(pre)
                dic[user_id].setdefault(item, pre)

    l = list(sorted(dic[user_id].items(), key = itemgetter(1), reverse = True)[:5])
    
    return [x[0] for x in l]   

In [10]:
path = './data/avazu/ctr_data.csv'
df_data, test, x_train, y_train, x_test, y_test, feature2field = load_dataset(path)

x_train = preprocessing.scale(x_train, with_mean=True, with_std=True)
x_test = preprocessing.scale(x_test, with_mean=True, with_std=True)

l1 = [x for x in y_train]
l2 = [x for x in y_test]
l = set(l1 + l2)
class_num = len(l)

In [11]:
# FFM模型
model = FFM_layer(field_dic = feature2field, fea_num = x_train.shape[1], reg_l1 = 0.01, reg_l2 = 0.01, class_num = class_num, latent_factor_dim = 10).to(device)
 
# 定义损失函数还有优化器
optm = torch.optim.Adam(model.parameters())

train_data = get_data(x_train, y_train, batch_size, shuffle = True)
test_data = get_data(x_test, y_test, batch_size, shuffle = False)

for epoch in range(1, epochs + 1):
    train_model(model, device, train_data, optm, epoch)
    test_model(model, device, test_data)

loss:3.6003482341766357 acc:15.5
loss:2.9328343868255615 acc:15.5
loss:2.270759344100952 acc:69.3
loss:1.6151325702667236 acc:69.3
loss:1.0864793062210083 acc:72.5
loss:0.80668044090271 acc:72.5
loss:0.6866648197174072 acc:73.7
loss:0.6335725784301758 acc:84.9
loss:0.5984730124473572 acc:84.9
loss:0.5693916082382202 acc:84.9


In [12]:
l = recommend(model, "a99f214a", device, test, test_data)
print(l)

[10977840500307600000, 11371461600980200000, 10641663383298100000, 11010988566923400000, 11203583363195000000]
