In [1]:
#导入相应的库
import torch
import torch.nn as nn
import torch.utils.data as data
import torch.nn.functional as F
import numpy as np
import pandas as pd
import random
import os
import time
from operator import itemgetter
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

In [2]:
# 固定随机数种子，保证模型的可复现性
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
set_seed(666)

In [3]:
epochs = 10
batch_size = 1024
device = torch.device("cpu")

In [4]:
def get_data(xx, yy, batch_size, shuffle):
    class newDataset(data.Dataset):
        def __init__(self, xx, yy):
            self.xx = xx
            self.yy = yy

        def __getitem__(self, index):
            m, n = self.xx[index], self.yy[index]
            return m, n

        def __len__(self):
            return len(self.xx)

    bdata = data.DataLoader(newDataset(xx, yy), batch_size = batch_size, shuffle = shuffle)
    
    return bdata

In [5]:
def load_dataset(path):
    df = pd.read_csv(path, usecols = [0, 1, 11, 3, 4], header = 0, names = ['id', 'click', 'C1', 'banner', 'device_id'])
    # 处理
    df["user_id"] = None
    df["item_id"] = df["id"]
    l = list(set(df['device_id']))
    
    for i in l:
        for j in range(0, len(df['device_id'])):
            if df['device_id'][j] == i:
                df['user_id'][j] = l.index(i) + 1
                
    df_one_hot = pd.get_dummies(df, columns = ['C1', 'banner']) # one_hot
    
    df_use = df_one_hot.drop(columns = ['device_id','item_id'])
    feature = df_use.columns.values.tolist()
    new_feature = feature
    drop_feature = ["click", "id"]
    for i in drop_feature:
        new_feature.remove(i)
    
    # merge得到相应的数据
    label = ['id', 'click', 'device_id']
    df_data = pd.read_csv(path, usecols = [0, 1, 11], header = 0, names = label)
    df_data['id'] = df['id']
    df_data['user_id'] = None
    df_data['user_id'] = df['user_id']
    train, test = train_test_split(df_data, test_size = 0.1) # 划分训练集和测试集
    
    label_l = ['id', 'click', 'user_id']
    df_train = train.merge(df_use, on = label_l, how = 'left')
    df_test = test.merge(df_use, on = label_l, how = 'left')
    
    # 转换label
    dic = {}
    label_set = sorted(set(df_train['click']) | set(df_test['click']))
    
    for label in label_set:
        dic[label] = dic.get(label, len(dic))
        
    df_train['click'] = df_train["click"].apply(lambda x: 1 if int(x) == 1 else 0)
    df_test['click'] = df_test["click"].apply(lambda x: 1 if int(x) == 1 else 0)

    # 转化格式
    train_labels = np.array(df_train['click'].astype(np.int32))
    test_labels = np.array(df_test['click'].astype(np.int32))
    
    return df_test, df_train[new_feature].values, train_labels, df_test[new_feature].values, test_labels

In [6]:
class FM_layer(nn.Module):
    def __init__(self, reg_l1 = 0.01, reg_l2 = 0.01, class_num = 1, feature_num = 10, latent_factor_dim = 5):
        super().__init__()
        self.k = latent_factor_dim
        self.class_num = class_num
        self.fea_num = feature_num
        self.reg_l1 = reg_l1
        self.reg_l2 = reg_l2
        self.linear = nn.Linear(self.fea_num, class_num)
        self.v = nn.Parameter(torch.randn(self.fea_num, self.k, class_num))
        
    def forward(self, xx):
        lp = self.linear(xx)

        p1 = torch.matmul(self.v.permute(2, 1, 0), xx.T).permute(2, 1, 0)
        p1 = torch.pow(p1, 2)
        p1 = 0.5 * torch.sum(p1, dim=1)
        p1 = torch.squeeze(p1, dim=1)

        sq1 = torch.pow(xx, 2)
        sq2 = torch.pow(self.v, 2)
        
        p2 = torch.matmul(sq2.permute(2, 1, 0), sq1.T).permute(2, 1, 0)
        p2 = torch.sum(p2, dim = 1) * -0.5
        p2 = torch.squeeze(p2, dim = 1)

        output = lp + p1 + p2

        output = F.log_softmax(output, dim = 1)
        
        return output

In [7]:
def train_model(model, device, train, optimizer, epoch):
    model.train()
    
    for idx, (x, y) in enumerate(train):
        x = x.to(device, dtype = torch.float32)
        y = y.to(device).long()
        optimizer.zero_grad()
        output = model(x)
        
        if model.class_num == 2:
            loss = F.cross_entropy(output, y)
        else:
            loss = F.nll_loss(output, y)

        loss1 = 0
        for param in model.parameters():
            loss1 += model.reg_l1 * torch.sum(torch.abs(param))
            loss1 += model.reg_l2 * torch.sum(torch.pow(param, 2))
        loss += loss1

        loss.backward()
        optimizer.step()

In [8]:
def test_model(model, device, test):
    loss = 0
    corr = 0
    
    model.eval()
    with torch.no_grad():
        for x, y in test:
            x = x.to(device, dtype = torch.float32)
            y = y.to(device).long()
            output = model(x)

            if model.class_num == 2:
                loss += F.cross_entropy(output, y)
            else:
                loss += F.nll_loss(output, y, reduction='sum').item()

            loss1 = 0
            for param in model.parameters():
                loss1 += model.reg_l1 * torch.sum(torch.abs(param))
                loss1 += model.reg_l2 * torch.sum(torch.pow(param, 2))
            loss += loss1
            
            pred = output.max(1, keepdim = True)[1]
            corr += pred.eq(y.view_as(pred)).sum().item()
            
    loss /= len(test.dataset)
    print("loss:" + str(loss.item()) + " acc:" + str( corr / len(test.dataset)))

In [9]:
def recommend(model, use_id, device, df_test, data):
    dic = {}
    l = []
    
    model.eval()
    with torch.no_grad():
        for x, y in data:
            x = x.to(device, dtype = torch.float32)
            y = y.to(device).long()
            pre = model(x).max(1, keepdim = True)[0]
            
            for i in range(0, len(pre)):
                l.append([list(df_test['device_id'])[i], list(df_test['id'])[i], pre[i], y[i]])
                
            for user, item, pre, label in l:
                if int(label) == 1: #如果点击过就跳过 
                    continue
                dic.setdefault(user, {})
                pre = float(pre)
                dic[user].setdefault(item, pre)
                
    l = list(sorted(dic[use_id].items(), key = itemgetter(1), reverse = True)[:5])
    
    return [x[0] for x in l]   

In [10]:
# 导入数据
path = './data/avazu/ctr_data.csv'
df_test, x_train, y_train, x_test, y_test = load_dataset(path)

x_train = preprocessing.scale(x_train, with_mean = True, with_std = True)
x_test = preprocessing.scale(x_test, with_mean = True, with_std = True)

l1 = [x for x in y_train]
l2 = [x for x in y_test]
l = set(l1 + l2)
class_num = len(l)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [11]:
# FM模型
model = FM_layer(class_num = class_num, feature_num = x_train.shape[1], latent_factor_dim = 40).to(device)

# 定义损失函数还有优化器
optm = torch.optim.Adam(model.parameters())
train_data = get_data(x_train, y_train, batch_size, shuffle = True)
test_data = get_data(x_test, y_test, batch_size, shuffle = False)

for epoch in range(0, epochs):
    train_model(model, device, train_data, optm, epoch + 1)
    test_model(model, device, test_data)

loss:0.022308453917503357 acc:0.808
loss:0.021703263744711876 acc:0.808
loss:0.021083194762468338 acc:0.808
loss:0.02048397809267044 acc:0.806
loss:0.01989045739173889 acc:0.806
loss:0.01929497718811035 acc:0.805
loss:0.018714744597673416 acc:0.805
loss:0.01812630519270897 acc:0.805
loss:0.01756555587053299 acc:0.806
loss:0.017086299136281013 acc:0.807


In [12]:
l = recommend(model, "a99f214a", device, df_test, test_data)
print(l)

[10888676745574100000, 11083904333730300000, 11277485828613800000, 10064037090970500000, 1102755678399120000]
