In [3]:
import numpy as np
import pandas as pd
import random

import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import os

In [7]:
#全局参数，随机种子，图像尺寸
seed = 114514
np.random.seed(seed)
random.seed(seed)
BATCH_SIZE = 512

hidden_dim = 16
epochs = 20
device = torch.device('cuda:1') if torch.cuda.is_available() else torch.device('cpu')
print(device)

cuda:1


In [9]:
df = pd.read_csv('datasets/datafountain/train_dataset.csv')
print('共{}个用户，{}本图书，{}条记录'.format(max(df['user_id'])+1, max(df['item_id'])+1, len(df)))

共53424个用户，10000本图书，5869631条记录


In [10]:
df.head()

Unnamed: 0,user_id,item_id
0,0,257
1,0,267
2,0,5555
3,0,3637
4,0,1795


In [11]:
import tqdm
class Goodbooks(Dataset):
    def __init__(self, df, mode='training', negs = 99):
        super().__init__()
        self.df = df
        self.mode = mode
        self.book_nums = max(df['item_id'])+1
        self.user_nums = max(df['user_id'])+1
        self._init_dataset()
    
    def _init_dataset(self):
        self.Xs = []
        self.user_book_map = {}
        for i in range(self.user_nums):
            self.user_book_map[i] = []
        for index, row in self.df.iterrows():
            user_id, book_id = row
            self.user_book_map[user_id].append(book_id)
            
        if self.mode == 'training':
            for user, items in tqdm.tqdm(self.user_book_map.items()):
                for item in items[:-1]:
                    self.Xs.append((user, item, 1))
                    for _ in range(3):
                        while True:
                            neg_sample = random.randint(0, self.book_nums-1)
                            if neg_sample not in self.user_book_map[user]:
                                self.Xs.append((user, neg_sample, 0))
                                break
        elif self.mode == 'validation':
            for user, items in tqdm.tqdm(self.user_book_map.items()):
                if len(items) == 0:
                    continue
                self.Xs.append((user, items[-1]))

    def __getitem__(self, index):
        if self.mode == 'training':
            user_id, book_id, label = self.Xs[index]
            return user_id, book_id, label
        elif self.mode == 'validation':
            user_id, book_id = self.Xs[index]
            negs = list(random.sample(
                list(set(range(self.book_nums)) - set(self.user_book_map[user_id])),
                k=99
            ))
            return user_id, book_id, torch.LongTensor(negs)
    def __len__(self):
        return len(self.Xs)    

In [12]:
  #建立训练和验证dataloader
traindataset = Goodbooks(df, 'training')
validdataset = Goodbooks(df, 'validation')

trainloader = DataLoader(traindataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=False, num_workers=0)
validloader = DataLoader(validdataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=False, num_workers=0)   

100%|██████████| 53424/53424 [01:29<00:00, 599.76it/s]
100%|██████████| 53424/53424 [00:00<00:00, 436993.06it/s]


In [13]:
# 构建模型
class NCFModel(torch.nn.Module):
    def __init__(self, hidden_dim, user_num, item_num, mlp_layer_num=6, weight_decay = 1e-5, dropout=0.5):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.user_num = user_num
        self.item_num = item_num
        self.mlp_layer_num = mlp_layer_num
        self.weight_decay = weight_decay
        self.dropout=dropout
        self.mlp_user_embedding = torch.nn.Embedding(user_num, hidden_dim * (2 ** (self.mlp_layer_num - 1)))
        self.mlp_item_embedding = torch.nn.Embedding(item_num, hidden_dim * (2 ** (self.mlp_layer_num - 1)))

        self.gmf_user_embedding = torch.nn.Embedding(user_num, hidden_dim)
        self.gmf_item_embedding = torch.nn.Embedding(item_num, hidden_dim)

        mlp_Layers = []
        input_size = int(hidden_dim*(2 ** (self.mlp_layer_num)))
        for i in range(self.mlp_layer_num):
            mlp_Layers.append(torch.nn.Linear(int(input_size), int(input_size / 2)))
            mlp_Layers.append(torch.nn.Dropout(self.dropout))
            mlp_Layers.append(torch.nn.ReLU())
            input_size /= 2
        self.mlp_layers = torch.nn.Sequential(*mlp_Layers)
        self.output_layer = torch.nn.Linear(2*self.hidden_dim, 1)

    def forward(self, user, item):
        user_gmf_embedding = self.gmf_user_embedding(user)
        item_gmf_embedding = self.gmf_item_embedding(item)

        user_mlp_embedding = self.mlp_user_embedding(user)
        item_mlp_embedding = self.mlp_item_embedding(item)

        gmf_output = user_gmf_embedding * item_gmf_embedding

        mlp_input = torch.cat([user_mlp_embedding, item_mlp_embedding], dim=-1)
        mlp_output = self.mlp_layers(mlp_input)

        output = torch.sigmoid(self.output_layer(torch.cat([gmf_output, mlp_output], dim=-1))).squeeze(-1)
        # return -r_pos_neg + reg
        return output

    def predict(self, user, item):
        self.eval()
        with torch.no_grad():
            user_gmf_embedding = self.gmf_user_embedding(user)
            item_gmf_embedding = self.gmf_item_embedding(item)

            user_mlp_embedding = self.mlp_user_embedding(user)
            item_mlp_embedding = self.mlp_item_embedding(item)

            gmf_output = user_gmf_embedding.unsqueeze(1) * item_gmf_embedding

            user_mlp_embedding = user_mlp_embedding.unsqueeze(1).expand(-1, item_mlp_embedding.shape[1], -1)
            mlp_input = torch.cat([user_mlp_embedding, item_mlp_embedding], dim=-1)
            mlp_output = self.mlp_layers(mlp_input)

        output = torch.sigmoid(self.output_layer(torch.cat([gmf_output, mlp_output], dim=-1))).squeeze(-1)
        return output

In [10]:
model = NCFModel(hidden_dim, traindataset.user_nums, traindataset.book_nums).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
crit = torch.nn.BCELoss()
best_hits = 0
loss_for_plot = []
hits_for_plot = []
epochs = 10

for epoch in range(epochs):

    losses = []
    for index, data in enumerate(trainloader):
        user, item, label = data
        user, item, label = user.to(device), item.to(device), label.to(device).float()
        y_ = model(user, item).squeeze()

        loss = crit(y_, label)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        losses.append(loss.detach().cpu().item()) 
    hits = []
    for index, data in enumerate(validloader):
        user, pos, neg = data
        pos = pos.unsqueeze(1)
        all_data = torch.cat([pos, neg], dim=-1)
        output = model.predict(user.to(device), all_data.to(device)).detach().cpu()
        
        for batch in output:
            if 0 not in (-batch).argsort()[:10]:
                hits.append(0)
            else:
                hits.append(1)
    print('Epoch {} finished, average loss {}, hits@20 {}'.format(epoch, sum(losses)/len(losses), sum(hits)/len(hits)))
    loss_for_plot.append(sum(losses)/len(losses))
    hits_for_plot.append(sum(hits)/len(hits))
    epoch_hits = sum(hits)/len(hits)
    if epoch_hits > best_hits:
        best_hits = sum(hits)/len(hits)
        torch.save(model.state_dict(), 'best_model.h5')

Epoch 0 finished, average loss 0.447723874439892, hits@20 0.380989817310572
Epoch 1 finished, average loss 0.36973981534616207, hits@20 0.5105757711889788
Epoch 2 finished, average loss 0.3041636566867606, hits@20 0.6072177298592393
Epoch 3 finished, average loss 0.26241439088267393, hits@20 0.6535826594788859
Epoch 4 finished, average loss 0.2348265324319175, hits@20 0.6775606469002695
Epoch 5 finished, average loss 0.2135938015464746, hits@20 0.690214135968853
Epoch 6 finished, average loss 0.19502752743543347, hits@20 0.6956611260856543
Epoch 7 finished, average loss 0.17828809743563473, hits@20 0.7058812518718179
Epoch 8 finished, average loss 0.16236734930130745, hits@20 0.7053197064989518
Epoch 9 finished, average loss 0.14719087681191986, hits@20 0.7059561245882
Epoch 10 finished, average loss 0.13345298735821673, hits@20 0.7026055705300989
Epoch 11 finished, average loss 0.11955288015133833, hits@20 0.6955675351901767
Epoch 12 finished, average loss 0.10693430094118886, hits@20

KeyboardInterrupt: 

In [14]:
# 模型保存
# torch.save(model.state_dict(), 'model.
model = NCFModel(hidden_dim, traindataset.user_nums, traindataset.book_nums).to(device)
model.load_state_dict(torch.load('best_model.h5'))
model

NCFModel(
  (mlp_user_embedding): Embedding(53424, 512)
  (mlp_item_embedding): Embedding(10000, 512)
  (gmf_user_embedding): Embedding(53424, 16)
  (gmf_item_embedding): Embedding(10000, 16)
  (mlp_layers): Sequential(
    (0): Linear(in_features=1024, out_features=512, bias=True)
    (1): Dropout(p=0.5, inplace=False)
    (2): ReLU()
    (3): Linear(in_features=512, out_features=256, bias=True)
    (4): Dropout(p=0.5, inplace=False)
    (5): ReLU()
    (6): Linear(in_features=256, out_features=128, bias=True)
    (7): Dropout(p=0.5, inplace=False)
    (8): ReLU()
    (9): Linear(in_features=128, out_features=64, bias=True)
    (10): Dropout(p=0.5, inplace=False)
    (11): ReLU()
    (12): Linear(in_features=64, out_features=32, bias=True)
    (13): Dropout(p=0.5, inplace=False)
    (14): ReLU()
    (15): Linear(in_features=32, out_features=16, bias=True)
    (16): Dropout(p=0.5, inplace=False)
    (17): ReLU()
  )
  (output_layer): Linear(in_features=32, out_features=1, bias=True)
)

In [15]:

import matplotlib.pyplot as plt

x = list(range(1, len(hits_for_plot) + 1))
plt.subplot(1, 2, 1)
plt.xlabel('epochs')
plt.ylabel('loss')
plt.plot(x, loss_for_plot, 'r')

plt.subplot(1, 2, 2)
plt.xlabel('epochs')
plt.ylabel('acc')
plt.plot(x, hits_for_plot, 'r')

plt.show()



NameError: name 'hits_for_plot' is not defined

In [16]:
def chunks(l, n):
    for i in range(0, len(l), n):
        yield l[i: i+n]

In [17]:
import heapq

df = pd.read_csv('datasets/datafountain/test_dataset.csv')
user_for_test = df['user_id'].tolist()

predict_item_id = []

f = open('submission.csv', 'w', encoding='utf-8')
f.write('user_id,item_id\n')
for user in tqdm.tqdm(user_for_test):
    #将用户已经交互过的物品排除
    user_visited_items = traindataset.user_book_map[user]
    items_for_predict = list(set(range(traindataset.book_nums)) - set(user_visited_items))
    
    results = []
    user = torch.Tensor([user]).unsqueeze(0).long().to(device)

    for batch in chunks(items_for_predict,102400):
        
        batch = torch.Tensor(batch).unsqueeze(0).long().to(device)
        _,batch_len = batch.shape
        user_predicts_tensor = user.expand(1,batch_len)

        result = model(user_predicts_tensor, batch).view(-1).detach().cpu()
        results.append(result)
    results = torch.cat(results, dim=-1)
    predict_item_id = (-results).argsort()[:10]
    list(map(lambda x: f.write('{},{}\n'.format(user.cpu().item(), x)), predict_item_id))

f.flush()
f.close()
    


100%|██████████| 53424/53424 [04:39<00:00, 191.18it/s]
