In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2
# 多行输出
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" 

数据与预处理参考 [yooChooseRec.ipynb](./yooChooseRec.ipynb)

这里解决用户在某个session中会发生购买行为，预测用户购买什么东西

In [2]:
import os, gc
import torch
import pickle
from pathlib import Path
from torch import nn, optim
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from torch_geometric.nn import global_mean_pool, global_max_pool
from torch_geometric.data import Data, InMemoryDataset, DataLoader
from torch_geometric.nn import GraphConv, TopKPooling, GatedGraphConv, SAGEConv, SGConv
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score
from tqdm import tqdm
np.random.seed(123)

In [3]:
root = Path('../data/yoochoose-data/')

## 数据准备

In [4]:
# clicks = pd.read_csv(root / 'clicks_pro.csv', encoding='utf-8', low_memory=False)
# buys = pd.read_csv(root / 'buys_pro.csv', encoding='utf-8', low_memory=False)
clicks = pd.read_csv(root / 'clicks_pro_100m.csv', encoding='utf-8', low_memory=False)
buys = pd.read_csv(root / 'buys_pro_100m.csv', encoding='utf-8', low_memory=False)

In [5]:
clicks.nunique()
buys.nunique()

session_id    1000000
timestamp     5552990
item_id         37196
category          263
label               2
dtype: int64

session_id     84817
timestamp     216577
item_id        12373
price            511
quantity          22
dtype: int64

In [6]:
clicks.head()
buys.head()

Unnamed: 0,session_id,timestamp,item_id,category,label
0,9,2014-04-06T11:26:24.127Z,6926,0,False
1,9,2014-04-06T11:28:54.654Z,6926,0,False
2,9,2014-04-06T11:29:13.479Z,6926,0,False
3,14,2014-04-01T10:09:01.362Z,7116,0,False
4,14,2014-04-01T10:11:14.773Z,8100,0,False


Unnamed: 0,session_id,timestamp,item_id,price,quantity
0,140806,2014-04-07T09:22:28.132Z,14011,523,1
1,140806,2014-04-07T09:22:28.176Z,8219,1046,1
2,140806,2014-04-07T09:22:28.219Z,8001,837,1
3,140806,2014-04-07T09:22:28.268Z,25054,1151,1
4,140806,2014-04-07T09:22:28.280Z,7211,1046,1


- 使用最大值而不是 nunique

In [7]:
clicks.item_id.max() + 1
clicks.category.max() + 1

37196

263

- 购买力字典

In [8]:
buy_item_dict = dict(buys.groupby('session_id')['item_id'].apply(list))

In [9]:
with open(root/'buy_item_dict.pkl',  'wb') as f:
    pickle.dump(buy_item_dict, f)

## 构造数据集

In [10]:
class YooChooseDatasetNode(InMemoryDataset):
    def __init__(self, root, transform=None, pre_transform=None):
        super().__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0])

    @property
    def raw_file_names(self):
        return []

    @property
    def processed_file_names(self):
        return ['buy_binary100m.pt']
        # return ['buy_binary.pt']    # for whole dataset

    def download(self):
        pass

    def process(self):
        data_list = []
        # clicks= pd.read_csv(root / 'clicks_pro.csv', encoding='utf-8', low_memory=False)  # 使用全部数据
        clicks = pd.read_csv(root / 'clicks_pro_100m.csv', encoding='utf-8', low_memory=False)    # 使用部分数据
        with open(root / 'buy_item_dict.pkl', 'rb') as f:
            buy_item_dict = pickle.load(f)

        # process by session_id
        grouped = iter(clicks.groupby('session_id'))
        lens = clicks.session_id.unique().shape[0]

        # buys 不需要常驻内存, 并且使用动态加载
        del clicks

        for session_id, group in tqdm(grouped, total=lens):
            # 重新编码，作为节点的顺序
            item_lb = LabelEncoder()
            sess_item_id = item_lb.fit_transform(group.item_id)

            # 重建索引
            group = group.reset_index(drop=True)
            group['sess_item_id'] = sess_item_id

            # 节点的初始特征
            # 重复的浏览记录当做一次记录
            # 使用 [item id, category] 作为节点特征
            node_features = group.loc[group.session_id ==
                                      session_id, ['sess_item_id', 'item_id', 'category']].sort_values('sess_item_id')[[
                                          'item_id', 'category'
                                      ]].drop_duplicates().values
            node_features = torch.LongTensor(node_features).unsqueeze(1)

            # 序列访问的顺序
            source_nodes = group.sess_item_id.values[:-1]
            target_nodes = group.sess_item_id.values[1:]
            edge_index = torch.tensor([source_nodes, target_nodes], dtype=torch.long)

            # 构造 x, y
            x = node_features

            if session_id in buy_item_dict:
                positive_indices = item_lb.transform(buy_item_dict[session_id])
                # one-hot 编码
                label = np.zeros(len(node_features))
                label[positive_indices] = 1
            else:
                label = [0] * len(node_features)

            y = torch.FloatTensor(label)

            data = Data(x=x, edge_index=edge_index, y=y)
            data_list.append(data)

        data, slices = self.collate(data_list)
        torch.save((data, slices), self.processed_paths[0])

In [11]:
dataset = YooChooseDatasetNode(root)
dataset = dataset.shuffle()
train_dataset, val_dataset, test_dataset = dataset[:800000], dataset[800000:900000], dataset[900000:]
len_train = 800000

In [12]:
batch_size= 1024
train_loader = DataLoader(train_dataset, batch_size=batch_size)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [13]:
num_items = clicks.item_id.max() +1
num_categories = clicks.category.max()+1
num_items , num_categories

(37196, 263)

In [14]:
del clicks, buys, dataset, train_dataset, val_dataset, test_dataset
gc.collect()

7

In [15]:
class NetNode(nn.Module):
    def __init__(self, num_items, num_categories, emb_sz=128, p=0.5):
        super().__init__()

        self.item_embedding = nn.Embedding(num_embeddings=num_items, embedding_dim=emb_sz)
        self.category_embedding = nn.Embedding(num_embeddings=num_categories, embedding_dim=emb_sz)

        self.conv1 = GraphConv(emb_sz * 2, 128)
        self.pool1 = TopKPooling(128, ratio=0.9)

        self.conv2 = GraphConv(128, 128)
        self.pool2 = TopKPooling(128, ratio=0.9)

        self.conv3 = GraphConv(128, 128)
        self.pool3 = TopKPooling(128, ratio=0.9)

        self.fc1 = nn.Sequential(nn.Linear(256, 256), nn.ReLU())
        self.fc2 = nn.Linear(256, 128)

        self.drop = p
        self.ac = nn.ReLU()

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch

        item_id, category = x[:, :, 0], x[:, :, 1]

        emb_item = self.item_embedding(item_id).squeeze(1)
        emb_category = self.category_embedding(category).squeeze(1)

        x = torch.cat([emb_item, emb_category], dim=1)
        x = F.relu(self.conv1(x, edge_index))
        x, edge_index, _, batch, *_ = self.pool1(x, edge_index, None, batch)
        x1 = torch.cat([global_max_pool(x, batch), global_mean_pool(x, batch)], dim=1)

        x = F.relu(self.conv2(x, edge_index))
        x, edge_index, _, batch, *_ = self.pool2(x, edge_index, None, batch)
        x2 = torch.cat([global_max_pool(x, batch), global_mean_pool(x, batch)], dim=1)

        x = F.relu(self.conv3(x, edge_index))
        x, edge_index, _, batch, *_ = self.pool3(x, edge_index, None, batch)
        x3 = torch.cat([global_max_pool(x, batch), global_mean_pool(x, batch)], dim=1)

        x = x1 + x2 + x3

        x = self.fc1(x)
        x = self.fc2(x)
        x = F.dropout(x, p=self.drop, training=self.training)
        x = self.ac(x)

        outputs = []
        for i in range(x.size(0)):
            output = torch.matmul(emb_item[data.batch == i], x[i, :])
            outputs.append(output)

        x = torch.cat(outputs, dim=0)
        x = torch.sigmoid(x)

        return x

- 训练与评估

In [16]:
def train():
    model.train()

    loss_all = 0
    for data in train_loader:
        data = data.to(device)
        optimizer.zero_grad()
        output = model(data)

        label = data.y.to(device)
        loss = criterion(output, label)
        loss.backward()
        loss_all += data.num_graphs * loss.item()
        optimizer.step()
    return loss_all / len_train

In [17]:
@torch.no_grad()
def evaluate(loader):
    model.eval()

    predictions = []
    labels = []

    for data in loader:
        data = data.to(device)
        pred = model(data).detach().cpu().numpy()
        label = data.y.detach().cpu().numpy()
        predictions.append(pred)
        labels.append(label)

    predictions = np.hstack(predictions)
    labels = np.hstack(labels)
    return roc_auc_score(labels, predictions)

In [18]:
device = torch.device('cuda: 0' if torch.cuda.is_available() else 'cpu')
model = NetNode(37197,  264).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCELoss()

In [19]:
for epoch in range(10):
    loss = train()
    train_acc = evaluate(train_loader)
    val_acc = evaluate(val_loader)    
    test_acc = evaluate(test_loader)
    print('Epoch: {:03d}, Loss: {:.5f}, Train Auc: {:.5f}, Val Auc: {:.5f}, Test Auc: {:.5f}'.
          format(epoch, loss, train_acc, val_acc, test_acc))

Epoch: 000, Loss: 0.36615, Train Auc: 0.66282, Val Auc: 0.65518, Test Auc: 0.65176
Epoch: 001, Loss: 0.25901, Train Auc: 0.72887, Val Auc: 0.70953, Test Auc: 0.70834
Epoch: 002, Loss: 0.23043, Train Auc: 0.76399, Val Auc: 0.73564, Test Auc: 0.73676
Epoch: 003, Loss: 0.21250, Train Auc: 0.79811, Val Auc: 0.75792, Test Auc: 0.75902
Epoch: 004, Loss: 0.19966, Train Auc: 0.81499, Val Auc: 0.76711, Test Auc: 0.76695
Epoch: 005, Loss: 0.19048, Train Auc: 0.83226, Val Auc: 0.77382, Test Auc: 0.77470
Epoch: 006, Loss: 0.18290, Train Auc: 0.84464, Val Auc: 0.77752, Test Auc: 0.77727
Epoch: 007, Loss: 0.17586, Train Auc: 0.85812, Val Auc: 0.77747, Test Auc: 0.77786
Epoch: 008, Loss: 0.17064, Train Auc: 0.86737, Val Auc: 0.77547, Test Auc: 0.77624
Epoch: 009, Loss: 0.16548, Train Auc: 0.87729, Val Auc: 0.77351, Test Auc: 0.77247
