In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2
# 多行输出
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" 

# GCN 推荐系统
1. [AutuanLiu/PyTorch-Geometric-YooChoose: This is a tutorial for PyTorch Geometric on the YooChoose dataset](https://github.com/AutuanLiu/PyTorch-Geometric-YooChoose)
2. [Hands on Graph Neural Networks with PyTorch & PyTorch Geometric](https://towardsdatascience.com/hands-on-graph-neural-networks-with-pytorch-pytorch-geometric-359487e221a8)
3. [RecSys Challenge 2015 - Challenge](https://2015.recsyschallenge.com/challenge.html?source=post_page)

## Task
给出电商网站的用户点击序列，预测用户是否会购买，如果购买，他会买什么？

1. 用户是否会点击？
2. 用户如果点击，他会买什么？

## Data
### Training
- yoochoose-clicks.dat - 点击记录. 
    - Session ID – 每个 session 可能有一个或者多个点击记录.
    - Timestamp – 时间戳
    - Item ID – item的标识.
    - Category – item的类别.
- yoochoose-buys.dat - 购买记录:
    - Session ID - 每个 session 可能有一个或者多个购买记录.
    - Timestamp - 时间戳.
    - Item ID – item的标识.
    - Price – item的价格.
    - Quantity – item被购买的次数.
- yoochoose-buys.dat 中的 Session ID 一定会出现在 yoochoose-clicks.dat， 具有相同会话ID的记录一起构成会话期间某个用户的单击事件序列， 会话可以很短(几分钟)或很长(几小时)，可以单击一次或数百次，一切都取决于用户的活动。

### Test
- yoochoose-test.dat - 和 yoochoose-clicks.dat 数据的结构相同
    - Session ID
    - Timestamp
    - Item ID
    - Category

### Solution

任务是预测测试文件中的每个会话，这个会话中是否有购买事件，如果有，将购买哪些项。不需要预测数量

- solution.dat
    - Session ID
    - 用逗号分割在这个 session中会购买的 item ID
    - 用 ';' 分割不同的域
- 如果测试文件中存在会话ID，但解决方案文件中不存在，这意味着挑战者预测的会话不会以购买事件结束。

In [2]:
# export
import os, gc
import torch
import pickle
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
np.random.seed(123)

In [3]:
# export
root = Path('../data/yoochoose-data/')

In [4]:
os.listdir(root)

['dataset-README.txt',
 'processed',
 'clicks_pro.csv',
 'yoochoose-clicks.dat',
 'clicks_pro_100m.csv',
 'yoochoose-test.dat',
 'data.h5',
 'clicks_dataset.pkl',
 'yoochoose-buys.dat',
 '.ipynb_checkpoints']

In [5]:
click_columns=['session_id', 'timestamp', 'item_id', 'category']
buy_columns=['session_id', 'timestamp', 'item_id', 'price', 'quantity']

In [6]:
clicks = pd.read_csv(root/'yoochoose-clicks.dat', header=None, names=click_columns, low_memory=False)
buys = pd.read_csv(root/'yoochoose-buys.dat', header=None, names=buy_columns, low_memory=False)

### 数据

In [7]:
clicks.sample(10)

Unnamed: 0,session_id,timestamp,item_id,category
14350187,4834644,2014-06-15T17:07:12.299Z,214587317,0
7348567,2625326,2014-05-08T13:36:33.640Z,214699633,0
17161170,5660749,2014-07-05T18:13:47.308Z,214849040,6
17090065,5672639,2014-07-06T07:47:33.648Z,214845405,4
27329543,9474893,2014-09-01T10:17:48.382Z,214509013,S
22672083,7551453,2014-08-06T08:54:37.650Z,214567404,S
28748896,10175682,2014-09-07T08:35:58.247Z,214854155,S
21567501,7431274,2014-08-04T17:13:49.795Z,214537967,0
25264004,8933501,2014-08-22T20:53:32.377Z,214848302,S
13336902,4475823,2014-06-12T12:09:26.139Z,214553565,0


In [8]:
clicks.nunique()

session_id     9249729
timestamp     32937845
item_id          52739
category           339
dtype: int64

In [9]:
# with pd.HDFStore(root/'data.h5', 'w') as data:
#     data['clicks'] = clicks
#     data['buys'] = buys

In [10]:
buys.sample(10)

Unnamed: 0,session_id,timestamp,item_id,price,quantity
1129238,11445978,2014-09-27T10:21:30.900Z,214690775,1570,1
557492,5789688,2014-07-04T15:35:54.405Z,214839956,0,0
760398,7615309,2014-08-07T07:25:37.493Z,214587328,0,0
269394,2873916,2014-05-15T09:00:28.363Z,214835165,0,0
742293,8002044,2014-08-06T09:35:38.562Z,214587028,0,0
474160,5157217,2014-06-23T06:47:14.637Z,214839617,0,0
374486,3670649,2014-06-01T12:26:18.565Z,214839984,0,0
604516,6520068,2014-07-16T12:23:20.937Z,214826565,0,0
89424,499127,2014-04-09T08:58:21.114Z,214829336,313,3
393849,4241381,2014-06-09T18:08:43.776Z,214748338,0,0


In [11]:
buys.nunique()

session_id     509696
timestamp     1136477
item_id         19949
price             735
quantity           28
dtype: int64

In [12]:
clicks.category.unique().shape

(339,)

In [13]:
clicks.shape  # 3000万的点击量

(33003944, 4)

**计算每个session的点击数量，当点击数量大于2 的时候，认为是一个有效的 session，同时删除无效的session**

```
pandas.core.groupby.generic.SeriesGroupBy.size()
Compute group sizes.

Returns:	
Series
Number of rows in each group.
```

In [14]:
clicks['valid_session'] = clicks.session_id.map(clicks.groupby('session_id')['item_id'].size() > 2)

In [15]:
clicks.groupby('valid_session').size()

valid_session
False     8375885
True     24628059
dtype: int64

In [16]:
clicks = clicks.loc[clicks.valid_session].drop('valid_session',axis=1)
clicks.nunique()

session_id     4431931
timestamp     24590089
item_id          48255
category           330
dtype: int64

In [17]:
# 是否存在缺失
clicks.isna().sum()

session_id    0
timestamp     0
item_id       0
category      0
dtype: int64

In [18]:
# session 的平均点击记录
clicks.groupby('session_id')['item_id'].size().mean()

5.556959032078794

- item id 要做 embedding ，其属于稀疏变量，我们对其进行编码

In [19]:
item_encoder = LabelEncoder()
clicks['item_id'] = item_encoder.fit_transform(clicks['item_id'])

In [20]:
clicks.head()

Unnamed: 0,session_id,timestamp,item_id,category
0,1,2014-04-07T10:51:09.277Z,1909,0
1,1,2014-04-07T10:54:09.868Z,1908,0
2,1,2014-04-07T10:54:46.998Z,1910,0
3,1,2014-04-07T10:57:00.306Z,9038,0
4,2,2014-04-07T13:56:37.614Z,17503,0


- 要确定标签，即给定会话是否有任何购买事件，我们只需检查yoochoose-click.dat中的session_id是否也出现在yoochoose-buy.dat中

In [21]:
clicks['label'] = clicks.session_id.isin(buys.session_id)

- 统计 session id 的标签

In [22]:
clicks.drop_duplicates('session_id')['label'].mean()

0.08514934009577316

- 上述结果说明 样本存在严重的不平衡问题

保存处理好的数据

In [41]:
clicks.to_csv(root/'clicks_pro.csv', index=False, encoding='utf-8')

In [42]:
gc.collect()

260

In [44]:
clicks.item_id.nunique()

48255

构造小型数据集

In [45]:
lenx = 1000000
clicks = pd.read_csv(root / 'clicks_pro.csv', encoding='utf-8', low_memory=False)
samples = np.random.choice(clicks.session_id.unique(), lenx, replace=False)
clicks_pro_100m = clicks.loc[clicks.session_id.isin(samples)]
clicks_pro_100m.to_csv(root/'clicks_pro_100m.csv', index=False, encoding='utf-8')

In [46]:
clicks_pro_100m.nunique()

session_id    1000000
timestamp     5546820
item_id         37494
category          257
label               2
dtype: int64

**为避免内存超出，上述过程运行一次即可， 导入必要的包和变量**

我们将会话中的每个item视为node，因此同一会话中的所有item形成一个graph,即可将每个session id中的每次点击当做一个节点，这个session构成一个graph

### 构造数据集

- 每个 session id 构成一个图
- 每个 session中的序列从 0 开始计数当做节点
- 每个 session id 中的 item id 要重新进行编码（节点）,用于确定顺序，节点的特征使用 item id，不使用 category 的原因是因为，我们的目标是对节点进行分类，
    category对应的是节点的组别信息，多个item id 可能属于同一个类别
- 图的流向是从前一个节点流向后一个节点的链表形式连接  0-->1-->2-->3-->4
- 这里也可以从 Dataset 继承，构造更大的数据集，但是要事先对 clicks 进行分块处理

In [5]:
# export
from torch_geometric.data import InMemoryDataset, Data
from tqdm import tqdm

class YooChooseBinaryDataset(InMemoryDataset):
    def __init__(self, root, transform=None, pre_transform=None):
        super().__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0])

    @property
    def raw_file_names(self):
        return []

    @property
    def processed_file_names(self):
        return [Path(self.processed_dir)/'click_binary.pt']

    def download(self):
        pass

    def process(self):
        data_list = []
        # clicks = pd.read_csv(root / 'clicks_pro.csv', encoding='utf-8', low_memory=False)  # 使用全部数据
        clicks = pd.read_csv(root / 'clicks_pro_100m.csv', encoding='utf-8', low_memory=False)  # 使用全部数据

        # process by session_id
        grouped = clicks.groupby('session_id')
        lens = clicks.session_id.unique().shape[0]

        # clicks 不需要常驻内存, 并且使用动态加载
        del clicks

        for session_id, group in tqdm(iter(grouped), total=lens):
            # 重新编码，作为节点的顺序
            sess_item_id = LabelEncoder().fit_transform(group.item_id)

            # 重建索引
            group = group.reset_index(drop=True)
            group['sess_item_id'] = sess_item_id

            # 节点的初始特征
            # 重复的浏览记录当做一次记录
            # 使用 item id 作为节点特征
            # 每个子表 group 都有同样的 session id
            node_features = group.loc[group.session_id == session_id, ['sess_item_id', 'item_id']].sort_values('sess_item_id').item_id.drop_duplicates().values
            node_features = torch.LongTensor(node_features).unsqueeze(1)

            # 序列访问的顺序
            source_nodes = group.sess_item_id.values[:-1]
            target_nodes = group.sess_item_id.values[1:]
            edge_index = torch.tensor([source_nodes, target_nodes], dtype=torch.long)

            x = node_features
            y = torch.FloatTensor([group.label.values[0]])

            # 每个 session 当做一个 graph
            data = Data(x=x, edge_index=edge_index, y=y)
            data_list.append(data)

        data, slices = self.collate(data_list)
        torch.save((data, slices), self.processed_paths[0])

In [None]:
dataset  =  YooChooseBinaryDataset(root=root)

 **为了避免再次处理数据，对上述生成的数据进行保存，方便下次使用**

In [None]:
with open(root/'clicks_dataset.pkl', 'wb') as f:
    pickle.dump(dataset, f)

分割训练集、测试集、验证集，先随机打乱数据

存在样本不均衡问题，因为大量session没有购买记录，使用 auc作为评估指标

In [None]:
dataset = dataset.shuffle()
train_dataset = dataset[:800000]
val_dataset = dataset[800000:900000]
test_dataset = dataset[900000:]
len(train_dataset), len(val_dataset), len(test_dataset)

- 构建模型

In [None]:
embed_dim = 128
import torch
import torch.nn.functional as F
from torch import nn
from torch_geometric.nn import (SAGEConv, TopKPooling, global_max_pool, global_mean_pool)


class Net(torch.nn.Module):
    def __init__(self, sparse_sz, emb_sz=128, p=0.5):
        super().__init__()

        self.item_embedding = torch.nn.Embedding(num_embeddings=sparse_sz, embedding_dim=emb_sz)
        self.conv1 = SAGEConv(emb_sz, 128)
        self.pool = TopKPooling(128, ratio=0.8)
        self.conv2 = SAGEConv(128, 128)
        self.fc1 = nn.Sequential(nn.Linear(256, 128), nn.BatchNorm1d(128), nn.ReLU())
        self.fc2 = nn.Sequential(nn.Linear(128, 64), nn.BatchNorm1d(64), nn.ReLU())
        self.drop = nn.Dropout(p)
        self.lin = nn.Linear(64, 1)

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        x = self.item_embedding(x).squeeze(1)

        x = F.relu(self.conv1(x, edge_index))

        x, edge_index, _, batch, _ = self.pool(x, edge_index, None, batch)
        x1 = torch.cat([global_max_pool(x, batch), global_mean_pool(x, batch)], dim=1)

        x = F.relu(self.conv2(x, edge_index))

        x, edge_index, _, batch, _ = self.pool(x, edge_index, None, batch)
        x2 = torch.cat([global_max_pool(x, batch), global_mean_pool(x, batch)], dim=1)

        x = F.relu(self.conv2(x, edge_index))

        x, edge_index, _, batch, _ = self.pool(x, edge_index, None, batch)
        x3 = torch.cat([global_max_pool(x, batch), global_mean_pool(x, batch)], dim=1)

        x = x1 + x2 + x3

        x = self.fc1(x)
        x = self.fc2(x)
        x = self.drop(x)
        x = torch.sigmoid(self.lin(x)).squeeze(1)
        return x

### 训练与评估

[yooChoose_train.py](./yooChoose_train.py)