In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2
# 多行输出
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" 

# GCN 推荐系统
1. [AutuanLiu/PyTorch-Geometric-YooChoose: This is a tutorial for PyTorch Geometric on the YooChoose dataset](https://github.com/AutuanLiu/PyTorch-Geometric-YooChoose)
2. [Hands on Graph Neural Networks with PyTorch & PyTorch Geometric](https://towardsdatascience.com/hands-on-graph-neural-networks-with-pytorch-pytorch-geometric-359487e221a8)
3. [RecSys Challenge 2015 - Challenge](https://2015.recsyschallenge.com/challenge.html?source=post_page)

## Task
给出电商网站的用户点击序列，预测用户是否会购买，如果购买，他会买什么？

1. 用户是否会点击？
2. 用户如果点击，他会买什么？

## Data
### Training
- yoochoose-clicks.dat - 点击记录. 
    - Session ID – 每个 session 可能有一个或者多个点击记录.
    - Timestamp – 时间戳
    - Item ID – item的标识.
    - Category – item的类别.
- yoochoose-buys.dat - 购买记录:
    - Session ID - 每个 session 可能有一个或者多个购买记录.
    - Timestamp - 时间戳.
    - Item ID – item的标识.
    - Price – item的价格.
    - Quantity – item被购买的次数.
- yoochoose-buys.dat 中的 Session ID 一定会出现在 yoochoose-clicks.dat， 具有相同会话ID的记录一起构成会话期间某个用户的单击事件序列， 会话可以很短(几分钟)或很长(几小时)，可以单击一次或数百次，一切都取决于用户的活动。

### Test
- yoochoose-test.dat - 和 yoochoose-clicks.dat 数据的结构相同
    - Session ID
    - Timestamp
    - Item ID
    - Category

### Solution

任务是预测测试文件中的每个会话，这个会话中是否有购买事件，如果有，将购买哪些项。不需要预测数量

- solution.dat
    - Session ID
    - 用逗号分割在这个 session中会购买的 item ID
    - 用 ';' 分割不同的域
- 如果测试文件中存在会话ID，但解决方案文件中不存在，这意味着挑战者预测的会话不会以购买事件结束。

In [2]:
# export
import os, gc
import torch
import pickle
from pathlib import Path
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [3]:
# export
root = Path('../data/yoochoose-data/')

In [4]:
os.listdir(root)

['dataset-README.txt',
 'processed',
 'yoochoose-clicks.dat',
 'yoochoose-test.dat',
 'data.h5',
 'yoochoose-buys.dat',
 '.ipynb_checkpoints']

In [5]:
click_columns=['session_id', 'timestamp', 'item_id', 'category']
buy_columns=['session_id', 'timestamp', 'item_id', 'price', 'quantity']

In [6]:
clicks = pd.read_csv(root/'yoochoose-clicks.dat', header=None, names=click_columns, low_memory=False)
buys = pd.read_csv(root/'yoochoose-buys.dat', header=None, names=buy_columns, low_memory=False)

### 数据

In [7]:
clicks.sample(10)

Unnamed: 0,session_id,timestamp,item_id,category
5918405,1797391,2014-05-03T03:37:32.665Z,214829855,0
19405979,6414712,2014-07-20T09:57:53.541Z,214845435,3
27034841,9370293,2014-08-30T07:54:35.410Z,214711284,2
368496,121007,2014-04-07T09:58:22.237Z,214549950,0
16680947,5627108,2014-07-01T12:06:23.520Z,214848997,S
18339004,5910052,2014-07-12T10:04:16.332Z,214850612,6
27706363,9626101,2014-08-31T18:14:32.628Z,214854125,S
6732793,2007893,2014-05-01T06:16:04.941Z,214711657,0
15713769,5538089,2014-06-25T09:11:09.462Z,214539527,2
1197480,361061,2014-04-07T08:50:04.474Z,214538951,0


In [8]:
# with pd.HDFStore(root/'data.h5', 'w') as data:
#     data['clicks'] = clicks
#     data['buys'] = buys

In [9]:
buys.sample(10)

Unnamed: 0,session_id,timestamp,item_id,price,quantity
452115,4842482,2014-06-15T19:12:39.431Z,214839967,0,0
38974,52026,2014-04-02T13:59:48.327Z,214826835,1674,1
74068,519397,2014-04-11T14:14:52.051Z,214837483,523,1
995024,10090984,2014-09-07T07:54:07.303Z,214853700,837,1
459527,4957258,2014-06-20T19:58:18.870Z,214837288,0,0
608319,6486388,2014-07-20T08:49:46.128Z,214836924,0,0
1074418,11193231,2014-09-17T17:33:59.808Z,214854227,3141,1
13842,404642,2014-04-05T11:35:10.381Z,214821277,1046,6
1000893,9772747,2014-09-06T19:35:44.366Z,214828957,5654,1
456558,5095898,2014-06-22T14:34:07.664Z,214819468,0,0


In [10]:
clicks.category.unique().shape

(339,)

- item id 要做 embedding ，其属于稀疏变量，我们对其进行编码

In [11]:
item_encoder = LabelEncoder()
item_encoder.fit(clicks['item_id'])
clicks['item_id'] = item_encoder.transform(clicks['item_id'])

LabelEncoder()

In [12]:
clicks.head()

Unnamed: 0,session_id,timestamp,item_id,category
0,1,2014-04-07T10:51:09.277Z,2053,0
1,1,2014-04-07T10:54:09.868Z,2052,0
2,1,2014-04-07T10:54:46.998Z,2054,0
3,1,2014-04-07T10:57:00.306Z,9876,0
4,2,2014-04-07T13:56:37.614Z,19448,0


- 要确定标签，即给定会话是否有任何购买事件，我们只需检查yoochoose-click.dat中的session_id是否也出现在yoochoose-buy.dat中

In [13]:
clicks['label'] = clicks.session_id.isin(buys.session_id)

In [14]:
clicks.head()

Unnamed: 0,session_id,timestamp,item_id,category,label
0,1,2014-04-07T10:51:09.277Z,2053,0,False
1,1,2014-04-07T10:54:09.868Z,2052,0,False
2,1,2014-04-07T10:54:46.998Z,2054,0,False
3,1,2014-04-07T10:57:00.306Z,9876,0,False
4,2,2014-04-07T13:56:37.614Z,19448,0,False


保存处理好的数据

In [15]:
clicks.to_csv(root/'clicks_pro.csv', index=False, encoding='utf-8')

In [16]:
gc.collect()

0

**为避免内存超出，上述过程运行一次即可， 导入必要的包和变量**

我们将会话中的每个item视为node，因此同一会话中的所有item形成一个graph,即可将每个session id中的每次点击当做一个节点，这个session构成一个graph

### 构造数据集

- 每个 session id 构成一个图
- 每个 session中的序列从 0 开始计数当做节点
- 每个 session id 中的 item id 要重新进行编码（节点）,用于确定顺序，节点的特征使用 item id，不使用 category 的原因是因为，我们的目标是对节点进行分类，
    category对应的是节点的组别信息，多个item id 可能属于同一个类别
- 图的流向是从前一个节点流向后一个节点的链表形式连接  0-->1-->2-->3-->4

In [5]:
# export
from torch_geometric.data import InMemoryDataset, Data
from tqdm import tqdm

class YooChooseBinaryDataset(InMemoryDataset):
    def __init__(self, root, transform=None, pre_transform=None):
        super().__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0])

    @property
    def raw_file_names(self):
        return []

    @property
    def processed_file_names(self):
        return [Path(self.processed_dir)/'click_binary.pt']

    def download(self):
        pass

    def process(self):
        data_list = []
        clicks = pd.read_csv(root/'clicks_pro.csv', encoding='utf-8', low_memory=False)
        # process by session_id
        grouped = clicks.groupby('session_id')
        
        for session_id, group in tqdm(grouped):
            # 重新编码，作为节点的顺序
            sess_item_id = LabelEncoder().fit_transform(group.item_id)
            
            # 重建索引
            group = group.reset_index(drop=True)
            group['sess_item_id'] = sess_item_id
            # 节点的初始特征
            # 重复的浏览记录当做一次记录
            # 使用 item id 作为节点特征
            # 每个子表 group 都有同样的 session id
            node_features = group.loc[group.session_id == session_id, ['sess_item_id', 'item_id']].sort_values('sess_item_id').item_id.drop_duplicates().values
            node_features = torch.LongTensor(node_features).unsqueeze(1)
            
            # 序列访问的顺序
            source_nodes = group.sess_item_id.values[:-1]
            target_nodes = group.sess_item_id.values[1:]
            edge_index = torch.tensor([source_nodes, target_nodes], dtype=torch.long)
            
            x = node_features
            y = torch.FloatTensor([group.label.values[0]])
            
            # 每个 session 当做一个 graph
            data = Data(x=x, edge_index=edge_index, y=y)
            data_list.append(data)
            
            # 清理内存
            gc.collect()

        data, slices = self.collate(data_list)
        torch.save((data, slices), self.processed_paths[0])

In [None]:
dataset  =  YooChooseBinaryDataset(root=root)

 **为了避免再次处理数据，对上述生成的数据进行保存，方便下次使用**

In [None]:
with open(root/'clicks_dataset.pkl', 'wb') as f:
    pickle.dump(dataset, f)