In [1]:
!pip install torch
import torch
print(torch.__version__)

!pip install pandas

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
1.12.1+cu113
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


Change torch.__version__ of following installation command.


In [2]:
!pip install torch-geometric torch-scatter torch-sparse -f https://data.pyg.org/whl/torch-{torch.__version__}.html

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://data.pyg.org/whl/torch-1.12.1+cu113.html


Preprocess data

In [3]:
import pandas as pd

# Load files. (You should change file name according to your storage path)
clicks_df = pd.read_csv('clicks.dat', header=None)
clicks_df.columns=['session_id','timestamp','item_id','category']
print(clicks_df.head())
 
buy_df = pd.read_csv('transaction.dat', header=None)
buy_df.columns=['session_id','timestamp','item_id','price','quantity']
print(buy_df.head())


  exec(code_obj, self.user_global_ns, self.user_ns)


   session_id                 timestamp      item_id category
0           1  2014-04-07T10:51:09.277Z  214536502.0        0
1           1  2014-04-07T10:54:09.868Z  214536500.0        0
2           1  2014-04-07T10:54:46.998Z  214536506.0        0
3           1  2014-04-07T10:57:00.306Z  214577561.0        0
4           2  2014-04-07T13:56:37.614Z  214662742.0        0
   session_id                 timestamp    item_id  price  quantity
0      420374  2014-04-06T18:44:58.314Z  214537888  12462         1
1      420374  2014-04-06T18:44:58.325Z  214537850  10471         1
2      281626  2014-04-06T09:40:13.032Z  214535653   1883         1
3      420368  2014-04-04T06:13:28.848Z  214530572   6073         1
4      420368  2014-04-04T06:13:28.858Z  214835025   2617         1


Sample data: Randomly sample 100K sessions (due to huge file size)

In [4]:
import numpy as np
sampled_session_id = np.random.choice(clicks_df.session_id.unique(), 100000, replace=False)
clicks_df = clicks_df.loc[clicks_df.session_id.isin(sampled_session_id)]
clicks_df.nunique()

session_id    100000
timestamp     360627
item_id        19809
category         128
dtype: int64

Encode labels with values between 0 to (num_classes - 1)

In [5]:
from sklearn.preprocessing import LabelEncoder
item_encoder = LabelEncoder()
category_encoder = LabelEncoder()
clicks_df['item_id']=item_encoder.fit_transform(clicks_df.item_id)
clicks_df['category']=category_encoder.fit_transform(clicks_df.category.apply(str))
clicks_df.head()
 
buy_df=buy_df.loc[buy_df.session_id.isin(clicks_df.session_id)]
buy_df['item_id']=item_encoder.transform(buy_df.item_id)
buy_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


Unnamed: 0,session_id,timestamp,item_id,price,quantity
17,489756,2014-04-05T16:51:59.947Z,12793,6177,1
84,281719,2014-04-04T17:04:56.685Z,12812,2408,2
135,210344,2014-04-03T17:02:23.016Z,15297,2408,2
136,210344,2014-04-03T17:02:23.078Z,6202,6282,1
461,490243,2014-04-06T08:07:06.180Z,14561,523,1


Create session_id - Item dictionary (Who bought which item?)

In [6]:
buy_item_dict = dict(buy_df.groupby('session_id')['item_id'].apply(list))

Define custom dataset class for train GNN
Treat each item in a session as node → All items in the same session form a graph
Items will be connected by edge.


In [7]:
from torch_geometric.data import InMemoryDataset
from torch_geometric.data import Data

from tqdm import tqdm
class CustomDataset(InMemoryDataset):
   def __init__(self, root, transform=None, pre_transform=None):
       super(CustomDataset, self).__init__(root, transform, pre_transform)
       self.data, self.slices = torch.load(self.processed_paths[0])
 
   @property
   def raw_file_names(self):
       return []
 
   @property
   def processed_file_names(self):
       return ['save.dataset']
 
   def download(self):
       pass
  
   def process(self):
      
       data_list = []
       # process by session_id
       grouped = clicks_df.groupby('session_id')
       for session_id, group in tqdm(grouped):
           le = LabelEncoder()
           sess_item_id = le.fit_transform(group.item_id)
           group = group.reset_index(drop=True)
           group['sess_item_id'] = sess_item_id
           node_features = group.loc[group.session_id==session_id,['sess_item_id','item_id','category']].sort_values('sess_item_id')[['item_id','category']].drop_duplicates().values
           node_features = torch.LongTensor(node_features).unsqueeze(1)
           target_nodes = group.sess_item_id.values[1:]
           source_nodes = group.sess_item_id.values[:-1]
           edge_index = torch.tensor([source_nodes,
                                  target_nodes], dtype=torch.long)
           x = node_features
 
           if session_id in buy_item_dict:
               positive_indices = le.transform(buy_item_dict[session_id])
               label = np.zeros(len(node_features))
               label[positive_indices] = 1
           else:
               label = [0] * len(node_features)
 
           y = torch.FloatTensor(label)
           data = Data(x=x, edge_index=edge_index, y=y)
 
           data_list.append(data)
      
       data, slices = self.collate(data_list)
       torch.save((data, slices), self.processed_paths[0])
 
dataset = CustomDataset('../')


Model

In [8]:
import torch
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.nn import global_mean_pool

num_items = clicks_df.item_id.max() +1
num_categories = clicks_df.category.max()+1
embed_dim = 128

class Net(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.item_embedding = torch.nn.Embedding(num_embeddings=num_items, embedding_dim=embed_dim)
        self.category_embedding = torch.nn.Embedding(num_embeddings=num_categories, embedding_dim=embed_dim)        
        self.conv1 = GCNConv(embed_dim * 2, embed_dim) # You can use other graph convolution layer
        self.lin1 = Linear(embed_dim, 1) # You can multiple linear layers
        self.act1 = torch.nn.ReLU()

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch  
        item_id = x[:,:,0]
        category = x[:,:,1]

        # Use item embedding & category embedding to solve the problem.
        emb_item = self.item_embedding(item_id).squeeze(1)
        emb_category = self.category_embedding(category).squeeze(1)
        x = torch.cat([emb_item, emb_category], dim=1)

        ### You can add layers or alter the model structure. See geometric documents which layer or model you can use.
        x = self.conv1(x, edge_index) # x.shape = (number of edges in the batch of graphs, representation dimension)
       


        ### Prediction score for each edge, range 0~1
        x = torch.sigmoid(self.lin1(x)).squeeze(1)
        
        return x  

Dataloader

In [9]:
from torch_geometric.loader import DataLoader

In [10]:
### add train, val, test loader.
training_dataset = dataset[:80000]
val_test_dataset = dataset[80000:]
val_dataset = val_test_dataset[:10000]
test_dataset = val_test_dataset[10000:]

In [11]:
train_loader = DataLoader(training_dataset, batch_size=640, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=640, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=640, shuffle=True)

In [12]:
for batch in train_loader:
  print(batch)
  print(batch.num_graphs)

DataBatch(x=[1912, 1, 2], edge_index=[2, 1827], y=[1912], batch=[1912], ptr=[641])
640
DataBatch(x=[2137, 1, 2], edge_index=[2, 2087], y=[2137], batch=[2137], ptr=[641])
640
DataBatch(x=[1905, 1, 2], edge_index=[2, 1919], y=[1905], batch=[1905], ptr=[641])
640
DataBatch(x=[1826, 1, 2], edge_index=[2, 1846], y=[1826], batch=[1826], ptr=[641])
640
DataBatch(x=[1946, 1, 2], edge_index=[2, 1968], y=[1946], batch=[1946], ptr=[641])
640
DataBatch(x=[1948, 1, 2], edge_index=[2, 1920], y=[1948], batch=[1948], ptr=[641])
640
DataBatch(x=[1852, 1, 2], edge_index=[2, 1838], y=[1852], batch=[1852], ptr=[641])
640
DataBatch(x=[1867, 1, 2], edge_index=[2, 1771], y=[1867], batch=[1867], ptr=[641])
640
DataBatch(x=[1908, 1, 2], edge_index=[2, 1816], y=[1908], batch=[1908], ptr=[641])
640
DataBatch(x=[1925, 1, 2], edge_index=[2, 1922], y=[1925], batch=[1925], ptr=[641])
640
DataBatch(x=[1799, 1, 2], edge_index=[2, 1757], y=[1799], batch=[1799], ptr=[641])
640
DataBatch(x=[2030, 1, 2], edge_index=[2, 20

Train

In [13]:
def train():
    model.train()
    loss_all = 0
    for data in train_loader:
        data = data.to(device)
        optimizer.zero_grad()
        output = model(data)

        label = data.y.to(device)
        loss = torch.nn.BCELoss()(output, label)
        loss.backward()
        loss_all += data.num_graphs * loss.item()
        optimizer.step()
    return loss_all / len(training_dataset)

In [14]:
def val():
    model.eval()
    loss_all = 0
    for data in val_loader:
        data = data.to(device)
        optimizer.zero_grad()
        output = model(data)

        label = data.y.to(device)
        loss = torch.nn.BCELoss()(output, label)
        loss.backward()
        loss_all += data.num_graphs * loss.item()
        optimizer.step()
    return loss_all / len(training_dataset)

In [15]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Net().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

In [16]:
print(device)
print(model)
print(optimizer)

cpu
Net(
  (item_embedding): Embedding(19809, 128)
  (category_embedding): Embedding(127, 128)
  (conv1): GCNConv(256, 128)
  (lin1): Linear(in_features=128, out_features=1, bias=True)
  (act1): ReLU()
)
Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    eps: 1e-08
    foreach: None
    lr: 0.01
    maximize: False
    weight_decay: 0.0005
)


Test

In [17]:
from sklearn.metrics import roc_auc_score
def test(loader):
    model.eval()

    predictions = []
    labels = []

    with torch.no_grad():
        for data in loader:

            data = data.to(device)
            pred = model(data).detach().cpu().numpy()

            label = data.y.detach().cpu().numpy()
            predictions.append(pred)
            labels.append(label)

    predictions = np.hstack(predictions)
    labels = np.hstack(labels)

    return roc_auc_score(labels, predictions)

In [18]:
model.train()

number_of_epochs = 20 # You can change.

lowest_val_loss = float('inf')
best_model = None
for epoch in range(number_of_epochs):
    train_loss = train()
    val_loss = val()

    # Choose the lowest validation loss checkpoint (you can implement early stopping as well)
    if val_loss < lowest_val_loss:
      lowest_val_loss = val_loss
      torch.save(model, "./model/model.pt")

model = torch.load("./model/model.pt")
# Load the lowest validation loss checkpoint and check the performance.
test_acc = test(test_loader)
print(test_acc)

0.6501818919851923
