### Requirements

In [None]:
!pip install arm-mango

In [None]:
!pip install torcheck #### Check for potential issue in models

In [None]:
# check mlflow
!mlflow --version

### GNN
Using hierachical pooling to reduce the graph until it's only one embedding
- Hierachical pooling: reduce the nodes by distribute node to neighbor nodes

In [None]:
!mlflow ui

In [1]:
import pandas as pd

DATA_PATH = "pre-processed/fixtures_full.csv"
data = pd.read_csv(DATA_PATH)
data['result'].value_counts()

2    1760
0    1405
1    1099
Name: result, dtype: int64

In [2]:
import numpy as np
np.where(data.applymap(lambda x: x == ''))

(array([], dtype=int64), array([], dtype=int64))

In [3]:
np.where(pd.isnull(data))

(array([], dtype=int64), array([], dtype=int64))

#### General Version

In [4]:
%%writefile model.py
import torch
import torch.nn.functional as F 
from torch.nn import Sequential, Linear, BatchNorm1d, ReLU
from torch_geometric.nn import TransformerConv, GATConv, TopKPooling, BatchNorm
from torch_geometric.nn import global_mean_pool as gap, global_max_pool as gmp
torch.manual_seed(42)

class GNN(torch.nn.Module):
    def __init__(self, feature_size):
        super(GNN, self).__init__()
        num_classes = 3
        embedding_size = 1024

        # GNN layers
        self.conv1 = GATConv(feature_size, embedding_size, heads=3, dropout=0.3)
        self.head_transform1 = Linear(embedding_size*3, embedding_size)
        self.pool1 = TopKPooling(embedding_size, ratio=0.8)
        self.conv2 = GATConv(embedding_size, embedding_size, heads=3, dropout=0.3)
        self.head_transform2 = Linear(embedding_size*3, embedding_size)
        self.pool2 = TopKPooling(embedding_size, ratio=0.5)
        self.conv3 = GATConv(embedding_size, embedding_size, heads=3, dropout=0.3)
        self.head_transform3 = Linear(embedding_size*3, embedding_size)
        self.pool3 = TopKPooling(embedding_size, ratio=0.2)

        # Linear layers
        self.linear1 = Linear(embedding_size*2, 1024) # 1024 dense neurons
        self.linear2 = Linear(1024, num_classes)   

    def forward(self, x, edge_index, batch_index):
        # def forward(self, x, edge_attr, edge_index, batch_index):
        # First block
        x = self.conv1(x, edge_index)
        x = self.head_transform1(x)

        x, edge_index, edge_attr, batch_index, _, _ = self.pool1(x, 
                                                                 edge_index,
                                                                 None,
                                                                 batch_index)
        x1 = torch.cat([gmp(x, batch_index), gap(x, batch_index)], dim=1)

        # Second block
        x = self.conv2(x, edge_index)
        x = self.head_transform2(x)
        x, edge_index, edge_attr, batch_index, _, _ = self.pool2(x,
                                                                edge_index,
                                                                None,
                                                                batch_index)
        x2 = torch.cat([gmp(x, batch_index), gap(x, batch_index)], dim=1)

        # Third block
        x = self.conv3(x, edge_index)
        x = self.head_transform3(x)
        x, edge_index, edge_attr, batch_index, _, _ = self.pool3(x,
                                                                edge_index,
                                                                None,
                                                                batch_index)
        x3 = torch.cat([gmp(x, batch_index), gap(x, batch_index)], dim=1)

        # Concat pooled vectors
        x = x1 + x2 + x3

        # Output block
        x = self.linear1(x).relu()
        #x = F.dropout(x, p=0.5, training=self.training)
        x = self.linear2(x)

        return x

Overwriting model.py


##### Training

In [5]:
# imports 
import torch 
from torch_geometric.data import DataLoader
from sklearn.metrics import confusion_matrix, f1_score, \
    accuracy_score, precision_score, recall_score, roc_auc_score
import numpy as np
from tqdm import tqdm
from processedDataset import ProcessedDataset
from model import GNN
import mlflow.pytorch
import matplotlib.pyplot as plt 
import seaborn as sns
import pandas as pd 

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


In [6]:
full_dataset = ProcessedDataset(root = "data/", filename = "fixtures_full.csv")

train_size = int(0.8 * len(full_dataset))
test_size = len(full_dataset) - train_size
#print("size of train and test: ", train_size, test_size)
train_dataset, test_dataset = torch.utils.data.random_split(full_dataset, [train_size, test_size])

# train_dataset = ProcessedDataset(root = "data/", filename = "fixtures_train.csv")
# test_dataset = ProcessedDataset(root = "data/", filename = "fixtures_test.csv", test=True)

In [7]:
model = GNN(feature_size=train_dataset[0].x.shape[1])
model = model.to(device)

print(f"Number of parameters: {count_parameters(model)}")
model

Number of parameters: 17986563


GNN(
  (conv1): GATConv(40, 1024, heads=3)
  (head_transform1): Linear(in_features=3072, out_features=1024, bias=True)
  (pool1): TopKPooling(1024, ratio=0.8, multiplier=1.0)
  (conv2): GATConv(1024, 1024, heads=3)
  (head_transform2): Linear(in_features=3072, out_features=1024, bias=True)
  (pool2): TopKPooling(1024, ratio=0.5, multiplier=1.0)
  (conv3): GATConv(1024, 1024, heads=3)
  (head_transform3): Linear(in_features=3072, out_features=1024, bias=True)
  (pool3): TopKPooling(1024, ratio=0.2, multiplier=1.0)
  (linear1): Linear(in_features=2048, out_features=1024, bias=True)
  (linear2): Linear(in_features=1024, out_features=3, bias=True)
)

In [8]:
# loss and optimizer, could try some more to see which is better
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9) # or ADAM...etc
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.95) #

In [9]:
# prepare training
NUM_GRAPHS_PER_BATCH = 256
train_loader = DataLoader(train_dataset, 
                         batch_size=NUM_GRAPHS_PER_BATCH, shuffle=True)
test_loader = DataLoader(test_dataset, 
                        batch_size=NUM_GRAPHS_PER_BATCH, shuffle=True)

def train(epoch):
    # Enumerate over the data
    all_preds = []
    all_labels = []
    #
    running_loss = 0.0
    step = 0
    for _, batch in enumerate(tqdm(train_loader)):
        # use CPU(or GPU)
        batch.to(device)
        # Reset gradients
        optimizer.zero_grad()
        # passing the node features and the connection info
        pred = model(batch.x.float(), 
                   # batch.edge_attr.float(), 
                    batch.edge_index, 
                    batch.batch)
        # calculating the loss and gradients
        loss = torch.sqrt(loss_fn(pred, batch.y.long()))
        loss.backward()
        # update using the gradients
        optimizer.step()
        
        # 我自己加上general version的，不然loss好像會爆掉
        #Update tracking
        running_loss += loss.item()
        step += 1
        
        
        all_preds.append(np.argmax(pred.cpu().detach().numpy(), axis=1))
        all_labels.append(batch.y.cpu().detach().numpy())
    all_preds = np.concatenate(all_preds).ravel()
    all_labels = np.concatenate(all_labels).ravel()
    calculate_metrics(all_preds, all_labels, epoch, "train")
    #return loss
    return running_loss/step

def test(epoch):
    all_preds = []
    all_labels = []
    #
    running_loss = 0.0
    step = 0
    for batch in test_loader:
        batch.to(device)
        pred = model(batch.x.float(), 
                   # batch.edge_attr.float(), 
                    batch.edge_index, 
                    batch.batch)
        loss = torch.sqrt(loss_fn(pred, batch.y.long()))
        
        # 我自己加上general version的，不然loss好像會爆掉
        #Update tracking
        running_loss += loss.item()
        step += 1
        
        all_preds.append(np.argmax(pred.cpu().detach().numpy(), axis=1))
        all_labels.append(batch.y.cpu().detach().numpy())
        
    all_preds = np.concatenate(all_preds).ravel()
    all_labels = np.concatenate(all_labels).ravel()
    calculate_metrics(all_preds, all_labels, epoch, "test")
    #return loss
    return running_loss/step

def calculate_metrics(y_pred, y_true, epoch, type):
    print(f"\n Confusion matrix: \n {confusion_matrix(y_pred, y_true)}")
    print(f"F1 Score: {f1_score(y_pred, y_true, average='macro')}")
    print(f"Accuracy: {accuracy_score(y_pred, y_true)}")
    print(f"Precision: {precision_score(y_pred, y_true, average='macro')}")
    print(f"Recall: {recall_score(y_pred, y_true, average='macro')}")
    try:
        roc = roc_auc_score(y_pred, y_true)
        print(f"ROC AUC: {roc}")
        mlflow.log_metric(key=f"ROC-AUC-{type}", value=float(roc), step=epoch)
    except:
        mlflow.log_metric(key=f"ROC-AUC-{type}", value=float(0), step=epoch)
        print(f"ROC AUC: notdefined")



In [10]:
# run the training
# Run the training
with mlflow.start_run() as run:
    for epoch in range(500): 
        # Training
        model.train()
        loss = train(epoch=epoch)
        #loss = loss.detach().cpu().numpy() # this somehow triggers error?
        print(f"Epoch {epoch} | Train Loss {loss}")
        mlflow.log_metric(key="Train loss", value=float(loss), step=epoch)

        # Testing
        model.eval()
        if epoch % 5 == 0:
            loss = test(epoch=epoch)
            #loss = loss.detach().cpu().numpy()
            print(f"Epoch {epoch} | Test Loss {loss}")
            mlflow.log_metric(key="Test loss", value=float(loss), step=epoch)

        scheduler.step()

    print("Done.")


100%|██████████| 14/14 [01:01<00:00,  4.43s/it]



 Confusion matrix: 
 [[ 869  661 1046]
 [  97   95  131]
 [ 165  124  223]]
F1 Score: 0.2866816051374102
Accuracy: 0.34799179126355906
Precision: 0.3451956185576875
Recall: 0.3556697475185726
ROC AUC: notdefined
Epoch 0 | Train Loss nan


  _warn_prf(average, modifier, msg_start, len(result))



 Confusion matrix: 
 [[274 219 360]
 [  0   0   0]
 [  0   0   0]]
F1 Score: 0.16208222419402543
Accuracy: 0.3212192262602579
Precision: 0.3333333333333333
Recall: 0.10707307542008597
ROC AUC: notdefined
Epoch 0 | Test Loss nan


100%|██████████| 14/14 [01:00<00:00,  4.32s/it]
  _warn_prf(average, modifier, msg_start, len(result))



 Confusion matrix: 
 [[1131  880 1400]
 [   0    0    0]
 [   0    0    0]]
F1 Score: 0.16600616468516075
Accuracy: 0.33157431838170626
Precision: 0.3333333333333333
Recall: 0.11052477279390209
ROC AUC: notdefined
Epoch 1 | Train Loss nan


100%|██████████| 14/14 [01:00<00:00,  4.32s/it]
  _warn_prf(average, modifier, msg_start, len(result))



 Confusion matrix: 
 [[1131  880 1400]
 [   0    0    0]
 [   0    0    0]]
F1 Score: 0.16600616468516075
Accuracy: 0.33157431838170626
Precision: 0.3333333333333333
Recall: 0.11052477279390209
ROC AUC: notdefined
Epoch 2 | Train Loss nan


100%|██████████| 14/14 [01:00<00:00,  4.32s/it]
  _warn_prf(average, modifier, msg_start, len(result))



 Confusion matrix: 
 [[1131  880 1400]
 [   0    0    0]
 [   0    0    0]]
F1 Score: 0.16600616468516075
Accuracy: 0.33157431838170626
Precision: 0.3333333333333333
Recall: 0.11052477279390209
ROC AUC: notdefined
Epoch 3 | Train Loss nan


100%|██████████| 14/14 [00:57<00:00,  4.12s/it]
  _warn_prf(average, modifier, msg_start, len(result))



 Confusion matrix: 
 [[1131  880 1400]
 [   0    0    0]
 [   0    0    0]]
F1 Score: 0.16600616468516075
Accuracy: 0.33157431838170626
Precision: 0.3333333333333333
Recall: 0.11052477279390209
ROC AUC: notdefined
Epoch 4 | Train Loss nan


100%|██████████| 14/14 [00:59<00:00,  4.28s/it]
  _warn_prf(average, modifier, msg_start, len(result))



 Confusion matrix: 
 [[1131  880 1400]
 [   0    0    0]
 [   0    0    0]]
F1 Score: 0.16600616468516075
Accuracy: 0.33157431838170626
Precision: 0.3333333333333333
Recall: 0.11052477279390209
ROC AUC: notdefined
Epoch 5 | Train Loss nan


  _warn_prf(average, modifier, msg_start, len(result))



 Confusion matrix: 
 [[274 219 360]
 [  0   0   0]
 [  0   0   0]]
F1 Score: 0.16208222419402543
Accuracy: 0.3212192262602579
Precision: 0.3333333333333333
Recall: 0.10707307542008597
ROC AUC: notdefined
Epoch 5 | Test Loss nan


100%|██████████| 14/14 [01:00<00:00,  4.31s/it]
  _warn_prf(average, modifier, msg_start, len(result))



 Confusion matrix: 
 [[1131  880 1400]
 [   0    0    0]
 [   0    0    0]]
F1 Score: 0.16600616468516075
Accuracy: 0.33157431838170626
Precision: 0.3333333333333333
Recall: 0.11052477279390209
ROC AUC: notdefined
Epoch 6 | Train Loss nan


100%|██████████| 14/14 [00:59<00:00,  4.22s/it]
  _warn_prf(average, modifier, msg_start, len(result))



 Confusion matrix: 
 [[1131  880 1400]
 [   0    0    0]
 [   0    0    0]]
F1 Score: 0.16600616468516075
Accuracy: 0.33157431838170626
Precision: 0.3333333333333333
Recall: 0.11052477279390209
ROC AUC: notdefined
Epoch 7 | Train Loss nan


 86%|████████▌ | 12/14 [00:56<00:09,  4.70s/it]


KeyboardInterrupt: 

In [None]:
# Save the model
mlflow.pytorch.log_model(model, "model")

In [None]:
!mlflow ui