In [7]:
%%capture
!pip install torch-geometric

In [111]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm.auto import trange
from catboost import CatBoostClassifier,Pool, cv
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import f1_score, accuracy_score
import networkx as nx
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl

In [112]:
import warnings

warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

In [113]:
from torch_geometric.datasets import Planetoid

# Import dataset from PyTorch Geometric
dataset = Planetoid(root=".", name="CiteSeer")

# Print information about the dataset
print(f'Number of graphs: {len(dataset)}')
print(f'Number of nodes: {dataset[0].x.shape[0]}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')
print(f'Has isolated nodes: {dataset[0].has_isolated_nodes()}')

Number of graphs: 1
Number of nodes: 3327
Number of features: 3703
Number of classes: 6
Has isolated nodes: True


In [114]:
test_mask = dataset.test_mask
val_mask = dataset.val_mask
train_mask = ~(test_mask + val_mask)

In [116]:
train_idxes = torch.arange(0,len(train_mask))[train_mask]
train_gnn_idxes, train_catboost_idxes = train_test_split(train_idxes,test_size=0.9,random_state=56,shuffle=True)

train_gnn_mask = torch.tensor([x in train_gnn_idxes for x in torch.arange(0,len(train_mask))])
train_catboost_mask = torch.tensor([x in train_catboost_idxes for x in torch.arange(0,len(train_mask))])

# GNN

In [117]:
class ModelCFG:
        model ='GCN'
        num_labels = 6
        scheduler= False
        warnap = False
        max_epoches=175
        lr = 3e-4
        eps=1e-6
        betas=(0.9, 0.999)
        drop = 0.1
        depths = [3703,64]
        act = nn.GELU()
        _act = 'gelu'
        weight_decay = 1e-4
        
cfg = ModelCFG()

In [118]:
from torch_geometric.nn import GCNConv, GATConv, GATv2Conv

class GCNModel(nn.Module):
    def __init__(self,CFG):
        super().__init__()
        self.convs = nn.ModuleList()
        self.act = CFG.act
        self.drop = nn.Dropout(CFG.drop)
        for i in range(len(CFG.depths)-1):
            self.convs.append(GCNConv(CFG.depths[i],CFG.depths[i+1]))
        self.head = nn.Linear(CFG.depths[-1],CFG.num_labels)
    
    def forward(self, x, edge_index, return_feats=False):
        x = self.drop(x)
        for conv in self.convs:
            x = conv(x,edge_index)
            x = self.act(x)
        
        if return_feats:
            return x
        x = self.head(x).softmax(dim=-1)
        return x

In [119]:
def train(model, data, criterion, optimizer, epochs):
    

    model.train()
    for epoch in trange(epochs):
        
        pl.seed_everything(56)
        optimizer.zero_grad()
        logits = model(data.x.cuda(), data.edge_index.cuda())
        loss = criterion(logits[train_gnn_mask], data.y[train_gnn_mask].cuda())
        loss.backward()
        optimizer.step()
        
        if epoch % 10 == 0:
            val_acc = accuracy_score(logits[val_mask].argmax(dim=1).cpu().detach().numpy(),
                                     data.y[val_mask].cpu().detach().numpy())
            print(val_acc)
    
    test_acc = accuracy_score(logits[test_mask].argmax(dim=1).cpu().detach().numpy(),
                             data.y[test_mask].cpu().detach().numpy())
    
    print(f'Test Score {test_acc}')
    return val_acc, test_acc

In [120]:
model = GCNModel(cfg).cuda()

optimizer = torch.optim.AdamW(model.parameters(),
                              lr = cfg.lr,
                              eps = cfg.eps,
                              betas = cfg.betas,
                              weight_decay = cfg.weight_decay)

criterion = nn.CrossEntropyLoss()

In [121]:
train(model, dataset, criterion, optimizer, cfg.max_epoches)

  0%|          | 0/175 [00:00<?, ?it/s]

0.172
0.204
0.326
0.512
0.584
0.616
0.636
0.63
0.63
0.632
0.636
0.646
0.656
0.658
0.656
0.662
0.666
0.664
Test Score 0.644


(0.664, 0.644)

In [141]:
logits = model(dataset.x.cuda(), dataset.edge_index.cuda(),False)#[train_catboost_mask].cpu().detach().numpy()

In [142]:
train_gnn_embeds = logits[train_catboost_mask].cpu().detach().numpy()
val_gnn_embeds = logits[val_mask].cpu().detach().numpy()
test_gnn_embeds = logits[test_mask].cpu().detach().numpy()

# CatBoost

In [143]:
train_feats = dataset.x[train_catboost_mask].numpy()
train_label = dataset.y[train_catboost_mask].numpy()

val_feats = dataset.x[val_mask].numpy()
val_label = dataset.y[val_mask].numpy()

test_feats = dataset.x[test_mask].numpy()
test_label = dataset.y[test_mask].numpy()

In [144]:
train_idx = torch.arange(0,len(train_mask))[train_catboost_mask]
val_idx = torch.arange(0,len(val_mask))[val_mask]
test_idx = torch.arange(0,len(test_mask))[test_mask]

In [145]:
train_id2_label = {}

for i,(mask,label) in enumerate(zip(train_mask,dataset.y)):
    if mask:
        train_id2_label[i] = label.item()
    else:
        train_id2_label[i] = -1

In [146]:
graph = {}
used_nodes = []

for node1,node2 in zip(dataset.edge_index[0], dataset.edge_index[1]):
    node1,node2 = node1.item(), node2.item()
    if node1 not in used_nodes:
        graph[node1] = [node2]
        used_nodes.append(node1)
    else:
        graph[node1].append(node2)
        
for i in range(dataset[0].x.shape[0]):
    if i not in used_nodes:
        graph[i] = 'None'

In [147]:
svd_params = {'n_components':64,
              'algorithm':'randomized',
              'n_iter':5,
              'n_oversamples':10,
              'power_iteration_normalizer':'auto',
              'random_state':56}

svd = TruncatedSVD(**svd_params).fit(dataset.x[train_mask].numpy())

train_svd_feats = svd.transform(train_feats)
test_svd_feats = svd.transform(test_feats)
val_svd_feats = svd.transform(val_feats)
all_feats = svd.transform(dataset.x.numpy())

In [148]:
train_svd_feats.shape

(1645, 64)

In [149]:
def calc_feature_aggregation(neighbours,nfeats=64,all_feats=all_feats,agg=np.mean,null_value=-100):
    feats = []
    if neighbours == 'None':
        return [null_value] * nfeats
    for n in neighbours:
        feats.append(all_feats[n])
    return agg(feats,axis=0).tolist()
    

def calc_label_features(neighbours,train_id2_label=train_id2_label,num_classes=6):
    feats = [0] * (num_classes + 2)
    if neighbours == 'None':
        return feats
    for n in neighbours:
        n = train_id2_label[n]
        feats[n+1] += 1
        feats[-1] += 1
        
    for i in range(len(feats)-1):
        feats[i] /= feats[-1]

    return tuple(feats)

def make_df(feats, feats2, idxes, graph, num_classes=6,agg_type='mean'):
    df = pd.DataFrame()
    df['id'] = idxes
    for i in range(feats.shape[1]):
        df[f'svd_feature_{i}'] = feats[:,i]
    
    for i in range(feats2.shape[1]):
        df[f'gnn_feature_{i}'] = feats2[:,i]


    nearest_feature_cols = [f'target_{i}_percent' for i in range(-1,num_classes)] + ['count_neighbours']
    svd_agg_features = [f'svd_{agg_type}_feature_{i}' for i in range(feats.shape[1])] 
    embeds_features = [f'gnn_embeds_feature_{i}' for i in range(cfg.depths[-1])] 
    
    df[nearest_feature_cols] = [calc_label_features(graph[x]) for x in df['id']]
    df[svd_agg_features] = [calc_feature_aggregation(graph[x]) for x in df['id']]
    return df.drop(['id'],axis=1)

train_df = make_df(train_svd_feats, train_gnn_embeds, train_idx, graph)
val_df = make_df(val_svd_feats, val_gnn_embeds, val_idx, graph)
test_df = make_df(test_svd_feats, test_gnn_embeds, test_idx, graph)

In [150]:
svd_features = [f'svd_feature_{i}' for i in range(64)]
nearest_feature_cols = [f'target_{i}_percent' for i in range(-1,6)] + ['count_neighbours']
svd_agg_features = [f'svd_mean_feature_{i}' for i in range(64)] 

## Without Embeds

In [151]:
train_pool = Pool(data=train_df[svd_features+nearest_feature_cols+svd_agg_features],
                  label=train_label)

eval_pool = Pool(data=val_df[svd_features+nearest_feature_cols+svd_agg_features],
                  label=val_label)

test_pool = Pool(data=test_df[svd_features+nearest_feature_cols+svd_agg_features],
                  label=test_label)

In [152]:
params = {'iterations':1000,
          'learning_rate':0.05,
          'loss_function':'MultiClass',
          'max_depth':5,
          'eval_metric':'Accuracy',
          'random_seed':56}

cbm = CatBoostClassifier(**params)
cbm.fit(train_pool,eval_set=eval_pool,verbose=100)

0:	learn: 0.6133739	test: 0.6300000	best: 0.6300000 (0)	total: 55.4ms	remaining: 55.4s
100:	learn: 0.7507599	test: 0.7320000	best: 0.7340000 (92)	total: 3.68s	remaining: 32.8s
200:	learn: 0.8109422	test: 0.7460000	best: 0.7460000 (179)	total: 7.32s	remaining: 29.1s
300:	learn: 0.8553191	test: 0.7560000	best: 0.7560000 (300)	total: 10.9s	remaining: 25.3s
400:	learn: 0.8887538	test: 0.7540000	best: 0.7600000 (376)	total: 14.4s	remaining: 21.6s
500:	learn: 0.9148936	test: 0.7520000	best: 0.7600000 (376)	total: 18s	remaining: 18s
600:	learn: 0.9337386	test: 0.7540000	best: 0.7600000 (376)	total: 21.6s	remaining: 14.3s
700:	learn: 0.9477204	test: 0.7480000	best: 0.7600000 (376)	total: 25.5s	remaining: 10.9s
800:	learn: 0.9604863	test: 0.7520000	best: 0.7600000 (376)	total: 29.2s	remaining: 7.26s
900:	learn: 0.9702128	test: 0.7520000	best: 0.7600000 (376)	total: 32.8s	remaining: 3.6s
999:	learn: 0.9787234	test: 0.7500000	best: 0.7600000 (376)	total: 36.3s	remaining: 0us

bestTest = 0.76
best

<catboost.core.CatBoostClassifier at 0x7b754439b400>

In [153]:
cbm.get_feature_importance(prettified=True)

Unnamed: 0,Feature Id,Importances
0,svd_feature_1,11.660475
1,svd_mean_feature_4,6.409880
2,svd_mean_feature_1,6.074300
3,svd_feature_3,5.851792
4,svd_feature_4,5.161317
...,...,...
131,svd_feature_39,0.062355
132,svd_feature_59,0.051130
133,svd_mean_feature_57,0.044834
134,svd_feature_46,0.038934


In [154]:
y_p = cbm.predict(test_pool)
accuracy_score(test_label,y_p)

0.786

## With Embeds

In [155]:
train_pool = Pool(data=train_df,
                  label=train_label)

eval_pool = Pool(data=val_df,
                  label=val_label)

test_pool = Pool(data=test_df,
                  label=test_label)

In [156]:
params = {'iterations':1000,
          'learning_rate':0.05,
          'loss_function':'MultiClass',
          'max_depth':5,
          'eval_metric':'Accuracy',
          'random_seed':56}

cbm = CatBoostClassifier(**params)
cbm.fit(train_pool,eval_set=eval_pool,verbose=100)

0:	learn: 0.6151976	test: 0.6560000	best: 0.6560000 (0)	total: 61ms	remaining: 1m
100:	learn: 0.7659574	test: 0.7680000	best: 0.7700000 (95)	total: 3.88s	remaining: 34.5s
200:	learn: 0.8127660	test: 0.7780000	best: 0.7800000 (155)	total: 7.68s	remaining: 30.5s
300:	learn: 0.8480243	test: 0.7800000	best: 0.7800000 (155)	total: 11.4s	remaining: 26.6s
400:	learn: 0.8778116	test: 0.7740000	best: 0.7820000 (302)	total: 15.2s	remaining: 22.7s
500:	learn: 0.9063830	test: 0.7680000	best: 0.7820000 (302)	total: 18.9s	remaining: 18.8s
600:	learn: 0.9264438	test: 0.7720000	best: 0.7820000 (302)	total: 23s	remaining: 15.3s
700:	learn: 0.9428571	test: 0.7740000	best: 0.7820000 (302)	total: 26.7s	remaining: 11.4s
800:	learn: 0.9604863	test: 0.7760000	best: 0.7820000 (302)	total: 30.5s	remaining: 7.58s
900:	learn: 0.9689970	test: 0.7780000	best: 0.7820000 (302)	total: 34.3s	remaining: 3.77s
999:	learn: 0.9762918	test: 0.7820000	best: 0.7820000 (302)	total: 38s	remaining: 0us

bestTest = 0.782
bestIte

<catboost.core.CatBoostClassifier at 0x7b7572dec6d0>

In [159]:
cbm.get_feature_importance(prettified=True)[:20]

Unnamed: 0,Feature Id,Importances
0,svd_feature_1,9.126613
1,gnn_feature_3,6.370174
2,gnn_feature_1,5.374682
3,gnn_feature_5,5.004287
4,svd_feature_3,4.938674
5,gnn_feature_4,4.368504
6,target_2_percent,4.355
7,svd_mean_feature_1,3.97641
8,svd_feature_4,3.871679
9,svd_mean_feature_4,3.239949


In [160]:
y_p = cbm.predict(test_pool)
accuracy_score(test_label,y_p)

0.77