In [1]:
%%capture
!pip install torch-geometric

In [142]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm.auto import trange
from catboost import CatBoostClassifier,Pool, cv
from sklearn.decomposition import TruncatedSVD
from tqdm.auto import tqdm
from sklearn.metrics import f1_score, accuracy_score
import networkx as nx
import torch

In [143]:
from torch_geometric.datasets import TUDataset

# Import dataset from PyTorch Geometric
dataset = TUDataset(root=".", name="IMDB-BINARY")

# Simple CatBoost

In [144]:
def create_data(dataset):
    df = pd.DataFrame()
    df['label'] = [data.y.item() for data in dataset]
    df['graph'] = [data.edge_index.T.tolist() for data in dataset]
    return df

data = create_data(dataset)
data

Unnamed: 0,label,graph
0,0,"[[0, 2], [0, 4], [0, 5], [0, 9], [0, 10], [1, ..."
1,0,"[[0, 6], [0, 7], [0, 14], [0, 23], [1, 2], [1,..."
2,0,"[[0, 2], [0, 3], [0, 5], [0, 6], [0, 7], [0, 1..."
3,0,"[[0, 2], [0, 8], [0, 9], [0, 15], [0, 17], [0,..."
4,0,"[[0, 1], [0, 2], [0, 3], [0, 4], [0, 5], [0, 6..."
...,...,...
995,1,"[[0, 1], [0, 2], [0, 3], [0, 7], [0, 8], [0, 1..."
996,1,"[[0, 1], [0, 2], [0, 3], [0, 4], [0, 5], [0, 6..."
997,1,"[[0, 1], [0, 2], [0, 3], [0, 4], [0, 5], [0, 6..."
998,1,"[[0, 1], [0, 20], [0, 21], [0, 29], [0, 30], [..."


In [145]:
def make_features(data):
    feature_df = pd.DataFrame()
    feature_df['num_edges'] = data['graph'].apply(len)
    feature_df['num_nodes'] = data['graph'].apply(lambda x: len(np.unique(np.concatenate(x))))
    feature_df['designity'] = feature_df['num_edges'] / feature_df['num_nodes'] / (feature_df['num_nodes'] - 1)
    
    graphs = [nx.Graph(x) for x in data['graph']]
    nodes_adjs = [nx.adjacency_matrix(gr) for gr in graphs]
    neighbours = [x.sum(axis=1) for x in tqdm(nodes_adjs)]
    neighbours_per = [(x.dot(x.T)).sum(axis=1) for x in tqdm(nodes_adjs)]
    neighbours_per_sq = [(x.dot(x.T).dot(x.T)).sum(axis=1) for x in tqdm(nodes_adjs)]
    neighbours_per_qr = [(x.dot(x.T).dot(x.T).dot(x.T)).sum(axis=1) for x in tqdm(nodes_adjs)]
    
    feature_df['neighbours_sum'] = [np.sum(x) for x in neighbours]
    feature_df['neighbours_mean'] = [np.mean(x) for x in neighbours]
    feature_df['neighbours_max'] = [np.max(x) for x in neighbours]
    feature_df['neighbours_min'] = [np.min(x) for x in neighbours]
    feature_df['neighbours_std'] = [np.std(x) for x in neighbours]
    
    feature_df['neighbours_per_sum'] = [np.sum(x) for x in neighbours_per]
    feature_df['neighbours_per_mean'] = [np.mean(x) for x in neighbours_per]
    feature_df['neighbours_per_max'] = [np.max(x) for x in neighbours_per]
    feature_df['neighbours_per_min'] = [np.min(x) for x in neighbours_per]
    feature_df['neighbours_per_std'] = [np.std(x) for x in neighbours_per]
    
    feature_df['neighbours_per_sq_sum'] = [np.sum(x) for x in neighbours_per_sq]
    feature_df['neighbours_per_sq_mean'] = [np.mean(x) for x in neighbours_per_sq]
    feature_df['neighbours_per_sq_max'] = [np.max(x) for x in neighbours_per_sq]
    feature_df['neighbours_per_sq_min'] = [np.min(x) for x in neighbours_per_sq]
    feature_df['neighbours_per_sq_std'] = [np.std(x) for x in neighbours_per_sq]
    
    feature_df['neighbours_per_qr_sum'] = [np.sum(x) for x in neighbours_per_qr]
    feature_df['neighbours_per_qr_mean'] = [np.mean(x) for x in neighbours_per_qr]
    feature_df['neighbours_per_qr_max'] = [np.max(x) for x in neighbours_per_qr]
    feature_df['neighbours_per_qr_min'] = [np.min(x) for x in neighbours_per_qr]
    feature_df['neighbours_per_qr_std'] = [np.std(x) for x in neighbours_per_qr]
    
    return pd.concat([data,feature_df],axis=1)

data = make_features(data)

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

In [146]:
data = data.drop(['graph'],axis=1)

In [147]:
train_df, test_df = train_test_split(data,test_size=0.35,random_state=56,shuffle=True)
test_df, eval_df = train_test_split(test_df,test_size=0.3,random_state=56,shuffle=True)

In [148]:
train_pool = Pool(data=train_df.drop(['label'],axis=1),
                  label=train_df['label'])

eval_pool = Pool(data=eval_df.drop(['label'],axis=1),
                  label=eval_df['label'])

test_pool = Pool(data=test_df.drop(['label'],axis=1),
                  label=test_df['label'])

In [149]:
params = {'iterations':1000,
          'loss_function':'CrossEntropy',
          'learning_rate':0.05,
          'max_depth':4,
          'eval_metric': 'Accuracy',
          'random_seed':56}

cbm = CatBoostClassifier(**params)
cbm.fit(train_pool,eval_set=eval_pool,verbose=100)

0:	learn: 0.6969231	test: 0.6952381	best: 0.6952381 (0)	total: 2.77ms	remaining: 2.77s
100:	learn: 0.7461538	test: 0.7047619	best: 0.7142857 (97)	total: 187ms	remaining: 1.66s
200:	learn: 0.8123077	test: 0.7238095	best: 0.7238095 (111)	total: 366ms	remaining: 1.46s
300:	learn: 0.8661538	test: 0.7333333	best: 0.7428571 (254)	total: 550ms	remaining: 1.28s
400:	learn: 0.8784615	test: 0.7142857	best: 0.7428571 (254)	total: 734ms	remaining: 1.1s
500:	learn: 0.8830769	test: 0.7142857	best: 0.7428571 (254)	total: 921ms	remaining: 917ms
600:	learn: 0.8876923	test: 0.7142857	best: 0.7428571 (254)	total: 1.1s	remaining: 733ms
700:	learn: 0.8876923	test: 0.7333333	best: 0.7428571 (254)	total: 1.28s	remaining: 548ms
800:	learn: 0.8876923	test: 0.7428571	best: 0.7428571 (254)	total: 1.47s	remaining: 365ms
900:	learn: 0.8876923	test: 0.7333333	best: 0.7428571 (254)	total: 1.65s	remaining: 181ms
999:	learn: 0.8876923	test: 0.7333333	best: 0.7428571 (254)	total: 1.83s	remaining: 0us

bestTest = 0.7428

<catboost.core.CatBoostClassifier at 0x79a8b17bd300>

In [150]:
cbm.get_feature_importance(prettified=True)

Unnamed: 0,Feature Id,Importances
0,neighbours_mean,9.372838
1,neighbours_per_qr_sum,7.846107
2,neighbours_per_sq_mean,7.670496
3,neighbours_per_mean,6.744232
4,neighbours_per_std,5.864179
5,designity,5.292018
6,neighbours_per_qr_mean,5.275187
7,neighbours_per_min,5.144845
8,neighbours_per_sq_std,4.65434
9,neighbours_per_qr_min,4.639359


In [151]:
y_p = cbm.predict(test_pool)
accuracy_score(test_df['label'],y_p)

0.746938775510204