In [1]:
#%load_ext autoreload
#%autoreload 1
#%aimport graph_description
import networkx as nx
import pysubgroup as ps
import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', None)

In [2]:
from graph_description.datasets import nx_read_attributed_graph
from graph_description.utils import prune_sparse_selectors
from graph_description.networkx_aggregation import SumAggregator, MeanAggregator, apply_aggregator    

In [3]:
G, df = nx_read_attributed_graph("pubmed")
print("n_edges=", G.number_of_edges(), "n_nodes", G.number_of_nodes())

n_edges= 88648 n_nodes 19717


In [4]:
searchspace = ps.create_selectors(df, ignore=['label'])
searchspace = [sel for sel in searchspace if "==0" not in str(sel)]

In [5]:
len(searchspace)

2500

In [2]:
for i in range(0):
    print(i)

In [6]:
%%time
# do the actual propagation
df1 = apply_aggregator(SumAggregator, df, G, searchspace)
print("A")
df2 = apply_aggregator((SumAggregator, MeanAggregator), df1, G)

init
prep done
A
init
prep done
CPU times: total: 4.7 s
Wall time: 4.79 s


In [7]:
#create the final dataframe & searchspace
total_df = pd.concat([df], axis=1)

#total_searchspace = searchspace+searchspace1+searchspace2
#ss2 = prune_sparse_selectors(total_searchspace, total_df)
#print(len(total_searchspace), len(ss2))

In [8]:
total_df["label"]

0        1
1        1
2        0
3        2
4        1
        ..
19712    2
19713    0
19714    2
19715    0
19716    2
Name: label, Length: 19717, dtype: int64

In [9]:
def fix_column_name(name):
    return str(name).replace("<", " smaller ").replace("[", "{").replace("]",  "}")

In [10]:
pd.isna(total_df).any(axis=1).sum()

0

In [11]:
train_df = total_df.drop("label", axis=1)
train_df.columns= list(map(fix_column_name, train_df.columns))

In [12]:
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
X_train, X_test, y_train, y_test = train_test_split(train_df, total_df['label'], test_size=.2)

In [13]:
from xgboost import XGBClassifier, XGBRFClassifier

In [14]:
#bst = XGBClassifier(n_estimators=10, max_depth=2, learning_rate=1, objective='binary:logistic')
## fit model
#bst.fit(X_train, y_train)

In [15]:
#bst.score(X_test, y_test)

In [16]:
#from sklearn.linear_model import LogisticRegression
#clf_LR = LogisticRegression()
#cross_val_score(clf_LR, train_df, total_df['label'], cv=5)

In [17]:
train_df.columns

Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
       ...
       '490', '491', '492', '493', '494', '495', '496', '497', '498', '499'],
      dtype='object', length=500)

In [18]:
def do_cross_validate(clf):
    results = cross_validate(clf, train_df, total_df['label'], cv=5, scoring=('accuracy', "f1_micro"))
    #print(results)
    print("test_accuracy", results["test_accuracy"])
    print("test_f2", results["test_f1_micro"])
    return results

## For comparison

From Prediction instability paper:
### Pubmed
GAT: Accuracy 75.69+-0.69

GCN: Accuracy 76.78+- 0.55


## xgboost

In [19]:
default_params = {
    "learning_rate":1,
    "objective":'binary:logistic',
    "n_jobs" : 4,
}

In [28]:
%%time
clf = XGBClassifier(n_estimators=10, max_depth=3, **default_params)
result_clf1 = do_cross_validate(clf)

test_accuracy [0.87271805 0.8775355  0.88130865 0.88612731 0.88562009]
test_f2 [0.87271805 0.8775355  0.88130865 0.88612731 0.88562009]
CPU times: total: 27 s
Wall time: 9.1 s


In [21]:
%%time
clf = XGBClassifier(n_estimators=15, max_depth=3, **default_params)
result_clf2 = do_cross_validate(clf)

test_accuracy [0.88235294 0.88235294 0.88917068 0.887649   0.88891707]
test_f2 [0.88235294 0.88235294 0.88917068 0.887649   0.88891707]
CPU times: total: 31.3 s
Wall time: 10.1 s


In [22]:
%%time
clf = XGBClassifier(n_estimators=20, max_depth=3, **default_params)
result_clf3 = do_cross_validate(clf)

test_accuracy [0.88590264 0.89097363 0.89348212 0.89475019 0.88840984]
test_f2 [0.88590264 0.89097363 0.89348212 0.89475019 0.88840984]
CPU times: total: 35.2 s
Wall time: 11.2 s


In [23]:
%%time
clf_RF = XGBRFClassifier(n_estimators=20, max_depth=3, **default_params)
result_clf_RF = do_cross_validate(clf_RF)

test_accuracy [0.81059838 0.82074037 0.81968045 0.81689069 0.81410094]
test_f2 [0.81059838 0.82074037 0.81968045 0.81689069 0.81410094]
CPU times: total: 36.7 s
Wall time: 11.5 s


## sklearn

In [24]:
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import GaussianNB
import warnings

In [25]:
%%time
dummy_clf = DummyClassifier()
result_dummy = do_cross_validate(dummy_clf)

test_accuracy [0.39934077 0.39934077 0.39944205 0.39944205 0.39944205]
test_f2 [0.39934077 0.39934077 0.39944205 0.39944205 0.39944205]
CPU times: total: 3.7 s
Wall time: 3.49 s


In [26]:
%%time
clf_NB = GaussianNB()
with warnings.catch_warnings():
    warnings.filterwarnings("ignore", message="pandas.DataFrame with sparse columns found.It will be converted to a dense numpy array.")
    result_clf_NB = do_cross_validate(clf_NB)


KeyboardInterrupt

