In [1]:
#%load_ext autoreload
#%autoreload 1
#%aimport graph_description
import networkx as nx
import pysubgroup as ps
import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', None)

In [2]:
from graph_description.datasets import nx_read_attributed_graph
from graph_description.utils import prune_sparse_selectors
from graph_description.networkx_aggregation import SumAggregator, MeanAggregator, apply_aggregator    

In [3]:
G, df = nx_read_attributed_graph("citeseer")
print("n_edges=", G.number_of_edges(), "n_nodes", G.number_of_nodes())

In [4]:
searchspace = ps.create_selectors(df, ignore=['label'])
searchspace = [sel for sel in searchspace if "==0" not in str(sel)]
print(len(searchspace))

3703


In [5]:
%%time
# do the actual propagation
df1, searchspace1 = apply_aggregator(SumAggregator, df, G, searchspace)
df2, searchspace2 = apply_aggregator((SumAggregator, MeanAggregator), df1, G)

init
prep done
init
prep done
CPU times: total: 5.67 s
Wall time: 5.66 s


In [6]:
#create the final dataframe & searchspace
total_df = pd.concat([df, df1, df2], axis=1)

total_searchspace = searchspace+searchspace1+searchspace2
ss2 = prune_sparse_selectors(total_searchspace, total_df)
print(len(total_searchspace), len(ss2))

47037 35928


In [7]:
target = ps.BinaryTarget ('label', 1)

In [8]:
# check whether there are columns that are not sparse (enough)
for col in total_df.columns:
    if isinstance(total_df[col].dtype, pd.SparseDtype):
        density = total_df[col].sparse.density
        if density > 0.2:
            print(density)

0.20229468599033817
0.21256038647342995


In [9]:
def print_stats(result):
    task = result.task
    cls = str(type(task.qf))[len("<class 'pysubgroup.binary_target."):-2]
    print(f"depth={task.depth}, a={task.qf.a}, {cls}, min_size={task.constraints[0].min_support}")

In [10]:
#%%snakeviz -t
task = ps.SubgroupDiscoveryTask (
    total_df,
    target,
    ss2,
    result_set_size=20,
    depth=2,
    qf=ps.GeneralizationAware_StandardQF(0.5),
    constraints=[ps.MinSupportConstraint(30)])
algorithm = ps.Apriori(ps.NumpySetRepresentation)
algorithm.use_vectorization=False
result = algorithm.execute(task)

  tau_diff = pos / (pos + delta_n)


In [11]:
print_stats(result)
result.to_dataframe()[["quality", "subgroup", "size_sg", "positives_sg"]]

depth=2, a=0.5, GeneralizationAware_StandardQF, min_size=30


Unnamed: 0,quality,subgroup,size_sg,positives_sg
0,0.243973,65==1.0,670,484
1,0.108667,2185==1.0,261,148
2,0.103526,neigh_sum(65==1.0): [1.0:2.0[,291,154
3,0.103102,2186==1.0,87,71
4,0.08951,247==1.0,168,97
5,0.079337,277==1.0,31,31
6,0.077021,neigh_sum(65==1.0): [2.0:3.0[,87,57
7,0.076191,3016==1.0,209,101
8,0.075359,neigh_sum(2186==1.0)==1.0,68,48
9,0.071824,neigh_mean(neigh_sum(65==1.0)): [1.0:1.50[,84,53


In [12]:
# depth=3, a=0.1, GeneralizationAware_StandardQF, min_size=30

# 0 	0.514004 	277==1.0 	31 	31
# 1 	0.462325 	65==1.0 	670 	484
# 2 	0.442075 	2186==1.0 	87 	71
# 3 	0.402550 	1616==1.0 	33 	27
# 4 	0.397740 	1700==1.0 	32 	26
# 5 	0.366548 	3027==1.0 	30 	23

In [13]:
# depth=3, a=0.2, GA, min_size=30

# 0 	0.394045 	65==1.0 	670 	484
# 1 	0.322176 	277==1.0 	31 	31
# 2 	0.307212 	2186==1.0 	87 	71
# 3 	0.253900 	1616==1.0 	33 	27
# 4 	0.250095 	1700==1.0 	32 	26
# 5 	0.241775 	neigh_sum(2186==1.0)==1.0 	68 	48

In [14]:
# depth=2, a=0.3, GA

# 0 	0.243973 	65==1.0 	670 	484
# 1 	0.108667 	2185==1.0 	261 	148
# 2 	0.103526 	neigh_sum(65==1.0): [1.0:2.0[ 	291 	154
# 3 	0.103102 	2186==1.0 	87 	71
# 4 	0.089510 	247==1.0 	168 	97
# 5 	0.077021 	neigh_sum(65==1.0): [2.0:3.0[ 	87 	57

In [15]:
# depth=1, a = 0.5, GA

# 0 	0.243973 	65==1.0 	670 	484
# 1 	0.108667 	2185==1.0 	261 	148
# 2 	0.103526 	neigh_sum(65==1.0): [1.0:2.0[ 	291 	154
# 3 	0.103102 	2186==1.0 	87 	71
# 4 	0.089510 	247==1.0 	168 	97
# 5 	0.077021 	neigh_sum(65==1.0): [2.0:3.0[ 	87 	57

In [16]:
from pysubgroup.visualization import supportSetVisualization
import matplotlib.pyplot as plt

In [17]:
#plt.imshow(supportSetVisualization(result))