### Construct graph

In [2]:
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

def construct_graph(data):
    G = nx.Graph()
    nodes = list(data.index)
    G.add_nodes_from(nodes)

    # print(G.nodes()[1]['features'])
    simul = {i:{} for i in range(data.shape[0])}
    for id1 in range(data.shape[0]):
        for id2 in range(data.shape[0]):
            if id1 != id2:
                feature1 = data.iloc[id1]
                feature2 = data.iloc[id2]

                # simularity
                simularity = cosine_similarity([feature1], [feature2])[0][0]
                simul[id1][id2] = simularity
    
    # sorted
    sorted_simul = {}
    
    for key, values in simul.items():
        sorted_simul[key] = dict(sorted(values.items(), key=lambda x:x[1], reverse=True))

        # top 5 simularity
        top5 = list(sorted_simul[key].keys())[:5]
        for node in top5:
            G.add_edge(key, node)
            
    '''
    pos = nx.spring_layout(G)
    nx.draw(G, pos, with_labels=True, font_weight='bold', node_size=500)
    plt.show()
    '''

    # write graphml file
    nx.write_graphml(G, "mygraph.graphml")

def construct_graph2(data, file_name):
    G = nx.Graph()

    if isinstance(data, np.ndarray):
        data = pd.DataFrame(data)
    nodes = list(data.index)

    G.add_nodes_from(nodes)
    simularity = cosine_similarity(data.values, data.values)
    
    # sorted_simul = []
    for node in nodes:
        # reversed argsort
        sorted_simul_arg = np.argsort(simularity[node])[::-1]
        # exclude itself
        top5 = sorted_simul_arg[1:6]
        G.add_edges_from([node, n] for n in top5)
    
    nx.write_graphml(G, file_name)
    
    

### Calculate feature vector

In [3]:
from catboost import CatBoostClassifier, CatBoostRegressor
import json

def feature_vector(X, y, path):
    model = CatBoostClassifier(iterations=100,
                                  depth=6,
                                  learning_rate=0.1,
                                  loss_function='MultiClass',
                                  random_seed=0,
                                  nan_mode='Min',
                                  allow_const_label=True)
    
    # extract train masks
    with open(f'{path}/masks.json') as f:
        masks = json.load(f)
    train_masks = masks['0']['train']
    
    X_train = X.iloc[train_masks]
    y_train = y.iloc[train_masks]
    print(X_train.shape)
    print(y_train.shape)

    model.fit(X_train, y_train, verbose=False)
    prediction = model.predict_proba(X)
    # pred = model.predict(X)
    # print((y == pred.max(1)).sum().item()/y.shape[0])
    leaf_index = model.calc_leaf_indexes(X)
    return prediction, leaf_index


### Generate masks

In [20]:
import random
import json

def gen_masks(data, path):
    masks = {str(i):{"train":[], "val":[], "test":[]} for i in range(5)}
    scale_config = [int(0.6*len(data)), int(0.2*len(data)), int(0.2*len(data))]
    # print(scale_config)

    for key in list(masks.keys()):
        ids = list(data.index)
        random.shuffle(ids)

        masks[key]["train"] = ids[: scale_config[0]]
        masks[key]["val"] = ids[scale_config[0]:scale_config[0]+scale_config[1]]
        masks[key]["test"] = ids[scale_config[0]+scale_config[1]:]

    write_json(masks, path)


def write_json(data, path):
    with open(path, "w") as f:
        json.dump(data, f)


In [34]:
import pandas as pd

path1 = 'slap/X.csv'
path2 = 'slap/y.csv'

data = pd.read_csv(path1, header=None)[1:]
label = pd.read_csv(path2, header=None)[1:]

prediction, leaf_index = feature_vector(data, label, 'slap_s4')
construct_graph2(prediction, "pred_graph.graphml")
construct_graph2(leaf_index, "leaf_graph.graphml")

(60, 2701)
(60, 1)


### Optical recognition of handwritten digits dataset

In [6]:
import pandas as pd

path1 = 'optdigit/train_data.csv'
path2 = 'optdigit/test_data.csv'

train_data = pd.read_csv(path1, header=None)
test_data = pd.read_csv(path2, header=None)

data = pd.concat([train_data, test_data], axis=0, ignore_index=True)
label = data.iloc[:, -1]
data = data.iloc[:, :-1]
print(data.shape)

# create X.csv
# data.to_csv("X.csv", index=False)

# create y.csv
# label.to_csv("y.csv", index=False, header=["class"])

# create graph.graphml
# construct_graph2(data)

# create masks.json
# gen_masks(data, "masks.json")
prediction, leaf_index = feature_vector(data, label, "optdigit")
# construct_graph2(prediction, "pred_graph.graphml")
# construct_graph2(leaf_index, "leaf_graph.graphml")

(5620, 64)
(3372, 64)
(3372,)


### Wine recognition dataset

In [28]:
import pandas as pd
import random

path = 'wine/wine.data.csv'

data = pd.read_csv(path, header=None)
data = data.sample(frac=1, random_state=42)
label = data.iloc[:, 0]-1
data = data.iloc[:,1:]

# create X.csv
# data.to_csv("X.csv", index=False)

# create y.csv
# label.to_csv("y.csv", index=False, header=["class"])

# create graph.graphml
# construct_graph2(data)

# create masks.json
# gen_masks(data, "masks.json")

# create feature vector
prediction, leaf_index = feature_vector(data, label, 'wine_s4')
# print(prediction.shape)
# print(leaf_index.shape)
# construct_graph2(prediction, "pred_graph.graphml")
# construct_graph2(leaf_index, "leaf_graph.graphml")



(12, 13)
(12,)
0.9382022471910112


### Breast cancer wisconsin (diagnostic) dataset

In [15]:
import pandas as pd

path = 'wdbc/wdbc.data.csv'

data = pd.read_csv(path, header=None)
label = data.iloc[:, 1].replace({"B":1, "M":0})
data = data.iloc[:, 2:]

# create X.csv
data.to_csv("X.csv", index=False)
# create y.csv
label.to_csv("y.csv", index=False, header=["class"])
# create graph.graphml
construct_graph2(data)
# create masks.json
gen_masks(data, "masks.json")