# Setting

Python and compiler versions.

In [0]:
import sys
print(sys.version)

Import PersLay.

In [0]:
from PersLay import *

# Reading data

In this section, persistence diagrams, features and labels are read from files.

In [0]:
dataset = "MUTAG"

### Persistence diagrams

Persistence diagrams are assumed to be in **hdf5** file format with filtrations given as keys (i.e. "alpha", "rips"...) and homological dimensions given as sub-keys (i.e. "0", "1"...). These keys and sub-keys lead to dictionaries, whose keys correspond to data indexes starting at 0. This **hdf5** file is then read with the **diag_to_dict** function defined in *PersLay.py*. For instance, the 10th 0-dimensional persistence diagram computed with the Rips filtration is accessed with **diagrams["rips"]["0"]["9"]**, where **diagrams** is the output of **diag_to_dict**.

In [0]:
import h5py

filts = ["Ord0_10.0-hks", "Rel1_10.0-hks", "Ext0_10.0-hks", "Ext1_10.0-hks"]
path_to_diag = dataset + "/" + dataset + ".hdf5"
diag = diag_to_dict(h5py.File(path_to_diag, "r"), filts=filts)

Print name and cardinality of the different filtrations.

In [0]:
for filt in diag.keys():
    print("Max cardinal of filtration " + filt + " = " + str(max([len(dgm) for dgm in diag[filt]])))

Visualize the persistence diagrams (optional, requires **sklearn_tda**).

In [0]:
import sklearn_tda as tda
from sklearn.preprocessing import MinMaxScaler

filtration   = "Ord0_10.0-hks"    # Name of filtration
idx_diagram  = 0          # Index of persistence diagram that will be visualized

# Retrieve points with finite coordinates
diag_example = tda.DiagramSelector(limit=np.inf).fit_transform(diag[filtration])

# Get minimum and maximum coordinates for all diagrams in the filtration
pre = tda.DiagramPreprocessor(use=True, scalers=[([0,1], MinMaxScaler())]).fit(diag_example)
[mx,my],[Mx,My] = pre.scalers_[0][1].data_min_, pre.scalers_[0][1].data_max_
print("Minimum x = " + str(mx) + ", Maximum x = " + str(Mx) + \
      ", Minimum y = " + str(my) + ", Maximum y = " + str(My))

# Plot persistence diagram corresponding to given index
xs, ys = diag_example[idx_diagram][:,0], diag_example[idx_diagram][:,1]
plt.scatter(xs,ys)
plt.plot([min(xs),max(xs)],[min(xs),max(xs)])
plt.axis([min(xs),max(xs),min(ys),max(ys)])
plt.show()

### Features and labels

Features are given in **csv** file format. The first column is assumed to be called *label* and to contain the labels.

In [0]:
import pandas as pd

path_to_feat = "gdrive/My Drive/data/" + dataset + "/" + dataset + ".csv"
feat = pd.read_csv(path_to_feat, index_col=0, header=0)

Read and encode labels and features.

In [0]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Extract and encode labels with integers
L = np.array(feat["label"])
L = np.array(LabelEncoder().fit_transform(L))
L = OneHotEncoder(sparse=False, categories="auto").fit_transform(L[:,np.newaxis]) 
num_labels = L.shape[1]

# Extract features
F = np.array(feat)[:,1:]
[num_pts, num_features] = F.shape

Print summary.

In [0]:
print(str(num_pts) + " instances and " + str(num_features) + " features.")

# (Optional) Preprocessing

In this section, persistence diagrams are preprocessed with **sklearn_tda** (see https://github.com/MathieuCarriere/sklearn_tda). If you already preprocessed your diagrams, you can skip this section.

In [0]:
import sklearn_tda as tda

Use sklearn format pipelines, can fit and fit_transform and apply sequences of args. For example, Separator selects finite or essential points, ProminentPts (or Quantizer) forces uniformly upper bounded number of points and MinMax rescales diagrams to have coordinates in [0,1] x [0,1]. At the end, all diagrams are represented by a (Npts x Ndgm x dim+1) array, where Npts is the number of instances, Ndgm is the upper bound on the number of points, dim is the dimension of the diagrams (usually 2 for finite persistence diagrams and 1 for essential ones). The last dimension is dim+1 because we give an additional coordinate for the points in the diagrams specifying if the point is relevant (in which case the value of this coordinate is 1) or added after padding (in which case the value of the coordinate is 0).

*Remark:* **nu_separator** operation is used only for baseline experiment on graphs in the article "Deep Learning with Topological Signatures". Use it only if you wish to work with their architecture, that is implemented in the next section as a special case of PersLay channels.

In [0]:
from sklearn.pipeline import Pipeline

# Definition of our scaler for the persistence diagrams:
# 1. transform the diagram points with (x,y) ---> (x, y-x) 
# 2. rescale the diagrams to the unit square [0,1] x [0,1]
scaler = [([0,1],  Pipeline([("1", tda.BirthPersistenceTransform()), ("2", MinMaxScaler()) ]))]

# Whole pipeline
preprocess = Pipeline([
    ("Selector",      tda.DiagramSelector(use=True, point_type="finite")),
    ("ProminentPts",  tda.ProminentPoints(use=True, num_pts=400, point_type="finite")),
    ("Scaler",        tda.DiagramPreprocessor(use=True,  scalers=scaler)),
    ("NuSeparator",   tda.DiagramPreprocessor(use=False, scalers=[([0,1],nu_separator(nu=.1))])),
    ("Padding",       tda.Padding(use=True)),
                      ])

Apply the previous pipeline on the different filtrations. 

In [0]:
# Number of points to keep for each filtration
prm = {"Ord0_10.0-hks":  {"ProminentPts__num_pts": 400}, "Rel1_10.0-hks":  {"ProminentPts__num_pts": 400},
       "Ext0_10.0-hks":  {"ProminentPts__num_pts": 400}, "Ext1_10.0-hks":  {"ProminentPts__num_pts": 400}}

# Apply pipeline
D = []
for dt in prm.keys():
    param = prm[dt]
    preprocess.set_params(**param)
    D.append(preprocess.fit_transform(diag[dt]))

For each filtration, concatenate all diagrams in a single array.

In [0]:
D_pad = []
for dt in range(len(prm.keys())):
    D_pad.append(np.concatenate([D[dt][i][np.newaxis,:] for i in range(len(D[dt]))], axis=0))
    print(D_pad[dt].shape)

Visualize the preprocessed persistence diagrams.

In [0]:
from sklearn.preprocessing import MinMaxScaler

filtration   = 1
idx_diagram  = 2

diag_example = D[filtration]
pre = tda.DiagramPreprocessor(use=True, scalers=[([0,1,2], MinMaxScaler())]).fit(diag_example)
[mx,my,mz],[Mx,My,Mz] = pre.scalers_[0][1].data_min_, pre.scalers_[0][1].data_max_
print("Minimum x = " + str(mx) + ", Maximum x = " + str(Mx) + ", Minimum y = " + str(my) + ", Maximum y = " + str(My))

xs, ys = diag_example[idx_diagram][:,0], diag_example[idx_diagram][:,1]
plt.scatter(xs,ys)
plt.axis([min(xs),max(xs),min(ys),max(ys)])
plt.show()

# Classification

In this section, we classify persistence diagrams with PersLay.

 **Warning:** persistence diagrams are assumed to be preprocessed, i.e., they should all have the same number of points, and the points in the persistence diagrams should have a third coordinate, which is either 1 or 0, and which indicates if the point is meaningful (1) or if it has been added after padding (0).

### Architecture of neural network
In this subsection, we define the neural network architecture with several **PersLay** channels. We also provide a specific PersLay architecture that implements the network defined in https://arxiv.org/pdf/1707.04041.pdf

Neural network with several **PersLay** channels.

In [0]:
from tensorflow import random_uniform_initializer as rui

def model(indxs, feats, diags):

    list_v = []
    
    # Channels for persistence diagrams. There is one per filtration.
    PersLay(list_v, "fin-0", diags[0],
        #layer="ls", num_samples=25, perm_op="topk", keep=2,
        #layer="gs", num_gaussians=25, perm_op="sum", mean_init=rui(0.,1.), variance_init=rui(1.,1.), mean_const=False, variance_const=False,
        layer="pm", peq=[(2,None)], perm_op="sum", keep=5, weight_init=rui(0.,1.), bias_init=rui(-1.,1.),
        #layer="im", image_size=[5,5], perm_op="sum", image_bnds=[[0.,1.],[0.,1.]], variance_init=rui(1.,1.),
        persistence_weight="linear", grid_size=[5,5], grid_bnds=[[0.,1.],[0.,1.]], grid_init=rui(1.,1.),
        fc_layers=[],
        cv_layers=[(10,2,"bdr")],
        tensor=True)
    PersLay(list_v, "fin-1", diags[1],
        #layer="ls", num_samples=25, perm_op="topk", keep=2,
        #layer="gs", num_gaussians=25, perm_op="sum", mean_init=rui(0.,1.), variance_init=rui(1.,1.), mean_const=False, variance_const=False,
        layer="pm", peq=[(2,None)], perm_op="sum", keep=5, weight_init=rui(0.,1.), bias_init=rui(-1.,1.),
        #layer="im", image_size=[5,5], perm_op="sum", image_bnds=[[0.,1.],[0.,1.]], variance_init=rui(1.,1.),
        persistence_weight="linear", grid_size=[5,5], grid_bnds=[[0.,1.],[0.,1.]], grid_init=rui(1.,1.),
        fc_layers=[],
        cv_layers=[(10,2,"bdr")],
        tensor=True)
    PersLay(list_v, "fin-2", diags[2],
        #layer="ls", num_samples=25, perm_op="topk", keep=2,
        #layer="gs", num_gaussians=25, perm_op="sum", mean_init=rui(0.,1.), variance_init=rui(1.,1.), mean_const=False, variance_const=False,
        layer="pm", peq=[(2,None)], perm_op="sum", keep=5, weight_init=rui(0.,1.), bias_init=rui(-1.,1.),
        #layer="im", image_size=[5,5], perm_op="sum", image_bnds=[[0.,1.],[0.,1.]], variance_init=rui(1.,1.),
        persistence_weight="linear", grid_size=[5,5], grid_bnds=[[0.,1.],[0.,1.]], grid_init=rui(1.,1.),
        fc_layers=[],
        cv_layers=[(10,2,"bdr")],
        tensor=True)
    PersLay(list_v, "fin-3", diags[3],
        #layer="ls", num_samples=25, perm_op="topk", keep=2,
        #layer="gs", num_gaussians=25, perm_op="sum", mean_init=rui(0.,1.), variance_init=rui(1.,1.), mean_const=False, variance_const=False,
        layer="pm", peq=[(2,None)], perm_op="sum", keep=5, weight_init=rui(0.,1.), bias_init=rui(-1.,1.),
        #layer="im", image_size=[5,5], perm_op="sum", image_bnds=[[0.,1.],[0.,1.]], variance_init=rui(1.,1.),
        persistence_weight="linear", grid_size=[5,5], grid_bnds=[[0.,1.],[0.,1.]], grid_init=rui(1.,1.),
        fc_layers=[],
        cv_layers=[(10,2,"bdr")],
        tensor=True)
        
    # Concatenate all channels and add other features
    vector = tf.concat(list_v, 1)
    with tf.variable_scope("norm_diag"):
        vector = tf.layers.batch_normalization(vector)
    with tf.variable_scope("norm_feat"):
        feat = tf.layers.batch_normalization(feats)
    vector = tf.concat([vector, feat[:,:]], 1)
    
    # Final fully-connected layer
    with tf.variable_scope("final-dense-3"):
        vector = post_processing(tf.layers.dense(vector, num_labels), "")
    
    return vector

**Example of neural network from "Deep Learning with Topological Signatures".** Don't run this if you want to run your own model (defined in cell above). This cell implements the architecture defined in https://arxiv.org/pdf/1707.04041.pdf

In [0]:
def baseline_model(feats, diags):
    list_v = []
    PersLay(list_v, "fin0", diags[0],
        layer="gs", num_gaussians=150, perm_op="sum",
        persistence_weight=None,
        fc_layers=[(75,"bd"),(75,"rd")])
    PersLay(list_v, "ess0", diags[1],
        layer="gs", num_gaussians=50, perm_op="sum",
        persistence_weight=None,
        fc_layers = [(25,"bd"),(25,"rd")])
    PersLay(list_v, "ess1", diags[2],
        layer="gs", num_gaussians=50, perm_op="sum",
        persistence_weight=None,
        fc_layers = [(25,"bd"),(25,"rd")])
    vector = tf.concat(list_v, 1)
    with tf.variable_scope("final-dense-0"):
        vector = post_processing(tf.layers.dense(vector, 200), "br")
    with tf.variable_scope("final-dense-1"):
        vector = post_processing(tf.layers.dense(vector, 100), "bdr")
    with tf.variable_scope("final-dense-2"):
        vector = post_processing(tf.layers.dense(vector, 50), "br")
    with tf.variable_scope("final-dense-3"):
        vector = post_processing(tf.layers.dense(vector, num_labels), "b")
    return vector

### Train and test data
In this subsection, we finally train and test the network on the data. 

Specify here how you want to train data and test data: either with K-folds ("KF") or with random permutations of test set ("RP").

In [0]:
mode        = "KF"     # Either "KF" or "RP"
num_scores  = 10        # Number of score generations
num_splits  = 10      # Number of splits
test_size   = 0.3      # Size of test set in case of "RP"

Specify here if you have one or several GPUs or CPUs, as well as number of epochs, batch size and validation size. If you do not want to use validation sets for early stopping, set **valid_size** to 0.

In [0]:
num_tower   = 1        # Number of computing units
tower_type  = "gpu"    # Type of computing units ("cpu" or "gpu")
batch_size  = 128      # Batch size for each tower
num_epochs  = 1000      # Number of epochs
valid_size  = 0.       # Size of validation set

Specify here the decay of Exponential Moving Average, the learning rate of optimizer and the verbose for training.

In [0]:
decay       = 0.9       # Decay of Exponential Moving Average
learn_rate  = 0.01     # Learning rate of optimizer
verbose     = True

Train and test data.

In [0]:
from sklearn.model_selection import KFold, ShuffleSplit

for idx_score in range(num_scores):

    if mode == "KF": # Evaluation with k-fold on test set
        folds = KFold(n_splits=num_splits, random_state=idx_score, shuffle=True).split(np.empty([num_pts]))
    if mode == "RP": # Evaluation with random test set
        folds = ShuffleSplit(n_splits=num_splits, test_size=test_size, random_state=idx_score).split(np.empty([num_pts]))

    train_accs, valid_accs, test_accs = np.zeros([num_splits, num_epochs]), np.zeros([num_splits, num_epochs]), np.zeros([num_splits, num_epochs])
    for idx, (train_sub, test_sub) in enumerate(folds):
      
        valid_sub = train_sub[:int(valid_size*len(train_sub))]
        train_sub = train_sub[int(valid_size*len(train_sub)):]

        print(str(len(train_sub)) + " train points and " + str(len(test_sub)) + " test points")

        # Create neural network
        tf.reset_default_graph()
        
        # Evaluation of neural network
        ltrain, lvalid, ltest = evaluate_nn_model(L,F,D_pad,train_sub,valid_sub,test_sub,\
                                                  model,num_tower,tower_type,num_epochs,\
                                                  decay,learn_rate,batch_size,verbose)
        
        train_accs[idx,:],valid_accs[idx,:],test_accs[idx,:] = np.array(ltrain),np.array(lvalid),np.array(ltest)
    
        # Write results
        #np.save(dataset + "_train_score" + str(idx_score) + "_split"  + str(idx), np.array(ltrain))
        #np.save(dataset + "_valid_score" + str(idx_score) + "_split"  + str(idx), np.array(lvalid))
        np.save(dataset + "/" + dataset + "_test_score"  + str(idx_score) + "_split"  + str(90+idx), np.array(ltest))