In [7]:
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sk
import networkx as nx
import numpy as np
from pyvis.network import Network
import random
import math
import sklearn.metrics
from matplotlib.pyplot import figure
from grakel.kernels import VertexHistogram,WeisfeilerLehmanOptimalAssignment,ShortestPath,EdgeHistogram
from sklearn.model_selection import train_test_split
from grakel.utils import graph_from_networkx
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold 
from sklearn.model_selection import StratifiedKFold

In [8]:
def add_weights_edges(graph):
    """Adds edge weights to a graph. The weight for an edge is equal to the path length from a node to the 'All' node in the HPO graph.

    Parameters
    ----------
    graph : networkx.classes.multidigraph.MultiDiGraph
        a networkx graph

    Returns
    -------
    networkx.classes.multidigraph.MultiDiGraph
        a networkx graph where edge weights are added
    """
    G = nx.DiGraph(graph)
    for u,v,d in G.edges(data=True):
        l1 = list((nx.all_simple_paths(G,u, 'All')))
        path_to_source_length = len([item for sublist in l1 for item in sublist])
            
        G[u][v]['weight'] = ((path_to_source_length/(len(l1)))-1)
            
    return G
    
def add_weights_graphs(list_graphs):
    """Adds graphs with edge weights to a list of graphs.

    Parameters
    ----------
    list_graphs : list(networkx.classes.multidigraph.MultiDiGraph)
        list of networkx graphs

    Returns
    -------
    list(networkx.classes.multidigraph.MultiDiGraph)
        list of networkx graphs
    """
    graphs = []
    for i in range(0,len(list_graphs)):
        gn =add_weights_edges(list_graphs[i])
        graphs.append(gn)
    return graphs

In [9]:
def graph_kernel_classification(path_to_dataframe,kernel_name,k,kfold_type):
    """graph kernel classification of HPO graphs.

    Parameters
    ----------
    path_to_dataframe : str
        directory path to dataframe of graphs and labels
    kernel_name: type
        kernel name as documented on https://ysig.github.io/GraKeL/0.1a8/kernels.html
    k: int
        The amount of folds for cross validation
    kfold_type: str
        stratified KFold or KFold. ("stratified" as input for stratified KFold)

    Returns
    -------
    NoneType
        prints the accuracy for each fold and the average accuracy over all folds
    """

    #Get labels for graphs
    y_labels = [g for g in path_to_dataframe['label']]

    #Initialize kernel
    kernel = kernel_name(normalize=True)
    

    #Initialize kfold/stratified kfold
    
    if(kfold_type == 'stratified'):
        
        kf = StratifiedKFold(n_splits=k, random_state=None)
    else:
        kf = KFold(n_splits=k, random_state=None)

    graphs_with_weights = add_weights_graphs(path_to_dataframe['graphs'])
    #Convert graphs from networkx to GraKeL graphs
    graphs_grakel = list(graph_from_networkx(graphs_with_weights,node_labels_tag='name',edge_labels_tag='weight'))

    #Compute kernel matrices
    print("Computing kernel matrices")
    X = kernel.fit_transform(graphs_grakel)
    y = np.array(y_labels)



    # Initialize SVM
    clf = SVC(kernel='precomputed')

    acc_score = []

    # Split the kernel matrices into training and testing sets based on the given k/stratified k.
    print("Classifying patients")
    for train_index , test_index in kf.split(X,y):
        X_train , X_test = X[train_index,:],X[test_index,:]
        y_train , y_test = y[train_index] , y[test_index]
        y_train = y[train_index]
        y_test = y[test_index]
    
    
    
        kernel_train = np.dot(X_train, X_train.T) #Linear kernel
        clf.fit(kernel_train,y_train) #Fit the support vector classification model
        kernel_test = np.dot(X_test, X_train.T)
        pred_values = clf.predict(kernel_test) #Test the model
    
        acc = accuracy_score(pred_values , y_test)
    
        acc_score.append(acc)

    avg_acc_score = sum(acc_score)/k
    print('accuracy of each fold - {}'.format(acc_score))
    print('Avg accuracy : {}'.format(avg_acc_score))

In [11]:
# Give the path to the dataset, the kernel method you want to use, k, kfolds or stratified kfold
graph_kernel_classification(pd.read_pickle('C:/Users/niels/Downloads/patienten.pkl'),VertexHistogram,553,'kfolds')

Computing kernel matrices
Classifying patients
accuracy of each fold - [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0