In [95]:
import pickle as pkl 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import utils
import models
import kernels.KShortestPathKernel as KShortestPathKernel
import kernels.ShortestPathKernel as  ShortestPathKernel
import kernels.RandomWalkKernel as RandomWalkKernel
import kernels.WalkKernel as WalkKernel

In [96]:
path = 'data/'
with open(path + 'training_data.pkl', 'rb') as file: 
    train_graphs = pkl.load(file) 

with open(path + 'test_data.pkl', 'rb') as file: 
    test_graphs = pkl.load(file) 
    
with open(path + 'training_labels.pkl', 'rb') as file: 
    train_labels = pkl.load(file) 

In [97]:
for G in train_graphs:
    for e in G.nodes:
        G.nodes[e]['labels'] = [G.nodes[e]['labels'][0], 1]

for G in test_graphs:
    for e in G.nodes:
        G.nodes[e]['labels'] = [G.nodes[e]['labels'][0], 1]

for G in train_graphs:
    for e in G.edges:
        G.edges[e]['labels'] = G.edges[e]['labels'][0] + 1

for G in test_graphs:
    for e in G.edges:
        G.edges[e]['labels'] = G.edges[e]['labels'][0] + 1
        

In [98]:
import copy
def morgan_index(graphs):
    
    for (i,G) in enumerate(graphs):
        K = copy.deepcopy(G)
        for node in G.nodes:
            K.nodes[node]['labels'][1] = 0
            for x in G.neighbors(node):
                K.nodes[node]['labels'][1]  += G.nodes[x]['labels'][1]
        graphs[i] = K 

In [99]:
for i in range(1):
    morgan_index(train_graphs)
    morgan_index(test_graphs)

In [100]:
one_train = []
zero_train = []
for (i,G) in enumerate(train_graphs):
    if(train_labels[i] == 0):
        zero_train.append(G)
    else:
        one_train.append(G)    

In [101]:
print(len(zero_train))
print(len(one_train))

5445
555


In [102]:
n = len(zero_train)//9
  
# using list comprehension
chunck_train = [zero_train[i:i + n] for i in range(0, len(zero_train), n)]
#chunck_train[-2] += chunck_train[-1]
#chunck_train = chunck_train[:-1]

In [103]:
train_graphs = []
train_labels = []

for G in chunck_train:
    train_graphs.append(one_train + G)
    train_labels.append([1]*len(one_train) + [0]*len(G))

In [105]:
np.unique(train_labels[-1], return_counts= True)

(array([0, 1]), array([605, 555], dtype=int64))

In [59]:
randomWalk = WalkKernel.WalkKernel(maxK = 100)  
shortestPath = ShortestPathKernel.ShortestPath()
K_train = shortestPath.compute_kernel(train_graphs,train_graphs)  + randomWalk.compute_kernel(train_graphs,train_graphs)

  0%|          | 0/8 [00:00<?, ?it/s]


AttributeError: 'list' object has no attribute 'is_multigraph'

In [60]:
from scipy import optimize
from scipy.linalg import cho_factor, cho_solve
import numpy as np
import cvxopt
import cvxopt.solvers
from sklearn.base import BaseEstimator
def to_binary(y):
    return ((y + 1) / 2).astype(int)

def sigmoid(x):
    return 1/(1 + np.exp(-x))

class KernelSVC(BaseEstimator):
    
    def __init__(self, C, epsilon = 1e-3):
        self.type = 'non-linear'
        self.C = C                                     
        self.alpha = None
        self.epsilon = epsilon
        self.norm_f = None
        self.a = None
       
    
    def fit(self, K_train, y):
        

        y = np.array(y)
       
        #### You might define here any variable needed for the rest of the code
        N = len(y)
        
        K_train += 1   
        
        # Set up quadratic programming problem
        P = cvxopt.matrix(np.outer(y, y) * K_train)
        q = cvxopt.matrix(-1 * np.ones(N))
        G = cvxopt.matrix(np.vstack((-1 * np.eye(N), np.eye(N))))
        h = cvxopt.matrix(np.hstack((np.zeros(N), self.C * np.ones(N))))
        A = cvxopt.matrix(y.reshape(1, -1)) * 1.0
        b = cvxopt.matrix(np.zeros(1))
        # Solve the quadratic program using cvxopt       
        cvxopt.solvers.options['show_progress'] = True
        solution = cvxopt.solvers.qp(P, q, G, h, A, b)
        # Lagrange multipliers
        self.alpha = np.ravel(solution['x'])
        
        
        #clip
        self.alpha[self.alpha < 1e-5] = 0
        ## Assign the required attributes
        self.a = np.diag(y)@self.alpha 
        f = K_train@self.a
        mask = ((self.alpha < self.C) & (self.alpha > 0))
        self.b =  np.median((1 - y[mask]*f[mask])/y[mask]) #''' -----------------offset of the classifier------------------ '''
        self.norm_f = self.a.T@K_train@self.a   #'''------------------------RKHS norm of the function f ------------------------------'''
       

    ### Implementation of the separting function $f$ 
    def separating_function(self, K_test):
        # Input : matrix x of shape N data points times d dimension
        # Output: vector of size N
        K_test += 1
        return K_test@self.a
    
    
    def predict(self, K_test):
        """ Predict y values in {-1, 1} """
        d = self.separating_function(K_test)
        return 2 * ((d+self.b)> 0) - 1
    
    def predict_proba(self, X):
        d = self.separating_function(X)
        return sigmoid(d + self.b)

In [21]:
y_train = np.array(train_labels)
y_train = 2*y_train-1

In [24]:
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from tqdm import tqdm

c = np.arange(1,1.5,0.01)
scores = []
for C in tqdm(c):
    for i in range(2):
        s = []
        skf = KFold(n_splits=5, random_state=i, shuffle=True)
        score = cross_val_score(SVC(C = C, kernel='precomputed',  probability=True, class_weight='balanced'), K_train, train_labels, cv=skf, scoring = 'roc_auc')
        s.append(score.reshape(-1))
    scores.append(np.mean(s))

C =  c[np.argsort(scores)[-1]]
print(scores)
print(C,np.sort(scores)[-1])

  0%|          | 0/50 [00:00<?, ?it/s]


(800,) (800, 1000)


ValueError: operands could not be broadcast together with shapes (800,800) (800,1000) 

In [12]:
print(C,np.sort(scores)[-1])

1.4900000000000004 0.910652782919979


## Make Prediction

In [13]:
K_test = shortestPath.compute_kernel(test_graphs,train_graphs)  + randomWalk.compute_kernel(test_graphs,train_graphs)    

  7%|▋         | 147/2000 [00:02<00:27, 66.77it/s]


KeyboardInterrupt: 

In [None]:
y_train = train_labels
y_train = np.array(y_train).reshape(-1)
y_train = 2*y_train - 1 

In [None]:
np.unique(y_train,return_counts=True)

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
clf = SVC(C = C, kernel='precomputed',  probability=True)
#clf = models.KernelSVC(C=1)
# Fit on the train Kernel
clf.fit(K_train, y_train)

# Predict and test.
y_pred = clf.predict(K_train)

In [None]:
print("Classification accuracy: %0.2f" % accuracy_score(y_train, y_pred))

In [None]:
np.unique(y_pred,return_counts=True)

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn import metrics
y_pred = clf.predict(K_train)
#tn, fp, fn, tp = confusion_matrix(y_train, y_pred,normalize='true').ravel()
confusion_matrix = confusion_matrix(y_train, y_pred,normalize='true')
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [False, True])
cm_display.plot()
plt.show()

In [None]:
from sklearn.metrics import plot_roc_curve, auc 
plot_roc_curve(clf, K_train, y_train)

In [None]:
y_pred = clf.predict_proba(K_test)[:,1]
y_pred = np.log(y_pred/(1-y_pred))
y_pred

In [None]:
sub = pd.DataFrame()
sub['Id']= np.arange(1,len(y_pred)+1)
sub['Predicted'] = y_pred
sub.to_csv("submissions/sub.csv",index=False)

In [None]:
y_sub = pd.read_csv("submissions/sub.csv")
y_sub = np.array(y_sub['Predicted'])
y_sub[y_sub< 0 ] = 0
y_sub[y_sub>0] = 1
np.sum(y_sub)/len(y_sub)

In [None]:
y_sub = pd.read_csv("submissions/sub_089.csv")
y_sub = np.array(y_sub['Predicted'])
y_sub[y_sub< 0 ] = 0
y_sub[y_sub>0] = 1
np.sum(y_sub)/len(y_sub)