In [1]:
import pickle as pkl 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import utils
import models
import kernels.KShortestPathKernel as KShortestPathKernel
import kernels.ShortestPathKernel as  ShortestPathKernel
import kernels.RandomWalkKernel as RandomWalkKernel
import kernels.WalkKernel as WalkKernel

In [96]:
path = 'data/'
with open(path + 'training_data.pkl', 'rb') as file: 
    train_graphs = pkl.load(file) 

with open(path + 'test_data.pkl', 'rb') as file: 
    test_graphs = pkl.load(file) 
    
with open(path + 'training_labels.pkl', 'rb') as file: 
    train_labels = pkl.load(file) 

In [97]:
for G in train_graphs:
    for e in G.nodes:
        G.nodes[e]['labels'] = [G.nodes[e]['labels'][0], 1]

for G in test_graphs:
    for e in G.nodes:
        G.nodes[e]['labels'] = [G.nodes[e]['labels'][0], 1]

for G in train_graphs:
    for e in G.edges:
        G.edges[e]['labels'] = G.edges[e]['labels'][0] + 1

for G in test_graphs:
    for e in G.edges:
        G.edges[e]['labels'] = G.edges[e]['labels'][0] + 1
        

In [98]:
import copy
def morgan_index(graphs):
    
    for (i,G) in enumerate(graphs):
        K = copy.deepcopy(G)
        for node in G.nodes:
            K.nodes[node]['labels'][1] = 0
            for x in G.neighbors(node):
                K.nodes[node]['labels'][1]  += G.nodes[x]['labels'][1]
        graphs[i] = K 

In [99]:
for i in range(1):
    morgan_index(train_graphs)
    morgan_index(test_graphs)

In [100]:
one_train = []
zero_train = []
for (i,G) in enumerate(train_graphs):
    if(train_labels[i] == 0):
        zero_train.append(G)
    else:
        one_train.append(G)    

In [101]:
print(len(zero_train))
print(len(one_train))

5445
555


In [102]:
n = len(zero_train)//9
  
# using list comprehension
chunck_train = [zero_train[i:i + n] for i in range(0, len(zero_train), n)]
#chunck_train[-2] += chunck_train[-1]
#chunck_train = chunck_train[:-1]

In [103]:
train_graphs = []
train_labels = []

for G in chunck_train:
    train_graphs.append(one_train + G)
    train_labels.append([1]*len(one_train) + [0]*len(G))

In [105]:
np.unique(train_labels[-1], return_counts= True)

(array([0, 1]), array([605, 555], dtype=int64))

In [106]:
randomWalk = WalkKernel.WalkKernel(maxK = 100)  
shortestPath = ShortestPathKernel.ShortestPath()
K_train = []
for i in range(len(train_graphs)):
    K_train.append(shortestPath.compute_kernel(train_graphs[i],train_graphs[i])  + randomWalk.compute_kernel(train_graphs[i],train_graphs[i]))

100%|██████████| 1160/1160 [00:11<00:00, 101.23it/s]
100%|██████████| 1160/1160 [00:10<00:00, 114.03it/s]
100%|██████████| 1160/1160 [03:30<00:00,  5.52it/s]
100%|██████████| 1160/1160 [04:56<00:00,  3.92it/s]
100%|██████████| 1160/1160 [00:07<00:00, 147.65it/s]
100%|██████████| 1160/1160 [00:07<00:00, 145.72it/s]
100%|██████████| 1160/1160 [03:16<00:00,  5.89it/s]
100%|██████████| 1160/1160 [04:03<00:00,  4.76it/s]
100%|██████████| 1160/1160 [00:05<00:00, 193.93it/s]
100%|██████████| 1160/1160 [00:07<00:00, 151.04it/s]
100%|██████████| 1160/1160 [03:12<00:00,  6.01it/s]
100%|██████████| 1160/1160 [02:37<00:00,  7.37it/s]
100%|██████████| 1160/1160 [00:03<00:00, 327.26it/s]
100%|██████████| 1160/1160 [00:03<00:00, 332.18it/s]
100%|██████████| 1160/1160 [01:34<00:00, 12.28it/s]
100%|██████████| 1160/1160 [02:11<00:00,  8.79it/s]
100%|██████████| 1160/1160 [00:02<00:00, 406.84it/s]
100%|██████████| 1160/1160 [00:02<00:00, 407.63it/s]
100%|██████████| 1160/1160 [01:19<00:00, 14.55it/s]
10

In [107]:
from scipy import optimize
from scipy.linalg import cho_factor, cho_solve
import numpy as np
import cvxopt
import cvxopt.solvers
from sklearn.base import BaseEstimator
def to_binary(y):
    return ((y + 1) / 2).astype(int)

def sigmoid(x):
    return 1/(1 + np.exp(-x))

class KernelSVC(BaseEstimator):
    
    def __init__(self, C, epsilon = 1e-3):
        self.type = 'non-linear'
        self.C = C                                     
        self.alpha = None
        self.epsilon = epsilon
        self.norm_f = None
        self.a = None
       
    
    def fit(self, K_train, y):
        

        y = np.array(y)
       
        #### You might define here any variable needed for the rest of the code
        N = len(y)
        
        K_train += 1   
        
        # Set up quadratic programming problem
        P = cvxopt.matrix(np.outer(y, y) * K_train)
        q = cvxopt.matrix(-1 * np.ones(N))
        G = cvxopt.matrix(np.vstack((-1 * np.eye(N), np.eye(N))))
        h = cvxopt.matrix(np.hstack((np.zeros(N), self.C * np.ones(N))))
        A = cvxopt.matrix(y.reshape(1, -1)) * 1.0
        b = cvxopt.matrix(np.zeros(1))
        # Solve the quadratic program using cvxopt       
        cvxopt.solvers.options['show_progress'] = True
        solution = cvxopt.solvers.qp(P, q, G, h, A, b)
        # Lagrange multipliers
        self.alpha = np.ravel(solution['x'])
        
        
        #clip
        self.alpha[self.alpha < 1e-5] = 0
        ## Assign the required attributes
        self.a = np.diag(y)@self.alpha 
        f = K_train@self.a
        mask = ((self.alpha < self.C) & (self.alpha > 0))
        self.b =  np.median((1 - y[mask]*f[mask])/y[mask]) #''' -----------------offset of the classifier------------------ '''
        self.norm_f = self.a.T@K_train@self.a   #'''------------------------RKHS norm of the function f ------------------------------'''
       

    ### Implementation of the separting function $f$ 
    def separating_function(self, K_test):
        # Input : matrix x of shape N data points times d dimension
        # Output: vector of size N
        K_test += 1
        return K_test@self.a
    
    
    def predict(self, K_test):
        """ Predict y values in {-1, 1} """
        d = self.separating_function(K_test)
        return 2 * ((d+self.b)> 0) - 1
    
    def predict_proba(self, X):
        d = self.separating_function(X)
        return sigmoid(d + self.b)

In [21]:
y_train = np.array(train_labels)
y_train = 2*y_train-1

In [118]:
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from tqdm import tqdm
list_c = []
for l in range(len(K_train)):
    c = np.arange(0.1,1.5,0.1)
    scores = []
    for C in tqdm(c):
        for i in range(2):
            s = []
            skf = KFold(n_splits=5, random_state=i, shuffle=True)
            score = cross_val_score(SVC(C = C, kernel='precomputed',  probability=True, class_weight='balanced'), K_train[l], train_labels[l], cv=skf, scoring = 'roc_auc')
            s.append(score.reshape(-1))
        scores.append(np.mean(s))
    print(c[np.argsort(scores)[-1]])
    list_c.append(c[np.argsort(scores)[-1]])
    print(list_c,np.sort(scores)[-1])

100%|██████████| 14/14 [00:10<00:00,  1.35it/s]


0.9
[0.8909623095733232, 0.9025810982317983, 0.9096969013300319, 0.9147955154474771, 0.9185342694224173, 0.9200289750265094, 0.921283262395562, 0.92146344191315, 0.92174792522001, 0.9215206941585775, 0.9211009990596821, 0.9204729741385064, 0.9196650259321826, 0.9193084448432429]
[0.9] 0.92174792522001


100%|██████████| 14/14 [00:08<00:00,  1.68it/s]


0.8
[0.907620464119347, 0.9248090791784946, 0.9329742279797006, 0.938269564549078, 0.9416949928604001, 0.9433247014253799, 0.943607359534448, 0.9440843545162773, 0.9438293618905547, 0.9436316501066596, 0.9434495324567076, 0.9434176089699413, 0.9434019665749844, 0.9434328722489779]
[0.9, 0.8] 0.9440843545162773


100%|██████████| 14/14 [00:08<00:00,  1.69it/s]


0.8
[0.8683816770707686, 0.8862686471215167, 0.8945688115790815, 0.8977265826266312, 0.8992211109234738, 0.9007487404843608, 0.9007199979758209, 0.9009587495566803, 0.9007054375587009, 0.9005579716148988, 0.9003808842073493, 0.9004271503774361, 0.8996362304018861, 0.9000434629874279]
[0.9, 0.8, 0.8] 0.9009587495566803


100%|██████████| 14/14 [00:08<00:00,  1.58it/s]


0.6
[0.8688688173384085, 0.8885923035099046, 0.8944303124381576, 0.8961734935046655, 0.8968392000897424, 0.8976372044009511, 0.8968094155749551, 0.8969247533132879, 0.8960525350979118, 0.8954812233485716, 0.8948060967635734, 0.894250568447789, 0.8938596924565996, 0.8938727060453642]
[0.9, 0.8, 0.8, 0.6] 0.8976372044009511


100%|██████████| 14/14 [00:08<00:00,  1.61it/s]


0.4
[0.9058923577744536, 0.9205263764712901, 0.9243216974670553, 0.9251433212445827, 0.924933035220253, 0.9248112885600344, 0.9239744954277025, 0.923750074549089, 0.9234496100115006, 0.9227895842382287, 0.9220988516763693, 0.9221277908208017, 0.9218737480023128, 0.9215299254644437]
[0.9, 0.8, 0.8, 0.6, 0.4] 0.9251433212445827


100%|██████████| 14/14 [00:08<00:00,  1.60it/s]


1.0
[0.8988503874343365, 0.9167435730075008, 0.9226654706101998, 0.9252808056653746, 0.9269571642806467, 0.9283760944517567, 0.9283902909171102, 0.9290189057684974, 0.9292410723677783, 0.9295092673279501, 0.9294020360252508, 0.9294174330372298, 0.9288344383880757, 0.9285066824630073]
[0.9, 0.8, 0.8, 0.6, 0.4, 1.0] 0.9295092673279501


100%|██████████| 14/14 [00:07<00:00,  1.77it/s]


0.6
[0.9043726120879411, 0.9206562790498841, 0.9252386085208342, 0.9258236769103148, 0.9256926610879594, 0.925840412664732, 0.9252867299940288, 0.9240001152047199, 0.9231599830390657, 0.9227108462376432, 0.9221576982751843, 0.9217249546323002, 0.9223207327920298, 0.9219312823319885]
[0.9, 0.8, 0.8, 0.6, 0.4, 1.0, 0.6] 0.925840412664732


100%|██████████| 14/14 [00:07<00:00,  1.87it/s]


0.9
[0.9566041315326178, 0.9624177225300977, 0.9649383201432562, 0.9660886579038973, 0.966371332536945, 0.9669087646117702, 0.9671324915599422, 0.9670419860646968, 0.9674897046269837, 0.9673692346203063, 0.9671004465487998, 0.9671295981546411, 0.9670549598643067, 0.9670398935651109]
[0.9, 0.8, 0.8, 0.6, 0.4, 1.0, 0.6, 0.9] 0.9674897046269837


100%|██████████| 14/14 [00:07<00:00,  1.77it/s]

0.9
[0.9187481577156784, 0.9304454693665507, 0.9366706653555577, 0.9399433135480122, 0.9417815480721708, 0.9426911012511244, 0.943720962680531, 0.9441974179054384, 0.9444647044475708, 0.944387891941029, 0.944075077220975, 0.9438340177801001, 0.9437122889961314, 0.9433690635378292]
[0.9, 0.8, 0.8, 0.6, 0.4, 1.0, 0.6, 0.9, 0.9] 0.9444647044475708





In [120]:
print(C,np.sort(scores)[-1])

1.4000000000000001 0.9444647044475708


In [119]:
list_c

[0.9, 0.8, 0.8, 0.6, 0.4, 1.0, 0.6, 0.9, 0.9]

## Make Prediction

In [121]:
K_test = []
for i in  range(len(train_graphs)):
    K_test.append(shortestPath.compute_kernel(test_graphs,train_graphs[i])  + randomWalk.compute_kernel(test_graphs,train_graphs[i]))    

100%|██████████| 2000/2000 [00:21<00:00, 94.59it/s] 
100%|██████████| 1160/1160 [00:08<00:00, 142.08it/s]
100%|██████████| 2000/2000 [06:19<00:00,  5.27it/s]
100%|██████████| 2000/2000 [08:36<00:00,  3.87it/s]
100%|██████████| 2000/2000 [00:18<00:00, 106.41it/s]
100%|██████████| 1160/1160 [00:07<00:00, 158.63it/s]
100%|██████████| 2000/2000 [06:10<00:00,  5.39it/s]
100%|██████████| 2000/2000 [08:35<00:00,  3.88it/s]
100%|██████████| 2000/2000 [00:19<00:00, 101.92it/s]
100%|██████████| 1160/1160 [00:04<00:00, 266.61it/s]
100%|██████████| 2000/2000 [05:32<00:00,  6.02it/s]
100%|██████████| 2000/2000 [07:34<00:00,  4.40it/s]
100%|██████████| 2000/2000 [00:18<00:00, 110.67it/s]
100%|██████████| 1160/1160 [00:03<00:00, 341.24it/s]
100%|██████████| 2000/2000 [04:58<00:00,  6.70it/s]
100%|██████████| 2000/2000 [06:33<00:00,  5.08it/s]
100%|██████████| 2000/2000 [00:18<00:00, 109.94it/s]
100%|██████████| 1160/1160 [00:02<00:00, 420.14it/s]
100%|██████████| 2000/2000 [04:48<00:00,  6.93it/s]
10

In [138]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
y_pred = []
for i in range(len(train_graphs)):
    clf = SVC(C = list_c[i], kernel='precomputed',  probability=True)
    #clf = models.KernelSVC(C=1)
    # Fit on the train Kernel
    clf.fit(K_train[i], train_labels[i])

    # Predict and test.
    y = clf.predict_proba(K_test[i])[:,1]
    #y = np.log(y/(1-y))
    y_pred.append(y)

In [139]:
y_pred = np.array(y_pred)
y_pred = np.mean(y_pred,axis=0)
y_pred.shape

(2000,)

In [143]:
y_pred = np.log(y_pred/(1-y_pred))

In [144]:
sub = pd.DataFrame()
sub['Id']= np.arange(1,len(y_pred)+1)
sub['Predicted'] = y_pred
sub.to_csv("submissions/sub.csv",index=False)

In [145]:
y_sub = pd.read_csv("submissions/sub.csv")
y_sub = np.array(y_sub['Predicted'])
y_sub[y_sub< 0 ] = 0
y_sub[y_sub>0] = 1
np.sum(y_sub)/len(y_sub)

0.4745

In [146]:
y_sub = pd.read_csv("submissions/sub_089.csv")
y_sub = np.array(y_sub['Predicted'])
y_sub[y_sub< 0 ] = 0
y_sub[y_sub>0] = 1
np.sum(y_sub)/len(y_sub)

0.111

In [32]:
y_bootstrap = pd.read_csv("submissions/sub_bootstrap.csv")
y_best_weights = pd.read_csv("submissions/sub_best_weights.csv")
y_best_no_weights = pd.read_csv("submissions/sub_best_no_weights.csv")

In [33]:
y_bootstrap = np.array(y_bootstrap['Predicted'])
y_best_weights = np.array(y_best_weights['Predicted'])
y_best_no_weights = np.array(y_best_no_weights['Predicted'])

In [46]:
y_bootstrap_labels = np.where(y_bootstrap<2.0556, 0, 1)
y_best_weights_labels = np.where(y_best_weights<0, 0, 1)
y_best_no_weights_labels = np.where(y_best_no_weights<0, 0, 1)

In [47]:
print(np.unique(y_bootstrap_labels,return_counts=True))
print(np.unique(y_best_weights_labels,return_counts=True))
print(np.unique(y_best_weights_labels,return_counts=True))

(array([0, 1]), array([1776,  224], dtype=int64))
(array([0, 1]), array([1777,  223], dtype=int64))
(array([0, 1]), array([1777,  223], dtype=int64))
