In [1]:
import pickle as pkl 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import utils
import models
import kernels.KShortestPathKernel as KShortestPathKernel
import kernels.ShortestPathKernel as  ShortestPathKernel
import kernels.RandomWalkKernel as RandomWalkKernel
import kernels.WalkKernel as WalkKernel

In [2]:
path = 'data/'
with open(path + 'training_data.pkl', 'rb') as file: 
    train_graphs = pkl.load(file) 

with open(path + 'test_data.pkl', 'rb') as file: 
    test_graphs = pkl.load(file) 
    
with open(path + 'training_labels.pkl', 'rb') as file: 
    train_labels = pkl.load(file) 

In [3]:
for G in train_graphs:
    for e in G.nodes:
        G.nodes[e]['labels'] = [G.nodes[e]['labels'][0], 1]

for G in test_graphs:
    for e in G.nodes:
        G.nodes[e]['labels'] = [G.nodes[e]['labels'][0], 1]

for G in train_graphs:
    for e in G.edges:
        G.edges[e]['labels'] = G.edges[e]['labels'][0] + 1

for G in test_graphs:
    for e in G.edges:
        G.edges[e]['labels'] = G.edges[e]['labels'][0] + 1
        

In [4]:
import copy
def morgan_index(graphs):
    
    for (i,G) in enumerate(graphs):
        K = copy.deepcopy(G)
        for node in G.nodes:
            K.nodes[node]['labels'][1] = 0
            for x in G.neighbors(node):
                K.nodes[node]['labels'][1]  += G.nodes[x]['labels'][1]
        graphs[i] = K 

In [5]:
for i in range(1):
    morgan_index(train_graphs)
    morgan_index(test_graphs)

In [6]:
one_train = []
zero_train = []
for (i,G) in enumerate(train_graphs):
    if(train_labels[i] == 0):
        zero_train.append(G)
    else:
        one_train.append(G)    

In [7]:
n = 6000
train_graphs = one_train
train_labels = [1]*len(one_train)
for G in zero_train:
    if(len(train_graphs) == n):
        break
    train_graphs.append(G)
    train_labels.append(0)


In [8]:
zero_train = zero_train[n:]
y_zeros = np.zeros(len(zero_train))

In [9]:
len(zero_train)

0

In [10]:
randomWalk = WalkKernel.WalkKernel(maxK = 100)  
shortestPath = ShortestPathKernel.ShortestPath()

#K_train = shortestPath.compute_kernel(train_graphs,train_graphs)  + randomWalk.compute_kernel(train_graphs,train_graphs)


In [11]:
from numpy import loadtxt
K_train = loadtxt('precomputed_kernels/train_morgan_index_1.csv', delimiter=',')
K_test = loadtxt('precomputed_kernels/test_morgan_index_1.csv', delimiter=',')

In [12]:
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from tqdm import tqdm

c = np.arange(1,1.5,0.1)
scores = []
for C in tqdm(c):
    for i in range(2):
        s = []
        skf = KFold(n_splits=5, random_state=i, shuffle=True)
        score = cross_val_score(SVC(C = C, kernel='precomputed',  probability=True, class_weight='balanced'), K_train, train_labels, cv=skf, scoring = 'roc_auc')
        s.append(score.reshape(-1))
    scores.append(np.mean(s))

C =  c[np.argsort(scores)[-1]]
print(scores)
print(C,np.sort(scores)[-1])

100%|██████████| 5/5 [00:55<00:00, 11.09s/it]

[0.9205736576664465, 0.920352571352263, 0.920143916517009, 0.9197528703370784, 0.9198265260461825]
1.0 0.9205736576664465





In [13]:
print(C,np.sort(scores)[-1])

1.0 0.9205736576664465


## Make Prediction

In [14]:
#K_test = shortestPath.compute_kernel(test_graphs,train_graphs)  + randomWalk.compute_kernel(test_graphs,train_graphs) 

In [15]:
y_train = train_labels
y_train = np.array(y_train).reshape(-1)
y_train = 2*y_train - 1 

In [16]:
np.unique(y_train,return_counts=True)

(array([-1,  1]), array([5445,  555], dtype=int64))

In [17]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
#clf = SVC(C = C, kernel='precomputed',  probability=True, class_weight='balanced')
clf = models.KernelSVC(C = 1)
# Fit on the train Kernel
clf.fit(K_train, y_train)

# Predict and test.
y_pred = clf.predict(K_train)

print("Classification accuracy: %0.2f" % accuracy_score(y_train, y_pred))

     pcost       dcost       gap    pres   dres
 0: -4.9040e+02 -1.0925e+04  5e+04  2e+00  2e-13
 1: -4.1242e+02 -5.6474e+03  7e+03  2e-01  3e-13
 2: -4.0959e+02 -1.3966e+03  1e+03  2e-02  2e-13
 3: -4.4506e+02 -7.6572e+02  3e+02  4e-03  2e-13
 4: -4.6622e+02 -5.9521e+02  1e+02  9e-04  2e-13
 5: -4.7798e+02 -5.2102e+02  4e+01  7e-14  3e-13
 6: -4.8309e+02 -4.9817e+02  2e+01  5e-14  3e-13
 7: -4.8532e+02 -4.9011e+02  5e+00  1e-13  3e-13
 8: -4.8620e+02 -4.8749e+02  1e+00  3e-13  2e-13
 9: -4.8646e+02 -4.8683e+02  4e-01  8e-13  3e-13
10: -4.8655e+02 -4.8661e+02  6e-02  6e-13  3e-13
11: -4.8657e+02 -4.8657e+02  2e-03  6e-14  3e-13
12: -4.8657e+02 -4.8657e+02  2e-04  3e-13  3e-13
Optimal solution found.
Classification accuracy: 0.99


In [18]:
y_pred = clf.predict_proba(K_test)
y_pred = np.log(y_pred/(1-y_pred))
y_pred

array([-0.90894634, -0.24840027, -0.89220016, ..., -0.92396126,
        0.75024512, -1.029548  ])

In [19]:
sub = pd.DataFrame()
sub['Id']= np.arange(1,len(y_pred)+1)
sub['Predicted'] = y_pred
sub.to_csv("submissions/sub_test.csv",index=False)

In [23]:
y_sub = pd.read_csv("new_submissions/sub_full_dataset.csv")
y_sub = np.array(y_sub['Predicted'])
y_sub[y_sub< 0 ] = 0
y_sub[y_sub>0] = 1
np.sum(y_sub)/len(y_sub)

0.015

In [21]:
y_sub = pd.read_csv("submissions/sub_test.csv")
y_sub = np.array(y_sub['Predicted'])
y_sub[y_sub< 0 ] = 0
y_sub[y_sub>0] = 1
np.sum(y_sub)/len(y_sub)

0.1255

In [22]:
%pip freeze > requirement.txt

Note: you may need to restart the kernel to use updated packages.
