In [6]:
import numpy as np
import matplotlib.pyplot as plt
import pickle
from scipy import optimize
import networkx as nx
from hashlib import blake2b
from collections import Counter, defaultdict
import itertools
import time
import pandas as pd
from sklearn.metrics import accuracy_score

In [2]:
with open("../data-challenge-kernel-methods-2022-2023/training_data.pkl", "rb") as f:
    train_data = pickle.load(f)

with open("../data-challenge-kernel-methods-2022-2023/training_labels.pkl", "rb") as f:
    train_labels = pickle.load(f)

with open("../data-challenge-kernel-methods-2022-2023/test_data.pkl", "rb") as f:
    test_data = pickle.load(f)

In [3]:
train_data = np.array(train_data, dtype=object)
train_labels = np.array(train_labels, dtype=float)
test_data = np.array(test_data, dtype=object)

In [4]:
train_labels_svm = train_labels.copy()
train_labels_svm[train_labels_svm == 0] = -1
np.unique(train_labels_svm)

array([-1.,  1.])

In [5]:
class WLKernel:
    def __init__(self, edge_attr="labels", node_attr="labels", iterations=3):
        self.edge_attr = edge_attr
        self.node_attr = node_attr
        self.n_iter = iterations
    
    def _hash_label(self, label, digest_size):
        return blake2b(label.encode("ascii"), digest_size=digest_size).hexdigest()

    def _neighborhood_aggregate(self, G, node, node_labels):
        """
        Compute new labels for given node by aggregating
        the labels of each node's neighbors.
        """
        label_list = []
        for nbr in G.neighbors(node):
            prefix = "" if self.edge_attr is None else str(G[node][nbr][self.edge_attr])
            label_list.append(prefix + node_labels[nbr])
        return node_labels[node] + "".join(sorted(label_list))

    def weisfeiler_lehman_graph_hash(self, G, digest_size=16):
        def weisfeiler_lehman_step(G, labels):
            """
            Apply neighborhood aggregation to each node
            in the graph.
            Computes a dictionary with labels for each node.
            """
            new_labels = {}
            for node in G.nodes():
                label = self._neighborhood_aggregate(G, node, labels)
                new_labels[node] = self._hash_label(label, digest_size)
            return new_labels

        # set initial node labels
        node_labels = {u: str(dd[self.node_attr]) for u, dd in G.nodes(data=True)}

        subgraph_hash_counts = {}
        for it in range(self.n_iter):
            node_labels = weisfeiler_lehman_step(G, node_labels)
            counter = Counter(node_labels.values())
            # normalize counter
            total = np.sum(list(counter.values()))
            for k in counter:
                counter[k] /= total

            # sort the counter, extend total counts
            subgraph_hash_counts[it] = sorted(counter.items(), key=lambda x: x[0])

        # return _hash_label(str(tuple(subgraph_hash_counts)), digest_size)
        return subgraph_hash_counts
    
    
    def compute_phi(self, Z):
        phi_list = []
        for g in Z:
            phi_list.append(self.weisfeiler_lehman_graph_hash(g))
        return phi_list
    
    def compute_kernel(self, wl1, wl2):
        k = 0
        for i in range(self.n_iter):
            dict1 = dict(wl1[i])
            dict2 = dict(wl2[i])
            # take scalar product only on common keys
            common_keys = set(dict1.keys()).intersection(set(dict2.keys()))
            k += np.sum([dict1[c]*dict2[c] for c in common_keys])
        return k

    def compute_kernel_matrix(self, X, Y):
        # Precompute phi to deal only with dot products
        phi_X = self.compute_phi(X)
        if np.array_equal(X, Y):
            print("Not computing phi again as X=Y")
            phi_Y = phi_X.copy()
        else:
            phi_Y = self.compute_phi(Y)
        ker = np.zeros((len(X), len(Y)))
        count_iter = 0
        if len(X) == len(Y):
            for i in range(len(X)):
                for j in range(i, len(Y)):
                    ker[i, j] = self.compute_kernel(phi_X[i], phi_Y[j])
                    ker[j, i] = ker[i,j]
                count_iter += 1
                if count_iter % 100 == 0:
                    print(f"Iteration {count_iter}")
        else:
            for (i,j) in itertools.product(range(len(X)), range(len(Y))):
                ker[i,j] = self.compute_kernel(phi_X[i], phi_Y[j])
        print("Kernel computed")
        return ker

In [100]:
class KernelPerceptron:
    
    def __init__(self, kernel_mat, epsilon = 1e-3, n_iter=100):
        self.kernel = kernel_mat        
        self.norm_f = None
        self.alpha = None
        self.n_iter = n_iter
        self.training_data = None
        self.accuracy = 0
        self.opt_alpha = None
        self.acc_list = []
        
    def fit(self, X, y, K=None, valX=None, valY=None, K_val=None):
       #### You might define here any variable needed for the rest of the code
        N = len(y)
        if K is None:
            K = self.kernel(X, X)
        self.alpha = np.zeros(N)
        
        for it in range(self.n_iter):
            y_pred = np.sign(self.alpha.T@K)
            for i in range(N):
                if y_pred[i] == y[i]:
                    continue
                self.alpha[i] += y[i]
            if valX is not None:
                y_pred_val = np.sign(self.alpha.T@K_val)
                acc = accuracy_score(y_pred_val, valY)
                self.acc_list.append(acc)
                if acc > self.accuracy:
                    self.accuracy = acc
                    self.opt_alpha = self.alpha.copy()
                    
        self.training_data = X
        
    
    def predict(self, X, K=None):
        """ Predict y values in {-1, 1} """
        if K is None:
            K = self.kernel(self.training_data, X)
        return np.sign(self.opt_alpha.T@K)

In [None]:
class KernelLogisticRegression:
    def __init__(self, kernel_mat, n_iter=100, regul=None):
        self.n_iter = n_iter
        self.regul = regul
        self.alpha = None
        self.kernel = kernel_mat        

    
    def loss(self, k, y):
        prob = 1/(1 + np.exp(-self.alpha.T@k))
        loss = 1/len(y)*((prob >= .5) - y)**2
        if self.regul is not None:
            loss += self.regul*np.linalg.norm(self.alpha)
        return pred, loss
    
    def grad_loss(self, pred, real, k):
        back = -2*pred*(pred - real)*k*np.exp(self.alpha.T@k)
        if self.regul is not None:
            back += 2*self.regul*self.alpha
        
        return back
        
        
        
    
    def fit(self, X, y, K=None, batch_size=16):
        # Initialize params
        self.alpha = np.zeros(len(X))
        
        # Init K in case
        if K is None:
            K = self.kernel(X, X)
        
        for it in range(self.n_iter):
            b in range(len(X)//batch_size):
                X_batch = X[b*batch_size: (b+1)*batch_size]
                y_batch = y[b*batch_size: (b+1)*batch_size]
                K_batch = K[:, b*batch_size: (b+1)*batch_size]
                pred_batch, loss_batch = self.loss(K_batch ,y_batch)
                self.alpha = self.alpha
                
                
                
            
        

In [101]:
kernel_mat = WLKernel(iterations=5).compute_kernel_matrix

In [102]:
K = np.load("../matrices/WL_kernel_train_5it.npy")

In [103]:
idx_train = np.sort(np.random.choice(np.arange(len(train_data)), size=len(train_data)*80//100, replace=False))
idx_val = np.setdiff1d(np.arange(len(train_data)), idx_train)
train_data[idx_train].shape, train_data[idx_val].shape

((4800,), (1200,))

In [117]:
K_train = K[idx_train, :][:, idx_train]
K_val = K[idx_train, :][:, idx_val]
K_val.shape, K_train.shape

((4800, 1200), (4800, 4800))

In [118]:
model = KernelPerceptron(kernel_mat=kernel_mat, n_iter=1000)
model.fit(train_data[idx_train], train_labels_svm[idx_train], K_train, train_data[idx_val], train_labels_svm[idx_val], K_val)

In [119]:
model.acc_list

[0.9075,
 0.9075,
 0.9075,
 0.9058333333333334,
 0.8725,
 0.9125,
 0.8508333333333333,
 0.905,
 0.8766666666666667,
 0.9083333333333333,
 0.8241666666666667,
 0.905,
 0.9233333333333333,
 0.8633333333333333,
 0.9066666666666666,
 0.8408333333333333,
 0.905,
 0.9008333333333334,
 0.9275,
 0.8408333333333333,
 0.9066666666666666,
 0.8891666666666667,
 0.9208333333333333,
 0.7991666666666667,
 0.9058333333333334,
 0.9083333333333333,
 0.85,
 0.9075,
 0.8425,
 0.9066666666666666,
 0.8625,
 0.9125,
 0.8316666666666667,
 0.9058333333333334,
 0.91,
 0.9166666666666666,
 0.905,
 0.9225,
 0.8583333333333333,
 0.9066666666666666,
 0.8,
 0.9058333333333334,
 0.9133333333333333,
 0.83,
 0.905,
 0.8841666666666667,
 0.9191666666666667,
 0.8158333333333333,
 0.9066666666666666,
 0.9233333333333333,
 0.9025,
 0.9241666666666667,
 0.8491666666666666,
 0.9058333333333334,
 0.8408333333333333,
 0.9058333333333334,
 0.875,
 0.9183333333333333,
 0.8066666666666666,
 0.9066666666666666,
 0.9233333333333333

In [121]:
np.sum(model.opt_alpha != model.alpha)

1656

In [122]:
val = model.predict(train_data[idx_val], K_val)

In [123]:
np.unique(val, return_counts=True)

(array([-1.,  0.,  1.]), array([1132,    6,   62]))

In [124]:
from sklearn.metrics import classification_report
print(classification_report(train_labels_svm[idx_val], val))

              precision    recall  f1-score   support

        -1.0       0.95      0.98      0.96      1095
         0.0       0.00      0.00      0.00         0
         1.0       0.69      0.41      0.51       105

    accuracy                           0.93      1200
   macro avg       0.55      0.46      0.49      1200
weighted avg       0.92      0.93      0.92      1200



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [125]:
K_test = np.load('../matrices/WL_kernel_test_10it.npy')
K_test.shape

(2000, 6000)

In [126]:
test = model.predict(test_data, K_test.T[idx_train, :])

In [127]:
np.unique(test, return_counts=True)

(array([-1.,  0.,  1.]), array([1832,    5,  163]))

In [101]:
test_preds = test.copy()
test_preds[test_preds == -1] = 0
Yte = {'Predicted' : test_preds}
dataframe = pd.DataFrame(Yte) 
dataframe.index += 1 
dataframe.to_csv('test_pred.csv',index_label='Id')

In [102]:
dataframe

Unnamed: 0,Predicted
1,0.0
2,0.0
3,0.0
4,0.0
5,1.0
...,...
1996,0.0
1997,0.0
1998,0.0
1999,1.0
