In [1]:
import numpy as np
import random
import cv2
import os
from imutils import paths
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score
import pandas as pd

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Activation, Flatten, Dense, Dropout, SimpleRNN
from tensorflow.keras.optimizers import SGD
from tensorflow.keras import backend as K
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
#binarize the labels
lb = LabelBinarizer()

import pandas as pd

data_files = ['cle_train.csv','cle_test.csv','hun_train.csv','hun_test.csv','swi_train.csv','swi_test.csv','vir_train.csv','vir_test.csv']

datasets = {}

for file in data_files:
    data = pd.read_csv('../TrainTestData/' + file)
    
    X = data.iloc[:, :-1]
    Y = data.iloc[:, -1]
    
    Y_binary = Y.apply(lambda x: 1 if x > 0 else 0)
    
    # Extract the name from the file path
    name = file.split('.')[0]
    
    # Store the dataset components in a dictionary
    datasets[name] = {'X': X, 'Y': Y, 'Y_binary': Y_binary}

# Unpack the dictionary values in a loop
variables = ['cle', 'hun', 'swi', 'vir']
train_test = ['train', 'test']

for var in variables:
    for tt in train_test:
        X, Y, Y_binary = datasets[f'{var}_{tt}'].values()
        globals()[f'{var}_X_{tt}'] = X
        globals()[f'{var}_Y_{tt}'] = Y
        globals()[f'{var}_Y_{tt}_binary'] = Y_binary

In [3]:
X_test = pd.concat([cle_X_test,hun_X_test,swi_X_test,vir_X_test])
y_test = pd.concat([cle_Y_test_binary,hun_Y_test_binary,swi_Y_test_binary,vir_Y_test_binary])

X_train = pd.concat([cle_X_train,hun_X_train,swi_X_train,vir_X_train])
y_train = pd.concat([cle_Y_train_binary,hun_Y_train_binary,swi_Y_train_binary,vir_Y_train_binary])

In [4]:
def create_clients():
    cle_zip = list(zip(cle_X_train.values,cle_Y_train_binary))
    hun_zip = list(zip(hun_X_train.values,hun_Y_train_binary))
    vir_zip = list(zip(vir_X_train.values,vir_Y_train_binary))
    swi_zip = list(zip(swi_X_train.values,swi_Y_train_binary))
    
    shards = [cle_zip, hun_zip, vir_zip,swi_zip]
    client_names = ["client_1","client_2","client_3","client_4"]
    dic = {client_names[i] : shards[i] for i in range(len(client_names))}
    return dic


def batch_data(data_shard, bs=32):
    '''Takes in a clients data shard and create a tfds object off it
    args:
        shard: a data, label constituting a client's data shard
        bs:batch size
    return:
        tfds object'''
    #seperate shard into data and labels lists
    data, label = zip(*data_shard)
    dataset = tf.data.Dataset.from_tensor_slices((list(data), list(label)))
    return dataset.shuffle(len(label)).batch(bs)


class DNN:
    @staticmethod
    def build(shape, classes):
        model = Sequential()
        model.add(Dense(64, input_shape=(20,), activation='relu'))
        model.add(Dense(128, activation='relu'))
        model.add(Dense(256, activation='relu'))
        model.add(Dense(2, activation='sigmoid'))

        model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        return model
    

def weight_scalling_factor(clients_trn_data, client_name):
    client_names = list(clients_trn_data.keys())
    #get the bs
    bs = list(clients_trn_data[client_name])[0][0].shape[0]
    #first calculate the total training data points across clinets
    global_count = sum([tf.data.experimental.cardinality(clients_trn_data[client_name]).numpy() for client_name in client_names])*bs
    # get the total number of data points held by a client
    local_count = tf.data.experimental.cardinality(clients_trn_data[client_name]).numpy()*bs
    return local_count/global_count


def scale_model_weights(weight, scalar):
    '''function for scaling a models weights'''
    weight_final = []
    steps = len(weight)
    for i in range(steps):
        weight_final.append(scalar * weight[i])
    return weight_final


def sum_scaled_weights(scaled_weight_list):
    '''Return the sum of the listed scaled weights. The is equivalent to scaled avg of the weights'''
    avg_grad = list()
    #get the average grad accross all client gradients
    for grad_list_tuple in zip(*scaled_weight_list):
        layer_mean = tf.math.reduce_sum(grad_list_tuple, axis=0)
        avg_grad.append(layer_mean)
        
    return avg_grad


def test_model(X_test, Y_test,  model, comm_round):
    cce = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    #logits = model.predict(X_test, batch_size=100)
    logits = model.predict(X_test)
    length = len(y_test)
    Y_test = tf.reshape(Y_test,(length,1))
    loss = cce(Y_test, logits)
    acc = accuracy_score(tf.argmax(logits, axis=1), Y_test)
    print('comm_round: {} | global_acc: {:.3%} | global_loss: {}'.format(comm_round, acc, loss))
    return acc, loss

In [5]:
#create clients
clients = create_clients()

#process and batch the training data for each client
clients_batched = dict()
for (client_name, data) in clients.items():
    clients_batched[client_name] = batch_data(data)
    
#process and batch the test set  
test_batched = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(len(y_test))

comms_round = 100
    
#create optimizer
lr = 0.01 
loss='sparse_categorical_crossentropy'
metrics = ['accuracy']
optimizer = tf.keras.optimizers.legacy.SGD(lr=lr, decay=lr / comms_round, momentum=0.9) 

#initialize global model
smlp_global = DNN()
global_model = smlp_global.build(20, 2)
        
#commence global training loop
for comm_round in range(comms_round):
            
    # get the global model's weights - will serve as the initial weights for all local models
    global_weights = global_model.get_weights()
    
    #initial list to collect local model weights after scalling
    scaled_local_weight_list = list()

    #randomize client data - using keys
    client_names= list(clients_batched.keys())
    random.shuffle(client_names)
    
    #loop through each client and create new local model
    for client in client_names:
        smlp_local = DNN()
        local_model = smlp_local.build(20, 2)
        local_model.compile(loss=loss, 
                      optimizer=optimizer, 
                      metrics=metrics)
        
        #set local model weight to the weight of the global model
        local_model.set_weights(global_weights)
        
        #fit local model with client's data
        local_model.fit(clients_batched[client], epochs=1, verbose=0)
        
        #scale the model weights and add to list
        scaling_factor = weight_scalling_factor(clients_batched, client)
        scaled_weights = scale_model_weights(local_model.get_weights(), scaling_factor)
        scaled_local_weight_list.append(scaled_weights)
        
        #clear session to free memory after each communication round
        K.clear_session()
        
    #to get the average over all the local model, we simply take the sum of the scaled weights
    average_weights = sum_scaled_weights(scaled_local_weight_list)
    
    #update global model 
    global_model.set_weights(average_weights)

    #test global model and print out metrics after each communications round
    for(X_test, Y_test) in test_batched:
        global_acc, global_loss = test_model(X_test, Y_test, global_model, comm_round)
        SGD_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(len(y_train)).batch(250)
        smlp_SGD = DNN()
        SGD_model = smlp_SGD.build(20, 2) 

        SGD_model.compile(loss=loss, optimizer=optimizer, metrics=metrics)

# fit the SGD training data to model
_ = SGD_model.fit(SGD_dataset, epochs=100, verbose=0)

#test the SGD global model and print out metrics
for(X_test, Y_test) in test_batched:
        SGD_acc, SGD_loss = test_model(X_test, Y_test, SGD_model, 1)

  super().__init__(name, **kwargs)


comm_round: 0 | global_acc: 60.302% | global_loss: 0.6735559701919556
comm_round: 1 | global_acc: 65.651% | global_loss: 0.6621425151824951
comm_round: 2 | global_acc: 68.195% | global_loss: 0.6533271074295044
comm_round: 3 | global_acc: 69.672% | global_loss: 0.6505371928215027
comm_round: 4 | global_acc: 70.890% | global_loss: 0.6401727199554443
comm_round: 5 | global_acc: 72.624% | global_loss: 0.6345158815383911
comm_round: 6 | global_acc: 74.348% | global_loss: 0.6318967342376709
comm_round: 7 | global_acc: 72.467% | global_loss: 0.6241269707679749
comm_round: 8 | global_acc: 74.635% | global_loss: 0.6334661841392517
comm_round: 9 | global_acc: 73.835% | global_loss: 0.6266323328018188
comm_round: 10 | global_acc: 75.275% | global_loss: 0.613282322883606
comm_round: 11 | global_acc: 73.826% | global_loss: 0.6163196563720703
comm_round: 12 | global_acc: 75.204% | global_loss: 0.618182897567749
comm_round: 13 | global_acc: 75.180% | global_loss: 0.6136580109596252
comm_round: 14 | g

comm_round: 63 | global_acc: 75.553% | global_loss: 0.6076052784919739
comm_round: 64 | global_acc: 75.636% | global_loss: 0.6061744093894958
comm_round: 65 | global_acc: 75.649% | global_loss: 0.608026921749115
comm_round: 66 | global_acc: 75.442% | global_loss: 0.6106443405151367
comm_round: 67 | global_acc: 75.469% | global_loss: 0.6090624928474426
comm_round: 68 | global_acc: 75.525% | global_loss: 0.6083202362060547
comm_round: 69 | global_acc: 75.633% | global_loss: 0.6086216568946838
comm_round: 70 | global_acc: 75.643% | global_loss: 0.6074455380439758
comm_round: 71 | global_acc: 75.462% | global_loss: 0.6077463030815125
comm_round: 72 | global_acc: 75.647% | global_loss: 0.6073095798492432
comm_round: 73 | global_acc: 75.654% | global_loss: 0.6076278686523438
comm_round: 74 | global_acc: 75.532% | global_loss: 0.6096605658531189
comm_round: 75 | global_acc: 75.689% | global_loss: 0.6090200543403625
comm_round: 76 | global_acc: 75.682% | global_loss: 0.6071950793266296
comm_ro

In [6]:
Y_predictions = np.argmax(SGD_model.predict(X_test),axis = 1)



In [7]:
cm = confusion_matrix(Y_predictions, Y_test)
cm

array([[30018,  9652],
       [ 4195, 12871]], dtype=int64)

In [8]:
print(classification_report(Y_test, Y_predictions, digits=4))

              precision    recall  f1-score   support

           0     0.7567    0.8774    0.8126     34213
           1     0.7542    0.5715    0.6502     22523

    accuracy                         0.7559     56736
   macro avg     0.7554    0.7244    0.7314     56736
weighted avg     0.7557    0.7559    0.7481     56736



# Testing on each dataset

In [9]:
Y_cle = np.argmax(SGD_model.predict(cle_X_test),axis = 1)
cm_cle = confusion_matrix(Y_cle, cle_Y_test_binary)
print(cm_cle)
print(classification_report(Y_cle, cle_Y_test_binary, digits=4))

[[7472 2406]
 [1063 3243]]
              precision    recall  f1-score   support

           0     0.8755    0.7564    0.8116      9878
           1     0.5741    0.7531    0.6515      4306

    accuracy                         0.7554     14184
   macro avg     0.7248    0.7548    0.7316     14184
weighted avg     0.7840    0.7554    0.7630     14184



In [10]:
mismatch = [i for i, (a,b) in enumerate(zip(Y_cle, cle_Y_test_binary)) if a != b]
print(mismatch)

[1, 2, 7, 14, 16, 26, 28, 40, 44, 52, 61, 63, 65, 66, 67, 69, 70, 84, 95, 104, 112, 116, 118, 119, 120, 121, 133, 134, 135, 136, 148, 149, 170, 172, 173, 174, 177, 185, 192, 195, 196, 200, 205, 206, 208, 212, 213, 215, 216, 217, 222, 224, 225, 226, 227, 231, 233, 234, 235, 238, 248, 252, 253, 259, 266, 269, 271, 275, 279, 282, 288, 290, 293, 299, 304, 307, 308, 309, 315, 324, 328, 342, 344, 349, 351, 354, 362, 371, 384, 388, 389, 390, 391, 393, 394, 406, 407, 414, 425, 427, 430, 433, 452, 453, 458, 459, 462, 467, 468, 469, 475, 476, 488, 490, 491, 504, 507, 515, 516, 518, 519, 520, 521, 522, 524, 527, 528, 533, 542, 550, 555, 558, 564, 569, 572, 576, 580, 588, 590, 592, 595, 596, 599, 605, 606, 610, 613, 617, 619, 627, 631, 636, 642, 650, 653, 665, 669, 672, 675, 696, 698, 714, 723, 727, 731, 732, 733, 734, 737, 744, 746, 751, 753, 761, 762, 770, 777, 778, 782, 790, 793, 801, 811, 814, 827, 830, 835, 836, 838, 853, 858, 861, 863, 864, 866, 876, 878, 881, 884, 887, 896, 902, 905, 911, 9

In [11]:
Y_vir = np.argmax(SGD_model.predict(vir_X_test),axis = 1)
cm_vir = confusion_matrix(Y_vir, vir_Y_test_binary)
print(cm_vir)
print(classification_report(Y_vir, vir_Y_test_binary, digits=4))

[[7591 2354]
 [1056 3183]]
              precision    recall  f1-score   support

           0     0.8779    0.7633    0.8166      9945
           1     0.5749    0.7509    0.6512      4239

    accuracy                         0.7596     14184
   macro avg     0.7264    0.7571    0.7339     14184
weighted avg     0.7873    0.7596    0.7672     14184



In [12]:
mismatch = [i for i, (a,b) in enumerate(zip(Y_vir, vir_Y_test_binary)) if a != b]
print(mismatch)

[10, 14, 19, 24, 25, 26, 30, 32, 36, 38, 53, 56, 65, 68, 73, 75, 81, 88, 93, 97, 98, 108, 114, 115, 119, 120, 124, 127, 132, 134, 140, 142, 146, 162, 163, 166, 169, 176, 187, 188, 194, 198, 199, 201, 202, 207, 208, 213, 214, 219, 231, 234, 237, 248, 252, 253, 256, 261, 262, 267, 275, 280, 284, 286, 291, 293, 299, 307, 309, 314, 323, 325, 326, 334, 348, 349, 352, 354, 356, 361, 377, 387, 394, 398, 400, 408, 409, 410, 411, 413, 414, 425, 428, 437, 439, 440, 443, 446, 448, 455, 460, 462, 469, 470, 471, 472, 474, 479, 480, 482, 484, 485, 494, 495, 505, 507, 509, 511, 518, 520, 523, 524, 531, 533, 538, 539, 546, 548, 549, 551, 557, 564, 566, 568, 571, 585, 587, 592, 604, 607, 612, 616, 620, 623, 632, 640, 646, 647, 661, 663, 666, 670, 673, 674, 675, 677, 683, 689, 696, 704, 709, 710, 716, 720, 723, 724, 729, 732, 733, 737, 740, 743, 752, 753, 760, 766, 767, 776, 782, 785, 796, 800, 801, 802, 809, 815, 823, 824, 827, 829, 834, 839, 843, 845, 852, 853, 854, 858, 863, 866, 867, 868, 878, 890, 

In [13]:
Y_hun = np.argmax(SGD_model.predict(hun_X_test),axis = 1)
cm_hun = confusion_matrix(Y_hun, hun_Y_test_binary)
print(cm_hun)
print(classification_report(Y_hun, hun_Y_test_binary, digits=4))

[[7517 2427]
 [1024 3216]]
              precision    recall  f1-score   support

           0     0.8801    0.7559    0.8133      9944
           1     0.5699    0.7585    0.6508      4240

    accuracy                         0.7567     14184
   macro avg     0.7250    0.7572    0.7321     14184
weighted avg     0.7874    0.7567    0.7647     14184



In [14]:
mismatch = [i for i, (a,b) in enumerate(zip(Y_hun, hun_Y_test_binary)) if a != b]
print(mismatch)

[2, 7, 9, 18, 20, 25, 29, 32, 48, 57, 64, 65, 67, 71, 74, 83, 96, 111, 115, 119, 122, 124, 126, 129, 133, 134, 135, 145, 152, 153, 154, 156, 161, 165, 173, 174, 178, 185, 187, 190, 194, 200, 202, 207, 208, 212, 226, 232, 238, 242, 250, 251, 253, 254, 255, 258, 262, 267, 270, 272, 280, 284, 286, 290, 296, 301, 305, 306, 309, 310, 312, 315, 318, 319, 320, 328, 329, 341, 344, 357, 358, 359, 360, 366, 368, 369, 377, 381, 383, 385, 397, 398, 401, 414, 418, 420, 421, 426, 438, 439, 442, 445, 450, 460, 466, 469, 470, 472, 478, 479, 485, 489, 490, 492, 498, 502, 504, 505, 507, 511, 514, 516, 517, 525, 539, 540, 542, 548, 553, 559, 561, 563, 565, 572, 578, 586, 590, 597, 598, 599, 606, 608, 609, 631, 633, 636, 637, 650, 652, 655, 656, 667, 669, 685, 686, 697, 699, 704, 709, 710, 711, 720, 723, 727, 729, 731, 736, 739, 742, 748, 753, 766, 771, 776, 777, 783, 787, 795, 796, 799, 806, 808, 809, 810, 827, 829, 830, 841, 843, 845, 861, 862, 863, 864, 867, 868, 871, 875, 876, 877, 887, 893, 896, 897,

In [15]:
Y_swi = np.argmax(SGD_model.predict(swi_X_test),axis = 1)
cm_swi = confusion_matrix(Y_swi, swi_Y_test_binary)
print(cm_swi)
print(classification_report(Y_swi, swi_Y_test_binary, digits=4))

[[7438 2465]
 [1052 3229]]
              precision    recall  f1-score   support

           0     0.8761    0.7511    0.8088      9903
           1     0.5671    0.7543    0.6474      4281

    accuracy                         0.7520     14184
   macro avg     0.7216    0.7527    0.7281     14184
weighted avg     0.7828    0.7520    0.7601     14184



In [16]:
mismatch = [i for i, (a,b) in enumerate(zip(Y_swi, swi_Y_test_binary)) if a != b]
print(mismatch)

[7, 10, 12, 20, 25, 26, 28, 38, 46, 47, 50, 51, 69, 75, 78, 86, 88, 95, 97, 100, 112, 113, 115, 116, 120, 127, 129, 150, 151, 173, 174, 177, 188, 191, 193, 197, 200, 201, 202, 205, 218, 222, 229, 232, 251, 253, 254, 258, 261, 264, 265, 268, 269, 271, 272, 273, 277, 282, 289, 294, 295, 299, 300, 303, 305, 308, 314, 315, 329, 331, 333, 350, 351, 352, 354, 360, 361, 362, 369, 370, 371, 374, 378, 379, 389, 392, 393, 395, 400, 403, 404, 410, 416, 417, 432, 437, 441, 442, 447, 448, 449, 461, 463, 477, 478, 480, 482, 487, 490, 498, 499, 500, 503, 504, 505, 509, 512, 513, 514, 520, 521, 526, 527, 529, 547, 549, 559, 560, 562, 578, 589, 590, 591, 592, 594, 597, 603, 611, 613, 619, 622, 625, 626, 631, 638, 642, 647, 648, 653, 658, 662, 671, 672, 674, 676, 680, 681, 686, 687, 688, 695, 697, 705, 713, 718, 721, 722, 723, 724, 726, 728, 730, 732, 733, 736, 740, 746, 748, 750, 751, 753, 755, 761, 767, 769, 775, 778, 779, 780, 783, 787, 814, 816, 819, 822, 823, 825, 830, 832, 833, 837, 838, 841, 847,