<a href="https://colab.research.google.com/github/ElahehJafarigol/Federated-Learning/blob/Research-paper-1/1_Federated_Base_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

####**Deep Imbalanced Learning for Weather Data: A Federated Learning Approach**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install --quiet keras==2.9.0
!pip install --quiet tensorflow==2.9.2

In [None]:
!pip install --quiet imbalanced-learn
!pip install --quiet git+https://github.com/tensorflow/docs

In [None]:
import os
from  IPython import display
import pathlib
import shutil
import tempfile
import warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
warnings.filterwarnings('ignore')
from pandas.core.common import random_state
import pandas as pd
import numpy as np
import random
import itertools
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from collections import Counter


from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, confusion_matrix

import tensorflow as tf
from tensorflow import keras
from keras import datasets, layers, models, metrics
from keras.models import Sequential
from keras.layers import Conv2D
from keras.layers import MaxPooling2D
from keras.layers import Activation
from keras.layers import Flatten
from keras.layers import Dense
from keras.optimizers import SGD
from keras import backend as K
from keras.layers import GaussianNoise
from keras import regularizers
from keras.layers import BatchNormalization
from keras.layers import LeakyReLU
from keras.layers import ReLU
from keras.layers import Dropout
from keras.layers import Input
from keras.models import Model
from keras.layers.activation import ReLU
from keras.layers.serialization import activation
from pandas.core.indexes.datetimes import Resolution
from keras.layers import Concatenate
from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard

import imblearn
from imblearn.over_sampling import SMOTE

random_state = 42
random_seed = 42
tf.random.set_seed(42)

import tensorflow_docs.plots as tfplots

# Set the font to Times New Roman
plt.rcParams['font.family'] = 'serif'

In [None]:
def load_and_process_data(client_num):
    """
    Loads and processes data from Google Drive for a specific client.

    Args:
        client_num (int): The client number.

    Returns:
        pd.DataFrame, pd.DataFrame: Processed feature and label DataFrames.
    """
    # Load data
    file_path = f'/content/drive/MyDrive/Colab_Notebooks/Federated_Imbalanced_Learning/Stations/Federated_Learning/Client{client_num}.csv'
    df = pd.read_csv(file_path)

    # Encoding 'RainToday' and 'RainTomorrow' as binary
    df['RainToday'] = df['RainToday'].apply(lambda x: 1 if x == "Yes" else 0)
    df['RainTomorrow'] = df['RainTomorrow'].apply(lambda x: 1 if x == "Yes" else 0)

    # Dropping specific columns
    df = df.drop(['Sunshine','Evaporation','Cloud3pm','Cloud9am','RISK_MM','Location','Date','WindGustDir',
                  'WindDir9am', 'WindDir3pm'], axis=1)

    # Replacing NaN values with respective means
    fill_feat = ['MinTemp', 'MaxTemp', 'Rainfall', 'WindGustSpeed','WindSpeed9am', 'WindSpeed3pm',
                 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm','Temp9am', 'Temp3pm',
                 'RainToday', 'RainTomorrow']
    for i in fill_feat:
        df[i].fillna(df[i].mean(), inplace=True)

    df.dropna(inplace=True)

    # Separate the features and labels
    X = df.drop('RainTomorrow', axis=1)
    y = df['RainTomorrow']

    # Normalize the features
    scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
    X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

    # Reset the index to ensure consistency
    X.reset_index(drop=True, inplace=True)
    y.reset_index(drop=True, inplace=True)

    return X, y

def calculate_class_ratio(y):
    # Calculate the counts for minority and majority classes
    minority_count = sum(y == 1)
    majority_count = sum(y == 0)

    # Calculate the imbalance ratio
    class_ratio = minority_count / len(y)

    # Print the results
    print("Minority class:", minority_count)
    print("Majority class:", majority_count)
    print("Imbalance ratio:", class_ratio)

    return class_ratio

def split_client_data(X, y, test_size=0.2, val_size=0.1):
    """
    Splits data into training, validation, and test sets.

    Args:
        X (pd.DataFrame): Features DataFrame.
        y (pd.DataFrame): Labels DataFrame.
        test_size (float): Proportion of the data to include in the test set.
        val_size (float): Proportion of the data to include in the validation set.

    Returns:
        dict: Dictionary containing the split data.
    """
    # First, split into train + val and test
    X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

    # Next, split train + val into train and val sets
    val_actual_size = val_size / (1 - test_size)  # Adjust the validation size based on the initial split
    X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=val_actual_size, random_state=42)

    return {
        "X_train": X_train,
        "y_train": y_train,
        "X_val": X_val,
        "y_val": y_val,
        "X_test": X_test,
        "y_test": y_test
    }

# Process data for all clients and store in a dictionary
all_client_data = {}
clients = range(1, 10)

for client_num in clients:
    X, y = load_and_process_data(client_num)
    all_client_data[client_num] = {
        f"X_{client_num}": X,
        f"y_{client_num}": y
    }

# Split data for all clients and store the results in a new dictionary
split_data = {}

for client_num, data in all_client_data.items():
    X = data[f"X_{client_num}"]
    y = data[f"y_{client_num}"]
    split_data[client_num] = split_client_data(X, y)

In [None]:
# Define batch size
Batch_size = 64  # example batch size

def batch_data(data_shard, batchsize=Batch_size):
    # unpack data shard into data and labels arrays
    X, y = data_shard

    # create a tensorflow dataset object from the data and labels
    dataset = tf.data.Dataset.from_tensor_slices((X, y))

    # shuffle and batch the dataset
    return dataset.shuffle(len(y)).batch(batchsize)

# Batch the training data for all clients and store the results in a new dictionary
batched_train_data = {}

for client_num, data_splits in split_data.items():
    X_train = data_splits["X_train"]
    y_train = data_splits["y_train"]
    batched_train_data[client_num] = batch_data((X_train, y_train), Batch_size)

# Example of accessing the batched training dataset for all clients
for client_number in range(1, 10):
    train_dataset = batched_train_data[client_number]


# Batch the validation data for all clients and store the results in a new dictionary
batched_validation_data = {}

for client_num, data_splits in split_data.items():
    X_val = data_splits["X_val"]
    y_val = data_splits["y_val"]
    batched_validation_data[client_num] = batch_data((X_val, y_val), Batch_size)

# Example of accessing the batched training dataset for all clients
for client_number in range(1, 10):
    validation_dataset = batched_validation_data[client_number]

In [None]:
# Combine the test datasets from all clients
combined_X_test = pd.concat([split_data[client]["X_test"] for client in split_data])
combined_y_test = pd.concat([split_data[client]["y_test"] for client in split_data])

print("Test data shape:", combined_X_test.shape)
print("Test data labels shape:", combined_y_test.shape)

# Process and batch the test set
test_batched = tf.data.Dataset.from_tensor_slices((combined_X_test, combined_y_test)).batch(len(combined_y_test))


Test data shape: (28139, 13)
Test data labels shape: (28139,)


In [None]:
# Federated Averaging: aggregation method
def weight_scalling_factor(clients_trn_data, client_name):
    client_names = list(clients_trn_data.keys())
    #get the batch_size
    batch_size = list(clients_trn_data[client_name])[0][0].shape[0]
    #first calculate the total training data points across clinets
    global_count = sum([tf.data.experimental.cardinality(clients_trn_data[client_name]).numpy() for client_name in client_names])*batch_size
    # get the total number of data points held by a client
    local_count = tf.data.experimental.cardinality(clients_trn_data[client_name]).numpy()*batch_size
    return local_count/global_count


def scale_model_weights(weight, scalar):
    '''function for scaling a models weights'''
    weight_final = []
    steps = len(weight)
    for i in range(steps):
        weight_final.append(scalar * weight[i])
    return weight_final

def sum_scaled_weights(scaled_weight_list):
    # Return the sum of the listed scaled weights. The is equivalent to scaled avg of the weights
    avg_grad = list()
    #get the average grad accross all client gradients
    for grad_list_tuple in zip(*scaled_weight_list):
        layer_mean = tf.math.reduce_sum(grad_list_tuple, axis=0)
        avg_grad.append(layer_mean)

    return avg_grad

def test_model(X_test, Y_test, model, communication_round):
    cce = tf.keras.losses.BinaryCrossentropy(from_logits=True)
    logits = model.predict(X_test, batch_size=Batch_size)
    logits = tf.squeeze(logits, axis=1) # remove last dimension from logits
    loss = cce(Y_test, logits)
    acc = accuracy_score(tf.round(logits), Y_test)
    auc = roc_auc_score(Y_test, logits)
    tn, fp, fn, tp = confusion_matrix(Y_test, tf.round(logits)).ravel()
    g_mean = np.sqrt(tp/(tp+fn)*tn/(tn+fp))
    print('communication_round: {} | global_accuracy: {:.3%} | global_loss: {} | global_AUC: {:.3%} | global_G-mean: {:.3%}'.format(communication_round, acc, loss, auc, g_mean))
    return acc, loss, auc, g_mean

In [None]:
# define data shape and number of classes
input_shape = (13,)
data_shape = (13,)
num_classes = 2
Batch_size = 64
Learning_rate = 0.001
Momentum = 0.9
Epochs = 30
stddev = 0.01
communication_rounds = 10

epochs = Epochs
learning_rate = Learning_rate
momentum = Momentum
batch_size = Batch_size


# Define the optimizer and loss function
optimizer = keras.optimizers.SGD(learning_rate=learning_rate,
                                 momentum = momentum,
                                 nesterov = False)
loss = "binary_crossentropy"

metrics = [
      keras.metrics.BinaryAccuracy(name='accuracy'),
      keras.metrics.Precision(name='precision'),
      keras.metrics.Recall(name='recall'),
      keras.metrics.AUC(name='auc'),
      keras.metrics.AUC(name='prc', curve='PR'), # precision-recall curve
      keras.metrics.TruePositives(name='tp'),
      keras.metrics.FalsePositives(name='fp'),
      keras.metrics.TrueNegatives(name='tn'),
      keras.metrics.FalseNegatives(name='fn'),
      ]

class MyModel:
    @staticmethod
    def build(shape, classes):
        model = tf.keras.Sequential([
        layers.Dense(256, activation='relu', input_shape=(13,)),
        layers.GaussianNoise(stddev),
        layers.Dense(128, activation='relu'),
        layers.Dense(64, activation='relu'),
        layers.Dense(32, activation='relu'),
        layers.Dense(1, activation='sigmoid')
        ])
        return model


**Federated Training with Local Validation:**

In [None]:
smlp_global = MyModel()
global_model = smlp_global.build(shape=data_shape, classes=num_classes)
global_model.summary()

# Initialize lists to store global metrics
global_acc_list = []
global_loss_list = []
global_auc_list = []
global_gmean_list = []

# Commence global training loop
for communication_round in range(communication_rounds):  # 'communication_round' is likely intended to be 'communication_rounds'

    # Get the global model's weights to initialize local models
    global_weights = global_model.get_weights()

    # Initialize list to collect scaled local model weights
    scaled_local_weight_list = []

    # Randomize client data
    client_names = list(batched_train_data.keys())
    random.shuffle(client_names)

    # Loop through each client and train local models
    for client in client_names:
        smlp_local = MyModel()
        local_model = smlp_local.build(shape=data_shape, classes=num_classes)
        local_model.compile(loss=loss, optimizer=optimizer, metrics=metrics)

        # Set local model weights to global model weights
        local_model.set_weights(global_weights)

        # Fit local model with client's data
        model_history = local_model.fit(batched_train_data[client], epochs=epochs, verbose=0)

        # Validate the local model on associated validation set
        X_val = split_data[client]["X_val"]
        y_val = split_data[client]["y_val"]
        val_metrics = local_model.evaluate(X_val, y_val, verbose=0)

        val_loss, val_accuracy, val_auc, val_gmean = val_metrics[:4]

        print(f'Local model {client} - val_loss: {val_loss} - val_accuracy: {val_accuracy} - val_auc: {val_auc} - val_gmean: {val_gmean}')

        # Scale the model weights and add to list
        scaling_factor = weight_scalling_factor(batched_train_data, client)
        scaled_weights = scale_model_weights(local_model.get_weights(), scaling_factor)
        scaled_local_weight_list.append(scaled_weights)

        # Clear session to free memory after each communication round
        K.clear_session()

    # Compute the average of scaled weights
    avg_grad = sum_scaled_weights(scaled_local_weight_list)

    # Update global model weights
    global_model.set_weights(avg_grad)

    # Test global model and print metrics after each communication round
    for X_test, y_test in test_batched:
        global_acc, global_loss, global_auc, global_gmean = test_model(X_test, y_test, global_model, communication_round)
        global_acc_list.append(global_acc)
        global_loss_list.append(global_loss)
        global_auc_list.append(global_auc)
        global_gmean_list.append(global_gmean)
