# Centralized Learning to Federated Learning

COGONI Guillaume (p1810070)

# Import libraries

In [None]:
# Torch
import torch
from torch.utils.data import DataLoader, random_split, Subset
from torch.optim import Adam, SGD
import torch.nn as nn

# Time
import time

# Random
import random

# Tqdm
from tqdm import tqdm
from tqdm import trange

# Copy
import copy

# Sklearn
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder

# Pandas
import pandas as pd

# Matplotlib
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Importation of the dataset

In [None]:
dataset = pd.read_csv("./202207-divvy-tripdata.csv") # Replace by your dataset

In [None]:
data = dataset.copy() # Copy

# Visiualisation

In [None]:
# Print columns, dtypes and the 3 first observation
print(data.columns)
print(data.dtypes)
data.head(3)

In [None]:
# Check if there is missing values and how many values are in the dataset
print(data.isna().sum().sort_values(ascending=False))
#print("---------------------------------------------")
#print(data.count().sort_values(ascending=True))

# Cleaning the Data

In [None]:
'''
@Param value: a string value with this format "Hour(s):Minute(s):Second(s)" 
'''
def convert_to_seconds(value):
    if not any(char.isdigit() for char in value):
        return pd.NaT
    hour, minute, second = map(int, value.split(':'))
    if hour >= 15:
        hour = 0
    if minute >= 60:
        minute = 0
        hour += 1
    if second >= 60:
        second = 0
        minute += 1
    return hour*3600+minute*60+second

data_prep = data.copy()

# Convert the Series "ride_length" to second
data_prep["ride_length"] = data_prep["ride_length"].apply(convert_to_seconds)

# dropNaN
data_prep = data_prep.dropna()

In [None]:
# Check if there is missing values and how many values are in the dataset
print(data_prep.isna().sum().sort_values(ascending=False))
print("---------------------------------------------")
print(data_prep.count().sort_values(ascending=False))

In [None]:
# Check if ride_length correctly change
data_prep['ride_length'].sort_values(ascending=False)

In [None]:
data_prep['ride_length'] = data_prep['ride_length'].astype('int64')
print(data.shape)
data_prep.dtypes

# Selection of the features and the target

In [None]:
features = ['start_lat', 
            'start_lng',
            'end_lat',
            'end_lng',
            'member_casual',
            'ride_length',
            'day_of_week']
            
target = ["rideable_type"]

In [None]:
data_prep = data_prep.loc[(data_prep["rideable_type"] == "classic_bike") | (data_prep["rideable_type"] == "electric_bike")]
X = data_prep[features]
y = data_prep[target]

In [None]:
plt.hist(y, bins=3)
plt.show()

In [None]:
print("Information on X (shape and dtype)")
print(X.shape)
print(X.dtypes)
print("\n")
print("Information on y (shape and dtype)")
print(y.shape)
print(y.dtypes)

# Preprocessing on the data 

In [None]:
y_OE = OrdinalEncoder().fit_transform(y)

num_cols = ["start_lat", "start_lng", "end_lat", "end_lng", "day_of_week", "ride_length"]
categorical_cols_less_values = ["member_casual"]


preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat_less_values', OrdinalEncoder(), categorical_cols_less_values)
    ])

X_prep = preprocessor.fit_transform(X)

# Creation of the dataset

In [None]:
X_tensor = torch.tensor(X_prep, dtype=torch.float32)
y_tensor = torch.tensor(y_OE, dtype=torch.float32)

In [None]:
print(X.shape)
print(X_prep.shape)
print(X_tensor.shape)

print(y_OE.shape)
print(y_tensor.shape)

In [None]:
dataset_tensor = torch.utils.data.TensorDataset(X_tensor, y_tensor)

In [None]:
len(dataset_tensor)

# Models

In [None]:
class MyNet(nn.Module):
    def __init__(self, _Input, _Output):
        super(MyNet, self).__init__()
        self.fc1 = nn.Linear(_Input, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, _Output)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Centralized Version

## Separation of the data (train and validation)

In [None]:
train_size = int(0.6 * len(dataset_tensor))
val_size = len(dataset_tensor) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset_tensor, [train_size, val_size])

In [None]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=3200, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=3200, shuffle=True)

dataloaders = {'train': train_loader, 'val': val_loader}
dataset_sizes= {'train': len(train_dataset), 'val': len(val_dataset)}

# Train and Test function

In [None]:
def train_and_test_nn(model, criterion, optimizer, dataloaders, batch_size, num_epochs=25):

    since = time.time()
    # Instantiate the neural network and the optimizer
    model = model
    optimizer = optimizer
    criterion = criterion
    best_acc_avg = 0.0

    # Train the neural network
    for epoch in range(num_epochs):
        print("\n")
        print("_________________________Epoch %d / %d ____________________" % (epoch+1, num_epochs))
        print("\n")
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            correct = 0
            precision = 0.0
            recall = 0.0
            i = 0

            # Iterate over data.
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    running_loss += loss.item()
                    correct += torch.sum(preds == labels.squeeze())
            
                # backward + optimize only if in training phase
                if phase == 'train':
                    loss.backward()
                    optimizer.step()
                i= i+1
            
            ##Statistics

            # Calculate the average loss
            loss_avg = running_loss / (i+1)

            # Calculate the average accuracy
            accuracy_avg = correct.double() / dataset_sizes[phase]

            # Print the average loss, accuracy, precision, recall for once for train and val per epoch
            print('PHASE %s:  [AVG loss: %.3f || AVG Accuracy: %.4f] ' % 
                (phase, loss_avg, accuracy_avg))
            

            # deep copy the model
            if phase == 'val' and accuracy_avg > best_acc_avg:
                best_acc_avg = accuracy_avg
                best_model_wts = copy.deepcopy(model.state_dict())

    time_elapsed = time.time() - since
    print("\n")
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc_avg))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model


# Starting training and validation

In [None]:
model = MyNet(X_tensor.shape[1], y_tensor.shape[1])
epochs = 20
model = model.to(device)

criterion = nn.BCEWithLogitsLoss()
optimizer_ft = Adam(model.parameters(), lr=0.001)

#model_ft = train_and_test_nn(model, criterion, optimizer_ft, dataloaders, 1, num_epochs=10)

# Federated Version

In [None]:
global_model = MyNet(X_tensor.shape[1], y_tensor.shape[1])
local_model = MyNet(X_tensor.shape[1], y_tensor.shape[1])

In [None]:
def params_models_equals(model_1,model_2):
    """_summary_

    Args:
        model_1 (_type_): a neural network pythorch
        model_2 (_type_): a neural network pytorch
    """
    
    # Vérifier si les deux modèles ont les mêmes paramètres
    params_1 = model_1.state_dict()
    params_2 = model_2.state_dict()

    equal = all([torch.allclose(params_1[key], params_2[key]) for key in params_1.keys()])

    if equal:
        print("The models have the same parameters.")
    else:
        print("The models have differents parameters.")

## Creation Nodes

In [None]:
node_1_data, node_2_data, node_3_data, node_4_data, _ = torch.utils.data.random_split(dataset_tensor, [10000, 10000, 10000, 10000, len(dataset_tensor) - 40000])
#node_1_data, node_2_data, _ = torch.utils.data.random_split(dataset_tensor, [300000, 300000, len(dataset_tensor) - 600000])

### Splitting node_i_data into train_data, valid_data and test_data

In [None]:
nodes = {
        'node_1': {'model': copy.deepcopy(local_model), 'data': node_1_data},
        'node_2': {'model': copy.deepcopy(local_model), 'data': node_2_data},
        'node_3': {'model': copy.deepcopy(local_model), 'data': node_3_data},
        'node_4': {'model': copy.deepcopy(local_model), 'data': node_4_data}
}
""" nodes = {
        'node_1': {'model': copy.deepcopy(local_model), 'data': node_1_data},
        'node_2': {'model': copy.deepcopy(local_model), 'data': node_2_data},

} """

In [None]:
def train_valid_test(dataset, train_size=0.6, valid_size=0.2, test_size=0.2):
    return torch.utils.data.random_split(dataset, [train_size, valid_size, test_size])

In [None]:
def split_data_nodes(nodes):
    """_summary_

    Args:
        nodes (_type_):
        {
            'node_1' :
            {
                model: #, 
                data : #
            },
            'node_2' :
            {
                model: #, 
                data : #
            }, ...
        }

    Returns:
        _type_: a new dictionnary of nodes with a new format 
        {
            'node_1' :
            {
                model: #, 
                data : 
                {
                    'train_data': #,
                    'valid_data': #,
                    'test_data': #
                    
                }
            }, ...
        }
    """
    for node in nodes.keys():
        train_dataset, valid_dataset, test_dataset = train_valid_test(nodes[node]['data'], 0.6, 0.2, 0.2)
        nodes[node]['data'] = {'train_data': train_dataset, 'valid_data': valid_dataset, 'test_data': test_dataset}
    return nodes

In [None]:
nodes = split_data_nodes(nodes)

### Copying global model parameters into local model of nodes

In [None]:
""" for node in nodes.keys():
  params_models_equals(nodes[node]['model'], global_model) """

In [None]:
def send_global_model_to_node(global_model, node):
        """Send the parameters of the global model to a local model of a node

        Args:
            global_model (_type_): _description_
            node (_type_): {'model': #, 'train_data': #, 'valid_data': #, "test_data": #}

        Returns:
            _type_: _description_
        """
        node['model'].load_state_dict(copy.deepcopy(global_model.state_dict()))
        return node

In [None]:
""" # We send the main model to the nodes. 
for node in nodes.keys():
    nodes[node] = send_global_model_to_node(global_model, nodes[node]) """

In [None]:
""" for node in nodes.keys():
  params_models_equals(nodes[node]['model'], global_model) """

## Function to Selection nodes that will train the global model

In [None]:
def selection_nodes(nb_nodes, nodes):
    """_summary_

    Args:
        nb_nodes (_type_): _description_
        nodes (_type_): 
        {
            'node_1' :
            {
                model: #, 
                data : #
            },
            'node_2' :
            {
                model: #, 
                data : #
            }, ...
        }

    Returns:
        _type_: a list of node_name that will participate to the training. 
    """
    nb_nodes_names = len(nodes.keys())
    if(nb_nodes > nb_nodes_names) : nb_nodes = nb_nodes_names
    elif(nb_nodes < 1) : nb_nodes = 1
    return random.sample(list(nodes.keys()), nb_nodes)

## Training node

In [None]:
def train_and_test_node(node, criterion, optimizer, batch_size_train, batch_size_test, num_epochs=25):
    """_summary_

    Args:
        node (_type_): _description_
        criterion (_type_): _description_
        optimizer (_type_): _description_
        batch_size_train (_type_): _description_
        batch_size_test (_type_): _description_
        num_epochs (int, optional): _description_. Defaults to 25.

    Returns:
        _type_: _description_
    """
    train_loader = torch.utils.data.DataLoader(node['data']['train_data'], batch_size=batch_size_train, shuffle=True)
    val_loader = torch.utils.data.DataLoader(node['data']['valid_data'], batch_size=batch_size_test, shuffle=True)

    dataloaders = {'train': train_loader, 'val': val_loader}
    dataset_sizes= {'train': len(node['data']['train_data']), 'val': len(node['data']['train_data'])}

    since = time.time()
    # Instantiate the neural network and the optimizer
    model = node['model']
    optimizer = optimizer
    criterion = criterion
    best_acc_avg = 0.0

    #pbar = trange(num_epochs, unit="carrots")

    # Train the neural network
    for epoch in range(num_epochs):
        """ print("\n")
        print("_________________________Epoch %d / %d ____________________" % (epoch+1, num_epochs))
        print("\n") """
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            correct = 0
            i = 0

            # Iterate over data.
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    running_loss += loss.item()
                    correct += torch.sum(preds == labels.squeeze())

                # backward + optimize only if in training phase
                if phase == 'train':
                    loss.backward()
                    optimizer.step()
                i+=1
            
            ##Statistics

            # Calculate the average loss
            loss_avg = running_loss / (i+1)

            # Calculate the average accuracy
            accuracy_avg = correct.double() / dataset_sizes[phase]

            # Print the average loss, accuracy, precision, recall for once for train and val per epoch
            print('PHASE %s:  [AVG loss: %.3f || AVG Accuracy: %.4f] '% 
                (phase, loss_avg, accuracy_avg))
            

            # deep copy the model
            if phase == 'val' and accuracy_avg > best_acc_avg:
                best_acc_avg = accuracy_avg
                best_model_wts = copy.deepcopy(model.state_dict())

    time_elapsed = time.time() - since
    """ print("\n")
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc_avg)) """

    # load best model weights
    model.load_state_dict(best_model_wts)
    return {'model': model, 'node_best_acc_avg': best_acc_avg}


In [None]:
# Test Node

In [None]:
""" nodes_best_avg = {}

for node in nodes:
    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(nodes[node]['model'].parameters(), lr=0.001)
    model_best_acc_avg = train_and_test_node(nodes[node], criterion, optimizer, 5000, 5000, num_epochs=10)
    nodes[node]['model'] = model_best_acc_avg['model']
    nodes_best_avg[node] = model_best_acc_avg['node_best_acc_avg']
print(nodes_best_avg) """

In [None]:
def send_local_model_for_agg(global_model, nodes, node_in_training_mode):
    temp = node_in_training_mode.copy()
    named_params = nodes[temp[0]]["model"].named_parameters()
    state_dict = nodes[temp.pop(0)]['model'].state_dict()
    for name, param in named_params:
        for node in temp:
            state_dict[name] = state_dict[name] + nodes[node]["model"].state_dict()[name]
        state_dict[name] = state_dict[name]/(len(node_in_training_mode))
    global_model.load_state_dict(state_dict)

In [None]:
""" node_selectioned = selection_nodes(3, nodes)
global_model = send_local_model_for_agg(global_model, nodes, node_in_training_mode) """

In [None]:
def plan_training_global_model(nodes, global_model, nb_round, nb_epoch, nb_nodes_selectioned):
        nodes_selectioned = selection_nodes(nb_nodes_selectioned, nodes)
        nodes_best_avg = {}
        node_before_after_agg = {}

        # We send the main model to the selectioned nodes. 
        for node in nodes_selectioned:
            send_global_model_to_node(global_model, nodes[node])

        for k in range(nb_round):
            nodes_best_avg[k] = {}
            node_before_after_agg[k] = {}
            print("\n")
            print("############################################################")
            print("_________________________Round %d / %d ____________________" % (k+1, nb_round))
            print("############################################################")
            print("\n")
            for node in nodes_selectioned:
                print(f"_________________________TRAINING PHASE of {node}____________________")
                criterion = nn.BCEWithLogitsLoss()
                optimizer = SGD(nodes[node]['model'].parameters(), lr=0.01)
                model_best_acc_avg = train_and_test_node(nodes[node], criterion, optimizer, 3200, 3200, num_epochs=nb_epoch)
                nodes_best_avg[k][node] = model_best_acc_avg['node_best_acc_avg']


            for node in nodes_selectioned:
                node_before_after_agg[k][node] = {"before_agg": test(nodes[node])}
            
            send_local_model_for_agg(global_model, nodes, nodes_selectioned)

            for node in nodes_selectioned:
                send_global_model_to_node(global_model, nodes[node])
                node_before_after_agg[k][node]["after_agg"] = test(nodes[node])

        for k in range(nb_round):
            print("_____________________________________________________________________")
            print(f"_________________________Results for round {k+1} ____________________")
            print("_____________________________________________________________________")
            for node in nodes_selectioned:
                print(f'Results for {node}')
                print("Best Accuracy")
                print(nodes_best_avg[k][node])
                print("\n")
                print("Comparaison before and after aggregation")
                print(node_before_after_agg[k][node])
                print("\n")


In [None]:
plan_training_global_model(nodes,global_model, 4, 10, 4)