## Load packages

In [None]:
from codecarbon import EmissionsTracker
tracker = EmissionsTracker(project_name="federated_learning_model_emissions2")
tracker.start()

  import pynvml
[codecarbon INFO @ 01:01:13] [setup] RAM Tracking...
[codecarbon INFO @ 01:01:13] [setup] CPU Tracking...
 Windows OS detected: Please install Intel Power Gadget to measure CPU

[codecarbon INFO @ 01:01:16] CPU Model on constant consumption mode: Intel(R) Core(TM) i7-4600U CPU @ 2.10GHz
[codecarbon INFO @ 01:01:16] [setup] GPU Tracking...
[codecarbon INFO @ 01:01:16] No GPU found.
[codecarbon INFO @ 01:01:16] The below tracking methods have been set up:
                RAM Tracking Method: RAM power estimation model
                CPU Tracking Method: global constant
                GPU Tracking Method: Unspecified
            
[codecarbon INFO @ 01:01:16] >>> Tracker's metadata:
[codecarbon INFO @ 01:01:16]   Platform system: Windows-11-10.0.22631-SP0
[codecarbon INFO @ 01:01:16]   Python version: 3.13.5
[codecarbon INFO @ 01:01:16]   CodeCarbon version: 3.0.5
[codecarbon INFO @ 01:01:16]   Available RAM : 7.900 GB
[codecarbon INFO @ 01:01:16]   CPU count: 4 thread(s)

In [2]:
import numpy as np
import pandas as pd
import torch, random
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import os, re, gc
from glob import glob
from sklearn.model_selection import train_test_split, KFold
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from typing import Literal, Optional
from torch.utils.data import Dataset, DataLoader
import copy, logging
import joblib
import matplotlib.pyplot as plt
from modelling_utils import FeatureScaler, normalise_counts, LogisticRegression, PLSLatentTransformer
from modelling_utils import test_loop, train_loop, print_info, set_seed
from fl_utils import Client, Server, PreprocessClientData, create_dataloader, ClientDataset
import visual_utils
import time

In [3]:
start = time.time()
set_seed(42)

In [4]:
logging.basicConfig(level=logging.INFO, format='%(message)s')
logger = logging.getLogger(__name__)


device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using {device}')

Using cpu


## Helper functions

In [5]:
def load_datasets(base_dir): 
    """
    Loads dataset

    :param base_dir: Base directory where client data are stored
    :returns pd.DataFrame. All dataset in folder path (in kmers x sampleIDs)
    """
    try:
        logger.info(f'Loading Dataset from {base_dir} directory')
        files = glob(f'{base_dir}/**/*.parquet')
        all_data = []
        if files:
            clients = list(map(lambda x: re.search('Mouth|Nasal|Stool|Skin', x, flags=re.IGNORECASE), files))
            clients = [client.group() if client else None for client in clients]
        
        for _, file in enumerate(files):
            all_data.append(pd.read_parquet(file))
    except Exception as err:
        logger.exception(f'Error loading datasets.\n{err}')
    all_data = pd.concat(all_data, axis=1)
    logger.info('Datasets loaded')
    return all_data


In [6]:
class GlobalModel:
    def __init__(self, model_fn):
        self.model_fn = model_fn
        self.model = None
        self.scaler = None
    
    def fit(self, X):
        if self.model_fn.global_scaler:
            self.scaler = self.model_fn.global_scaler
        self.scaler.fit(X)
        self.model = self.model_fn.global_model.to(device)
        return self
    
    def transform(self, X):
        if self.scaler:
            X = self.scaler.transform(X)
        X = create_dataloader(X, batch_size=128, shuffle=False)
        return X
    
    def predict_proba(self, X):
        if self.model is None:
            self.fit(X)
        X_loader = self.transform(X)
        
        self.model.eval()
        probs = []
        with torch.no_grad():
            for inputs, _ in X_loader:
                outputs = self.model(inputs.to(device))
                prob = F.softmax(outputs, dim=1)
                probs.append(prob)
        probs = torch.cat(probs).cpu().numpy()
        return probs
    
    def predict(self, X):
        return self.predict_proba(X).argmax(axis=1)


### Loading datasets

In [None]:
train_labels = pd.read_csv('../data/Train.csv')
subjects = train_labels.groupby('SubjectID').size().sort_values(ascending=False).index
subject_ids = {k:j for k, j in zip(train_labels.filename.str.replace('.mgb', '').str.strip(), train_labels.SubjectID)}
subject_info = train_labels[['filename', 'SampleType', 'SubjectID']]
subject_info.loc[:, 'filename'] = subject_info.loc[:, 'filename'].str.replace('.mgb', '').str.strip()
subject_info =  subject_info.rename(columns={'filename': 'ID', 'SampleType': 'label', 'SubjectID': 'subject_id'}).set_index('ID')

__Loading client data__

In [None]:
client_data = load_datasets('../data/fl_data/')
print()
client_data.shape


# transpose to ID vs kmer
client_data = client_data.T

Loading Dataset from data/fl_data/ directory
[codecarbon INFO @ 01:01:35] Energy consumed for RAM : 0.000042 kWh. RAM Power : 10.0 W
[codecarbon INFO @ 01:01:35] Delta energy consumed for CPU with constant : 0.000126 kWh, power : 30.0 W
[codecarbon INFO @ 01:01:35] Energy consumed for All CPU : 0.000126 kWh
[codecarbon INFO @ 01:01:35] 0.000167 kWh of electricity used since the beginning.
Datasets loaded





__Load client datasets and extract their embeddings__

In [9]:
kmers = client_data.columns
kmers[:3]

Index(['AAAAAAAA', 'AAAAAAAC', 'AAAAAAAG'], dtype='object', name='kmer')

In [None]:
# autoencoder model
pls_scaler = joblib.load('../data/models/pls_scaler.pkl')
selected_features = pd.read_csv(glob('../data/select*_canon*_kmers.csv')[0], index_col=0).index
selected_features_idx = kmers.get_indexer(selected_features).tolist()

pls_model = joblib.load(glob('../data/models/canon*pls_model.pkl')[0])

## Data Preprocessing

Prepare client data by 

- Splitting data into 4 clients based on subject ID, where a subject is assigned to only one client to maintain privacy

- Extracting and scaling autoencoder embeddings for modelling

In [11]:
# prepare Client Data and extract their emebddings
prep = PreprocessClientData(pls_scaler, client_data, subject_info)
client_splits = prep.split_data(label_id='label', unique_id='subject_id', 
                                num_clients=4, random_state=42, test_fraction=None)

Splitting data into 4 clients...
Normalising and scaling kmer counts...
[codecarbon INFO @ 01:01:50] Energy consumed for RAM : 0.000084 kWh. RAM Power : 10.0 W
[codecarbon INFO @ 01:01:50] Delta energy consumed for CPU with constant : 0.000129 kWh, power : 30.0 W
[codecarbon INFO @ 01:01:50] Energy consumed for All CPU : 0.000254 kWh
[codecarbon INFO @ 01:01:50] 0.000338 kWh of electricity used since the beginning.


In [12]:
def get_embeddings(pls_model, X, cols=None):
    if cols is not None:
        if isinstance(X, np.ndarray):
            if all(isinstance(x, int) for x in cols):
                X_trans = pls_model.transform(X[:, cols])
            else:
                raise ValueError("For numpy arrays, 'cols' must be a list of integers.")
        elif isinstance(X, pd.DataFrame):
            if all(isinstance(x, int) for x in cols):
                X_trans = pls_model.transform(X.iloc[:, cols])
            elif all(isinstance(x, str) for x in cols):
                X_trans = pls_model.transform(X.loc[:, cols])
            else:
                raise ValueError("For DataFrames, 'cols' must be all integers or all strings.")
        else:
            raise TypeError("X must be a numpy array or pandas DataFrame.")
    else:
        X_trans = pls_model.transform(X)

    return X_trans

In [13]:
def extract_client_embeddings(client_data_splits, cols=None):
    """
    Extract pls features

    :param client_data_split: Dictionary. Client ID key with their respective datasets
    :param cols: Columns used for dimensionality reduction if not all features were used
    :returns: Dict[list] of client embeddings for train and validation sets
    """
    logger.info('Extracting client data embeddings....')
    client_embeddings = copy.copy(client_splits)

    for cid, val in client_splits.items():
        if val[1] is not None:
            train_test = [get_embeddings(pls_model, x, cols) for x in val[:2]]
            client_embeddings[cid][:2] = train_test
        else:
            client_embeddings[cid][0] = get_embeddings(pls_model, val[0], cols)
    return client_embeddings

In [14]:
client_embeddings = extract_client_embeddings(client_splits, selected_features_idx)

Extracting client data embeddings....


In [15]:
# get input shape
input_dim = client_embeddings['0'][0].shape[1]
num_classes = 4
input_dim

15

## Modelling

- Provide configuration setting for both client and server (where the global model's weights are updated via federated averaging)

__Configuration settings for both client and server__

In [16]:
# configuration setings for client and server (global model)
class ClientConfig:
    config = {
        'local_epochs' : 5, #number of local training rounds
        'loss_fn' : nn.CrossEntropyLoss(),
        'optimiser' : optim.SGD, # optim.AdamW,
        'lr' : 1e-1, 
        'weight_decay' : 0,
        'random_state' : 42,
        'n_epoch_print' : 250,
        'verbose' : False,
        'validation_fraction': None, # fraction for validation
        'batch_size' : 128,
        'shuffle' : True # shuffle during training
    }

class StrategyConfig:
    config = {
        'fit_fraction' : 1., # train using 100% of the clients
        'num_rounds' : 200, # number of training rounds to update global model's weights
        'fraction_evaluate': 1., # evaluate on test data of all clients
        'early_stopping': True,
        'patience' : 50, 
        'verbose' : True,
        'eval_metric' : 'loss',
        'random_state':42,
        'n_epoch_print': 20
    }

In [17]:
# Instantiate client and server class
client_config = ClientConfig.config
strategy_config = StrategyConfig.config

__Instantiating global model__

- Assigning clients with global model parameters and their respective client configuratons

In [18]:
# instantiate global model and clients
global_model = LogisticRegression(input_dim=input_dim, num_classes=num_classes)

# instantiate clients models
clients = [Client(global_model, client_embeddings.get(key), config=client_config) for key in client_embeddings.keys()]

In [19]:
def server_fn(global_model, clients_models, strategy_config):
    """
    Server App
    
    :param global_model: Global model class
    :param client_models: Client models
    :param strategy_config: Confugration settings for server
    :returns: Server class object
    """
    server = Server(global_model, clients_models, strategy_config)
    config = server.check_server_config(server.strategy_config)
    server.train_rounds(**config)
    return server

In [20]:
# return for use
logger.info('Performing Federated Learning\n===============================')
fc_server = server_fn(global_model, clients, strategy_config)

Performing Federated Learning
Total Clients: 4
4 clients selected for training and 0 for evaluation

Training started in server
[codecarbon INFO @ 01:02:05] Energy consumed for RAM : 0.000124 kWh. RAM Power : 10.0 W
[codecarbon INFO @ 01:02:05] Delta energy consumed for CPU with constant : 0.000121 kWh, power : 30.0 W
[codecarbon INFO @ 01:02:05] Energy consumed for All CPU : 0.000375 kWh
[codecarbon INFO @ 01:02:05] 0.000499 kWh of electricity used since the beginning.
Round 1 Train: Loss: 0.31055197638014087, Accuracy: 0.9927611168562565
Round 20 Train: Loss: 0.018199650206319664, Accuracy: 0.9986211651154774
[codecarbon INFO @ 01:02:20] Energy consumed for RAM : 0.000165 kWh. RAM Power : 10.0 W
[codecarbon INFO @ 01:02:20] Delta energy consumed for CPU with constant : 0.000125 kWh, power : 30.0 W
[codecarbon INFO @ 01:02:20] Energy consumed for All CPU : 0.000500 kWh
[codecarbon INFO @ 01:02:20] 0.000666 kWh of electricity used since the beginning.
Round 40 Train: Loss: 0.0128993542

In [21]:
# define global model
model = GlobalModel(fc_server) 

# get label IDs and names
lab_ids = clients[0].label_ids
lab_names = list(lab_ids.keys())

print(lab_ids, lab_names)

{'Mouth': 0, 'Nasal': 1, 'Skin': 2, 'Stool': 3} ['Mouth', 'Nasal', 'Skin', 'Stool']


In [None]:
# Visualising global model's performance on the test data from all clients
if clients[0].test_data is not None:
    all_test = [(model.test_data, model.test_labels) for model in clients]
    X, y = zip(*all_test)
    X = np.vstack(X)
    y = np.concatenate(y)
    X.shape, y.shape
    y = list(map(lambda x: lab_ids.get(x), y))

    logger.info('Classification Report')
    visual_utils.print_classification_report(model, X, y, lab_names)
    print()

    logger.info('Classification Evaluation Metrics')
    print(visual_utils.classification_eval_metrics(model, X, y))
    print()


    logger.info('Saving Classification Performance Chart Report')
    visual_utils.classification_performance_chart_report(model, X, y, display_names=lab_names)
    plt.savefig('classification_performance_chart_report2.png')
    print()

### Test predictions

In [None]:
# Load test data
logger.info('Obtaining Test Predictions for submission\n')

test_data = pd.read_parquet('../data/test_8kmer.parquet')
test_data.shape
# transpose to samples x features
test_data = test_data.T

# get their sample IDs
test_idx = test_data.index.tolist()

Obtaining Test Predictions for submission



In [24]:
# prepare test data
def prepare_test(X, encoder_model, encoder_scaler, **kwargs):
    # normalise and scale using the encoder scaler to get embeddings
    norm_X = normalise_counts(X)
    test_scaled = encoder_scaler.transform(norm_X)
    test_embeddings = get_embeddings(encoder_model, test_scaled, **kwargs)
    return test_embeddings


def save_file(probs, ids, filename):
    cols = lab_names
    path = 'preds'
    os.makedirs(path, exist_ok=True)
    df = pd.DataFrame()
    df['ID'] = test_idx
    df[cols] = probs
    print(df)
    filepath = os.path.join(path, f'{filename}.csv')
    df.to_csv(filepath, index=False)

In [25]:
# get test embeddings
logger.info('Preparing test embeddings')
test_embs = prepare_test(test_data, pls_model, pls_scaler, cols=selected_features_idx)

Preparing test embeddings
[codecarbon INFO @ 01:04:05] Energy consumed for RAM : 0.000457 kWh. RAM Power : 10.0 W
[codecarbon INFO @ 01:04:05] Delta energy consumed for CPU with constant : 0.000125 kWh, power : 30.0 W
[codecarbon INFO @ 01:04:05] Energy consumed for All CPU : 0.001376 kWh
[codecarbon INFO @ 01:04:05] 0.001833 kWh of electricity used since the beginning.


In [26]:
# test probabilities
logger.info('Predicting test probabilities')
test_probs = model.predict_proba(test_embs)

Predicting test probabilities


In [None]:
# save file
logger.info('Saving test predictions')
save_file(test_probs, test_idx, 'federated_learnin_preds2')

Saving test predictions


             ID     Mouth     Nasal      Skin     Stool
0     ID_UOIPKJ  0.000026  0.000010  0.000032  0.999931
1     ID_XHBQPF  0.994854  0.000312  0.003335  0.001499
2     ID_KYILXT  0.996740  0.000154  0.002301  0.000805
3     ID_UFGHMX  0.000115  0.000192  0.000233  0.999460
4     ID_URMZQG  0.000020  0.999897  0.000047  0.000036
...         ...       ...       ...       ...       ...
1063  ID_FUMEPV  0.001301  0.997365  0.000367  0.000967
1064  ID_RWUAEX  0.000165  0.000521  0.000391  0.998922
1065  ID_PLZYXW  0.995092  0.004114  0.000223  0.000572
1066  ID_TJNQXM  0.000064  0.000020  0.999878  0.000038
1067  ID_DSBIZA  0.000025  0.000016  0.000042  0.999916

[1068 rows x 5 columns]


In [28]:
# total time
end = time.time()
mins = (end - start)/60
logger.info(f'\nTotal Time taken : {mins:.4f} Mins')

tracker.stop()


Total Time taken : 2.6945 Mins
[codecarbon INFO @ 01:04:14] Energy consumed for RAM : 0.000482 kWh. RAM Power : 10.0 W
[codecarbon INFO @ 01:04:14] Delta energy consumed for CPU with constant : 0.000077 kWh, power : 30.0 W
[codecarbon INFO @ 01:04:14] Energy consumed for All CPU : 0.001452 kWh
[codecarbon INFO @ 01:04:14] 0.001935 kWh of electricity used since the beginning.


0.0008061949567610684