In [None]:
# load libraries
from codecarbon import EmissionsTracker
tracker = EmissionsTracker(project_name="central_model_emissions1")
tracker.start()

  import pynvml
[codecarbon INFO @ 20:58:00] [setup] RAM Tracking...
[codecarbon INFO @ 20:58:00] [setup] CPU Tracking...
 Windows OS detected: Please install Intel Power Gadget to measure CPU

[codecarbon INFO @ 20:58:04] CPU Model on constant consumption mode: Intel(R) Core(TM) i7-4600U CPU @ 2.10GHz
[codecarbon INFO @ 20:58:04] [setup] GPU Tracking...
[codecarbon INFO @ 20:58:04] No GPU found.
[codecarbon INFO @ 20:58:04] The below tracking methods have been set up:
                RAM Tracking Method: RAM power estimation model
                CPU Tracking Method: global constant
                GPU Tracking Method: Unspecified
            
[codecarbon INFO @ 20:58:04] >>> Tracker's metadata:
[codecarbon INFO @ 20:58:04]   Platform system: Windows-11-10.0.22631-SP0
[codecarbon INFO @ 20:58:04]   Python version: 3.13.5
[codecarbon INFO @ 20:58:04]   CodeCarbon version: 3.0.5
[codecarbon INFO @ 20:58:04]   Available RAM : 7.900 GB
[codecarbon INFO @ 20:58:04]   CPU count: 4 thread(s)

In [2]:
import numpy as np
import pandas as pd
import torch, random
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader


import os, re, gc
from glob import glob
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from typing import Literal, Optional, Union
import copy, logging, warnings
import joblib
from functools import partial
from modelling_utils import KmerAutoEncoder, KmerClassifier, AutoEncoderScaler, normalise_counts, load_encoder_model
from fl_utils import create_dataloader, get_embeddings, train_loop, test_loop, print_info
import visual_utils
import time

[codecarbon INFO @ 20:58:19] Energy consumed for RAM : 0.000042 kWh. RAM Power : 10.0 W
[codecarbon INFO @ 20:58:19] Delta energy consumed for CPU with constant : 0.000125 kWh, power : 30.0 W
[codecarbon INFO @ 20:58:19] Energy consumed for All CPU : 0.000125 kWh
[codecarbon INFO @ 20:58:19] 0.000167 kWh of electricity used since the beginning.


In [3]:
start = time.time()

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using device: {device}')

Using device: cpu


In [4]:
# for reproducibility
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.use_deterministic_algorithms(False)
    os.environ["PYTHONHASHSEED"] = str(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        torch.backends.cuda.matmul.allow_tf32 = False
        torch.backends.cudnn.allow_tf32 = False

set_seed(42)

In [None]:
# load (files in kmer x sampleIDs)
train = pd.read_parquet('../data/train_8kmer.parquet')
test = pd.read_parquet('../data/test_8kmer.parquet')
train_labels = pd.read_csv('../data/Train.csv')

In [6]:
train_labels = train_labels.assign(ID = train_labels.filename.str.replace('.mgb', '').str.strip())

# rename and select ID and target
train_labels = train_labels.rename(columns={'SampleType': 'target'})[['ID', 'target']]

# set train labels to match with train columns arrangement
train_labels = train_labels.set_index('ID').reindex(train.columns)
train_labels.shape

(2901, 1)

In [7]:
class_map = dict(zip(
    np.sort(train_labels['target'].unique()), 
    range(train_labels['target'].nunique())
))

train_labels['class_int'] = train_labels['target'].map(class_map)

target = train_labels.class_int

## Data Preprocessing

### Normalising kmer counts

In [8]:
print('Normalising kmer counts')
train_norm = normalise_counts(train.T)
test_norm = normalise_counts(test.T)

train_norm.shape, test_norm.shape

Normalising kmer counts


[codecarbon INFO @ 20:58:34] Energy consumed for RAM : 0.000083 kWh. RAM Power : 10.0 W
[codecarbon INFO @ 20:58:34] Delta energy consumed for CPU with constant : 0.000126 kWh, power : 30.0 W
[codecarbon INFO @ 20:58:34] Energy consumed for All CPU : 0.000251 kWh
[codecarbon INFO @ 20:58:34] 0.000334 kWh of electricity used since the beginning.
[codecarbon INFO @ 20:58:49] Energy consumed for RAM : 0.000125 kWh. RAM Power : 10.0 W
[codecarbon INFO @ 20:58:49] Delta energy consumed for CPU with constant : 0.000125 kWh, power : 30.0 W
[codecarbon INFO @ 20:58:49] Energy consumed for All CPU : 0.000376 kWh
[codecarbon INFO @ 20:58:49] 0.000501 kWh of electricity used since the beginning.


((2901, 65536), (1068, 65536))

### Extracting Autoencoders

We will extract autoencoder embeddings from saved autoencoder. The trained autoencoder was developed such that the input data were standardised using standardscaler. Hence, we will first scale and extract their embeddings. After that, we will load our saved autoencoder model. This saved model depends on the loaded `KmerAutoEncoder` class. We will use the load_encoder_model function to do that by passing the file path. 

This was done as a dimensionality reduction strategy for fast compute and modelling as against the 66k kmers from ($4^8$ possible 8-kmer sequences). The about 66k kmer features were reduced to 64 embeddings.


In [9]:
encoder_scaler = AutoEncoderScaler(scale=True, scale_type='ss')
encoder_scaler.fit(train_norm)

# joblib.dump(encoder_scaler, 'data/models/encoder_scaler.pkl') # saving for use

[codecarbon INFO @ 20:59:04] Energy consumed for RAM : 0.000166 kWh. RAM Power : 10.0 W
[codecarbon INFO @ 20:59:04] Delta energy consumed for CPU with constant : 0.000124 kWh, power : 30.0 W
[codecarbon INFO @ 20:59:04] Energy consumed for All CPU : 0.000500 kWh
[codecarbon INFO @ 20:59:04] 0.000666 kWh of electricity used since the beginning.


__Scale and transform autoencoders__

In [None]:
print('Scaling and transforming train and test data\n')
scaled_train_norm = encoder_scaler.transform(train_norm)
scaled_test_norm = encoder_scaler.transform(test_norm)

# load autoencoder model
encoder_path = glob('../data/models/*latent64*.pth')[0]
encoder_path

encoder_model = load_encoder_model(encoder_path)

Scaling and transforming train and test data



In [11]:
# extract embeddings
print('Extracting embeddings for train and test data')
train_embeddings = get_embeddings(encoder_model, scaled_train_norm)
test_embeddings = get_embeddings(encoder_model, scaled_test_norm)

Extracting embeddings for train and test data


[codecarbon INFO @ 20:59:19] Energy consumed for RAM : 0.000208 kWh. RAM Power : 10.0 W
[codecarbon INFO @ 20:59:19] Delta energy consumed for CPU with constant : 0.000125 kWh, power : 30.0 W
[codecarbon INFO @ 20:59:19] Energy consumed for All CPU : 0.000626 kWh
[codecarbon INFO @ 20:59:19] 0.000833 kWh of electricity used since the beginning.
[codecarbon INFO @ 20:59:34] Energy consumed for RAM : 0.000249 kWh. RAM Power : 10.0 W
[codecarbon INFO @ 20:59:34] Delta energy consumed for CPU with constant : 0.000125 kWh, power : 30.0 W
[codecarbon INFO @ 20:59:34] Energy consumed for All CPU : 0.000751 kWh
[codecarbon INFO @ 20:59:34] 0.001000 kWh of electricity used since the beginning.


## Modelling

In [12]:
class KmerPipeline:
    def __init__(self, model_fn, criterion, optimiser_fn, preprocessor):
        super().__init__()
        self.model_fn = model_fn # callable
        self.optimiser_fn = optimiser_fn # callable
        self.model = None
        self.preprocessor = preprocessor
        self.criterion = criterion
        self.optimiser = None
        self.best_model_state = None
    
    def train_model(self, X, y, Xval=None, yval=None, epochs=50, batch_size=32, 
                    shuffle=True, early_stopping_rounds=None, print_rounds=5, 
                    seed=None, verbose=False):
        set_seed(seed)
        y = np.array(y)
        # preprocessor
        self.preprocessor.fit(X, y)
        X = self.preprocessor.transform(X) 

        # build model
        input_dim = X.shape[1]
        num_classes = len(np.unique(y))
        self.model = self.model_fn(input_dim, num_classes).to(device)
        self.optimiser = self.optimiser_fn(self.model)
        
        # convert to tensor and dataloader
        train_loader = create_dataloader(X, y, batch_size=128, shuffle=shuffle)
        
        if Xval is not None and yval is not None:
            yval = np.array(yval)
            Xval = self.preprocessor.transform(Xval)
            val_loader = create_dataloader(Xval, yval, batch_size=64, shuffle=True)
        else:
            val_loader = None
        # fit model
        best_loss = float('inf'); wait = 0; best_epoch = None; best_model_train_loss = float('inf')
        for epoch in range(1, epochs+1):
            train_loss = train_loop(self.model, self.criterion, self.optimiser, train_loader)
            if val_loader is not None:
                val_loss = test_loop(self.model, self.criterion, val_loader)['loss']
                if verbose:
                    if print_info(epoch, epochs, print_rounds):
                        print(f"Epoch {epoch}: Train Loss: {train_loss:.7f}, Val Loss: {val_loss:.7f}")
                if val_loss < best_loss:
                    best_loss = val_loss
                    best_epoch = epoch
                    best_model_train_loss = train_loss
                    self.best_model_state = copy.deepcopy(self.model.state_dict())
                    wait = 0
                else:
                    wait += 1
            else:
                if verbose:
                    if print_info(epoch, epochs, print_rounds):
                        print(f"Epoch {epoch}: Train Loss: {train_loss:.7f}")
                if train_loss < best_loss:
                    best_loss = train_loss
                    best_epoch = epoch
                    self.best_model_state = copy.deepcopy(self.model.state_dict())
                    wait = 0
                else:
                    wait += 1
            # early stopping
            if early_stopping_rounds and wait >= early_stopping_rounds:
                # print(f"\nEarly stopping triggered at epoch {epoch}. No improvement after {early_stopping_rounds} epochs.")
                break
        # print best model
        if val_loader is not None:
            print(f'Best Model: Epoch: {best_epoch}, Train Loss: {best_model_train_loss:.7f}, Val Loss: {best_loss:.7f}')
        else:
            print(f'Best Model: Epoch: {best_epoch}, Train Loss: {best_loss:.7f}')
        if self.best_model_state:
            self.model.load_state_dict(self.best_model_state)
    
    def predict_proba(self, X):
        self.model.eval()
        X = self.preprocessor.transform(X)
        test_loader = create_dataloader(X, batch_size=64, shuffle=False)
        probabilities = []
        with torch.no_grad():
            for inputs, _ in test_loader:
                outputs = self.model(inputs.to(device))
                probs = F.softmax(outputs, dim=1)
                probabilities.append(probs.cpu().numpy())
        return np.vstack(probabilities)
    
    def predict(self, X):
        return self.predict_proba(X).argmax(axis=1)

In [13]:
def save_file(probs, ids, filename):
    cols = list(class_map.keys())
    path = './'
    os.makedirs(path, exist_ok=True)
    df = pd.DataFrame()
    df['ID'] = ids
    df[cols] = probs
    print(df)
    filepath = os.path.join(path, f'{filename}.csv')
    df.to_csv(filepath, index=False)

In [14]:
preprocessor = StandardScaler() # preprocessor
rms_optimiser = lambda model, lr=0.0005: torch.optim.RMSprop(model.parameters(), lr=lr) # optimiser
criterion = nn.CrossEntropyLoss() # loss function

# instantiate model
model = partial(KmerClassifier, layer_mult=3)

In [15]:
X = train_embeddings
y = train_labels.class_int.values

In [16]:
# define model pipeline and train model
print('Modelling\n===============')
print('Training started..')
model_pipe = KmerPipeline(model, criterion, rms_optimiser, preprocessor)
model_pipe.train_model(X, y, epochs=2000, early_stopping_rounds=100, 
                       print_rounds=150, batch_size=256, seed=42, verbose=True)

print('Training Completed!\n')

Modelling
Training started..


[codecarbon INFO @ 20:59:49] Energy consumed for RAM : 0.000291 kWh. RAM Power : 10.0 W
[codecarbon INFO @ 20:59:49] Delta energy consumed for CPU with constant : 0.000125 kWh, power : 30.0 W
[codecarbon INFO @ 20:59:49] Energy consumed for All CPU : 0.000875 kWh
[codecarbon INFO @ 20:59:49] 0.001166 kWh of electricity used since the beginning.


Epoch 1: Train Loss: 0.3520152


[codecarbon INFO @ 21:00:04] Energy consumed for RAM : 0.000332 kWh. RAM Power : 10.0 W
[codecarbon INFO @ 21:00:04] Delta energy consumed for CPU with constant : 0.000125 kWh, power : 30.0 W
[codecarbon INFO @ 21:00:04] Energy consumed for All CPU : 0.001001 kWh
[codecarbon INFO @ 21:00:04] 0.001333 kWh of electricity used since the beginning.
[codecarbon INFO @ 21:00:04] 0.004619 g.CO2eq/s mean an estimation of 145.66201113854103 kg.CO2eq/year
[codecarbon INFO @ 21:00:19] Energy consumed for RAM : 0.000374 kWh. RAM Power : 10.0 W
[codecarbon INFO @ 21:00:19] Delta energy consumed for CPU with constant : 0.000125 kWh, power : 30.0 W
[codecarbon INFO @ 21:00:19] Energy consumed for All CPU : 0.001125 kWh
[codecarbon INFO @ 21:00:19] 0.001499 kWh of electricity used since the beginning.


Epoch 150: Train Loss: 0.0011327


[codecarbon INFO @ 21:00:34] Energy consumed for RAM : 0.000416 kWh. RAM Power : 10.0 W
[codecarbon INFO @ 21:00:34] Delta energy consumed for CPU with constant : 0.000125 kWh, power : 30.0 W
[codecarbon INFO @ 21:00:34] Energy consumed for All CPU : 0.001250 kWh
[codecarbon INFO @ 21:00:34] 0.001666 kWh of electricity used since the beginning.


Epoch 300: Train Loss: 0.0000335


[codecarbon INFO @ 21:00:49] Energy consumed for RAM : 0.000457 kWh. RAM Power : 10.0 W
[codecarbon INFO @ 21:00:49] Delta energy consumed for CPU with constant : 0.000125 kWh, power : 30.0 W
[codecarbon INFO @ 21:00:49] Energy consumed for All CPU : 0.001375 kWh
[codecarbon INFO @ 21:00:49] 0.001833 kWh of electricity used since the beginning.
[codecarbon INFO @ 21:01:04] Energy consumed for RAM : 0.000499 kWh. RAM Power : 10.0 W
[codecarbon INFO @ 21:01:04] Delta energy consumed for CPU with constant : 0.000125 kWh, power : 30.0 W
[codecarbon INFO @ 21:01:04] Energy consumed for All CPU : 0.001500 kWh
[codecarbon INFO @ 21:01:04] 0.001999 kWh of electricity used since the beginning.


Epoch 450: Train Loss: 0.0000834


[codecarbon INFO @ 21:01:19] Energy consumed for RAM : 0.000541 kWh. RAM Power : 10.0 W
[codecarbon INFO @ 21:01:19] Delta energy consumed for CPU with constant : 0.000125 kWh, power : 30.0 W
[codecarbon INFO @ 21:01:19] Energy consumed for All CPU : 0.001625 kWh
[codecarbon INFO @ 21:01:19] 0.002166 kWh of electricity used since the beginning.


Best Model: Epoch: 441, Train Loss: 0.0000076
Training Completed!



### Test Predictions

In [17]:
test_idx = test.columns
len(test_idx)

1068

In [None]:
# model_predictions
print('Predicting test probabilities..')
test_probs = model_pipe.predict_proba(test_embeddings)

os.makedirs('preds', exist_ok=True)
print('Saving test predictions\n')
save_file(test_probs, test_idx, 'preds/centralised_LB_score1')

tracker.stop() # stop carbon emission tracking

Predicting test probabilities..
Saving test predictions

             ID         Mouth         Nasal          Skin         Stool
0     ID_UOIPKJ  6.081013e-09  4.005314e-09  4.161660e-09  1.000000e+00
1     ID_XHBQPF  9.999944e-01  1.627582e-06  3.748464e-06  1.815283e-07
2     ID_KYILXT  9.999998e-01  2.386512e-07  4.160470e-08  2.857757e-08
3     ID_UFGHMX  1.425549e-08  1.812382e-09  1.009929e-08  1.000000e+00
4     ID_URMZQG  2.947394e-07  9.999998e-01  1.123976e-08  1.493887e-09
...         ...           ...           ...           ...           ...
1063  ID_FUMEPV  7.672558e-06  9.999915e-01  6.516010e-07  2.106426e-07
1064  ID_RWUAEX  8.611288e-08  1.936774e-06  1.670381e-08  9.999980e-01
1065  ID_PLZYXW  9.999894e-01  1.027714e-05  2.130637e-07  7.681781e-08
1066  ID_TJNQXM  4.870396e-08  8.864474e-09  9.999998e-01  1.886305e-07
1067  ID_DSBIZA  5.625661e-08  2.241479e-09  3.627793e-10  1.000000e+00

[1068 rows x 5 columns]


[codecarbon INFO @ 21:01:21] Energy consumed for RAM : 0.000546 kWh. RAM Power : 10.0 W
[codecarbon INFO @ 21:01:21] Delta energy consumed for CPU with constant : 0.000015 kWh, power : 30.0 W
[codecarbon INFO @ 21:01:21] Energy consumed for All CPU : 0.001640 kWh
[codecarbon INFO @ 21:01:21] 0.002186 kWh of electricity used since the beginning.


0.0009108297831768457

## Cross-validation

In [19]:
print('\nPerforming cross-validation\n==============================')
folds = np.zeros(len(train_norm))
skfold = StratifiedKFold(shuffle=True, random_state=42)
for i, (_, val_idx) in enumerate(skfold.split(train_norm, train_labels.class_int)):
    folds[val_idx] = i

def cross_validation(clf, optimiser, criterion, train_emb, y=target, cv_folds:int=5, verbose=False):
    lloss = []
    
    skfold = StratifiedKFold(cv_folds, shuffle=True, random_state=42)
    for i in range(len(np.unique(folds))):
        val_idx = folds == i
        xtrain, ytrain = train_emb[~val_idx], y.loc[~val_idx]
        xval, yval = train_emb[val_idx], y.loc[val_idx]
        
        model_pipe = KmerPipeline(model, criterion, rms_optimiser, preprocessor)
        model_pipe.train_model(xtrain, ytrain, xval, yval, epochs=2000, early_stopping_rounds=100, 
                               print_rounds=500, batch_size=256, seed=42, verbose=False)
        
        # evaluate
        res = visual_utils.classification_eval_metrics(model_pipe, xval, yval)
        if verbose:
            print(f'\nFold {i+1}\tLogLoss: {np.array(res.LLoss).squeeze()}')
            print('=='*30)
        lloss.append(np.array(res.LLoss).squeeze())
    avg_lloss = np.mean(lloss)
    ci95_l, ci95_h = np.quantile(lloss, [0.025, 0.975])
    print(f'\nAvg LLoss: {avg_lloss:.8f}')
    print(f'95th CI: [{ci95_l:.8f}, {ci95_h:.8f}]\n')


Performing cross-validation


In [20]:
cross_validation(KmerClassifier, rms_optimiser, criterion, train_embeddings, target, verbose=True)

Best Model: Epoch: 142, Train Loss: 0.0012119, Val Loss: 0.0054802

Fold 1	LogLoss: 0.005919387427609668
Best Model: Epoch: 151, Train Loss: 0.0029829, Val Loss: 0.0088834

Fold 2	LogLoss: 0.009789484053319778
Best Model: Epoch: 79, Train Loss: 0.0035035, Val Loss: 0.0056103

Fold 3	LogLoss: 0.006163785992986322
Best Model: Epoch: 82, Train Loss: 0.0023723, Val Loss: 0.0064833

Fold 4	LogLoss: 0.007132724746008334
Best Model: Epoch: 25, Train Loss: 0.0093308, Val Loss: 0.0195971

Fold 5	LogLoss: 0.021105233866455096

Avg LLoss: 0.01002212
95th CI: [0.00594383, 0.01997366]



In [21]:
end = time.time()
mins = (end - start)/60
print(f'Total Time taken : {mins:.4f} Mins')

Total Time taken : 6.8935 Mins
