In [11]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [12]:
import h5py
import numpy as np
import pandas as pd
import torch
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from lightning import Trainer
from lightning.pytorch.loggers import WandbLogger, TensorBoardLogger
from lightning.pytorch.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import compute_class_weight
from sklearn.metrics import (
    average_precision_score,
    classification_report,
    confusion_matrix,
)
from sklearn.metrics import matthews_corrcoef as mcc
from sklearn.metrics import f1_score as f1
from sklearn.model_selection import StratifiedGroupKFold, cross_val_predict
from torch.utils.data import DataLoader, Dataset
from torch.nn.functional import cross_entropy
from tqdm import tqdm

from src.data.lightning_glyco import ImpGroupedBatchSampler
from src.models import *


In [13]:
def prepare_glyco_data(glyco_type: str):
    glyco_to_class = {
        'N': 1,
        'O': 2
        }
    glyco_class = glyco_type
    glyco_sites = pd.read_csv(f'data/glyco/{glyco_type}/{glyco_type}_train_RR.csv')
    val_glyco_sites = pd.read_csv(f'data/glyco/{glyco_type}/{glyco_type}_val_RR.csv')
    glyco_sites['split'] = 'train'
    val_glyco_sites['split'] = 'train'
    
    glyco_sites = pd.concat([glyco_sites, val_glyco_sites])
    #glyco_sites = glyco_sites[(glyco_sites['label'] == glyco_to_class[glyco_class]) | (glyco_sites['label'] == 0)]

    #glyco_sites["label"] = glyco_sites["label"].apply(lambda x: 1 if x >= 1 else 0)
    if glyco_class == 'N':
        glyco_sites = glyco_sites[glyco_sites["AA"] == 'N']
    else:
        glyco_sites = glyco_sites[(glyco_sites["AA"] == 'S') | (glyco_sites["AA"] == 'T')]
    #glyco_sites, labels = ros.fit_resample(glyco_sites, labels)
    glyco_sites.reset_index(drop=True, inplace=True)
    input_features = np.empty((len(glyco_sites), 2304))
    for idx, (pid, pos) in tqdm(enumerate(zip(glyco_sites['PID'], glyco_sites['Position']))):
        input_feature = np.empty(2304)
        with h5py.File(f'data/glyco/glyco_embeddings.h5', 'r') as p5, h5py.File(f'data/glyco/glyco_esm_embeddings.h5', 'r') as esm:
            try:
                #processed_pids = [pid.replace("-", "_").replace(".", "_") for pid in pids] 
                input_feature = np.concatenate([p5[pid.replace('-', '_').replace('.','_')][()][pos - 1], esm[pid.replace("_", "-")][()][pos - 1]])
            except:
                continue
            input_features[idx] = input_feature
    mask = np.all(input_features != 0, axis=1)
    input_features = input_features[mask]
    labels = np.array(glyco_sites['label'])[mask]
    print(np.sum(~mask))
    print(glyco_sites['label'][mask].value_counts())
    print(glyco_sites['label'].value_counts())
    input_features = input_features.astype(np.float32)
    labels = labels.astype(np.float16)
    return input_features, labels, glyco_sites[mask].reset_index(drop=True)

In [14]:
def prepare_glyco_data_3class():
    glyco_sites = pd.read_csv(f'data/glyco/combined/train_RR.csv')
    glyco_sites['split'] = 'train'
    glyco_sites.loc[(glyco_sites['label'] == 0) & (glyco_sites['AA'] != 'N'), 'label'] = 3
    glyco_sites.reset_index(drop=True, inplace=True)
    input_features = np.empty((len(glyco_sites), 2304))
    for idx, (pid, pos) in tqdm(enumerate(zip(glyco_sites['PID'], glyco_sites['Position']))):
        input_feature = np.empty(2304)
        with h5py.File(f'data/glyco/glyco_embeddings.h5', 'r') as p5, h5py.File(f'data/glyco/glyco_esm_embeddings.h5', 'r') as esm:
            try:
                #processed_pids = [pid.replace("-", "_").replace(".", "_") for pid in pids] 
                input_feature = np.concatenate([p5[pid.replace('-', '_').replace('.','_')][()][pos - 1], esm[pid.replace("_", "-")][()][pos - 1]])
            except:
                continue
            input_features[idx] = input_feature
    mask = np.all(input_features != 0, axis=1)
    input_features = input_features[mask]
    labels = np.array(glyco_sites['label'])[mask]
    print(np.sum(~mask))
    print(glyco_sites['label'][mask].value_counts())
    print(glyco_sites['label'].value_counts())
    input_features = input_features.astype(np.float32)
    labels = labels.astype(np.float16)
    return input_features, labels, glyco_sites[mask].reset_index(drop=True)

In [15]:
def prepare_data(sasa_or_bfactor: str):
    if sasa_or_bfactor == 'sasa':
        data_type = 'sasa'
    else:
        data_type = 'bfactor'
    train = pd.read_csv(f'data/e_prsa/{data_type}/train.csv')
    input_features = []
    ys = []
    pids = np.array(train['PID'].unique())
    for idx, pid in tqdm(enumerate(train['PID'].unique())):
        train_protein = train[train['PID'] == pid]
        y = train_protein['label'].values[0].astype(np.float32)
        input_feature = None
        with h5py.File(f'data/e_prsa/prott5_sasa_bfactor.h5', 'r') as p5, h5py.File(f'data/e_prsa/esm_sasa_bfactor.h5', 'r') as esm:
            try:
                #processed_pids = [pid.replace("-", "_").replace(".", "_") for pid in pids] 
                input_feature = np.concatenate([p5[pid.replace('-', '_').replace('.','_')][()], esm[pid.replace("_", "-")][()]])
            except:
                
                continue
            ys.append(y)
            input_features.append(input_feature)
    input_features = np.array(input_features, dtype=object)
    ys = np.array(ys, dtype=object)
    return input_features, ys, pids
    

In [16]:
class Glycodataset(Dataset):
    def __init__(self, X, y, pids):
        super().__init__()
        self.X = X
        self.y = y
        self.pids = pids
    def __len__(self):
        return len(self.y)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx], self.pids[idx]

class CVDataset(Dataset):
    def __init__(self, input_features, ys):
        self.input_features = input_features
        self.ys = ys

    def __len__(self):
        return len(self.ys)

    def __getitem__(self, idx):
        return self.input_features[idx], self.ys[idx]

In [17]:
def cv_train_glyco_no_oversamp(glyco_class: str, 
                   input_features, 
                   labels, 
                   glyco_sites, 
                   undersample,
                   batch_size,
                   hidden_size,
                   folds=5):
    train_idx = list(glyco_sites[glyco_sites['split'] == 'train'].index)
    pids = np.array(glyco_sites['PID'].values)
    
    train_X_o, train_y_o, train_pids_o = input_features[train_idx], labels[train_idx], pids[train_idx]
    if undersample:
        rus = RandomUnderSampler(random_state=42)
        train_idx_o, train_y_o = rus.fit_resample(np.arange(len(train_y_o)).reshape((-1, 1)), train_y_o)
        train_idx_o = train_idx_o.squeeze()
        train_X_o = train_X_o[train_idx_o]
        train_pids_o = train_pids_o[train_idx_o]
    
    cv_metrics = {}
    
    sfold = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=11442)
    for idx, (train_idx, val_idx) in enumerate(sfold.split(train_X_o, train_y_o, groups=train_pids_o)):
        train_X, train_y, train_pids = train_X_o[train_idx], train_y_o[train_idx], train_pids_o[train_idx]
        val_X, val_y, val_pids = train_X_o[val_idx], train_y_o[val_idx], train_pids_o[val_idx]

        
        train_dataset = Glycodataset(train_X, train_y, train_pids)
        val_dataset = Glycodataset(val_X, val_y, val_pids)
        train_dl = DataLoader(train_dataset, batch_sampler=ImpGroupedBatchSampler(train_pids, batch_size=batch_size))
        val_dl = DataLoader(val_dataset, 1, shuffle=False)
        glyco_model = GlycoModel(num_classes=2, 
                                 lr=0.0001,
                                 input_dim=2304, 
                                 num_hidden=hidden_size, 
                                 num_layers=2,
                                 class_weights=torch.tensor(train_y[train_y == 0].shape[0] / train_y[train_y == 1].shape[0]).to('cuda'))
        checkpoint_callback = ModelCheckpoint(monitor='val_loss', mode='min', save_top_k=1)
        tensor_b = TensorBoardLogger('tb_logs', 
                                     name=f'glyco_{glyco_class}_cv_{idx}', 
                                     default_hp_metric=False)
        #tensor_b = WandbLogger(name=f'glyco_{glyco_class}_cv_{idx}', project='protein_properties_cv')
        trainer = Trainer(max_epochs=30, 
                          enable_progress_bar=False, 
                          num_sanity_val_steps=1, 
                          logger=tensor_b,
                          callbacks=[EarlyStopping(monitor='val_loss', mode='min', patience=4), checkpoint_callback])        
        trainer.fit(glyco_model, train_dl, val_dl)
        
        y_pred_val = trainer.predict(ckpt_path='best', 
                                     dataloaders=DataLoader(val_dataset, 1, shuffle=False))
        val_loss = np.array([loss[1] for loss in y_pred_val]).mean()
        y_pred_val = np.array([pred[0] for pred in y_pred_val], dtype=np.float16)
        matt_val = mcc(val_y, y_pred_val)
        f1_val = f1(val_y, y_pred_val)
        acc_val = np.mean(val_y == y_pred_val)
        #loss_val = cross_entropy(torch.tensor(y_pred_val), torch.tensor(val_y).long())
        
        y_pred_train = trainer.predict(ckpt_path='best', 
                                       dataloaders=DataLoader(train_dataset, batch_size=1, shuffle=False))
        train_loss = np.array([loss[1] for loss in y_pred_train]).mean()
        y_pred_train = np.array([pred[0] for pred in y_pred_train], dtype=np.float16)
        matt_train = mcc(train_y, y_pred_train)
        f1_train = f1(train_y, y_pred_train)
        acc_train = np.mean(train_y == y_pred_train)
        #loss_train = cross_entropy(torch.tensor(y_pred_train), torch.tensor(train_y).long())
        
        cv_metrics[f'fold_{idx}'] = {
            'matt_train': matt_train,
            'f1_train': f1_train,
            'acc_train': acc_train,
            'loss_train': train_loss,
            'matt_val': matt_val,
            'f1_val': f1_val,
            'acc_val': acc_val,
            'loss_val': val_loss,
            'model': checkpoint_callback.best_model_path,
            'train_pred': y_pred_train,
            'val_pred': y_pred_val,
            'train_true': train_y,
            'val_true': val_y
        }
        
    return cv_metrics

In [18]:
def cv_train_glyco(glyco_class: str, 
                   input_features, 
                   labels, 
                   glyco_sites, 
                   us_ratio, 
                   os_ratio, 
                   batch_size,
                   hidden_size,
                   folds=7):
    train_idx = list(glyco_sites[glyco_sites['split'] == 'train'].index)
    pids = np.array(glyco_sites['PID'].values)
    
    train_X_o, train_y_o, train_pids_o = input_features[train_idx], labels[train_idx], pids[train_idx]
    
    
    cv_metrics = {}
    
    sfold = StratifiedGroupKFold(n_splits=folds, shuffle=True, random_state=13442)
    for idx, (train_idx, val_idx) in enumerate(sfold.split(train_X_o, train_y_o, groups=train_pids_o)):
        train_X, train_y, train_pids = train_X_o[train_idx], train_y_o[train_idx], train_pids_o[train_idx]
        val_X, val_y, val_pids = train_X_o[val_idx], train_y_o[val_idx], train_pids_o[val_idx]
        rus = RandomUnderSampler(random_state=42)
        rus_r = RandomUnderSampler(sampling_strategy={0: int(train_y[train_y == 1].shape[0] * us_ratio), 
                                                    1: train_y[train_y == 1].shape[0]}, random_state=42)
        train_idx, train_y = rus_r.fit_resample(np.arange(len(train_y)).reshape((-1, 1)), train_y)
        train_idx = train_idx.squeeze()
        train_X = train_X[train_idx]
        train_pids = train_pids[train_idx]
        
        ros = RandomOverSampler(sampling_strategy={0: train_y[train_y == 0].shape[0], 
                                                    1: int(train_y[train_y == 1].shape[0] * os_ratio)}, random_state=42)
        train_idx, train_y = ros.fit_resample(np.arange(len(train_y)).reshape((-1, 1)), train_y)
        train_idx = train_idx.squeeze()
        train_X = train_X[train_idx]
        train_pids = train_pids[train_idx]
        
        val_idx, val_y = rus.fit_resample(np.arange(len(val_y)).reshape((-1, 1)), val_y)
        val_idx = val_idx.squeeze()
        val_X = val_X[val_idx]
        val_pids = val_pids[val_idx]
        
        train_dataset = Glycodataset(train_X, train_y, train_pids)
        val_dataset = Glycodataset(val_X, val_y, val_pids)
        train_dl = DataLoader(train_dataset, batch_sampler=ImpGroupedBatchSampler(train_pids, batch_size=batch_size))
        val_dl = DataLoader(val_dataset, 1, shuffle=False)
        glyco_model = GlycoModel(num_classes=2, 
                                 lr=0.0001,
                                 input_dim=2304, 
                                 num_hidden=hidden_size, 
                                 num_layers=2,
                                 class_weights=torch.tensor(train_y[train_y == 0].shape[0] / train_y[train_y == 1].shape[0]).to('cuda'))
        checkpoint_callback = ModelCheckpoint(monitor='val_loss', mode='min', save_top_k=1)
        tensor_b = TensorBoardLogger('tb_logs', 
                                     name=f'glyco_{glyco_class}_cv_{idx}', 
                                     default_hp_metric=False)
        #tensor_b = WandbLogger(name=f'glyco_{glyco_class}_cv_{idx}', project='protein_properties_cv')
        trainer = Trainer(max_epochs=30, 
                          enable_progress_bar=False, 
                          num_sanity_val_steps=1, 
                          logger=tensor_b,
                          callbacks=[EarlyStopping(monitor='val_loss', mode='min', patience=4), checkpoint_callback])        
        trainer.fit(glyco_model, train_dl, val_dl)
        
        y_pred_val = trainer.predict(ckpt_path='best', 
                                     dataloaders=DataLoader(val_dataset, 1, shuffle=False))
        val_loss = np.array([loss[1] for loss in y_pred_val]).mean()
        y_pred_val = np.array([pred[0] for pred in y_pred_val], dtype=np.float16)
        matt_val = mcc(val_y, y_pred_val)
        f1_val = f1(val_y, y_pred_val)
        acc_val = np.mean(val_y == y_pred_val)
        #loss_val = cross_entropy(torch.tensor(y_pred_val), torch.tensor(val_y).long())
        
        y_pred_train = trainer.predict(ckpt_path='best', 
                                       dataloaders=DataLoader(train_dataset, batch_size=1, shuffle=False))
        train_loss = np.array([loss[1] for loss in y_pred_train]).mean()
        y_pred_train = np.array([pred[0] for pred in y_pred_train], dtype=np.float16)
        matt_train = mcc(train_y, y_pred_train)
        f1_train = f1(train_y, y_pred_train)
        acc_train = np.mean(train_y == y_pred_train)
        #loss_train = cross_entropy(torch.tensor(y_pred_train), torch.tensor(train_y).long())
        
        cv_metrics[f'fold_{idx}'] = {
            'matt_train': matt_train,
            'f1_train': f1_train,
            'acc_train': acc_train,
            'loss_train': train_loss,
            'matt_val': matt_val,
            'f1_val': f1_val,
            'acc_val': acc_val,
            'loss_val': val_loss,
            'model': checkpoint_callback.best_model_path,
            'train_pred': y_pred_train,
            'val_pred': y_pred_val,
            'train_true': train_y,
            'val_true': val_y
        }
        
    return cv_metrics


In [19]:
def cv_train_glyco_3class(
                   input_features, 
                   labels, 
                   glyco_sites, 
                   undersample,
                   batch_size,
                   hidden_size,
                   folds=5):
    train_idx = list(glyco_sites[glyco_sites['split'] == 'train'].index)
    pids = np.array(glyco_sites['PID'].values)
    
    train_X_o, train_y_o, train_pids_o = input_features[train_idx], labels[train_idx], pids[train_idx]
    if undersample:
        rus = RandomUnderSampler(random_state=42, sampling_strategy={0: train_y_o[train_y_o == 0].shape[0], 
                                                                    1: train_y_o[train_y_o == 1].shape[0],
                                                                    2: train_y_o[train_y_o == 2].shape[0],
                                                                    3: train_y_o[train_y_o == 2].shape[0]})
        train_idx_o, train_y_o = rus.fit_resample(np.arange(len(train_y_o)).reshape((-1, 1)), train_y_o)
        train_idx_o = train_idx_o.squeeze()
        train_X_o = train_X_o[train_idx_o]
        train_pids_o = train_pids_o[train_idx_o]
    
    cv_metrics = {}
    
    train_y_o = torch.tensor(train_y_o).long()
    sfold = StratifiedGroupKFold(n_splits=folds, shuffle=True, random_state=11442)
    for idx, (train_idx, val_idx) in enumerate(sfold.split(train_X_o, train_y_o, groups=train_pids_o)):
        train_X, train_y, train_pids = train_X_o[train_idx], train_y_o[train_idx], train_pids_o[train_idx]
        val_X, val_y, val_pids = train_X_o[val_idx], train_y_o[val_idx], train_pids_o[val_idx]
        train_y[train_y == 3] = 0
        val_y[val_y == 3] = 0
        
        train_dataset = Glycodataset(train_X, train_y, train_pids)
        val_dataset = Glycodataset(val_X, val_y, val_pids)
        train_dl = DataLoader(train_dataset, batch_sampler=ImpGroupedBatchSampler(train_pids, batch_size=batch_size))
        val_dl = DataLoader(val_dataset, 1, shuffle=False)
        class_weights = torch.tensor(compute_class_weight(class_weight='balanced', classes=np.unique(np.array(train_y)), y=np.array(train_y))).float().cuda()
        glyco_model = GlycoModel(num_classes=3, 
                                 lr=0.0001,
                                 input_dim=2304, 
                                 num_hidden=hidden_size, 
                                 num_layers=2,
                                 class_weights=class_weights
                                 )
        checkpoint_callback = ModelCheckpoint(monitor='val_loss', mode='min', save_top_k=1)
        tensor_b = TensorBoardLogger('tb_logs', 
                                     name=f'glyco_3class_cv_{idx}', 
                                     default_hp_metric=False)
        #tensor_b = WandbLogger(name=f'glyco_{glyco_class}_cv_{idx}', project='protein_properties_cv')
        trainer = Trainer(max_epochs=30, 
                          enable_progress_bar=False, 
                          num_sanity_val_steps=1, 
                          logger=tensor_b,
                          callbacks=[EarlyStopping(monitor='val_loss', mode='min', patience=4), checkpoint_callback])        
        trainer.fit(glyco_model, train_dl, val_dl)
        
        y_pred_val = trainer.predict(ckpt_path='best', 
                                     dataloaders=DataLoader(val_dataset, 1, shuffle=False))
        val_loss = np.array([loss[1] for loss in y_pred_val]).mean()
        y_pred_val = np.array([pred[0] for pred in y_pred_val], dtype=np.float16)
        matt_val = mcc(val_y, y_pred_val)
        f1_val = f1(val_y, y_pred_val, average='micro')
        acc_val = np.mean(val_y == y_pred_val)
        #loss_val = cross_entropy(torch.tensor(y_pred_val), torch.tensor(val_y).long())
        
        y_pred_train = trainer.predict(ckpt_path='best', 
                                       dataloaders=DataLoader(train_dataset, batch_size=1, shuffle=False))
        train_loss = np.array([loss[1] for loss in y_pred_train]).mean()
        y_pred_train = np.array([pred[0] for pred in y_pred_train], dtype=np.float16)
        matt_train = mcc(train_y, y_pred_train)
        f1_train = f1(train_y, y_pred_train, average='micro')
        acc_train = np.mean(train_y == y_pred_train)
        #loss_train = cross_entropy(torch.tensor(y_pred_train), torch.tensor(train_y).long())
        
        cv_metrics[f'fold_{idx}'] = {
            'matt_train': matt_train,
            'f1_train': f1_train,
            'acc_train': acc_train,
            'loss_train': train_loss,
            'matt_val': matt_val,
            'f1_val': f1_val,
            'acc_val': acc_val,
            'loss_val': val_loss,
            'model': checkpoint_callback.best_model_path,
            'train_pred': y_pred_train,
            'val_pred': y_pred_val,
            'train_true': train_y,
            'val_true': val_y
        }
        
    return cv_metrics

### N glyco

In [10]:
input_features, labels, glyco_sites  = prepare_glyco_data('N')

15292it [02:15, 112.69it/s]


670
label
0    8815
1    5807
Name: count, dtype: int64
label
0    9228
1    6064
Name: count, dtype: int64


In [20]:
cv_metrics_n_nosampling = cv_train_glyco_no_oversamp('N', input_features, labels, glyco_sites, False, 64, [84])

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type       | Params
---------------------------------------
0 | model   | Sequential | 193 K 
1 | softmax | Softmax    | 0     
2 | sigmoid | Sigmoid    | 0     
---------------------------------------
193 K     Trainable params
0         Non-trainable params
193 K     Total params
0.775     Total estimated model params size (MB)
2024-11-26 15:18:41.277102: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-26 15:18:41.285015: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-26 15:18:41.299563: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to

TypeError: 'NoneType' object is not iterable

In [None]:
cv_metrics = cv_train_glyco('N', input_features, labels, glyco_sites, 0.8, 1.5, 128, [84])

In [None]:
cv_metrics

In [None]:
# save results to disk
import pickle   
with open('glyco_N_cv_nosamp_results.pkl', 'wb') as f:
    pickle.dump(cv_metrics_n_nosampling, f)

### O glyco

In [21]:
input_features, labels, glyco_sites  = prepare_glyco_data('O')

88282it [03:51, 381.68it/s]


8636
label
0    75727
1     3919
Name: count, dtype: int64
label
0    84087
1     4195
Name: count, dtype: int64


In [None]:
cv_metrics_o_nosampling = cv_train_glyco_no_oversamp('O', input_features, labels, glyco_sites, True, 32, [46], 7)

In [None]:
cv_metrics_o = cv_train_glyco('O', input_features, labels, glyco_sites, 1.0, 1.7, 16, [46], 7)

In [20]:
import pickle
with open('glyco_O_cv_nosamp_results.pkl', 'wb') as f:
    pickle.dump(cv_metrics_o_nosampling, f)

### 3 class Glyco

In [10]:
input_features, labels, glyco_sites = prepare_glyco_data_3class()

105497it [05:57, 294.88it/s]


4396
label
3    79243
0    11284
1     6906
2     3668
Name: count, dtype: int64
label
3    82624
0    11819
1     7211
2     3843
Name: count, dtype: int64


In [13]:
cv_metric_3class = cv_train_glyco_3class(input_features, labels, glyco_sites, True, 64, [84], 5)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type       | Params
---------------------------------------
0 | model   | Sequential | 193 K 
1 | softmax | Softmax    | 0     
2 | sigmoid | Sigmoid    | 0     
---------------------------------------
193 K     Trainable params
0         Non-trainable params
193 K     Total params
0.775     Total estimated model params size (MB)
  rank_zero_warn(
  rank_zero_warn(
`Trainer.fit` stopped: `max_epochs=30` reached.
Restoring states from the checkpoint path at tb_logs/glyco_3class_cv_0/version_18/checkpoints/epoch=29-step=9570.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at tb_logs/glyco_3class_cv_0/version_18/checkpoints/epoch=29-step=9570.ckpt
  rank_zero_warn(
Restoring states from the checkpoint path at tb_logs/glyco_3class_cv_0/version_1

In [14]:
cv_metric_3class

{'fold_0': {'matt_train': 0.7468104236460702,
  'f1_train': 0.8360543217139775,
  'acc_train': 0.0,
  'loss_train': 0.7187252880522051,
  'matt_val': 0.6185779479153861,
  'f1_val': 0.7555078962760772,
  'acc_val': 0.0,
  'loss_val': 0.7896809572131342,
  'model': 'tb_logs/glyco_3class_cv_0/version_18/checkpoints/epoch=29-step=9570.ckpt',
  'train_pred': array([0., 0., 0., ..., 0., 2., 0.], dtype=float16),
  'val_pred': array([0., 1., 0., ..., 2., 0., 0.], dtype=float16),
  'train_true': tensor([0, 0, 0,  ..., 0, 0, 0]),
  'val_true': tensor([0, 0, 0,  ..., 0, 0, 0])},
 'fold_1': {'matt_train': 0.7362472274954647,
  'f1_train': 0.825373427672956,
  'acc_train': 0.0,
  'loss_train': 0.7310795588038111,
  'matt_val': 0.6544667484302495,
  'f1_val': 0.7709702357943564,
  'acc_val': 0.0,
  'loss_val': 0.7734390697656196,
  'model': 'tb_logs/glyco_3class_cv_1/version_0/checkpoints/epoch=28-step=9222.ckpt',
  'train_pred': array([0., 0., 1., ..., 0., 2., 0.], dtype=float16),
  'val_pred': ar

In [16]:
import pickle
with open('glyco_3class_cv_results.pkl', 'wb') as f:
    pickle.dump(cv_metric_3class, f)

### Calculate CV metrics 

In [17]:
import pickle
with open('glyco_N_cv_results.pkl', 'rb') as f:
    cv_metrics_N = pickle.load(f)
with open('glyco_O_cv_results.pkl', 'rb') as f:
    cv_metrics_O = pickle.load(f)
with open('glyco_N_cv_nosamp_results.pkl', 'rb') as f:
    cv_metrics_N_nosamp = pickle.load(f)
with open('glyco_O_cv_nosamp_results.pkl', 'rb') as f:
    cv_metrics_O_nosamp = pickle.load(f)
with open('glyco_3class_cv_results.pkl', 'rb') as f:
    cv_metric_3class = pickle.load(f)

In [31]:
cv_metrics_O_nosamp

{'fold_0': {'matt_train': 0.7160192522171801,
  'f1_train': 0.8493195074530137,
  'acc_train': 0.8556348959950326,
  'loss_train': 0.5656767462914274,
  'matt_val': 0.6129905200359977,
  'f1_val': 0.7670068027210885,
  'acc_val': 0.8037249283667621,
  'loss_val': 0.5978884983882522,
  'model': 'tb_logs/glyco_O_cv_0/version_1/checkpoints/epoch=26-step=5454.ckpt',
  'train_pred': array([0., 0., 0., ..., 1., 1., 1.], dtype=float16),
  'val_pred': array([0., 0., 0., ..., 0., 1., 1.], dtype=float16),
  'train_true': array([0., 0., 0., ..., 1., 1., 1.], dtype=float16),
  'val_true': array([0., 0., 0., ..., 1., 1., 1.], dtype=float16)},
 'fold_1': {'matt_train': 0.735558639539423,
  'f1_train': 0.8591668072187553,
  'acc_train': 0.8662502002242511,
  'loss_train': 0.572299803905374,
  'matt_val': 0.593143400514179,
  'f1_val': 0.7831245880026367,
  'acc_val': 0.793730407523511,
  'loss_val': 0.5983842329545455,
  'model': 'tb_logs/glyco_O_cv_1/version_1/checkpoints/epoch=28-step=5684.ckpt',
 

In [18]:
import numpy as np 
import scipy.stats
def mean_confidence_interval(data, confidence=0.95):
    a = 1.0 * np.array(data)
    n = len(a)
    m, se = np.mean(a), scipy.stats.sem(a)
    h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)
    return m, h

In [19]:
def calculate_cv_metrics(cv_metrics_N, title):
    # dicts to arrays
    matt_val_N = np.array([cv_metrics_N[f'fold_{i}']['matt_val'] for i in range(5)])
    f1_val_N = np.array([cv_metrics_N[f'fold_{i}']['f1_val'] for i in range(5)])
    acc_val_N = np.array([cv_metrics_N[f'fold_{i}']['acc_val'] for i in range(5)])
    loss_val_N = np.array([cv_metrics_N[f'fold_{i}']['loss_val'] for i in range(5)])
    matt_train_N = np.array([cv_metrics_N[f'fold_{i}']['matt_train'] for i in range(5)])
    f1_train_N = np.array([cv_metrics_N[f'fold_{i}']['f1_train'] for i in range(5)])
    acc_train_N = np.array([cv_metrics_N[f'fold_{i}']['acc_train'] for i in range(5)])
    loss_train_N = np.array([cv_metrics_N[f'fold_{i}']['loss_train'] for i in range(5)])
    # display results
    print(f'{title}')
    print('Validation')
    print(f'MCC: {mean_confidence_interval(matt_val_N)}')
    print(f'F1: {mean_confidence_interval(f1_val_N)}')
    print(f'Accuracy: {mean_confidence_interval(acc_val_N)}')
    print(f'Loss: {mean_confidence_interval(loss_val_N)}')
    print('Training')
    print(f'MCC: {mean_confidence_interval(matt_train_N)}')
    print(f'F1: {mean_confidence_interval(f1_train_N)}')
    print(f'Accuracy: {mean_confidence_interval(acc_train_N)}')
    print(f'Loss: {mean_confidence_interval(loss_train_N)}')


In [25]:
calculate_cv_metrics(cv_metrics_N, 'N Glycosylation with resampling')

N Glycosylation with resampling
Validation
MCC: (0.48723930199084303, 0.03325045947600155)
F1: (0.7269751001751958, 0.01795409639741133)
Accuracy: (0.7419858227182345, 0.0164121145917134)
Loss: (0.5256116059249847, 0.007316343052187019)
Training
MCC: (0.6001168182009299, 0.0164731138961925)
F1: (0.8298044726863629, 0.014753099569902648)
Accuracy: (0.7971249746143145, 0.013567001106316949)
Loss: (0.41664553938649596, 0.0015515123913180083)


In [28]:
calculate_cv_metrics(cv_metrics_N_nosamp, 'N Glycosylation without resampling')

N Glycosylation without resampling
Validation
MCC: (0.4946204404846449, 0.03053684150374917)
F1: (0.6982733902361147, 0.01780999910349806)
Accuracy: (0.7565729284402753, 0.015485499720937693)
Loss: (0.7501042101337616, 0.00783195992301314)
Training
MCC: (0.6492416766538454, 0.01087751331179394)
F1: (0.7896217750018247, 0.007966082128577554)
Accuracy: (0.8313197923153611, 0.005179743300540183)
Loss: (0.715584298354519, 0.003264926927211513)


In [26]:
calculate_cv_metrics(cv_metrics_O, 'O Glycosylation with resampling')

O Glycosylation with resampling
Validation
MCC: (0.55529108898251, 0.047172140978392514)
F1: (0.7529535652643737, 0.018501649912094843)
Accuracy: (0.7732222918133042, 0.02119787519557048)
Loss: (0.5177197247839391, 0.014986831916968405)
Training
MCC: (0.6946359157654367, 0.07330761598211553)
F1: (0.8661113654773402, 0.04267501466033582)
Accuracy: (0.8442043961398153, 0.04377269814898921)
Loss: (0.42413555231828737, 0.010886699785268199)


In [27]:
calculate_cv_metrics(cv_metrics_O_nosamp, 'O Glycosylation without resampling')

O Glycosylation without resampling
Validation
MCC: (0.5936016922897025, 0.06630466510668313)
F1: (0.7743130763169941, 0.04518225804070069)
Accuracy: (0.7928299533036512, 0.039354007240398406)
Loss: (0.5996544030130541, 0.009976605335670066)
Training
MCC: (0.7256647689250747, 0.012788211561977469)
F1: (0.8539772557837072, 0.006334160090041874)
Accuracy: (0.8610938404802905, 0.006941082078840988)
Loss: (0.5716556784436829, 0.0067526845637055615)


In [20]:
calculate_cv_metrics(cv_metric_3class, '3 class Glycosylation')

3 class Glycosylation
Validation
MCC: (0.6261037401998674, 0.02138254270338311)
F1: (0.7593156963960432, 0.008238817732687692)
Accuracy: (0.0, 0.0)
Loss: (0.785141260207179, 0.008451956430241537)
Training
MCC: (0.7409497314222058, 0.005359801578878789)
F1: (0.8286345150865279, 0.006040548758827899)
Accuracy: (0.0, 0.0)
Loss: (0.7265588308830223, 0.0064850974439327105)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
# plot cv metric for all 4 cross validation runs compare O against O and N against N
# use the mean confidence interval for the error bars
def plot_cv_nosamp_vs_samp(cv_metrics, cv_metrics_nosamp):
    fig, axs = plt.subplots(2, 2, figsize=(14, 14))
    for i, (glyco_class, title) in enumerate(zip(['N', 'O'], ['N Glycosylation', 'O Glycosylation'])):
        # dicts to arrays
        matt_val = np.array([cv_metrics[f'fold_{i}']['matt_val'] for i in range(5)])
        f1_val = np.array([cv_metrics[f'fold_{i}']['f1_val'] for i in range(5)])
        acc_val = np.array([cv_metrics[f'fold_{i}']['acc_val'] for i in range(5)])
        loss_val = np.array([cv_metrics[f'fold_{i}']['loss_val'] for i in range(5)])
        matt_train = np.array([cv_metrics[f'fold_{i}']['matt_train'] for i in range(5)])
        f1_train = np.array([cv_metrics[f'fold_{i}']['f1_train'] for i in range(5)])
        acc_train = np.array([cv_metrics[f'fold_{i}']['acc_train'] for i in range(5)])
        loss_train = np.array([cv_metrics[f'fold_{i}']['loss_train'] for i in range(5)])
        # display results
        print(f'{title}')
        print('Validation')
        print(f'MCC: {mean_confidence_interval(matt_val)}')
        print(f'F1: {mean_confidence_interval(f1_val)}')
        print(f'Accuracy: {mean_confidence_interval(acc_val)}')
        print(f'Loss: {mean_confidence_interval(loss_val)}')
        print('Training')
        print(f'MCC: {mean_confidence_interval(matt_train)}')
        print(f'F1: {mean_confidence_interval(f1_train)}')
        print(f'Accuracy: {mean_confidence_interval(acc_train)}')
        print(f'Loss: {mean_confidence_interval(loss_train)}')
        # dicts to arrays
        matt_val_nosamp = np.array([cv_metrics_nosamp[f'fold_{i}']['matt_val'] for i in range(5)])
        f1_val_nosamp = np.array([cv_metrics_nosamp[f'fold_{i}']['f1_val'] for i in range(5)])
        acc_val_nosamp = np.array([cv_metrics_nosamp[f'fold_{i}']['acc_val'] for i in range(5)])
    