In [20]:
import sys

# Simulate the command-line arguments
sys.argv = [
    'notebook',  # argv[0] is the script name, can be anything
    '--npy_file', '../masif_features/new_train.npy',
    '--cv_fold', '5',
    '--cv_fold_idx', '0'  # or 1, 2, 3, 4 for other folds
]

In [21]:
import torch
import numpy as np
from metrics import *
from data_prepare import testloader
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, GATConv, global_max_pool as gmp, global_add_pool as gap,global_mean_pool as gep,global_sort_pool
from torch_geometric.utils import dropout_adj
from torch.optim.lr_scheduler import MultiStepLR


In [22]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [23]:
## TEST 28 APRIL ## 

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, GATConv, global_max_pool as gmp, global_add_pool as gap,global_mean_pool as gep,global_sort_pool
from torch_geometric.utils import dropout_adj
from torch.optim.lr_scheduler import MultiStepLR

class GCNN_mutual_attention(nn.Module):
    def __init__(self, n_output=1, num_features_pro=1024, output_dim=128, dropout=0.2, descriptor_dim=80, transformer_dim=32, nhead=4, num_layers=1, dim_feedforward=128): ## change number of layers 
        super(GCNN_mutual_attention, self).__init__()
        print('GCNN Loaded')
        self.n_output = n_output
        self.pro1_conv1 = GCNConv(num_features_pro, num_features_pro)
        self.pro1_fc1 = nn.Linear(num_features_pro, output_dim)
        self.pro2_conv1 = GCNConv(num_features_pro, num_features_pro)
        self.pro2_fc1 = nn.Linear(num_features_pro, output_dim)
        
        self.descriptor_dim = descriptor_dim
        self.transformer_dim = transformer_dim
        self.reducer = nn.Linear(self.descriptor_dim, self.transformer_dim - 2)

        # Transformer parameters
        
        self.nhead = nhead
        self.num_layers = num_layers

        # Transformer encoders for masif descriptors
        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(
                d_model=self.transformer_dim,
                nhead=self.nhead,
                dim_feedforward=dim_feedforward,
                dropout=dropout
            ),
            num_layers=self.num_layers
        )

        # Output processing
        self.relu = nn.LeakyReLU()
        self.dropout = nn.Dropout(dropout)
        self.sigmoid = nn.Sigmoid()
        
        # Final layers
        combined_dim = 2 * output_dim + self.transformer_dim
        self.final_fc = nn.Linear(combined_dim, self.n_output)

    def forward(self, pro1_data, pro2_data, mas1_straight, mas1_flipped, mas2_straight, mas2_flipped):
        # Process protein 1 with GNN
        pro1_x, pro1_edge_index, pro1_batch = pro1_data.x, pro1_data.edge_index, pro1_data.batch
        x = self.pro1_conv1(pro1_x, pro1_edge_index)
        x = self.relu(x)
        x = gep(x, pro1_batch)
        x = self.relu(self.pro1_fc1(x))
        x = self.dropout(x)

        # Process protein 2 with GNN
        pro2_x, pro2_edge_index, pro2_batch = pro2_data.x, pro2_data.edge_index, pro2_data.batch
        xt = self.pro2_conv1(pro2_x, pro2_edge_index)
        xt = self.relu(xt)
        xt = gep(xt, pro2_batch)
        xt = self.relu(self.pro2_fc1(xt))
        xt = self.dropout(xt)

        # Process masif descriptors with transformers
        # Add indicator (0/1) for straight/flipped
        batch_size = mas1_straight.size(0)
        
        # Prepare indicators
        straight_indicator_mas1 = torch.ones((*mas1_straight.shape[:-1], 1), device=mas1_straight.device)
        flipped_indicator_mas1 = torch.zeros((*mas1_flipped.shape[:-1], 1), device=mas1_flipped.device)

        straight_indicator_mas2 = torch.ones((*mas2_straight.shape[:-1], 1), device=mas2_straight.device)
        flipped_indicator_mas2 = torch.zeros((*mas2_flipped.shape[:-1], 1), device=mas2_flipped.device)
        
        first_indicator = torch.ones((*mas1_straight.shape[:-1], 1), device=mas1_straight.device)
        second_indicator = torch.zeros((*mas2_flipped.shape[:-1], 1), device=mas2_flipped.device)

        mas1_straight = self.reducer(mas1_straight)
        mas1_flipped = self.reducer(mas1_flipped)
        mas2_straight = self.reducer(mas2_straight)
        mas2_flipped = self.reducer(mas2_flipped)

        # Concatenate descriptors with indicators
        mas1_straight = torch.cat([mas1_straight, straight_indicator_mas1, first_indicator], dim=-1)
        mas1_flipped = torch.cat([mas1_flipped, flipped_indicator_mas1, first_indicator], dim=-1)
        mas2_straight = torch.cat([mas2_straight, straight_indicator_mas2, second_indicator], dim=-1)
        mas2_flipped = torch.cat([mas2_flipped, flipped_indicator_mas2, second_indicator], dim=-1)
        
        # Process through transformers
        # Combine straight and flipped for each protein
        mas = torch.cat([mas1_straight, mas1_flipped, mas2_straight, mas2_flipped], dim=1)
        
        # Transform sequences (B, L, D) -> (L, B, D) for transformer
        mas = mas.transpose(0, 1)

        # Apply transformers
        mas_out = self.transformer(mas)
        
        # Get mean of transformer outputs for global representation
        mas_out = mas_out.mean(dim=0)

        # Concatenate all features
        combined = torch.cat([x, xt, mas_out], dim=1)
        
        # Final prediction (logits)
        out = self.final_fc(combined)
        return out


In [24]:
model = GCNN_mutual_attention()
model.load_state_dict(torch.load("/workspace/masif_features/GCN_simple_run_5.pth"))
model.to(device)
model.eval()

GCNN Loaded




GCNN_mutual_attention(
  (pro1_conv1): GCNConv(1024, 1024)
  (pro1_fc1): Linear(in_features=1024, out_features=128, bias=True)
  (pro2_conv1): GCNConv(1024, 1024)
  (pro2_fc1): Linear(in_features=1024, out_features=128, bias=True)
  (reducer): Linear(in_features=80, out_features=30, bias=True)
  (transformer): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=32, out_features=32, bias=True)
        )
        (linear1): Linear(in_features=32, out_features=128, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
        (linear2): Linear(in_features=128, out_features=32, bias=True)
        (norm1): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.2, inplace=False)
        (dropout2): Dropout(p=0.2, inplace=False)
      )
    )
  )
  (relu)

In [34]:
predictions = torch.Tensor().to(device)
labels = torch.Tensor().to(device)

In [27]:
import sys
import importlib
import torch
import numpy as np
from sklearn.metrics import accuracy_score, roc_auc_score
from tqdm.auto import tqdm

fold_accuracies = []
fold_aucs = []

for fold_idx in range(5):
    print(f"\n===== Evaluating Fold {fold_idx} =====")

    # Simulate command-line arguments for data_prepare.py
    sys.argv = [
        'notebook',
        '--npy_file', '../masif_features/new_train.npy',
        '--cv_fold', '5',
        '--cv_fold_idx', str(fold_idx)
    ]

    # Reload data_prepare.py to apply fold arguments
    import data_prepare
    importlib.reload(data_prepare)

    # Access testloader from data_prepare
    testloader = data_prepare.testloader  # make sure this exists

    # Load the model for the fold
    model_path = f"/workspace/masif_features/GCN_simple_run_5.pth"
    model.load_state_dict(torch.load(model_path))
    model.to(device)
    model.eval()

    # Store predictions and labels
    predictions = torch.Tensor().to(device)
    labels = torch.Tensor().to(device)

    # Run evaluation
    with torch.no_grad():
        for prot_1, prot_2, label, mas1_straight, mas1_flipped, mas2_straight, mas2_flipped in tqdm(testloader):
            prot_1 = prot_1.to(device)
            prot_2 = prot_2.to(device)
            mas1_straight = mas1_straight.to(device)
            mas1_flipped = mas1_flipped.to(device)
            mas2_straight = mas2_straight.to(device)
            mas2_flipped = mas2_flipped.to(device)
            label = label.to(device)

            output = model(prot_1, prot_2, mas1_straight, mas1_flipped, mas2_straight, mas2_flipped)
            predictions = torch.cat((predictions, output), dim=0)
            labels = torch.cat((labels, label.view(-1, 1)), dim=0)

    # Convert to numpy
    y_true = labels.cpu().numpy().flatten()
    y_pred = predictions.cpu().numpy().flatten()
    y_pred_bin = (y_pred > 0.5).astype(int)

    # Compute metrics
    acc = accuracy_score(y_true, y_pred_bin)
    auc = roc_auc_score(y_true, y_pred)

    print(f"Fold {fold_idx} — Accuracy: {acc:.4f}, AUC: {auc:.4f}")
    fold_accuracies.append(acc)
    fold_aucs.append(auc)

# Final CV summary
print("\n===== Cross-Validation Summary =====")
print(f"Mean Accuracy: {np.mean(fold_accuracies):.4f} ± {np.std(fold_accuracies):.4f}")
print(f"Mean AUC: {np.mean(fold_aucs):.4f} ± {np.std(fold_aucs):.4f}")



===== Evaluating Fold 0 =====


  0%|          | 0/327 [00:00<?, ?it/s]

Fold 0 — Accuracy: 0.8845, AUC: 0.9377

===== Evaluating Fold 1 =====


  0%|          | 0/327 [00:00<?, ?it/s]

Fold 1 — Accuracy: 0.8944, AUC: 0.9772

===== Evaluating Fold 2 =====


  0%|          | 0/327 [00:00<?, ?it/s]

Fold 2 — Accuracy: 0.8989, AUC: 0.9845

===== Evaluating Fold 3 =====


  0%|          | 0/327 [00:00<?, ?it/s]

Fold 3 — Accuracy: 0.9028, AUC: 0.9808

===== Evaluating Fold 4 =====


  0%|          | 0/327 [00:00<?, ?it/s]

Fold 4 — Accuracy: 0.8982, AUC: 0.9816

===== Cross-Validation Summary =====
Mean Accuracy: 0.8957 ± 0.0062
Mean AUC: 0.9724 ± 0.0175


In [36]:
import sys
import importlib
import torch
import numpy as np
from sklearn.metrics import accuracy_score, roc_auc_score
from metrics import get_mse, get_accuracy, precision, sensitivity, specificity, f_score, mcc, auroc, auprc  
from tqdm.auto import tqdm

all_metrics = {
    'loss': [],
    'accuracy': [],
    'precision': [],
    'sensitivity': [],
    'specificity': [],
    'f1': [],
    'mcc': [],
    'auroc': [],
    'auprc': []
}


for fold_idx in range(5):
    print(f"\n=== Evaluating Fold {fold_idx} ===")

    # Simulate command-line args for data_prepare
    sys.argv = [
        'notebook',
        '--npy_file', '../masif_features/new_train.npy',
        '--cv_fold', '5',
        '--cv_fold_idx', str(fold_idx)
    ]
    
    import data_prepare
    importlib.reload(data_prepare)

    # Access testloader from data_prepare
    testloader = data_prepare.testloader  # make sure this exists

    # Load the model for the fold
    model_path = f"/workspace/masif_features/GCN_simple_run_5.pth"
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.to(device)
    model.eval()

    # Store predictions and labels
    predictions = torch.Tensor().to(device)
    labels = torch.Tensor().to(device)

    # Evaluation loop
    with torch.no_grad():
        for prot_1, prot_2, label, mas1_straight, mas1_flipped, mas2_straight, mas2_flipped in tqdm(testloader):
            prot_1 = prot_1.to(device)
            prot_2 = prot_2.to(device)
            mas1_straight = mas1_straight.to(device)
            mas1_flipped = mas1_flipped.to(device)
            mas2_straight = mas2_straight.to(device)
            mas2_flipped = mas2_flipped.to(device)
            label = label.to(device)

            # Forward pass
            output = model(prot_1, prot_2, mas1_straight, mas1_flipped, mas2_straight, mas2_flipped)
            predictions = torch.cat((predictions, output), dim=0)
            labels = torch.cat((labels, label.view(-1, 1)), dim=0)

    # Convert to numpy
    labels_np = labels.cpu().numpy().flatten()
    preds_np = predictions.cpu().numpy().flatten()

    # Threshold for classification
    threshold = 0.5

    # Compute metrics
    loss = get_mse(labels_np, preds_np)
    acc = get_accuracy(labels_np, preds_np, threshold)
    prec = precision(labels_np, preds_np, threshold)
    sens = sensitivity(labels_np, preds_np, threshold)
    spec = specificity(labels_np, preds_np, threshold)
    f1 = f_score(labels_np, preds_np, threshold)
    mcc_val = mcc(labels_np, preds_np, threshold)
    auc = auroc(labels_np, preds_np)
    aupr = auprc(labels_np, preds_np)

    # Print per-fold results
    print(f"Loss        : {loss:.4f}")
    print(f"Accuracy    : {acc:.4f}")
    print(f"Precision   : {prec:.4f}")
    print(f"Sensitivity : {sens:.4f}")
    print(f"Specificity : {spec:.4f}")
    print(f"F1 Score    : {f1:.4f}")
    print(f"MCC         : {mcc_val:.4f}")
    print(f"AUROC       : {auc:.4f}")
    print(f"AUPRC       : {aupr:.4f}")

    # Accumulate
    all_metrics['loss'].append(loss)
    all_metrics['accuracy'].append(acc)
    all_metrics['precision'].append(prec)
    all_metrics['sensitivity'].append(sens)
    all_metrics['specificity'].append(spec)
    all_metrics['f1'].append(f1)
    all_metrics['mcc'].append(mcc_val)
    all_metrics['auroc'].append(auc)
    all_metrics['auprc'].append(aupr)

# --- Print average and std of metrics ---
print("\n=== Average Metrics Across All Folds ===")
for name, values in all_metrics.items():
    values = np.array(values)
    print(f"{name.capitalize():<12}: Mean = {values.mean():.4f} | Std = {values.std():.4f}")



=== Evaluating Fold 0 ===


  0%|          | 0/327 [00:00<?, ?it/s]

Loss        : 12.8722
Accuracy    : 88.4468
Precision   : 0.9719
Sensitivity : 0.7924
Specificity : 0.9770
F1 Score    : 0.8730
MCC         : 0.7826
AUROC       : 0.9377
AUPRC       : 0.9557

=== Evaluating Fold 1 ===


  0%|          | 0/327 [00:00<?, ?it/s]

Loss        : 14.3883
Accuracy    : 89.4415
Precision   : 0.9827
Sensitivity : 0.7988
Specificity : 0.9865
F1 Score    : 0.8812
MCC         : 0.8017
AUROC       : 0.9772
AUPRC       : 0.9799

=== Evaluating Fold 2 ===


  0%|          | 0/327 [00:00<?, ?it/s]

Loss        : 14.0090
Accuracy    : 89.8928
Precision   : 0.9848
Sensitivity : 0.8072
Specificity : 0.9879
F1 Score    : 0.8872
MCC         : 0.8102
AUROC       : 0.9845
AUPRC       : 0.9860

=== Evaluating Fold 3 ===


  0%|          | 0/327 [00:00<?, ?it/s]

Loss        : 13.8478
Accuracy    : 90.2757
Precision   : 0.9875
Sensitivity : 0.8220
Specificity : 0.9889
F1 Score    : 0.8972
MCC         : 0.8186
AUROC       : 0.9808
AUPRC       : 0.9844

=== Evaluating Fold 4 ===


  0%|          | 0/327 [00:00<?, ?it/s]

Loss        : 14.2386
Accuracy    : 89.8162
Precision   : 0.9890
Sensitivity : 0.8093
Specificity : 0.9906
F1 Score    : 0.8902
MCC         : 0.8109
AUROC       : 0.9816
AUPRC       : 0.9850

=== Average Metrics Across All Folds ===
Loss        : Mean = 13.8712 | Std = 0.5329
Accuracy    : Mean = 89.5746 | Std = 0.6230
Precision   : Mean = 0.9832 | Std = 0.0060
Sensitivity : Mean = 0.8059 | Std = 0.0101
Specificity : Mean = 0.9862 | Std = 0.0048
F1          : Mean = 0.8858 | Std = 0.0082
Mcc         : Mean = 0.8048 | Std = 0.0123
Auroc       : Mean = 0.9724 | Std = 0.0175
Auprc       : Mean = 0.9782 | Std = 0.0114


In [None]:
### test seminar 

In [54]:
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class GCNN(nn.Module):
    def __init__(self, n_output=1, num_features_pro=1024, output_dim=128, dropout=0.2):
        super(GCNN, self).__init__()
        print('GCNN Loaded')
        self.n_output = n_output
        self.pro1_conv1 = GCNConv(num_features_pro, num_features_pro)
        self.pro1_fc1 = nn.Linear(num_features_pro, output_dim)
        self.pro2_conv1 = GCNConv(num_features_pro, num_features_pro)
        self.pro2_fc1 = nn.Linear(num_features_pro, output_dim)
        
        self.descriptor_dim = 80
        self.transformer_dim = 32 - 1
        self.reducer = nn.Linear(self.descriptor_dim, self.transformer_dim)

        # Transformer parameters
        
        self.nhead = 4
        self.num_layers = 2

        # Transformer encoders for masif descriptors
        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(
                d_model=self.transformer_dim + 1,  # +1 for the indicator
                nhead=self.nhead,
                dim_feedforward=128,
                dropout=dropout
            ),
            num_layers=self.num_layers
        )
        
        # Output processing
        self.relu = nn.LeakyReLU()
        self.dropout = nn.Dropout(dropout)
        self.sigmoid = nn.Sigmoid()
        
        # Final layers
        combined_dim = 2 * output_dim + 2 * (self.transformer_dim + 1)
        self.final_fc = nn.Linear(combined_dim, self.n_output)

    def forward(self, pro1_data, pro2_data, mas1_straight, mas1_flipped, mas2_straight, mas2_flipped):
        # Process protein 1 with GNN
        pro1_x, pro1_edge_index, pro1_batch = pro1_data.x, pro1_data.edge_index, pro1_data.batch
        x = self.pro1_conv1(pro1_x, pro1_edge_index)
        x = self.relu(x)
        x = gep(x, pro1_batch)
        x = self.relu(self.pro1_fc1(x))
        x = self.dropout(x)

        # Process protein 2 with GNN
        pro2_x, pro2_edge_index, pro2_batch = pro2_data.x, pro2_data.edge_index, pro2_data.batch
        xt = self.pro2_conv1(pro2_x, pro2_edge_index)
        xt = self.relu(xt)
        xt = gep(xt, pro2_batch)
        xt = self.relu(self.pro2_fc1(xt))
        xt = self.dropout(xt)

        # Process masif descriptors with transformers
        # Add indicator (0/1) for straight/flipped
        batch_size = mas1_straight.size(0)
        
        # Prepare indicators
        straight_indicator = torch.ones((*mas1_straight.shape[:-1], 1), device=mas1_straight.device)
        flipped_indicator = torch.zeros((*mas1_flipped.shape[:-1], 1), device=mas1_flipped.device)
        
        mas1_straight = self.reducer(mas1_straight)
        mas1_flipped = self.reducer(mas1_flipped)
        mas2_straight = self.reducer(mas2_straight)
        mas2_flipped = self.reducer(mas2_flipped)

        # Concatenate descriptors with indicators
        mas1_straight = torch.cat([mas1_straight, straight_indicator], dim=-1)
        mas1_flipped = torch.cat([mas1_flipped, flipped_indicator], dim=-1)
        mas2_straight = torch.cat([mas2_straight, straight_indicator], dim=-1)
        mas2_flipped = torch.cat([mas2_flipped, flipped_indicator], dim=-1)
        
        # Process through transformers
        # Combine straight and flipped for each protein
        mas1 = torch.cat([mas1_straight, mas1_flipped], dim=1)
        mas2 = torch.cat([mas2_straight, mas2_flipped], dim=1)
        
        # Transform sequences (B, L, D) -> (L, B, D) for transformer
        mas1 = mas1.transpose(0, 1)
        mas2 = mas2.transpose(0, 1)

        # Apply transformers
        mas1_out = self.transformer(mas1)
        mas2_out = self.transformer(mas2)
        
        # Get mean of transformer outputs for global representation
        mas1_out = mas1_out.mean(dim=0)
        mas2_out = mas2_out.mean(dim=0)

        # Concatenate all features
        combined = torch.cat([x, xt, mas1_out, mas2_out], dim=1)
        
        # Final prediction (logits)
        out = self.final_fc(combined)
        return out

In [55]:
model = GCNN()
model.load_state_dict(torch.load("/workspace/masif_features/GCN_old.pth"))
model.to(device)
model.eval()

GCNN Loaded




GCNN(
  (pro1_conv1): GCNConv(1024, 1024)
  (pro1_fc1): Linear(in_features=1024, out_features=128, bias=True)
  (pro2_conv1): GCNConv(1024, 1024)
  (pro2_fc1): Linear(in_features=1024, out_features=128, bias=True)
  (reducer): Linear(in_features=80, out_features=31, bias=True)
  (transformer): TransformerEncoder(
    (layers): ModuleList(
      (0-1): 2 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=32, out_features=32, bias=True)
        )
        (linear1): Linear(in_features=32, out_features=128, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
        (linear2): Linear(in_features=128, out_features=32, bias=True)
        (norm1): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.2, inplace=False)
        (dropout2): Dropout(p=0.2, inplace=False)
      )
    )
  )
  (relu): LeakyReLU

In [56]:
predictions = torch.Tensor().to(device)
labels = torch.Tensor().to(device)

In [12]:
from tqdm.auto import tqdm

with torch.no_grad():
    for prot_1, prot_2, label, mas1_straight, mas1_flipped, mas2_straight, mas2_flipped in tqdm(testloader):
        prot_1 = prot_1.to(device)
        prot_2 = prot_2.to(device)
        mas1_straight = mas1_straight.to(device)
        mas1_flipped = mas1_flipped.to(device)
        mas2_straight = mas2_straight.to(device)
        mas2_flipped = mas2_flipped.to(device)
        label = label.to(device)
        # Forward pass
        output = model(prot_1, prot_2, mas1_straight, mas1_flipped, mas2_straight, mas2_flipped)
        # Collect predictions and labels
        predictions = torch.cat((predictions, output), dim=0)
        labels = torch.cat((labels, label.view(-1, 1)), dim=0)
            
# Convert to numpy for metrics
labels = labels.cpu().numpy().flatten()
predictions = predictions.cpu().numpy().flatten()

  0%|          | 0/327 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [58]:
threshold = 0.5 

In [59]:
loss = get_mse(labels, predictions)
acc = get_accuracy(labels, predictions, threshold)
prec = precision(labels, predictions, threshold)
sensitivity = sensitivity(labels, predictions, threshold)
specificity = specificity(labels, predictions, threshold)
f1 = f_score(labels, predictions, threshold)
mcc = mcc(labels, predictions, threshold)
auroc = auroc(labels, predictions)
auprc = auprc(labels, predictions)


In [60]:
loss

44.943645

In [61]:
acc

88.77941981390258

In [62]:
prec

0.9108352144469526

In [63]:
sensitivity 


0.864951768488746

In [64]:
specificity 


0.9116331096196868

In [65]:
f1


0.8873007146783948

In [66]:
mcc 

0.776759973403508

In [67]:
auroc


0.9404359418872033

In [69]:
auprc 

0.9498247976709834

In [None]:
## Plots 

In [79]:
import os
os.makedirs("plots_seminar", exist_ok=True)

In [81]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, precision_recall_curve, confusion_matrix, ConfusionMatrixDisplay

# ROC Curve
fpr, tpr, _ = roc_curve(labels, predictions)
plt.figure(figsize=(6, 5))
plt.plot(fpr, tpr, label=f"AUROC = {auroc:.2f}")
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig("plots_seminar/roc_curve.png")
plt.close()

In [82]:
# Precision-Recall Curve
precision_vals, recall_vals, _ = precision_recall_curve(labels, predictions)
plt.figure(figsize=(6, 5))
plt.plot(recall_vals, precision_vals, label=f"AUPRC = {auprc:.2f}")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig("plots_seminar/precision_recall_curve.png")
plt.close()

In [84]:
# Confusion Matrix
pred_binary = (predictions >= threshold).astype(int)
cm = confusion_matrix(labels, pred_binary)
plt.figure(figsize=(5, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Pred 0", "Pred 1"], yticklabels=["True 0", "True 1"])
plt.title("Confusion Matrix")
plt.ylabel("True Label")
plt.xlabel("Predicted Label")
plt.tight_layout()
plt.savefig("plots_seminar/confusion_matrix.png")
plt.close()

In [85]:
# Distribution of prediction scores
plt.figure(figsize=(6, 5))
plt.hist(predictions[labels == 0], bins=30, alpha=0.6, label="Negative")
plt.hist(predictions[labels == 1], bins=30, alpha=0.6, label="Positive")
plt.axvline(x=threshold, color='red', linestyle='--', label=f"Threshold = {threshold}")
plt.title("Prediction Score Distribution")
plt.xlabel("Predicted Score")
plt.ylabel("Count")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig("plots_seminar/prediction_distribution.png")
plt.close()