In [6]:
import pandas as pd
import numpy as np
import torch
import optuna

import sys
sys.path.append('../utils')

from initialize_results_df import initialize_results_df
from load_sequence_data import load_sequence_data
from optuna_cnn_kmer_utils import *
from k_mer_data_loader import *


In [7]:
# K-MER CNN

In [None]:
k = 5
stride = 1
embedding_dim = 64
max_len = 50  # Could do 97: (101 - K + 1) // S = 97
batch_size = 32

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

vocab = build_kmer_vocab(k)
vocab_size = len(vocab) + 1
data_dir = '..\\Data'
excel_dir = '..\\Outputs\\excel_results.xlsx'

results_df, excel_df = initialize_results_df(data_dir, excel_dir)

train_df = load_sequence_data(results_df['train_path'][1])
test_df = load_sequence_data(results_df['test_path'][1])

vocab = build_kmer_vocab(k)
vocab_size = len(vocab) + 1

train_loader, valid_loader, test_loader = prepare_kmer_loaders(
    train_df['sequence'].tolist(), train_df['label'].values,
    test_df['sequence'].tolist(), test_df['label'].values,
    vocab, k, stride, max_len, batch_size
)

In [6]:

search_space = {
    "num_layers": {"type": "int", "low": 4, "high": 8},
    "embedding_dim": {"type": "categorical", "choices": [64]},
    "units": {"type": "categorical", "choices": [32, 64, 128]},
    "kernel_size": {"type": "categorical", "choices": [5, 7, 11]},
    "activation": {"type": "categorical", "choices": ["relu", "gelu", "silu"]},
    "dropout": {"type": "float", "low": 0.1, "high": 0.5},
}

best_model, best_params, acc, study = run_optuna_pipeline(
    train_loader, valid_loader,
    vocab_size=len(vocab)+1,
    device='cuda',
    epochs=10,
    n_trials=2,
    max_len=50,
    save_path='best_model_kmer.pt',
    search_space=search_space
)

print(best_params)
print(acc)


[I 2025-05-02 10:21:33,636] A new study created in memory with name: no-name-9c6968a9-8f11-4a66-b11c-a0ba0d56ae10
[I 2025-05-02 10:21:39,682] Trial 0 finished with value: 0.5951492786407471 and parameters: {'num_layers': 4, 'embedding_dim': 64, 'units_0': 128, 'kernel_size_0': 11, 'activation_0': 'gelu', 'dropout_0': 0.3096533241889008, 'units_1': 128, 'kernel_size_1': 7, 'activation_1': 'silu', 'dropout_1': 0.396843340121502, 'units_2': 32, 'kernel_size_2': 5, 'activation_2': 'gelu', 'dropout_2': 0.3770255953635342, 'units_3': 128, 'kernel_size_3': 7, 'activation_3': 'gelu', 'dropout_3': 0.2490459672114775}. Best is trial 0 with value: 0.5951492786407471.
[I 2025-05-02 10:21:47,042] Trial 1 finished with value: 0.611940324306488 and parameters: {'num_layers': 8, 'embedding_dim': 64, 'units_0': 128, 'kernel_size_0': 7, 'activation_0': 'gelu', 'dropout_0': 0.2517571691264483, 'units_1': 32, 'kernel_size_1': 7, 'activation_1': 'silu', 'dropout_1': 0.42124381299903235, 'units_2': 32, 'ker

{'num_layers': 8, 'embedding_dim': 64, 'units_0': 128, 'kernel_size_0': 7, 'activation_0': 'gelu', 'dropout_0': 0.2517571691264483, 'units_1': 32, 'kernel_size_1': 7, 'activation_1': 'silu', 'dropout_1': 0.42124381299903235, 'units_2': 32, 'kernel_size_2': 11, 'activation_2': 'relu', 'dropout_2': 0.2151194903695739, 'units_3': 64, 'kernel_size_3': 7, 'activation_3': 'gelu', 'dropout_3': 0.4150442815358266, 'units_4': 128, 'kernel_size_4': 7, 'activation_4': 'silu', 'dropout_4': 0.3880892389484897, 'units_5': 128, 'kernel_size_5': 7, 'activation_5': 'relu', 'dropout_5': 0.13444223875666297, 'units_6': 64, 'kernel_size_6': 11, 'activation_6': 'silu', 'dropout_6': 0.38751685998232377, 'units_7': 32, 'kernel_size_7': 5, 'activation_7': 'relu', 'dropout_7': 0.14052723641779222}
0.5858209133148193


In [None]:
# Save the Model

study.best_params
import json

with open("final_model_hparams.json", "w") as f:
    json.dump(study.best_params, f)


In [29]:
# ✅ Load the saved best model weights
best_model.load_state_dict(torch.load('best_model_kmer.pt'))
best_model.to(device)
best_model.eval()

# ✅ Evaluate on test_loader
acc_test, preds_test, labels_test = evaluate(best_model, test_loader, device)

print(f"Test Accuracy: {acc_test:.4f}")


Test Accuracy: 0.7341


# LOOPING THROUGH FOLDERS

In [6]:
import pandas as pd
import numpy as np
import torch
import optuna
import json

import sys
sys.path.append('../utils')

from initialize_results_df import initialize_results_df
from load_sequence_data import load_sequence_data
from optuna_cnn_kmer_utils import *
from k_mer_data_loader import *


In [None]:
# Paths
data_dir = '../Data'
excel_path = '../Outputs/50_CNN_KM.xlsx'

# Load dataframes
results_df, excel_df = initialize_results_df(data_dir, excel_path)

Excel file saved at: ../Outputs/50_CNN_KM.xlsx


In [14]:
# load hp from JSON
with open('../Outputs/optuna_cnn_k_mers_hypparams.json', 'r') as f:
    hp = json.load(f)
    
if 'embedding_dim' not in hp:
    hp['embedding_dim'] = 64  # or whatever value you tuned


In [15]:
vocab = build_kmer_vocab(k=5)
vocab_size = len(vocab) + 1  # +1 for padding
max_len = 101  # or fixed, or largest length across all folders (up to you)

model = DynamicCNN(vocab_size, hp, max_len=max_len)
model.load_state_dict(torch.load('../Outputs/optuna_cnn_k_mers.pt'))
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print("✅ Model loaded and ready!")


RuntimeError: Error(s) in loading state_dict for DynamicCNN:
	Missing key(s) in state_dict: "conv_layers.12.weight", "conv_layers.12.bias". 
	Unexpected key(s) in state_dict: "conv_layers.11.weight", "conv_layers.11.bias". 

In [4]:
for idx, row in results_df.iloc[:2].iterrows():
    train_path = row['train_path']
    test_path = row['test_path']
    folder_name = row['folder_name']
    
    print(f"🔄 Processing {folder_name}")

    # --- Load data ---
    train_df = load_sequence_data(train_path)
    test_df = load_sequence_data(test_path)

    # --- Tokenize ---
    X_train = [tokenize_sequence(seq, vocab, k=5, stride=2) for seq in train_df['sequence']]
    X_test = [tokenize_sequence(seq, vocab, k=5, stride=2) for seq in test_df['sequence']]
    y_train = train_df['label'].tolist()
    y_test = test_df['label'].tolist()

    # --- Compute max_len dynamically (or set fixed if preferred) ---
    max_len = max(max(len(seq) for seq in X_train), max(len(seq) for seq in X_test))

    # --- Prepare datasets/loaders ---
    train_dataset = PreTokenizedDataset(X_train, y_train, max_len=max_len)
    test_dataset = PreTokenizedDataset(X_test, y_test, max_len=max_len)

    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32)

    # --- Fine-tune same model ---
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    criterion = nn.BCEWithLogitsLoss()
    train_one_epoch(model, train_loader, optimizer, criterion, device)

    # --- Evaluate ---
    train_acc, train_preds, train_labels = evaluate(model, train_loader, device)
    test_acc, test_preds, test_labels = evaluate(model, test_loader, device)

    # Optional: calculate PR AUC, ROC AUC
    from sklearn.metrics import average_precision_score, roc_auc_score
    train_probs = train_preds.numpy()
    test_probs = test_preds.numpy()

    train_pr_auc = average_precision_score(train_labels.numpy(), train_probs)
    train_roc_auc = roc_auc_score(train_labels.numpy(), train_probs)

    test_pr_auc = average_precision_score(test_labels.numpy(), test_probs)
    test_roc_auc = roc_auc_score(test_labels.numpy(), test_probs)

    # ✅ Log metrics
    excel_df.at[idx, 'train_accuracy'] = train_acc
    excel_df.at[idx, 'test_accuracy'] = test_acc
    excel_df.at[idx, 'pr-roc'] = test_roc_auc
    excel_df.at[idx, 'pr-auc'] = test_pr_auc

    print(f"✅ {folder_name}: train_acc={train_acc:.4f}, test_acc={test_acc:.4f}")

# # ✅ Save final model
# torch.save(model.state_dict(), "outputs/final_model.pt")
# print("✅ Final model saved!")


🔄 Processing wgEncodeAwgTfbsBroadDnd41CtcfUniPk


NameError: name 'model' is not defined