# Hyperparameter Tuning with Optuna

In [None]:
import numpy as np
import pandas as pd
import torch
from torch import nn
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import silhouette_samples
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import umap
from tqdm import tqdm
import optuna

from utils import prepare_data
from build_model import Autoencoder_5_Layers, Autoencoder_4_Layers, Autoencoder_3_Layers
from train_model import train_autoencoder

### Prepare Data

In [9]:
# Device agnostic
device = "cuda" if torch.cuda.is_available() else "cpu"

# Import data
DNA_meth_train = prepare_data("../Data/DNAMethylation_train.csv")
DNA_meth_test = prepare_data("../Data/DNAMethylation_test.csv")
RNA_seq_train = prepare_data("../Data/RNAseq_train.csv")
RNA_seq_test = prepare_data("../Data/RNAseq_test.csv")

# k-means
dna_methylation = prepare_data("../Data/DNAMethylation.csv", transpose=True, normalise=True)
rna_seq = prepare_data("../Data/RNAseq.csv", transpose=True, normalise=True)

# Merge
X_train = pd.merge(DNA_meth_train, RNA_seq_train, left_index=True, right_index=True)
X_test = pd.merge(DNA_meth_test, RNA_seq_test, left_index=True, right_index=True)

data = pd.merge(dna_methylation, rna_seq, left_index=True, right_index=True)

# Convert to tensors and send to device
X_train = torch.tensor(X_train.to_numpy(), dtype=torch.float32).to(device)
X_test = torch.tensor(X_test.to_numpy(), dtype=torch.float32).to(device)

### Define the objective

In [None]:
def objective(trial):
    # Model and Model Hyperparameters
    model_layers = trial.suggest_categorical("model_layers", ["3 Layers", "4 Layers", "5 Layers"])
    epochs = trial.suggest_int("epochs", 10, 400, step=10)
    latent_space = trial.suggest_int("latent_space", 20, 200)

    # k-means hyperparameters
    # n_clusters = trial.suggest_int("n_clusters", 2, 5)
    n_clusters = 2

    if model_layers == "4 Layers":
        model = Autoencoder_4_Layers(input_features=X_train.shape[1], hidden_features=latent_space)
        model, train_loss, test_loss = train_autoencoder(model=model, 
                                                         loss_fn=nn.L1Loss(), 
                                                         optimizer=torch.optim.RMSprop(model.parameters(), lr=0.00001), 
                                                         epochs=epochs, 
                                                         X_train=X_train, 
                                                         X_test=X_test, 
                                                         updates=False)
        
        # encode data
        tensor_data = torch.tensor(data.to_numpy(), dtype=torch.float32).to(device) 
        model.eval()
        with torch.inference_mode():
            encoded_data = model.encode(tensor_data)

        # convert to dataframe
        encoded_dataframe = pd.DataFrame(encoded_data)

        kmeans = KMeans(n_clusters=n_clusters)
        cluster_labels = kmeans.fit_predict(encoded_dataframe)
        
        return silhouette_score(encoded_dataframe, cluster_labels)
    
    elif model_layers == "3 Layers":
        model = Autoencoder_3_Layers(input_features=X_train.shape[1], hidden_features=latent_space)
        model, train_loss, test_loss = train_autoencoder(model=model, 
                                                         loss_fn=nn.L1Loss(), 
                                                         optimizer=torch.optim.RMSprop(model.parameters(), lr=0.00001), 
                                                         epochs=epochs, 
                                                         X_train=X_train, 
                                                         X_test=X_test, 
                                                         updates=False)
        
        # encode data
        tensor_data = torch.tensor(data.to_numpy(), dtype=torch.float32).to(device) 
        model.eval()
        with torch.inference_mode():
            encoded_data = model.encode(tensor_data)

        # convert to dataframe
        encoded_dataframe = pd.DataFrame(encoded_data)

        kmeans = KMeans(n_clusters=n_clusters)
        cluster_labels = kmeans.fit_predict(encoded_dataframe)
        
        return silhouette_score(encoded_dataframe, cluster_labels)
    
    elif model_layers == "5 Layers":
        model = Autoencoder_5_Layers(input_features=X_train.shape[1], hidden_features=latent_space)
        model, train_loss, test_loss = train_autoencoder(model=model, 
                                                         loss_fn=nn.L1Loss(), 
                                                         optimizer=torch.optim.RMSprop(model.parameters(), lr=0.00001), 
                                                         epochs=epochs, 
                                                         X_train=X_train, 
                                                         X_test=X_test, 
                                                         updates=False)
        
        # encode data
        tensor_data = torch.tensor(data.to_numpy(), dtype=torch.float32).to(device) 
        model.eval()
        with torch.inference_mode():
            encoded_data = model.encode(tensor_data)

        # convert to dataframe
        encoded_dataframe = pd.DataFrame(encoded_data)

        kmeans = KMeans(n_clusters=n_clusters)
        cluster_labels = kmeans.fit_predict(encoded_dataframe)
        
        return silhouette_score(encoded_dataframe, cluster_labels)

In [None]:
# Create the Optuna study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=1000)  # Run 1000 trials

# Print best results
print("Best hyperparameters:", study.best_params)
print("Best silhouette score:", study.best_value)

# Save results
df = study.trials_dataframe()
df.to_csv("../Data/optuna_trials_3.csv", index=False)

# Visualise the Results

In [None]:
# Parameters that affect the objective the most
optuna.visualization.plot_param_importances(study)

In [None]:
# Plot optmisation history
optuna.visualization.plot_optimization_history(study)

In [None]:
# Parallel coordinates plot
optuna.visualization.plot_parallel_coordinate(study)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Convert study trials to DataFrame
df = study.trials_dataframe()

# Extract only completed trials
df = df[df["state"] == "COMPLETE"]

# Rename columns for readability
df = df.rename(columns={"value": "Silhouette Score", "params_model_layers": "Model Layers"})

# Plot
plt.figure(figsize=(8, 5))
sns.boxplot(x="Model Layers", y="Silhouette Score", data=df, palette="Set2")

# Customize plot
plt.title("Silhouette Score Distribution per Model Layer")
plt.xlabel("Number of Layers")
plt.ylabel("Silhouette Score")
plt.grid(True, linestyle="--", alpha=0.6)

plt.show()


In [None]:
import matplotlib.pyplot as plt

# Extract data from study
trial_numbers = [t.number for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE]
silhouette_scores = [t.value for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE]
n_clusters = [t.params["n_clusters"] for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE]

# Scatter plot
plt.figure(figsize=(8, 6))
sc = plt.scatter(n_clusters, silhouette_scores, c=silhouette_scores)

# Customize plot
plt.xlabel("Number of Clusters")
plt.ylabel("Silhouette Score")
plt.title("Silhouette Score per Number of Clusters Across Optuna Trials")

# Show plot
plt.show()


In [None]:
df

# Statistical tests

In [None]:
from scipy.stats import mannwhitneyu

u_stat, p_value = mannwhitneyu(df[df["Model Layers"] == "4 Layers"]["Silhouette Score"], df[df["Model Layers"] == "5 Layers"]["Silhouette Score"], alternative="two-sided")
print(f"Mann-Whitney U: {u_stat}, p-value: {p_value}")

In [None]:
u_stat, p_value = mannwhitneyu(df[df["params_n_clusters"] == 2]["Silhouette Score"], df[df["params_n_clusters"] == 3]["Silhouette Score"], alternative="two-sided")
print(f"Mann-Whitney U: {u_stat}, p-value: {p_value}")