In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cross_decomposition import PLSRegression
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc, precision_score, recall_score, f1_score

# Load the dataset
file_path = "mCH_on_off_calc_params.xlsx"
data = pd.read_excel(file_path)

# Separate features (X) and experimental values (Y)
X = data.iloc[:, 5:]  # Features: all columns after the first five
Y = data.iloc[:, :5]  # Targets: first five columns

target_columns = ["OFF AVG", "ON AVG Truncated", "ON AVG Full", "ON OFF Truncated", "ON OFF Full"]
threshold_high_values = np.arange(0.50, 0.81, 0.05)
threshold_low_value = 0.35

# Dictionary to collect all data to export to Excel
excel_data = {}
summary_data = []

for i, target in enumerate(target_columns):
    for threshold_high_value in threshold_high_values:
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)

        threshold_high = Y[target].quantile(threshold_high_value)
        threshold_low = Y[target].quantile(threshold_low_value)

        if threshold_low >= threshold_high:
            continue

        Y_classes = pd.cut(Y[target], bins=[-float("inf"), threshold_high, float("inf")], labels=["Low", "High"])
        label_encoder = LabelEncoder()
        Y_encoded = label_encoder.fit_transform(Y_classes)

        X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, Y_encoded, test_size=0.2, random_state=42)

        estimator = LogisticRegression()
        selector = RFE(estimator, n_features_to_select=20)
        X_train_rfe = selector.fit_transform(X_train, Y_train)
        X_test_rfe = selector.transform(X_test)

        selected_feature_indices = selector.support_
        selected_feature_names = X.columns[selected_feature_indices].tolist()

        n_components_range = range(1, min(21, X_train_rfe.shape[1] + 1))
        scores = []
        for n in n_components_range:
            pls = PLSRegression(n_components=n)
            X_latent = pls.fit_transform(X_train_rfe, Y_train)[0]
            classifier = LogisticRegression()
            score = cross_val_score(classifier, X_latent, Y_train, cv=5, scoring='accuracy').mean()
            scores.append(score)
        best_n_components = n_components_range[np.argmax(scores)]
        best_cv_accuracy = max(scores)
        cv_error = 1 - best_cv_accuracy

        pls = PLSRegression(n_components=best_n_components)
        X_train_latent = pls.fit_transform(X_train_rfe, Y_train)[0]
        X_test_latent = pls.transform(X_test_rfe)

        classifier = LogisticRegression()
        classifier.fit(X_train_latent, Y_train)
        Y_pred = classifier.predict(X_test_latent)
        Y_prob = classifier.predict_proba(X_test_latent)[:, 1]

        try:
            fpr, tpr, _ = roc_curve(Y_test, Y_prob)
            roc_auc = auc(fpr, tpr)
        except ValueError:
            roc_auc = np.nan

        precision = precision_score(Y_test, Y_pred, zero_division=0)
        recall = recall_score(Y_test, Y_pred, zero_division=0)
        f1 = f1_score(Y_test, Y_pred, zero_division=0)

        # Save summary stats
        summary_data.append({
            "Target": target,
            "Threshold High": threshold_high_value,
            "AUROC": roc_auc,
            "Precision": precision,
            "Recall": recall,
            "F1": f1,
            "CV Error": cv_error
        })

# Save Excel
summary_df = pd.DataFrame(summary_data)
summary_df.to_excel("threshold_high_sweep_summary.xlsx", index=False)