In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import os
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cross_decomposition import PLSRegression
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc, accuracy_score, f1_score

# Load the dataset
file_path = "mCH_on_off_calc_params.xlsx"
data = pd.read_excel(file_path)

# Separate features (X) and experimental values (Y)
X = data.iloc[:, 5:]  # Features: all columns after the first five
Y = data.iloc[:, :5]  # Targets: first five columns

# Target variables for classification
target_columns = ["OFF AVG", "ON AVG Truncated", "ON AVG Full", "ON OFF Truncated", "ON OFF Full"]

# Dictionary to store model parameters
model_params = {}

# Dictionary to collect all data to export to Excel
excel_data = {}

# Initialize subplots
fig, axes = plt.subplots(nrows=4, ncols=len(target_columns), figsize=(5 * len(target_columns), 15))
plt.subplots_adjust(hspace=0.5, wspace=0.5)

# Loop through each target variable
for i, target in enumerate(target_columns):
    # Standardize the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Define class thresholds
    threshold_high = Y[target].quantile(0.65)  

    # Create binary labels: "High" for values above the 65th percentile, all others will be classified as "Low"
    Y_classes = pd.cut(Y[target], bins=[-float("inf"), threshold_high, float("inf")], labels=["Low", "High"])
    label_encoder = LabelEncoder()
    Y_encoded = label_encoder.fit_transform(Y_classes)

    # Split data into training and testing sets
    X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, Y_encoded, test_size=0.2, random_state=42)

    # Recursive Feature Elimination (RFE) with Logistic Regression
    estimator = LogisticRegression()
    selector = RFE(estimator, n_features_to_select=20)  
    X_train_rfe = selector.fit_transform(X_train, Y_train)
    X_test_rfe = selector.transform(X_test)

    selected_feature_indices = selector.support_
    selected_feature_names = X.columns[selected_feature_indices].tolist()

    # Perform cross-validation to determine the optimal number of components
    n_components_range = range(1, min(21, X_train_rfe.shape[1] + 1))
    scores = []

    for n in n_components_range:
        pls = PLSRegression(n_components=n)
        X_latent = pls.fit_transform(X_train_rfe, Y_train)[0]
        classifier = LogisticRegression()
        score = cross_val_score(classifier, X_latent, Y_train, cv=5, scoring='accuracy').mean()
        scores.append(score)
        best_cv_accuracy = max(scores)
        cv_error = 1 - best_cv_accuracy

    # Get the best number of components
    best_n_components = n_components_range[np.argmax(scores)]

    plt.rcParams['font.family'] = 'Arial'  
    plt.rcParams['font.size'] = 12  

    # Plot cross-validation accuracy vs. number of components
    axes[0, i].plot(n_components_range, scores, marker='o', linestyle='-')
    axes[0, i].set_title(f"Cross-Validation Accuracy: {target}")
    axes[0, i].set_xlabel("Number of Components")
    axes[0, i].set_ylabel("Accuracy")
    axes[0, i].grid()

    # Fit the final PLS model with optimal components
    pls = PLSRegression(n_components=best_n_components)
    X_train_latent = pls.fit_transform(X_train_rfe, Y_train)[0]
    X_test_latent = pls.transform(X_test_rfe)

    # Train logistic regression classifier on latent variables
    classifier = LogisticRegression()
    classifier.fit(X_train_latent, Y_train)
    Y_pred = classifier.predict(X_test_latent)

    # Scatter plot of PLS-DA latent variables
    scatter = axes[1, i].scatter(X_train_latent[:, 0], X_train_latent[:, 1], c=Y_train, cmap='bwr', edgecolor='k')
    axes[1, i].set_title(f"PLS-DA Latent Variables: {target}")
    axes[1, i].set_xlabel("Latent Variable 1")
    axes[1, i].set_ylabel("Latent Variable 2")
    plt.colorbar(scatter, ax=axes[1, i])

    # Get PLS regression loadings
    loadings = pls.x_loadings_

    # Create DataFrame for visualization
    feature_names = np.array(X.columns)[selector.support_]
    loadings_df = pd.DataFrame(loadings, index=feature_names, columns=[f"LV{i+1}" for i in range(best_n_components)])

    # Plot feature importance (loadings) for the first latent variable
    loadings_df.iloc[:, 0].sort_values().plot(kind='barh', ax=axes[2, i], color='blue')
    axes[2, i].set_title(f"Feature Importance: {target}")
    axes[2, i].set_xlabel("Loading Value")
    axes[2, i].set_ylabel("Feature")

    # Compute accuracy and classification metrics
    accuracy = accuracy_score(Y_test, Y_pred)
    conf_matrix = confusion_matrix(Y_test, Y_pred)
    class_report = classification_report(Y_test, Y_pred, target_names=label_encoder.classes_)

    print(f"Accuracy for {target}: {accuracy}")
    print(f"Confusion Matrix for {target}:\n{conf_matrix}")
    print(f"Classification Report for {target}:\n{class_report}")

    # Save model parameters
    model_params[target] = {
        "pls_loadings": pls.x_loadings_,
        "logreg_coefficients": classifier.coef_,
        "scaler_mean": scaler.mean_,
        "scaler_scale": scaler.scale_,
        "selected_feature_indices": selected_feature_indices,  # Save selected feature indices
        "selected_feature_names": selected_feature_names,     # Save selected feature names
    }
    
    # Compute ROC Curve & AUC
    Y_prob = classifier.predict_proba(X_test_latent)[:, 1]  # Probabilities for "High" class
    fpr, tpr, _ = roc_curve(Y_test, Y_prob)
    roc_auc = auc(fpr, tpr)
    print(roc_auc)

    roc_data = pd.DataFrame({
    'False Positive Rate': fpr,
    'True Positive Rate': tpr})
    print(roc_data)

    # Plot ROC Curve (Top Row)
    ax1 = axes[3, i]
    ax1.plot(fpr, tpr, color='blue', lw=2, label=f'AUC = {roc_auc:.2f}')
    ax1.plot([0, 1], [0, 1], color='gray', linestyle='--')  # Random guess line
    ax1.set_title(f'ROC Curve ({target})')
    ax1.set_xlabel('False Positive Rate')
    ax1.set_ylabel('True Positive Rate')
    ax1.legend(loc="lower right")

# Save all parameters to a file
with open("all_trained_model_params.pkl", "wb") as f:
    pickle.dump(model_params, f)

plt.tight_layout()
plt.savefig("pls_DA_accuracy_loading_feature.png", dpi=300, bbox_inches='tight')
plt.show()
