In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, label_binarize
from sklearn.metrics import roc_curve, auc, precision_recall_curve

def save_combined_roc_efficiency_plot(y_true, y_proba, class_names, save_path):
    n_classes = len(class_names)
    y_true_bin = label_binarize(y_true, classes=range(n_classes))

    fig, axes = plt.subplots(1, 2, figsize=(16, 6))

    # ROC Curve
    for i in range(n_classes):
        fpr, tpr, _ = roc_curve(y_true_bin[:, i], y_proba[:, i])
        roc_auc = auc(fpr, tpr)
        axes[0].plot(fpr, tpr, lw=2, label=f"{class_names[i]} (AUC = {roc_auc:.2f})")
    axes[0].plot([0, 1], [0, 1], "k--", lw=1)
    axes[0].set_xlabel("False Positive Rate")
    axes[0].set_ylabel("True Positive Rate (Recall)")
    axes[0].set_title("ROC Curves by Class")
    axes[0].legend(loc="lower right")

    # Completeness vs Efficiency Curve
    for i in range(n_classes):
        precision_vals, recall_vals, _ = precision_recall_curve(y_true_bin[:, i], y_proba[:, i])
        axes[1].plot(recall_vals, precision_vals, lw=2, label=class_names[i])
    axes[1].set_xlabel("Completeness (Recall)")
    axes[1].set_ylabel("Efficiency (Precision)")
    axes[1].set_title("Completeness vs Efficiency by Class")
    axes[1].legend(loc="lower left")

    plt.tight_layout()
    plt.savefig(save_path)
    plt.close()

def run_rf_cv_on_tess_features(csv_path, label_column='Variable_Type', title=''):
    # Load data
    df = pd.read_csv(csv_path).dropna()

    # Drop non-numeric columns except label
    non_numeric_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()
    non_numeric_cols = [col for col in non_numeric_cols if col != label_column]
    df = df.drop(columns=non_numeric_cols)

    # Extract features and labels
    X = df.drop(columns=[label_column]).values
    y = df[label_column].values
    class_names = sorted(np.unique(y))

    # Label encoding
    label_mapping = {label: idx for idx, label in enumerate(class_names)}
    y_encoded = np.array([label_mapping[label] for label in y])

    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    actual_targets = []
    predicted_probas = []

    for train_ix, test_ix in skf.split(X, y_encoded):
        train_x, test_x = X[train_ix], X[test_ix]
        train_y, test_y = y_encoded[train_ix], y_encoded[test_ix]

        scaler = StandardScaler()
        train_x = scaler.fit_transform(train_x)
        test_x = scaler.transform(test_x)

        clf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
        clf.fit(train_x, train_y)
        probs = clf.predict_proba(test_x)

        predicted_probas.append(probs)
        actual_targets.append(test_y)

    predicted_probas = np.vstack(predicted_probas)
    actual_targets = np.concatenate(actual_targets)

    # Save plot
    save_dir = "/home/devika/PhD/S1/Astroinformatics/Project/Results/"
    os.makedirs(save_dir, exist_ok=True)
    save_path = os.path.join(save_dir, f"{title.replace(' ', '_')}_ROC_Efficiency.pdf")
    save_combined_roc_efficiency_plot(actual_targets, predicted_probas, class_names, save_path)


In [2]:
run_rf_cv_on_tess_features(
    "/home/devika/PhD/S1/Astroinformatics/Project/Results/TESS_features_raw.csv",
    label_column="Variable_Type",
    title="Raw TESS Features"
)

run_rf_cv_on_tess_features(
    "/home/devika/PhD/S1/Astroinformatics/Project/Results/TESS_features_median_after_detrended.csv",
    label_column="Variable_Type",
    title="Median-Detrended TESS Features"
)

run_rf_cv_on_tess_features(
    "/home/devika/PhD/S1/Astroinformatics/Project/Results/TESS_features_outliercleaned.csv",
    label_column="Variable_Type",
    title="Outlier-Cleaned TESS Features"
)


  axes[0].legend(loc="lower right")
  axes[1].legend(loc="lower left")
