In [1]:

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
from pathlib import Path
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from statsmodels.stats.outliers_influence import variance_inflation_factor
pd.set_option('display.max_rows', 25990)
pd.set_option('display.max_columns', 200)

In [3]:
# Load datasets
df  = pd.read_csv("/content/drive/MyDrive/CP_UMBC/Feature_Engneering/Merged/merged_clean_dedup.csv")


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25926 entries, 0 to 25925
Columns: 157 entries, EmployeeID to AP_STEM_ratio
dtypes: float64(74), int64(21), object(62)
memory usage: 31.1+ MB


In [5]:
#  Check if each EmployeeID is unique
unique_ids = df['EmployeeID'].nunique()
total_rows = len(df)
print(f"Unique EmployeeIDs: {unique_ids} / Total rows: {total_rows}")

if unique_ids == total_rows:
    print(" Each EmployeeID is unique — 1 row per student.")
else:
    print(" WARNING: Some EmployeeIDs appear more than once!")

#  Inspect target distribution
print("\nGraduated value counts (including missing):")
print(df['Graduated'].value_counts(dropna=False))

# Convert target to numeric binary (Yes=1, No=0)
df['Graduated'] = df['Graduated'].map({'Yes': 1, 'No': 0})

#  Confirm no NaNs and show class balance
missing_graduated = df['Graduated'].isna().sum()
print(f"\nMissing target values: {missing_graduated}")

grad_rate = df['Graduated'].mean()
print(f" Graduation rate: {grad_rate:.3%}")


Unique EmployeeIDs: 25926 / Total rows: 25926
 Each EmployeeID is unique — 1 row per student.

Graduated value counts (including missing):
Graduated
No     13104
Yes    12822
Name: count, dtype: int64

Missing target values: 0
 Graduation rate: 49.456%


In [6]:
#  Print total number of columns
print("="*70)
print(f" Total Columns in DataFrame: {len(df.columns)}")
print("="*70)

#  Print each column numbered
print("\n Numbered Column View:\n")
for i, col in enumerate(df.columns, start=1):
    print(f"{i:3d}. {col}")



 Total Columns in DataFrame: 157

 Numbered Column View:

  1. EmployeeID
  2. StudentKey
  3. MatricTermdescription
  4. MatricAcademicYear
  5. MatricStatusOfficialDescr
  6. MatricGenderIPEDS
  7. MatricIPEDSEthnicity
  8. Zipcode
  9. MatricResidence
 10. Graduated
 11. Yrs_To_Deg
 12. Sem_To_Deg
 13. Plan1_1
 14. Plan1_2
 15. Plan1_3
 16. Plan1_4
 17. Plan1_5
 18. Plan1_6
 19. Plan1_7
 20. Plan1_8
 21. Plan1_9
 22. Plan1_10
 23. Plan1_11
 24. Plan1_12
 25. Plan1_13
 26. Plan1_14
 27. Plan1_15
 28. Plan1_16
 29. Deg_8
 30. Deg_10
 31. Deg_12
 32. LastSTEMType
 33. LastrptgPlanShortDescr
 34. Sem1_FTPT
 35. Sem2_FTPT
 36. Sem3_FTPT
 37. Sem4_FTPT
 38. Sem5_FTPT
 39. Sem6_FTPT
 40. Sem7_FTPT
 41. Sem8_FTPT
 42. Sem9_FTPT
 43. Sem10_FTPT
 44. Sem11_FTPT
 45. Sem12_FTPT
 46. Sem13_FTPT
 47. Sem14_FTPT
 48. Sem15_FTPT
 49. Sem16_FTPT
 50. Same_STEM_1_2
 51. Same_STEM_1_21
 52. Same_STEM_1_3
 53. Same_STEM_1_4
 54. Same_STEM_1_5
 55. Same_STEM_1_6
 56. Same_STEM_1_7
 57. Same_STEM_1_8
 5

# === Justification for column drops ===

    "Degree progress / future info":
        "Dropped all columns describing years or semesters to degree (e.g., Yrs_To_Deg, Deg_10). "
        "These are post-outcome indicators available only after graduation, which would cause data leakage.",

    "Academic plans (Plan1_x)":
        "Removed secondary academic plans beyond the first. They are often incomplete or declared after first-term enrollment, "
        "and add noise rather than predictive power for early graduation prediction.",

    "Future-term enrollment info":
        "Dropped Sem2+ FT/PT status variables, which reflect later enrollment behavior and thus leak future outcomes. "
        "Kept only Sem1_FTPT to capture initial enrollment intensity.",

    "Financial aid (need, grants, merit, scholar, program)":
        "Removed year-by-year aid details (Y1–Y6) to avoid leakage "
        "We have created the total support ans supported column which tells about finance",

    "Later-declared STEM or plan descriptors":
        "Dropped LastSTEMType and LastrptgPlanShortDescr since they describe post-matriculation major changes. "
        "Including them would leak future academic outcomes.",

    "Post-enrollment similarity flags (Same_STEM_1_x)":
        "Removed derived boolean flags comparing later terms to the first. These are constructed retrospectively and create target leakage.",

    "Redundant indicator flags":
        "Dropped boolean indicators (HasSAT..., HasEngScore, etc.) since actual score columns provide richer numeric information. "
        "Indicators duplicate data already represented numerically.",

    "Redundant SAT metrics":
        "Dropped SAT1600Score because it is a linear sum of SATMathScore + SATReadingWritingScore. "
        "Keeping section-level scores separately preserves interpretability across quantitative and verbal readiness domains "
        "and avoids perfect multicollinearity.",

    "AP redundant  metrics":
        "Removed AP aggregates (num_tests, avg_score, derived STEM ratios). Individual course or credit variables better represent AP strength. "
        "Aggregates were strongly collinear and sparsely populated.",

    "Identifiers text fields":
        "Dropped direct identifiers (EmployeeID, StudentKey) and high-cardinality text features (Zipcode, HighSchoolDescription). "
        "These have no predictive value and could violate privacy or overfit on location-specific patterns."


In [7]:

drop_cols_all = [
    # Degree progress  info
    "Yrs_To_Deg", "Sem_To_Deg",
    "Deg_8", "Deg_10", "Deg_12","MatricAcademicYear",
    "Degr4Sem", "Degr5Sem", "Degr6Sem", "Degr7Sem", "Degr8Sem", "Degr9Sem",
    "Degr10Sem", "Degr11Sem", "Degr12Sem", "Degr13Sem", "Degr14Sem", "Degr15Sem", "Degr16Sem",

    # Academic plans
    "Plan1_1","Plan1_2","Plan1_3","Plan1_4","Plan1_5","Plan1_6","Plan1_7",
    "Plan1_8","Plan1_9","Plan1_10","Plan1_11","Plan1_12","Plan1_13",
    "Plan1_14","Plan1_15","Plan1_16",

    # Future-term enrollment info
    "Sem2_FTPT","Sem3_FTPT","Sem4_FTPT","Sem5_FTPT","Sem6_FTPT","Sem7_FTPT",
    "Sem8_FTPT","Sem9_FTPT","Sem10_FTPT","Sem11_FTPT","Sem12_FTPT",
    "Sem13_FTPT","Sem14_FTPT","Sem15_FTPT","Sem16_FTPT",

    #  aid info (need, grants, merit, scholar, program)
    "Y1Need","Y2Need","Y3Need","Y4Need","Y5Need","Y6Need",
    "Y1GrantAmount","Y2GrantAmount","Y3GrantAmount","Y4GrantAmount","Y5GrantAmount","Y6GrantAmount",
    "Y1MeritAmount","Y2MeritAmount","Y3MeritAmount","Y4MeritAmount","Y5MeritAmount","Y6MeritAmount",
    "Y1ScholarProgram","Y1ProgramAmount","Y2ScholarProgram","Y2ProgramAmount",
    "Y3ScholarProgram","Y3ProgramAmount","Y4ScholarProgram","Y4ProgramAmount",
    "Y5ScholarProgram","Y5ProgramAmount","Y6ScholarProgram","Y6ProgramAmount",

    # Later-declared STEM type or plan
    "LastSTEMType", "LastrptgPlanShortDescr",
    # term discription
    "MatricTermdescription",
    # Post-enrollment similarity flags
    *[f"Same_STEM_1_{i}" for i in [2,21,3,4,5,6,7,8,9,10,11,12,13,14,15,16]],

    # Redundant indicator flags (we keep numeric scores instead)
    "HasSATMathScore","HasSATReadingWritingScore","HasHighSchoolRankPercentile",
    "HasAlgScore","HasCalScore","HasALEKSscore","HasEngScore",

    # Redundant SAT metrics
    "SAT1600Score",

    # AP redundant or derived
    "AP_num_tests","AP_unique_codes","AP_any_credit","AP_avg_score",
    *[f"AP_credit_ct_{x}" for x in ["art","computer","english","language","math","social","stem"]],
    "AP_ct_STEM_like","AP_STEM_ratio",

    # Non-predictive identifiers / sparse text
    "EmployeeID","StudentKey","Zipcode","HighSchoolDescription","HighSchoolState","HighSchoolCounty",
]

# Intersect with actual columns
existing = set(df.columns)
to_drop = [c for c in drop_cols_all if c in existing]

print(f"Requested to drop: {len(drop_cols_all)} columns")
print(f"Actually present & dropped: {len(to_drop)} columns")

before_shape = df.shape
df_clean = df.drop(columns=to_drop, errors="ignore")
after_shape = df_clean.shape

print(f"\nShape before: {before_shape}")
print(f"Shape after : {after_shape}")

# Show what remains (and split by dtype)
num_cols = df_clean.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = df_clean.select_dtypes(exclude=[np.number]).columns.tolist()

print(f"\nRemaining columns: {len(df_clean.columns)}")
print(f"  • Numeric    : {len(num_cols)}")
print(f"  • Categorical: {len(cat_cols)}")



Requested to drop: 126 columns
Actually present & dropped: 126 columns

Shape before: (25926, 157)
Shape after : (25926, 31)

Remaining columns: 31
  • Numeric    : 20
  • Categorical: 11


In [8]:
from datetime import datetime

# Define your output folder in Google Drive
out_dir = Path("/content/drive/MyDrive/CP_UMBC/ Feature Engineering/Merged")

# Make sure folder exists
out_dir.mkdir(parents=True, exist_ok=True)

# Create a versioned filename (with date & time stamp)
version_tag = "vA1"
timestamp = datetime.now().strftime("%Y%m%d_%H%M")
file_name = f"merged_clean_{version_tag}_{timestamp}.csv"
out_path = out_dir / file_name

# Save the frozen dataset
df_clean.to_csv(out_path, index=False)

# Also, save just the column schema
schema_path = out_dir / f"merged_clean_{version_tag}_schema_{timestamp}.txt"
with open(schema_path, "w") as f:
    f.write(f"Dataset shape: {df_clean.shape}\n\n")
    f.write("Column list:\n")
    for i, col in enumerate(df_clean.columns, start=1):
        f.write(f"{i:3d}. {col}\n")

print(" Dataset version frozen successfully!")
print(f" Saved data:   {out_path}")
print(f" Saved schema: {schema_path}")
print(f" Shape: {df_clean.shape}")

 Dataset version frozen successfully!
 Saved data:   /content/drive/MyDrive/CP_UMBC/ Feature Engineering/Merged/merged_clean_vA1_20251202_1602.csv
 Saved schema: /content/drive/MyDrive/CP_UMBC/ Feature Engineering/Merged/merged_clean_vA1_schema_20251202_1602.txt
 Shape: (25926, 31)


In [9]:
#  Identify column types
num_cols = df_clean.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = df_clean.select_dtypes(exclude=[np.number]).columns.tolist()

print("="*70)
print(f" Dataset shape: {df_clean.shape}")
print(f" Numeric columns: {len(num_cols)}")
print(f" Categorical columns: {len(cat_cols)}")
print("="*70)

#  Display column lists
print("\n Numeric Columns:")
for i, c in enumerate(num_cols, 1):
    print(f"{i:2d}. {c}")

print("\ Categorical Columns:")
for i, c in enumerate(cat_cols, 1):
    print(f"{i:2d}. {c}")


 Dataset shape: (25926, 31)
 Numeric columns: 20
 Categorical columns: 11

 Numeric Columns:
 1. Graduated
 2. HighSchoolGpa
 3. SATMathScore
 4. SATReadingWritingScore
 5. HighSchoolWeightedRankPercentile
 6. AlgSCORE
 7. CalScore
 8. ALEKSScore
 9. EngSCORE
10. AP_CRDS
11. TotalSupport
12. AP_total_transfer_credits
13. AP_max_score
14. AP_ct_art
15. AP_ct_computer
16. AP_ct_english
17. AP_ct_language
18. AP_ct_math
19. AP_ct_social
20. AP_ct_stem
\ Categorical Columns:
 1. MatricStatusOfficialDescr
 2. MatricGenderIPEDS
 3. MatricIPEDSEthnicity
 4. MatricResidence
 5. Sem1_FTPT
 6. MatricResidencyTuitionDescript
 7. HighSchoolGPABandDescription
 8. HS_PecentileDesc
 9. Supported
10. NeedStatus
11. SupportBin


  print("\ Categorical Columns:")


In [10]:
print("="*80)
print(f" Dataset shape: {df_clean.shape}")
print("="*80)


missing = (
    df_clean.isna().sum()
    .reset_index()
    .rename(columns={"index": "Feature", 0: "MissingCount"})
)
missing["MissingPct"] = (missing["MissingCount"] / len(df_clean) * 100).round(2)

print("\n Missingness Summary (Top 20 features by % missing):")
print(missing.sort_values("MissingPct", ascending=False).head(20).to_string(index=False))



num_cols = df_clean.select_dtypes(include=[np.number]).columns
num_summary = df_clean[num_cols].describe().T
num_summary["missing_%"] = df_clean[num_cols].isna().mean().round(3) * 100

print("\n Numeric Feature Summary:")
print(num_summary[["count", "mean", "std", "min", "25%", "50%", "75%", "max", "missing_%"]])



cat_cols = df_clean.select_dtypes(exclude=[np.number]).columns
print("\n Categorical Feature Frequency Overview:")
for c in cat_cols:
    vc = df_clean[c].value_counts(dropna=False)
    top5 = vc.head(5)
    print(f"\n-- {c} ({df_clean[c].isna().mean()*100:.2f}% missing) --")
    print(top5.to_string())


 Dataset shape: (25926, 31)

 Missingness Summary (Top 20 features by % missing):
                         Feature  MissingCount  MissingPct
       MatricStatusOfficialDescr             0         0.0
               MatricGenderIPEDS             0         0.0
            MatricIPEDSEthnicity             0         0.0
                 MatricResidence             0         0.0
                       Graduated             0         0.0
                       Sem1_FTPT             0         0.0
  MatricResidencyTuitionDescript             0         0.0
                   HighSchoolGpa             0         0.0
    HighSchoolGPABandDescription             0         0.0
                    SATMathScore             0         0.0
          SATReadingWritingScore             0         0.0
HighSchoolWeightedRankPercentile             0         0.0
                HS_PecentileDesc             0         0.0
                        AlgSCORE             0         0.0
                        CalScore 

In [11]:

#  Separate target + features

X = df_clean.drop(columns=["Graduated"])
y = df_clean["Graduated"].astype(int)   # ensure numeric target


#  Train/Validation/Test Split

# 70/15/15 split with stratified sampling to preserve class ratio
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, stratify=y, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, stratify=y_temp, random_state=42
)

print(" Dataset Split Summary:")
print(f"Train: {X_train.shape}, Validation: {X_val.shape}, Test: {X_test.shape}")
print(f"Graduation ratio - Train: {y_train.mean():.3f}, Val: {y_val.mean():.3f}, Test: {y_test.mean():.3f}")


#  Identify Column Types

num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()

print("\nFeature Type Breakdown:")
print(f"Numeric: {len(num_cols)} | Categorical: {len(cat_cols)}")


# Define Preprocessing Pipelines

numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value=0)),
    ("scaler", StandardScaler())
])

categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value="Unknown")),
    ("encoder", OneHotEncoder(drop="first", handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_pipeline, num_cols),
        ("cat", categorical_pipeline, cat_cols)
    ],
    remainder="drop",
    verbose_feature_names_out=False
)


#  Fit on Train & Verify Output

X_train_prep = preprocessor.fit_transform(X_train)
feature_names = preprocessor.get_feature_names_out()

print("\n Preprocessing Summary:")
print(f"Transformed TRAIN shape: {X_train_prep.shape}")
print(f"Total features after OHE: {len(feature_names)}")
print(f"Any missing after transform? {np.isnan(X_train_prep).sum()}")




 Dataset Split Summary:
Train: (18148, 30), Validation: (3889, 30), Test: (3889, 30)
Graduation ratio - Train: 0.495, Val: 0.494, Test: 0.495

Feature Type Breakdown:
Numeric: 19 | Categorical: 11

 Preprocessing Summary:
Transformed TRAIN shape: (18148, 57)
Total features after OHE: 57
Any missing after transform? 0


In [12]:
# Select numeric block (exclude target)
num_cols = [c for c in df_clean.select_dtypes(include=[np.number]).columns if c != "Graduated"]
X_num = df_clean[num_cols].copy()

print(f"Numeric features considered: {len(num_cols)}\n")


# Pairwise correlation

CORR_THRESH = 0.95
corr = X_num.corr().abs()

# Upper triangle only
upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
high_pairs = (
    upper.stack()
         .reset_index()
         .rename(columns={"level_0":"Feature1", "level_1":"Feature2", 0:"Correlation"})
         .query("Correlation > @CORR_THRESH")
         .sort_values("Correlation", ascending=False)
         .reset_index(drop=True)
)

print(f"[Correlation] Pairs with |r| > {CORR_THRESH}: {len(high_pairs)}")
if not high_pairs.empty:
    print(high_pairs.head(20).to_string(index=False))
else:
    print(" No strong pairwise correlations found above threshold.")


#  VIF on numeric block

def compute_vif(df):
    vif_vals = []
    for i in range(df.shape[1]):
        vif_vals.append(variance_inflation_factor(df.values, i))
    return pd.DataFrame({"Feature": df.columns, "VIF": vif_vals}).sort_values("VIF", ascending=False)

vif_table = compute_vif(X_num)
print("\n[VIF] Results (sorted descending):")
print(vif_table.head(25).to_string(index=False))

# Identify potential concern features
high_vif = vif_table[vif_table["VIF"] > 10]
if not high_vif.empty:
    print("\n High multicollinearity detected:")
    print(high_vif.to_string(index=False))
else:
    print("\n All VIF values ≤ 10 — numeric features are stable for linear models.")


Numeric features considered: 19

[Correlation] Pairs with |r| > 0.95: 2
    Feature1                  Feature2  Correlation
SATMathScore    SATReadingWritingScore     0.992447
     AP_CRDS AP_total_transfer_credits     0.980542

[VIF] Results (sorted descending):
                         Feature       VIF
                    SATMathScore 98.313864
          SATReadingWritingScore 97.754986
       AP_total_transfer_credits 47.799766
                         AP_CRDS 42.590131
                        AlgSCORE  6.342961
                    AP_max_score  5.912487
                        CalScore  5.223977
                   HighSchoolGpa  5.191062
                    AP_ct_social  4.450252
                      AP_ct_math  2.911515
                      ALEKSScore  2.720970
                      AP_ct_stem  2.207276
                        EngSCORE  2.048714
                   AP_ct_english  1.990519
HighSchoolWeightedRankPercentile  1.642389
                    TotalSupport  1.610497
     

In [13]:
from sklearn.metrics import (
    accuracy_score, precision_recall_fscore_support, roc_auc_score,
    confusion_matrix, classification_report
)
from sklearn.inspection import permutation_importance

# Models
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier



In [14]:
class Scorer:
    def __init__(self, positive_label=1):
        self.positive_label = positive_label

    def score(self, y_true, y_pred, y_score=None):
        acc = accuracy_score(y_true, y_pred)
        prec, rec, f1, _ = precision_recall_fscore_support(
            y_true, y_pred, average="binary", zero_division=0
        )

        auc = np.nan
        if y_score is not None:
            try:
                auc = roc_auc_score(y_true, y_score)
            except Exception:
                pass

        cm = confusion_matrix(y_true, y_pred, labels=[0, 1])
        cm_df = pd.DataFrame(
            cm, index=["True 0 (No)", "True 1 (Yes)"], columns=["Pred 0 (No)", "Pred 1 (Yes)"]
        )

        return {
            "accuracy": acc, "precision": prec, "recall": rec, "f1": f1, "auc": auc,
            "confusion_matrix": cm_df
        }


In [15]:
def safe_score_vector(model, X):
    """Return scores for AUC. Prefer predict_proba -> decision_function -> hard preds."""
    if hasattr(model, "predict_proba"):
        return model.predict_proba(X)[:, 1]
    if hasattr(model, "decision_function"):
        return model.decision_function(X)
    return model.predict(X)

def show_logreg_coeffs(pipeline, top_k=15):
    try:
        pre = pipeline.named_steps["pre"]
        clf = pipeline.named_steps["clf"]
        names = pre.get_feature_names_out()
        coefs = clf.coef_.ravel()
        df = pd.DataFrame({"feature": names, "coef": coefs})
        print("\n[LogReg] Top positive:")
        print(df.sort_values("coef", ascending=False).head(top_k).to_string(index=False))
        print("\n[LogReg] Top negative:")
        print(df.sort_values("coef").head(top_k).to_string(index=False))
    except Exception as e:
        print(f"[LogReg] Coeff inspection skipped: {e}")

def show_tree_importances(pipeline, top_k=15):
    try:
        pre = pipeline.named_steps["pre"]
        clf = pipeline.named_steps["clf"]
        if getattr(clf, "feature_importances_", None) is None:
            print("[Tree] No native feature_importances_.")
            return
        names = pre.get_feature_names_out()
        imp = pd.DataFrame({"feature": names, "importance": clf.feature_importances_})
        print("\n[Tree] Top features:")
        print(imp.sort_values("importance", ascending=False).head(top_k).to_string(index=False))
    except Exception as e:
        print(f"[Tree] Importance inspection skipped: {e}")

def show_permutation_importance(pipeline, X_val, y_val, n_repeats=5, top_k=15):
    """Compute and print only top feature names )."""
    try:
        pre = pipeline.named_steps["pre"]
        clf = pipeline.named_steps["clf"]
        Xv = pre.transform(X_val)
        r = permutation_importance(
            clf, Xv, y_val,
            n_repeats=n_repeats,
            random_state=42,
            scoring="f1"
        )
        names = pre.get_feature_names_out()
        pim = pd.DataFrame({
            "feature": names,
            "imp_mean": r.importances_mean
        }).sort_values("imp_mean", ascending=False)

        print("\nTop features:")
        for i, feat in enumerate(pim.head(top_k)["feature"], 1):
            print(f"{i:2d}. {feat}")

    except Exception as e:
        print(f"[Permutation Importance] Skipped: {e}")


In [16]:
class Evaluator:
    def __init__(self, preprocessor, scorer=None):
        self.pre = preprocessor
        self.scorer = scorer or Scorer()

    def run(self, name, clf, X_train, y_train, X_val, y_val,
            print_importances=True, perm_importance_for=("SVM_RBF", "GaussianNB"),
            n_perm_repeats=5):
        pipe = Pipeline([("pre", self.pre), ("clf", clf)])
        pipe.fit(X_train, y_train)

        # Predictions and scores
        y_pred = pipe.predict(X_val)
        y_score = safe_score_vector(pipe, X_val)

        # Core metrics
        m = self.scorer.score(y_val, y_pred, y_score)
        print(f"\n=== {name} — Validation Metrics ===")
        print(f"Accuracy : {m['accuracy']:.4f}")
        print(f"Precision: {m['precision']:.4f}")
        print(f"Recall   : {m['recall']:.4f}")
        print(f"F1-score : {m['f1']:.4f}")
        print(f"ROC-AUC  : {m['auc']:.4f}")

        # Confusion matrix
        print("\nConfusion Matrix (rows=true, cols=pred):")
        print(m["confusion_matrix"])

        # Classification report
        print("\nClassification Report:")
        print(classification_report(y_val, y_pred, target_names=["No Graduate", "Graduate"]))

        # Feature importance section
        if print_importances:
            if name in ("RandomForest", "GradientBoosting", "XGBoost"):
                show_tree_importances(pipe, top_k=15)
            elif name in perm_importance_for or name in ("LogisticRegression", "LinearSVM"):
                show_permutation_importance(pipe, X_val, y_val, n_repeats=n_perm_repeats, top_k=15)

        return {"name": name, "pipeline": pipe, **m}


In [None]:
def build_logreg():
    return LogisticRegression(max_iter=1000, class_weight="balanced", solver="lbfgs")

def build_linear_svm():
    return SGDClassifier(loss="hinge", class_weight="balanced", max_iter=2000, random_state=42)

def build_gaussian_nb():
    return GaussianNB()

def build_random_forest():
    return RandomForestClassifier(
        n_estimators=400, max_depth=None, min_samples_split=5,
        class_weight="balanced_subsample", n_jobs=-1, random_state=42
    )

def build_gradient_boosting():
    return GradientBoostingClassifier(random_state=42)


In [None]:
evaluator = Evaluator(preprocessor=preprocessor)


Accuracy provides an overall sense of correctness but can be misleading if one class (e.g., “Graduated”) dominates. It does not show which students are being misclassified.

Precision focuses on correctness of positive predictions in this case, how many students predicted to graduate actually do. High precision avoids wasting interventions on students who are already on track.

Recall measures coverage how many true graduates or non-graduates the model successfully identifies. High recall for “No Graduate” is critical for early warning and retention strategies.

F1-Score (Macro Average) balances precision and recall equally for both classes.

It is preferred because it treats “Graduate” and “No Graduate” as equally important outcomes.

Unlike micro averaging, it prevents overemphasizing the majority class.

Unlike weighted averaging, it does not let class proportions dominate the metric.

ROC-AUC is used to evaluate how well the model ranks students by graduation likelihood.

It’s threshold-independent, meaning it measures ranking quality rather than a single cutoff.

A high AUC means the model can effectively distinguish between likely graduates and non-graduates.

Together, Macro F1 and AUC provide a fair, interpretable, and policy-relevant view of model performance  ensuring decisions benefit both successful and at-risk students.

In [None]:
res_logreg = evaluator.run("LogisticRegression", build_logreg(), X_train, y_train, X_val, y_val)



=== LogisticRegression — Validation Metrics ===
Accuracy : 0.7687
Precision: 0.7645
Recall   : 0.7663
F1-score : 0.7654
ROC-AUC  : 0.8514

Confusion Matrix (rows=true, cols=pred):
              Pred 0 (No)  Pred 1 (Yes)
True 0 (No)          1024           304
True 1 (Yes)          301           987

Classification Report:
              precision    recall  f1-score   support

 No Graduate       0.77      0.77      0.77      1328
    Graduate       0.76      0.77      0.77      1288

    accuracy                           0.77      2616
   macro avg       0.77      0.77      0.77      2616
weighted avg       0.77      0.77      0.77      2616


Top features:
 1. MatricStatusOfficialDescr_New Transfer
 2. AP_total_transfer_credits
 3. EngSCORE
 4. ALEKSScore
 5. SATMathScore
 6. AP_CRDS
 7. AP_max_score
 8. HighSchoolWeightedRankPercentile
 9. SupportBin_>20K
10. AP_ct_computer
11. TotalSupport
12. SupportBin_<5K
13. AP_ct_math
14. HS_PecentileDesc_Unknown
15. MatricIPEDSEthnicity_Black

In [None]:
res_rf = evaluator.run("RandomForest", build_random_forest(), X_train, y_train, X_val, y_val)



=== RandomForest — Validation Metrics ===
Accuracy : 0.7787
Precision: 0.7667
Recall   : 0.7911
F1-score : 0.7788
ROC-AUC  : 0.8642

Confusion Matrix (rows=true, cols=pred):
              Pred 0 (No)  Pred 1 (Yes)
True 0 (No)          1018           310
True 1 (Yes)          269          1019

Classification Report:
              precision    recall  f1-score   support

 No Graduate       0.79      0.77      0.78      1328
    Graduate       0.77      0.79      0.78      1288

    accuracy                           0.78      2616
   macro avg       0.78      0.78      0.78      2616
weighted avg       0.78      0.78      0.78      2616


[Tree] Top features:
                               feature  importance
                          TotalSupport    0.194402
                            ALEKSScore    0.106995
                              EngSCORE    0.092053
                         HighSchoolGpa    0.054209
                          SATMathScore    0.040610
                SATReading

In [None]:
res_rf = evaluator.run("RandomForest", build_random_forest(), X_train, y_train, X_val, y_val)



=== RandomForest — Validation Metrics ===
Accuracy : 0.7787
Precision: 0.7667
Recall   : 0.7911
F1-score : 0.7788
ROC-AUC  : 0.8642

Confusion Matrix (rows=true, cols=pred):
              Pred 0 (No)  Pred 1 (Yes)
True 0 (No)          1018           310
True 1 (Yes)          269          1019

Classification Report:
              precision    recall  f1-score   support

 No Graduate       0.79      0.77      0.78      1328
    Graduate       0.77      0.79      0.78      1288

    accuracy                           0.78      2616
   macro avg       0.78      0.78      0.78      2616
weighted avg       0.78      0.78      0.78      2616


[Tree] Top features:
                               feature  importance
                          TotalSupport    0.194402
                            ALEKSScore    0.106995
                              EngSCORE    0.092053
                         HighSchoolGpa    0.054209
                          SATMathScore    0.040610
                SATReading

In [None]:
res_gb = evaluator.run("GradientBoosting", build_gradient_boosting(), X_train, y_train, X_val, y_val)



=== GradientBoosting — Validation Metrics ===
Accuracy : 0.7813
Precision: 0.7767
Recall   : 0.7803
F1-score : 0.7785
ROC-AUC  : 0.8726

Confusion Matrix (rows=true, cols=pred):
              Pred 0 (No)  Pred 1 (Yes)
True 0 (No)          1039           289
True 1 (Yes)          283          1005

Classification Report:
              precision    recall  f1-score   support

 No Graduate       0.79      0.78      0.78      1328
    Graduate       0.78      0.78      0.78      1288

    accuracy                           0.78      2616
   macro avg       0.78      0.78      0.78      2616
weighted avg       0.78      0.78      0.78      2616


[Tree] Top features:
                                                      feature  importance
                                                 TotalSupport    0.282946
                                                   ALEKSScore    0.245625
                                                     EngSCORE    0.221206
                       MatricSta

In [None]:
res_gnb = evaluator.run("GaussianNB", build_gaussian_nb(), X_train, y_train, X_val, y_val)



=== GaussianNB — Validation Metrics ===
Accuracy : 0.6510
Precision: 0.5969
Recall   : 0.8967
F1-score : 0.7167
ROC-AUC  : 0.7652

Confusion Matrix (rows=true, cols=pred):
              Pred 0 (No)  Pred 1 (Yes)
True 0 (No)           548           780
True 1 (Yes)          133          1155

Classification Report:
              precision    recall  f1-score   support

 No Graduate       0.80      0.41      0.55      1328
    Graduate       0.60      0.90      0.72      1288

    accuracy                           0.65      2616
   macro avg       0.70      0.65      0.63      2616
weighted avg       0.70      0.65      0.63      2616


Top features:
 1. EngSCORE
 2. SupportBin_<5K
 3. AlgSCORE
 4. CalScore
 5. ALEKSScore
 6. AP_ct_computer
 7. SupportBin_5K-10K
 8. TotalSupport
 9. MatricIPEDSEthnicity_Hispanic/Latino
10. AP_ct_language
11. SupportBin_15K-20K
12. Sem1_FTPT_PT
13. AP_ct_art
14. SATMathScore
15. HighSchoolGPABandDescription_4.0 - 4.49


In [None]:
def build_xgboost():
    from xgboost import XGBClassifier
    return XGBClassifier(
        n_estimators=400,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=1.0,
        objective="binary:logistic",
        eval_metric="logloss",
        tree_method="hist",       # fast and scalable
        random_state=42,
        n_jobs=-1
    )


In [None]:
res_xgb = evaluator.run("XGBoost", build_xgboost(), X_train, y_train, X_val, y_val)



=== XGBoost — Validation Metrics ===
Accuracy : 0.7997
Precision: 0.7821
Recall   : 0.8222
F1-score : 0.8017
ROC-AUC  : 0.8816

Confusion Matrix (rows=true, cols=pred):
              Pred 0 (No)  Pred 1 (Yes)
True 0 (No)          1033           295
True 1 (Yes)          229          1059

Classification Report:
              precision    recall  f1-score   support

 No Graduate       0.82      0.78      0.80      1328
    Graduate       0.78      0.82      0.80      1288

    accuracy                           0.80      2616
   macro avg       0.80      0.80      0.80      2616
weighted avg       0.80      0.80      0.80      2616


[Tree] Top features:
                                                      feature  importance
                                                     EngSCORE    0.146715
                       MatricStatusOfficialDescr_New Transfer    0.063279
                                                   ALEKSScore    0.060936
                                         

In [None]:
pipe = Pipeline([("pre", preprocessor), ("clf", res_xgb)])


In [None]:
import joblib


In [None]:
res_xgb = evaluator.run("XGBoost", build_xgboost(), X_train, y_train, X_val, y_val)



=== XGBoost — Validation Metrics ===
Accuracy : 0.7997
Precision: 0.7821
Recall   : 0.8222
F1-score : 0.8017
ROC-AUC  : 0.8816

Confusion Matrix (rows=true, cols=pred):
              Pred 0 (No)  Pred 1 (Yes)
True 0 (No)          1033           295
True 1 (Yes)          229          1059

Classification Report:
              precision    recall  f1-score   support

 No Graduate       0.82      0.78      0.80      1328
    Graduate       0.78      0.82      0.80      1288

    accuracy                           0.80      2616
   macro avg       0.80      0.80      0.80      2616
weighted avg       0.80      0.80      0.80      2616


[Tree] Top features:
                                                      feature  importance
                                                     EngSCORE    0.146715
                       MatricStatusOfficialDescr_New Transfer    0.063279
                                                   ALEKSScore    0.060936
                                         

In [None]:
import joblib
from datetime import datetime
from pathlib import Path

# Define save directory
model_dir = Path("/content/drive/MyDrive/CP_UMBC/Models/Base_Models")
model_dir.mkdir(parents=True, exist_ok=True)

timestamp = datetime.now().strftime("%Y%m%d_%H%M")

#  Dictionary of models to save
models_to_save = {
    "LogisticRegression": res_logreg["pipeline"],
    "RandomForest": res_rf["pipeline"],
    "GradientBoosting": res_gb["pipeline"],
    "GaussianNB": res_gnb["pipeline"],
    "XGBoost": res_xgb["pipeline"]
}

#  Save each model as .joblib
for name, pipe in models_to_save.items():
    filename = model_dir / f"{name}_base_{timestamp}.joblib"
    joblib.dump(pipe, filename)
    print(f" Saved: {filename}")

print("\nAll base models saved successfully.")


 Saved: /content/drive/MyDrive/CP_UMBC/Models/Base_Models/LogisticRegression_base_20251201_0522.joblib
 Saved: /content/drive/MyDrive/CP_UMBC/Models/Base_Models/RandomForest_base_20251201_0522.joblib
 Saved: /content/drive/MyDrive/CP_UMBC/Models/Base_Models/GradientBoosting_base_20251201_0522.joblib
 Saved: /content/drive/MyDrive/CP_UMBC/Models/Base_Models/GaussianNB_base_20251201_0522.joblib
 Saved: /content/drive/MyDrive/CP_UMBC/Models/Base_Models/XGBoost_base_20251201_0522.joblib

All base models saved successfully.


In [None]:

results_summary = pd.DataFrame([
    {"Model": "Logistic Regression", **{k: res_logreg[k] for k in ["accuracy","precision","recall","f1","auc"]}},
    {"Model": "Random Forest", **{k: res_rf[k] for k in ["accuracy","precision","recall","f1","auc"]}},
    {"Model": "Gradient Boosting", **{k: res_gb[k] for k in ["accuracy","precision","recall","f1","auc"]}},
    {"Model": "GaussianNB", **{k: res_gnb[k] for k in ["accuracy","precision","recall","f1","auc"]}},
    {"Model": "XGBoost", **{k: res_xgb[k] for k in ["accuracy","precision","recall","f1","auc"]}},
])

# Round for clean display
results_summary = results_summary.round(4)

# Sort primarily by AUC (then F1, then Accuracy)
results_summary = results_summary.sort_values(["auc","f1","accuracy"], ascending=False)

print("\n===  Base Model Comparison  ===")
print(results_summary.to_string(index=False))



===  Base Model Comparison  ===
              Model  accuracy  precision  recall     f1    auc
            XGBoost    0.7997     0.7821  0.8222 0.8017 0.8816
  Gradient Boosting    0.7813     0.7767  0.7803 0.7785 0.8726
      Random Forest    0.7787     0.7667  0.7911 0.7788 0.8642
Logistic Regression    0.7687     0.7645  0.7663 0.7654 0.8514
         GaussianNB    0.6510     0.5969  0.8967 0.7167 0.7652


In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint



def tune_model(name, base_estimator, param_distributions, n_iter=20, cv=3, random_state=42, n_jobs=-1):
    """Wraps base estimator in pipeline(preprocessor -> clf), runs RandomizedSearchCV on TRAIN only.
       Returns fitted randomized search object."""
    pipe = Pipeline([("pre", preprocessor), ("clf", base_estimator)])
    search = RandomizedSearchCV(
        estimator=pipe,
        param_distributions=param_distributions,
        n_iter=n_iter,
        scoring="roc_auc",
        cv=cv,
        verbose=1,
        random_state=random_state,
        n_jobs=n_jobs,
        refit=True
    )
    search.fit(X_train, y_train)
    print(f"\n>>> {name} best AUC (CV): {search.best_score_:.4f}")
    print(f">>> {name} best params: {search.best_params_}")
    return search

def evaluate_best(name, search):
    """Evaluate best_estimator_ from a fitted RandomizedSearchCV on the validation set."""
    best_pipe = search.best_estimator_
    res = evaluator.run(name + " (tuned)", best_pipe.named_steps["clf"], X_train, y_train, X_val, y_val)
    return res, best_pipe


In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gb_base = GradientBoostingClassifier(random_state=42)

gb_space = {
    "clf__n_estimators": randint(150, 500),
    "clf__learning_rate": uniform(0.01, 0.19),
    "clf__max_depth": randint(2, 5),
    "clf__min_samples_leaf": randint(1, 20),
    "clf__subsample": uniform(0.6, 0.4)
}

gb_search = tune_model("GradientBoosting", gb_base, gb_space, n_iter=25)
gb_res, gb_best_pipe = evaluate_best("GradientBoosting", gb_search)


Fitting 3 folds for each of 25 candidates, totalling 75 fits

>>> GradientBoosting best AUC (CV): 0.8779
>>> GradientBoosting best params: {'clf__learning_rate': np.float64(0.058652981324651556), 'clf__max_depth': 4, 'clf__min_samples_leaf': 9, 'clf__n_estimators': 356, 'clf__subsample': np.float64(0.7710164073434198)}

=== GradientBoosting (tuned) — Validation Metrics ===
Accuracy : 0.7878
Precision: 0.7762
Recall   : 0.7997
F1-score : 0.7878
ROC-AUC  : 0.8799

Confusion Matrix (rows=true, cols=pred):
              Pred 0 (No)  Pred 1 (Yes)
True 0 (No)          1031           297
True 1 (Yes)          258          1030

Classification Report:
              precision    recall  f1-score   support

 No Graduate       0.80      0.78      0.79      1328
    Graduate       0.78      0.80      0.79      1288

    accuracy                           0.79      2616
   macro avg       0.79      0.79      0.79      2616
weighted avg       0.79      0.79      0.79      2616



In [None]:
from xgboost import XGBClassifier

xgb_base = XGBClassifier(
    objective="binary:logistic", eval_metric="logloss",
    tree_method="hist", random_state=42, n_jobs=-1
)

xgb_space = {
    "clf__n_estimators": randint(200, 800),
    "clf__learning_rate": uniform(0.01, 0.19),

    "clf__max_depth": randint(3, 8),
    "clf__subsample": uniform(0.6, 0.4),

    "clf__colsample_bytree": uniform(0.6, 0.4),

    "clf__reg_lambda": uniform(0.0, 2.0)

}

xgb_search = tune_model("XGBoost", xgb_base, xgb_space, n_iter=30)
xgb_res, xgb_best_pipe = evaluate_best("XGBoost", xgb_search)


Fitting 3 folds for each of 30 candidates, totalling 90 fits

>>> XGBoost best AUC (CV): 0.8783
>>> XGBoost best params: {'clf__colsample_bytree': np.float64(0.662397808134481), 'clf__learning_rate': np.float64(0.021035886311957897), 'clf__max_depth': 7, 'clf__n_estimators': 299, 'clf__reg_lambda': np.float64(0.28573363584388156), 'clf__subsample': np.float64(0.8603553891795411)}

=== XGBoost (tuned) — Validation Metrics ===
Accuracy : 0.7963
Precision: 0.7832
Recall   : 0.8106
F1-score : 0.7966
ROC-AUC  : 0.8819

Confusion Matrix (rows=true, cols=pred):
              Pred 0 (No)  Pred 1 (Yes)
True 0 (No)          1039           289
True 1 (Yes)          244          1044

Classification Report:
              precision    recall  f1-score   support

 No Graduate       0.81      0.78      0.80      1328
    Graduate       0.78      0.81      0.80      1288

    accuracy                           0.80      2616
   macro avg       0.80      0.80      0.80      2616
weighted avg       0.80

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_base = RandomForestClassifier(
    class_weight="balanced_subsample", n_jobs=-1, random_state=42
)

rf_space = {
    "clf__n_estimators": randint(300, 800),
    "clf__max_depth": randint(4, 20),
    "clf__min_samples_split": randint(2, 20),
    "clf__min_samples_leaf": randint(1, 10),
    "clf__max_features": ["sqrt", "log2", 0.5, 0.7, None]
}

rf_search = tune_model("RandomForest", rf_base, rf_space, n_iter=25)
rf_res, rf_best_pipe = evaluate_best("RandomForest", rf_search)


Fitting 3 folds for each of 25 candidates, totalling 75 fits

>>> RandomForest best AUC (CV): 0.8746
>>> RandomForest best params: {'clf__max_depth': 11, 'clf__max_features': 0.5, 'clf__min_samples_leaf': 6, 'clf__min_samples_split': 3, 'clf__n_estimators': 643}

=== RandomForest (tuned) — Validation Metrics ===
Accuracy : 0.7859
Precision: 0.7716
Recall   : 0.8028
F1-score : 0.7869
ROC-AUC  : 0.8755

Confusion Matrix (rows=true, cols=pred):
              Pred 0 (No)  Pred 1 (Yes)
True 0 (No)          1022           306
True 1 (Yes)          254          1034

Classification Report:
              precision    recall  f1-score   support

 No Graduate       0.80      0.77      0.78      1328
    Graduate       0.77      0.80      0.79      1288

    accuracy                           0.79      2616
   macro avg       0.79      0.79      0.79      2616
weighted avg       0.79      0.79      0.79      2616



In [None]:
from sklearn.linear_model import LogisticRegression

log_base = LogisticRegression(max_iter=2000, solver="lbfgs")

log_space = {
    "clf__C": uniform(0.01, 9.99),  # 0.01–10
    "clf__class_weight": [None, "balanced"]
}

log_search = tune_model("LogisticRegression", log_base, log_space, n_iter=20)
log_res, log_best_pipe = evaluate_best("LogisticRegression", log_search)


Fitting 3 folds for each of 20 candidates, totalling 60 fits

>>> LogisticRegression best AUC (CV): 0.8452
>>> LogisticRegression best params: {'clf__C': np.float64(0.5735516744807316), 'clf__class_weight': 'balanced'}

=== LogisticRegression (tuned) — Validation Metrics ===
Accuracy : 0.7680
Precision: 0.7633
Recall   : 0.7663
F1-score : 0.7648
ROC-AUC  : 0.8512

Confusion Matrix (rows=true, cols=pred):
              Pred 0 (No)  Pred 1 (Yes)
True 0 (No)          1022           306
True 1 (Yes)          301           987

Classification Report:
              precision    recall  f1-score   support

 No Graduate       0.77      0.77      0.77      1328
    Graduate       0.76      0.77      0.76      1288

    accuracy                           0.77      2616
   macro avg       0.77      0.77      0.77      2616
weighted avg       0.77      0.77      0.77      2616



In [None]:
import joblib
from datetime import datetime
from pathlib import Path

#  Define directory for tuned models
tuned_dir = Path("/content/drive/MyDrive/CP_UMBC/Models/Tuned_Models")
tuned_dir.mkdir(parents=True, exist_ok=True)

timestamp = datetime.now().strftime("%Y%m%d_%H%M")

#Dictionary of tuned pipelines to save
tuned_models = {
    "GradientBoosting_tuned": gb_best_pipe,
    "XGBoost_tuned": xgb_best_pipe,
    "RandomForest_tuned": rf_best_pipe,
    "LogisticRegression_tuned": log_best_pipe
}

# Save all tuned models
for name, model in tuned_models.items():
    filename = tuned_dir / f"{name}_{timestamp}.joblib"
    joblib.dump(model, filename)
    print(f" Saved tuned model: {filename}")

print("\nAll tuned models saved successfully.")


 Saved tuned model: /content/drive/MyDrive/CP_UMBC/Models/Tuned_Models/GradientBoosting_tuned_20251201_0543.joblib
 Saved tuned model: /content/drive/MyDrive/CP_UMBC/Models/Tuned_Models/XGBoost_tuned_20251201_0543.joblib
 Saved tuned model: /content/drive/MyDrive/CP_UMBC/Models/Tuned_Models/RandomForest_tuned_20251201_0543.joblib
 Saved tuned model: /content/drive/MyDrive/CP_UMBC/Models/Tuned_Models/LogisticRegression_tuned_20251201_0543.joblib

All tuned models saved successfully.


In [None]:
summary_tuned = pd.DataFrame([
    {"model": "GB (tuned)", **{k:v for k,v in gb_res.items() if k in ("accuracy","precision","recall","f1","auc")}},
    {"model": "XGB (tuned)", **{k:v for k,v in xgb_res.items() if k in ("accuracy","precision","recall","f1","auc")}},
    {"model": "RF (tuned)", **{k:v for k,v in rf_res.items() if k in ("accuracy","precision","recall","f1","auc")}},
    {"model": "LogReg (tuned)", **{k:v for k,v in log_res.items() if k in ("accuracy","precision","recall","f1","auc")}},
]).sort_values(["auc","f1","accuracy"], ascending=False)

print("\n=== Tuned Models — Validation Summary ===")
print(summary_tuned.to_string(index=False))



=== Tuned Models — Validation Summary ===
         model  accuracy  precision   recall       f1      auc
   XGB (tuned)  0.796254   0.783196 0.810559 0.796643 0.881864
    GB (tuned)  0.787844   0.776187 0.799689 0.787763 0.879905
    RF (tuned)  0.785933   0.771642 0.802795 0.786910 0.875516
LogReg (tuned)  0.767966   0.763341 0.766304 0.764820 0.851235


In [None]:
# Build combined comparison
compare_df = pd.DataFrame([
    {"Phase": "Base", "Model": "Gradient Boosting", **{k: res_gb[k] for k in ("accuracy","precision","recall","f1","auc")}},
    {"Phase": "Tuned", "Model": "Gradient Boosting", **{k: gb_res[k] for k in ("accuracy","precision","recall","f1","auc")}},
    {"Phase": "Base", "Model": "XGBoost", **{k: res_xgb[k] for k in ("accuracy","precision","recall","f1","auc")}},
    {"Phase": "Tuned", "Model": "XGBoost", **{k: xgb_res[k] for k in ("accuracy","precision","recall","f1","auc")}},
    {"Phase": "Base", "Model": "Random Forest", **{k: res_rf[k] for k in ("accuracy","precision","recall","f1","auc")}},
    {"Phase": "Tuned", "Model": "Random Forest", **{k: rf_res[k] for k in ("accuracy","precision","recall","f1","auc")}},
    {"Phase": "Base", "Model": "Logistic Regression", **{k: res_logreg[k] for k in ("accuracy","precision","recall","f1","auc")}},
    {"Phase": "Tuned", "Model": "Logistic Regression", **{k: log_res[k] for k in ("accuracy","precision","recall","f1","auc")}},
])

compare_df = compare_df.round(4)
print("\n===Base vs Tuned Model Comparison ===")
print(compare_df.sort_values(["Model","Phase"]).to_string(index=False))



===Base vs Tuned Model Comparison ===
Phase               Model  accuracy  precision  recall     f1    auc
 Base   Gradient Boosting    0.7813     0.7767  0.7803 0.7785 0.8726
Tuned   Gradient Boosting    0.7878     0.7762  0.7997 0.7878 0.8799
 Base Logistic Regression    0.7687     0.7645  0.7663 0.7654 0.8514
Tuned Logistic Regression    0.7680     0.7633  0.7663 0.7648 0.8512
 Base       Random Forest    0.7787     0.7667  0.7911 0.7788 0.8642
Tuned       Random Forest    0.7859     0.7716  0.8028 0.7869 0.8755
 Base             XGBoost    0.7997     0.7821  0.8222 0.8017 0.8816
Tuned             XGBoost    0.7963     0.7832  0.8106 0.7966 0.8819


In [None]:
from sklearn.decomposition import PCA

numeric_pipeline_pca = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value=0)),
    ("scaler", StandardScaler()),
    ("pca", PCA(n_components=0.95, random_state=42))  # keep ~95% variance
])


In [None]:
preprocessor_pca = ColumnTransformer(
    transformers=[
        ("num", numeric_pipeline_pca, num_cols),
        ("cat", categorical_pipeline, cat_cols)
    ],
    remainder="drop",
    verbose_feature_names_out=False
)


In [None]:
evaluator_pca = Evaluator(preprocessor=preprocessor_pca)

In [None]:
res_logreg_pca = evaluator_pca.run("LogisticRegression_PCA", build_logreg(),
                                   X_train, y_train, X_val, y_val)
res_rf_pca  = evaluator_pca.run("RandomForest_PCA", build_random_forest(),
                                X_train, y_train, X_val, y_val)
res_gb_pca  = evaluator_pca.run("GradientBoosting_PCA", build_gradient_boosting(),
                                X_train, y_train, X_val, y_val)
res_gnb_pca = evaluator_pca.run("GaussianNB_PCA", build_gaussian_nb(),
                                X_train, y_train, X_val, y_val)
res_xgb_pca = evaluator_pca.run("XGBoost_PCA", build_xgboost(),
                                X_train, y_train, X_val, y_val)


=== LogisticRegression_PCA — Validation Metrics ===
Accuracy : 0.7710
Precision: 0.7677
Recall   : 0.7671
F1-score : 0.7674
ROC-AUC  : 0.8480

Confusion Matrix (rows=true, cols=pred):
              Pred 0 (No)  Pred 1 (Yes)
True 0 (No)          1029           299
True 1 (Yes)          300           988

Classification Report:
              precision    recall  f1-score   support

 No Graduate       0.77      0.77      0.77      1328
    Graduate       0.77      0.77      0.77      1288

    accuracy                           0.77      2616
   macro avg       0.77      0.77      0.77      2616
weighted avg       0.77      0.77      0.77      2616


=== RandomForest_PCA — Validation Metrics ===
Accuracy : 0.7710
Precision: 0.7608
Recall   : 0.7803
F1-score : 0.7704
ROC-AUC  : 0.8541

Confusion Matrix (rows=true, cols=pred):
              Pred 0 (No)  Pred 1 (Yes)
True 0 (No)          1012           316
True 1 (Yes)          283          1005

Classification Report:
              precisi

In [None]:
import pandas as pd

results_pca = [
    res_logreg_pca, res_rf_pca, res_gb_pca,
    res_gnb_pca, res_xgb_pca
]

summary_pca = pd.DataFrame([
    {k: v for k, v in r.items() if k not in ("pipeline", "confusion_matrix")}
    for r in results_pca
])

summary_pca = summary_pca.sort_values(["auc", "f1", "accuracy"], ascending=False)
print("\n=== PCA Branch Validation Metrics ===")
print(summary_pca.to_string(index=False))



=== PCA Branch Validation Metrics ===
                  name  accuracy  precision   recall       f1      auc
           XGBoost_PCA  0.779052   0.765719 0.794255 0.779726 0.867107
  GradientBoosting_PCA  0.776758   0.762295 0.794255 0.777947 0.857592
      RandomForest_PCA  0.771024   0.760787 0.780280 0.770410 0.854128
LogisticRegression_PCA  0.771024   0.767677 0.767081 0.767379 0.847969
        GaussianNB_PCA  0.636086   0.583416 0.912267 0.711690 0.757037


In [None]:
import joblib
from datetime import datetime
from pathlib import Path

#  Define PCA model directory
pca_dir = Path("/content/drive/MyDrive/CP_UMBC/Models/PCA_Models")
pca_dir.mkdir(parents=True, exist_ok=True)

timestamp = datetime.now().strftime("%Y%m%d_%H%M")

#  Dictionary of PCA pipelines to save
pca_models = {
    "LogisticRegression_PCA": res_logreg_pca["pipeline"],
    "RandomForest_PCA": res_rf_pca["pipeline"],
    "GradientBoosting_PCA": res_gb_pca["pipeline"],
    "GaussianNB_PCA": res_gnb_pca["pipeline"],
    "XGBoost_PCA": res_xgb_pca["pipeline"]
}

#Save PCA models
for name, pipe in pca_models.items():
    filename = pca_dir / f"{name}_{timestamp}.joblib"
    joblib.dump(pipe, filename)
    print(f" Saved PCA model: {filename}")

print("\nAll PCA models saved successfully.")


 Saved PCA model: /content/drive/MyDrive/CP_UMBC/Models/PCA_Models/LogisticRegression_PCA_20251201_0544.joblib
 Saved PCA model: /content/drive/MyDrive/CP_UMBC/Models/PCA_Models/RandomForest_PCA_20251201_0544.joblib
 Saved PCA model: /content/drive/MyDrive/CP_UMBC/Models/PCA_Models/GradientBoosting_PCA_20251201_0544.joblib
 Saved PCA model: /content/drive/MyDrive/CP_UMBC/Models/PCA_Models/GaussianNB_PCA_20251201_0544.joblib
 Saved PCA model: /content/drive/MyDrive/CP_UMBC/Models/PCA_Models/XGBoost_PCA_20251201_0544.joblib

All PCA models saved successfully.


In [None]:
import pandas as pd

def extract_metrics(phase, model_name, result_dict):
    if result_dict is None:
        return None
    return {
        "Phase": phase,
        "Model": model_name,
        "Accuracy": result_dict.get("accuracy", None),
        "Precision": result_dict.get("precision", None),
        "Recall": result_dict.get("recall", None),
        "F1": result_dict.get("f1", None),
        "AUC": result_dict.get("auc", None)
    }

-
results_summary = []

# Phase: BASE
results_summary.append(extract_metrics("Base", "LogisticRegression", res_logreg))
results_summary.append(extract_metrics("Base", "RandomForest", res_rf))
results_summary.append(extract_metrics("Base", "GradientBoosting", res_gb))
results_summary.append(extract_metrics("Base", "GaussianNB", res_gnb))
results_summary.append(extract_metrics("Base", "XGBoost", res_xgb))

# Phase: PCA
results_summary.append(extract_metrics("PCA", "LogisticRegression", res_logreg_pca))
results_summary.append(extract_metrics("PCA", "RandomForest", res_rf_pca))
results_summary.append(extract_metrics("PCA", "GradientBoosting", res_gb_pca))
results_summary.append(extract_metrics("PCA", "GaussianNB", res_gnb_pca))
results_summary.append(extract_metrics("PCA", "XGBoost", res_xgb_pca))

# Phase: TUNED
results_summary.extend([
    extract_metrics("Tuned", "LogisticRegression", log_res),
    extract_metrics("Tuned", "RandomForest", rf_res),
    extract_metrics("Tuned", "GradientBoosting", gb_res),
    extract_metrics("Tuned", "XGBoost", xgb_res)
])
# Combine into one DataFrame
results_summary = [r for r in results_summary if r is not None]
results_df = pd.DataFrame(results_summary)

results_df = results_df.sort_values(by=["Model", "Phase"], key=lambda x: x.map({"Base":1, "Tuned":2, "PCA":3}))

print("\n===  Combined Model Performance Summary ===")
print(results_df.to_string(index=False))





===  Combined Model Performance Summary ===
Phase              Model  Accuracy  Precision   Recall       F1      AUC
 Base LogisticRegression  0.768731   0.764524 0.766304 0.765413 0.851408
 Base       RandomForest  0.778670   0.766742 0.791149 0.778754 0.864196
 Base   GradientBoosting  0.781346   0.776662 0.780280 0.778466 0.872627
 Base         GaussianNB  0.650994   0.596899 0.896739 0.716724 0.765177
 Base            XGBoost  0.799694   0.782127 0.822205 0.801665 0.881634
Tuned LogisticRegression  0.767966   0.763341 0.766304 0.764820 0.851235
Tuned       RandomForest  0.785933   0.771642 0.802795 0.786910 0.875516
Tuned   GradientBoosting  0.787844   0.776187 0.799689 0.787763 0.879905
Tuned            XGBoost  0.796254   0.783196 0.810559 0.796643 0.881864
  PCA LogisticRegression  0.771024   0.767677 0.767081 0.767379 0.847969
  PCA       RandomForest  0.771024   0.760787 0.780280 0.770410 0.854128
  PCA   GradientBoosting  0.776758   0.762295 0.794255 0.777947 0.857592
  PCA 

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC

def build_linear_svm_base():
    # fast linear margin model; decision_function is used for AUC
    return SGDClassifier(loss="hinge", alpha=1e-3, penalty="l2",
                         class_weight="balanced", max_iter=3000, random_state=42)

def build_rbf_svm_base():
    # full SVC with probability for ROC/AUC; class_weight balances labels
    return SVC(kernel="rbf", C=1.0, gamma="scale",
               probability=True, class_weight="balanced", random_state=42)


res_linSVM_base = evaluator.run("LinearSVM_Base", build_linear_svm_base(),
                                X_train, y_train, X_val, y_val,
                                print_importances=True,  # will do permutation importance
                                n_perm_repeats=5)

res_rbfSVM_base = evaluator.run("SVM_RBF_Base", build_rbf_svm_base(),
                                X_train, y_train, X_val, y_val,
                                print_importances=True,  # permutation importance
                                n_perm_repeats=5)



=== LinearSVM_Base — Validation Metrics ===
Accuracy : 0.7672
Precision: 0.7570
Recall   : 0.7764
F1-score : 0.7666
ROC-AUC  : 0.8417

Confusion Matrix (rows=true, cols=pred):
              Pred 0 (No)  Pred 1 (Yes)
True 0 (No)          1007           321
True 1 (Yes)          288          1000

Classification Report:
              precision    recall  f1-score   support

 No Graduate       0.78      0.76      0.77      1328
    Graduate       0.76      0.78      0.77      1288

    accuracy                           0.77      2616
   macro avg       0.77      0.77      0.77      2616
weighted avg       0.77      0.77      0.77      2616


=== SVM_RBF_Base — Validation Metrics ===
Accuracy : 0.7779
Precision: 0.7575
Recall   : 0.8075
F1-score : 0.7817
ROC-AUC  : 0.8603

Confusion Matrix (rows=true, cols=pred):
              Pred 0 (No)  Pred 1 (Yes)
True 0 (No)           995           333
True 1 (Yes)          248          1040

Classification Report:
              precision    recall

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import loguniform

# Tune Linear SVM (SGDClassifier)
pipe_lin = Pipeline([("pre", preprocessor),
                     ("clf", SGDClassifier(loss="hinge", penalty="l2",
                                           class_weight="balanced",
                                           max_iter=3000, random_state=42))])

param_grid_lin = {
    "clf__loss": ["hinge", "modified_huber"],  # modified_huber can be more forgiving
    "clf__alpha": [1e-4, 3e-4, 1e-3, 3e-3, 1e-2],
}

gs_lin = GridSearchCV(pipe_lin, param_grid=param_grid_lin,
                      scoring="f1", cv=3, n_jobs=-1, verbose=1)
gs_lin.fit(X_train, y_train)

best_lin = gs_lin.best_params_
print("Best Linear SVM params:", best_lin)

# rebuild a clean classifier with best params and evaluate
clf_lin_tuned = SGDClassifier(loss=best_lin["clf__loss"],
                              alpha=best_lin["clf__alpha"],
                              penalty="l2",
                              class_weight="balanced",
                              max_iter=3000, random_state=42)

res_linSVM_tuned = evaluator.run("LinearSVM_Tuned", clf_lin_tuned,
                                 X_train, y_train, X_val, y_val,
                                 print_importances=True, n_perm_repeats=5)

#  Tune RBF SVM (SVC)
pipe_rbf = Pipeline([("pre", preprocessor),
                     ("clf", SVC(kernel="rbf", probability=True,
                                 class_weight="balanced", random_state=42))])

param_dist_rbf = {
    "clf__C": loguniform(1e-2, 1e2),        # ~[0.01, 100]
    "clf__gamma": loguniform(1e-4, 1e1),    # ~[1e-4, 10]
}

rs_rbf = RandomizedSearchCV(pipe_rbf, param_distributions=param_dist_rbf,
                            n_iter=25, scoring="f1", cv=3, n_jobs=-1,
                            random_state=42, verbose=1)
rs_rbf.fit(X_train, y_train)

best_rbf = rs_rbf.best_params_
print("Best RBF SVM params:", best_rbf)

clf_rbf_tuned = SVC(kernel="rbf",
                    C=best_rbf["clf__C"],
                    gamma=best_rbf["clf__gamma"],
                    probability=True,
                    class_weight="balanced",
                    random_state=42)

res_rbfSVM_tuned = evaluator.run("SVM_RBF_Tuned", clf_rbf_tuned,
                                 X_train, y_train, X_val, y_val,
                                 print_importances=True, n_perm_repeats=5)


Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best Linear SVM params: {'clf__alpha': 0.01, 'clf__loss': 'hinge'}

=== LinearSVM_Tuned — Validation Metrics ===
Accuracy : 0.7588
Precision: 0.7351
Recall   : 0.7974
F1-score : 0.7650
ROC-AUC  : 0.8439

Confusion Matrix (rows=true, cols=pred):
              Pred 0 (No)  Pred 1 (Yes)
True 0 (No)           958           370
True 1 (Yes)          261          1027

Classification Report:
              precision    recall  f1-score   support

 No Graduate       0.79      0.72      0.75      1328
    Graduate       0.74      0.80      0.76      1288

    accuracy                           0.76      2616
   macro avg       0.76      0.76      0.76      2616
weighted avg       0.76      0.76      0.76      2616

Fitting 3 folds for each of 25 candidates, totalling 75 fits
Best RBF SVM params: {'clf__C': np.float64(5.456725485601478), 'clf__gamma': np.float64(0.015876781526923997)}

=== SVM_RBF_Tuned — Validation Metrics ===
Accurac

In [None]:
from sklearn.decomposition import PCA

# identify columns once
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()

# find k components for ~95% variance on the numeric block
num_pipe_for_k = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value=0)),
    ("scaler", StandardScaler()),
    ("pca", PCA(n_components=0.95, svd_solver="full"))
])
num_pipe_for_k.fit(X_train[num_cols])
k = num_pipe_for_k.named_steps["pca"].n_components_
print(f"PCA numeric components (≈95% variance): k = {k}")

# build the actual PCA preprocessor with the chosen k
numeric_pca_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value=0)),
    ("scaler", StandardScaler()),
    ("pca", PCA(n_components=k, svd_solver="full"))
])

categorical_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value="Unknown")),
    ("ohe", OneHotEncoder(drop="first", handle_unknown="ignore", sparse_output=False))
])

pca_preprocessor = ColumnTransformer(
    [("num", numeric_pca_pipe, num_cols),
     ("cat", categorical_pipe, cat_cols)],
    remainder="drop",
    verbose_feature_names_out=False
)

# PCA evaluator
evaluator_pca = Evaluator(preprocessor=pca_preprocessor)

#  run SVMs on PCA features
res_linSVM_pca = evaluator_pca.run("LinearSVM_PCA", build_linear_svm_base(),
                                   X_train, y_train, X_val, y_val,
                                   print_importances=True, n_perm_repeats=5)

res_rbfSVM_pca = evaluator_pca.run("SVM_RBF_PCA", build_rbf_svm_base(),
                                   X_train, y_train, X_val, y_val,
                                   print_importances=True, n_perm_repeats=5)


PCA numeric components (≈95% variance): k = 13

=== LinearSVM_PCA — Validation Metrics ===
Accuracy : 0.7622
Precision: 0.7508
Recall   : 0.7741
F1-score : 0.7622
ROC-AUC  : 0.8403

Confusion Matrix (rows=true, cols=pred):
              Pred 0 (No)  Pred 1 (Yes)
True 0 (No)           997           331
True 1 (Yes)          291           997

Classification Report:
              precision    recall  f1-score   support

 No Graduate       0.77      0.75      0.76      1328
    Graduate       0.75      0.77      0.76      1288

    accuracy                           0.76      2616
   macro avg       0.76      0.76      0.76      2616
weighted avg       0.76      0.76      0.76      2616


=== SVM_RBF_PCA — Validation Metrics ===
Accuracy : 0.7764
Precision: 0.7556
Recall   : 0.8067
F1-score : 0.7803
ROC-AUC  : 0.8587

Confusion Matrix (rows=true, cols=pred):
              Pred 0 (No)  Pred 1 (Yes)
True 0 (No)           992           336
True 1 (Yes)          249          1039

Classificat

In [None]:
import joblib
from datetime import datetime
from pathlib import Path

#Define target directories
base_dir = Path("/content/drive/MyDrive/CP_UMBC/Models/Base_Models")
tuned_dir = Path("/content/drive/MyDrive/CP_UMBC/Models/Tuned_Models")
pca_dir   = Path("/content/drive/MyDrive/CP_UMBC/Models/PCA_Models")

# Ensure folders exist
for d in [base_dir, tuned_dir, pca_dir]:
    d.mkdir(parents=True, exist_ok=True)

timestamp = datetime.now().strftime("%Y%m%d_%H%M")

# Save Base SVM Models
svm_base_models = {
    "LinearSVM_Base": res_linSVM_base["pipeline"],
    "SVM_RBF_Base": res_rbfSVM_base["pipeline"]
}
for name, model in svm_base_models.items():
    path = base_dir / f"{name}_{timestamp}.joblib"
    joblib.dump(model, path)
    print(f" Saved Base SVM model: {path}")

# Save Tuned SVM Models
svm_tuned_models = {
    "LinearSVM_Tuned": res_linSVM_tuned["pipeline"],
    "SVM_RBF_Tuned": res_rbfSVM_tuned["pipeline"]
}
for name, model in svm_tuned_models.items():
    path = tuned_dir / f"{name}_{timestamp}.joblib"
    joblib.dump(model, path)
    print(f" Saved Tuned SVM model: {path}")

# Save PCA SVM Models
svm_pca_models = {
    "LinearSVM_PCA": res_linSVM_pca["pipeline"],
    "SVM_RBF_PCA": res_rbfSVM_pca["pipeline"]
}
for name, model in svm_pca_models.items():
    path = pca_dir / f"{name}_{timestamp}.joblib"
    joblib.dump(model, path)
    print(f" Saved PCA SVM model: {path}")

print("\n All SVM models saved successfully to:")
print(f" - Base models:  {base_dir}")
print(f" - Tuned models: {tuned_dir}")
print(f" - PCA models:   {pca_dir}")


 Saved Base SVM model: /content/drive/MyDrive/CP_UMBC/Models/Base_Models/LinearSVM_Base_20251201_0632.joblib
 Saved Base SVM model: /content/drive/MyDrive/CP_UMBC/Models/Base_Models/SVM_RBF_Base_20251201_0632.joblib
 Saved Tuned SVM model: /content/drive/MyDrive/CP_UMBC/Models/Tuned_Models/LinearSVM_Tuned_20251201_0632.joblib
 Saved Tuned SVM model: /content/drive/MyDrive/CP_UMBC/Models/Tuned_Models/SVM_RBF_Tuned_20251201_0632.joblib
 Saved PCA SVM model: /content/drive/MyDrive/CP_UMBC/Models/PCA_Models/LinearSVM_PCA_20251201_0632.joblib
 Saved PCA SVM model: /content/drive/MyDrive/CP_UMBC/Models/PCA_Models/SVM_RBF_PCA_20251201_0632.joblib

 All SVM models saved successfully to:
 - Base models:  /content/drive/MyDrive/CP_UMBC/Models/Base_Models
 - Tuned models: /content/drive/MyDrive/CP_UMBC/Models/Tuned_Models
 - PCA models:   /content/drive/MyDrive/CP_UMBC/Models/PCA_Models


In [None]:
import pandas as pd

svm_results = pd.DataFrame([
    {"Phase":"Base", "Model":"LinearSVM", **{k:res_linSVM_base[k] for k in ["accuracy","precision","recall","f1","auc"]}},
    {"Phase":"Base", "Model":"RBF SVM",  **{k:res_rbfSVM_base[k] for k in ["accuracy","precision","recall","f1","auc"]}},
    {"Phase":"Tuned","Model":"LinearSVM", **{k:res_linSVM_tuned[k] for k in ["accuracy","precision","recall","f1","auc"]}},
    {"Phase":"Tuned","Model":"RBF SVM",  **{k:res_rbfSVM_tuned[k] for k in ["accuracy","precision","recall","f1","auc"]}},
    {"Phase":"PCA", "Model":"LinearSVM", **{k:res_linSVM_pca[k] for k in ["accuracy","precision","recall","f1","auc"]}},
    {"Phase":"PCA", "Model":"RBF SVM",  **{k:res_rbfSVM_pca[k] for k in ["accuracy","precision","recall","f1","auc"]}},
])

print("\n=== SVM Comparison: Base vs Tuned vs PCA (Validation) ===")
print(svm_results.to_string(index=False))



=== SVM Comparison: Base vs Tuned vs PCA (Validation) ===
Phase     Model  accuracy  precision   recall       f1      auc
 Base LinearSVM  0.767202   0.757002 0.776398 0.766577 0.841747
 Base   RBF SVM  0.777905   0.757465 0.807453 0.781661 0.860281
Tuned LinearSVM  0.758792   0.735147 0.797360 0.764991 0.843918
Tuned   RBF SVM  0.778670   0.760854 0.802795 0.781262 0.863421
  PCA LinearSVM  0.762232   0.750753 0.774068 0.762232 0.840279
  PCA   RBF SVM  0.776376   0.755636 0.806677 0.780323 0.858723


In [None]:
# FINAL TEST EVALUATION (Base + Tuned + PCA)
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix
)

#   Collect all models (Base, Tuned, PCA)
all_models = {
    #  Base models
    "LogisticRegression_Base": res_logreg["pipeline"],
    "RandomForest_Base": res_rf["pipeline"],
    "GradientBoosting_Base": res_gb["pipeline"],
    "GaussianNB_Base": res_gnb["pipeline"],
    "XGBoost_Base": res_xgb["pipeline"],

    # === Tuned models ===
    "LogisticRegression_Tuned": log_best_pipe,
    "RandomForest_Tuned": rf_best_pipe,
    "GradientBoosting_Tuned": gb_best_pipe,
    "XGBoost_Tuned": xgb_best_pipe,

    # PCA models
    "LogisticRegression_PCA": res_logreg_pca["pipeline"],
    "RandomForest_PCA": res_rf_pca["pipeline"],
    "GradientBoosting_PCA": res_gb_pca["pipeline"],
    "GaussianNB_PCA": res_gnb_pca["pipeline"],
    "XGBoost_PCA": res_xgb_pca["pipeline"],
}

#   Evaluate each model on the TEST set
test_results = []
for name, model in all_models.items():
    print(f"\n=== Evaluating {name} on TEST data ===")
    try:
        y_pred = model.predict(X_test)
        if hasattr(model, "predict_proba"):
            y_score = model.predict_proba(X_test)[:, 1]
        elif hasattr(model, "decision_function"):
            y_score = model.decision_function(X_test)
        else:
            y_score = y_pred

        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred)
        rec = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        auc = roc_auc_score(y_test, y_score)
        cm = confusion_matrix(y_test, y_pred)

        test_results.append({
            "Model": name,
            "Accuracy": acc,
            "Precision": prec,
            "Recall": rec,
            "F1": f1,
            "AUC": auc
        })

        print(f"Accuracy: {acc:.4f} | Precision: {prec:.4f} | Recall: {rec:.4f} | F1: {f1:.4f} | AUC: {auc:.4f}")
        print("Confusion Matrix:\n", cm)

    except Exception as e:
        print(f" Skipped {name} due to error: {e}")

#  Create comparison summary
test_summary = (
    pd.DataFrame(test_results)
      .sort_values(by=["AUC", "F1", "Accuracy"], ascending=False)
      .reset_index(drop=True)
)

print("\n===  FINAL TEST PERFORMANCE SUMMARY (Base + Tuned + PCA) ===")
print(test_summary.to_string(index=False))

#  Save summary to Drive
out_dir = Path("/content/drive/MyDrive/CP_UMBC/Results")
out_dir.mkdir(parents=True, exist_ok=True)
out_path = out_dir / "final_test_results_all_models.csv"
test_summary.to_csv(out_path, index=False)

print(f"\n All test results saved to: {out_path}")



=== Evaluating LogisticRegression_Base on TEST data ===
Accuracy: 0.7673 | Precision: 0.7638 | Recall: 0.7632 | F1: 0.7635 | AUC: 0.8471
Confusion Matrix:
 [[1025  304]
 [ 305  983]]

=== Evaluating RandomForest_Base on TEST data ===
Accuracy: 0.7700 | Precision: 0.7548 | Recall: 0.7888 | F1: 0.7715 | AUC: 0.8554
Confusion Matrix:
 [[ 999  330]
 [ 272 1016]]

=== Evaluating GradientBoosting_Base on TEST data ===
Accuracy: 0.7856 | Precision: 0.7777 | Recall: 0.7904 | F1: 0.7840 | AUC: 0.8697
Confusion Matrix:
 [[1038  291]
 [ 270 1018]]

=== Evaluating GaussianNB_Base on TEST data ===
Accuracy: 0.6580 | Precision: 0.6018 | Recall: 0.9022 | F1: 0.7220 | AUC: 0.7727
Confusion Matrix:
 [[ 560  769]
 [ 126 1162]]

=== Evaluating XGBoost_Base on TEST data ===
Accuracy: 0.7864 | Precision: 0.7785 | Recall: 0.7911 | F1: 0.7848 | AUC: 0.8730
Confusion Matrix:
 [[1039  290]
 [ 269 1019]]

=== Evaluating LogisticRegression_Tuned on TEST data ===
Accuracy: 0.7677 | Precision: 0.7636 | Recall: 0.