In [1]:
"""
improved_with_smote_and_tuning.py

Improved emotion classification pipeline for your dataset (same-folder usage).
Includes SMOTE (in-pipeline), feature cleaning, RF-based selection, PCA,
RandomizedSearchCV tuning for base models, and a stacked ensemble.

Run in the folder containing:
  20231225_dfall_obs_data_and_spectral_features_revision1_n469.csv
"""

import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from functools import partial
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold, SelectFromModel
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.base import BaseEstimator, TransformerMixin
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import scipy.stats as stats
import time
import os

# -------------------------------
# Custom transformer: correlation filter
# -------------------------------
class CorrelationFilter(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.95):
        self.threshold = threshold
        self.to_drop_ = []

    def fit(self, X, y=None):
        # expect X as DataFrame or convertable
        X_df = pd.DataFrame(X)
        corr = X_df.corr().abs()
        upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
        self.to_drop_ = [col for col in upper.columns if any(upper[col] > self.threshold)]
        return self

    def transform(self, X):
        X_df = pd.DataFrame(X)
        return X_df.drop(columns=self.to_drop_, errors='ignore')

# -------------------------------
# File/target
# -------------------------------
CSV = "20231225_dfall_obs_data_and_spectral_features_revision1_n469.csv"
if not os.path.exists(CSV):
    raise FileNotFoundError(f"Place the CSV named '{CSV}' in this folder and run this script here.")

df = pd.read_csv(CSV)
if "Context2" not in df.columns:
    raise ValueError("Expected column 'Context2' not found!")

# target encode
df["Context2"] = df["Context2"].astype("category").cat.codes
y = df["Context2"]
X = df.select_dtypes(include=[np.number]).copy()
if "Context2" in X.columns:
    X = X.drop(columns=["Context2"])

print("Loaded data:", X.shape, "Target class counts:", np.bincount(y))

# -------------------------------
# Train/test split
# -------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=42
)

# -------------------------------
# Preprocessing + selection (fit on training folds only inside pipeline)
# We'll build an imbalanced-learn pipeline that includes:
# imputer -> variance threshold -> corr filter -> SMOTE -> scaler -> SelectFromModel -> PCA -> model
# Note: SelectFromModel uses RandomForest for importance and threshold="median".
# -------------------------------

imputer = SimpleImputer(strategy="mean")
var_thresh = VarianceThreshold(threshold=1e-4)
corr_filter = CorrelationFilter(threshold=0.95)
scaler = StandardScaler()
feat_selector = SelectFromModel(RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1), threshold="median")
pca = PCA(n_components=0.95, svd_solver="full", random_state=42)
smote = SMOTE(random_state=42, k_neighbors=3)

# -------------------------------
# Base estimator search spaces (moderate)
# -------------------------------
rf = RandomForestClassifier(random_state=42, n_jobs=-1, class_weight=None)
xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss', n_jobs=1)
svm = SVC(probability=True, random_state=42, class_weight='balanced')

rf_param_dist = {
    "model__n_estimators": stats.randint(150, 500),
    "model__max_depth": stats.randint(4, 50),
    "model__min_samples_split": stats.randint(2, 8),
    "model__min_samples_leaf": stats.randint(1, 6),
    "model__max_features": ['sqrt', 'log2', None, 0.5]
}

xgb_param_dist = {
    "model__n_estimators": stats.randint(100, 600),
    "model__learning_rate": stats.uniform(0.01, 0.2),
    "model__max_depth": stats.randint(3, 12),
    "model__subsample": stats.uniform(0.6, 0.4),
    "model__colsample_bytree": stats.uniform(0.5, 0.5)
}

svm_param_dist = {
    "model__C": stats.loguniform(1e-1, 10),
    "model__gamma": ['scale', 'auto'],
    "model__kernel": ['rbf', 'poly'],
    "model__degree": stats.randint(2, 5)
}

# -------------------------------
# Helper: pipeline builder for RandomizedSearchCV
# -------------------------------
def build_imb_pipeline(base_model):
    """
    Build an imblearn pipeline up to 'model' step. Steps:
    imputer -> var_thresh -> corr_filter -> smote -> scaler -> feat_selector -> pca -> model
    """
    return ImbPipeline([
        ("imputer", imputer),
        ("var_thresh", var_thresh),
        ("corr_filter", corr_filter),
        ("smote", smote),
        ("scaler", scaler),
        ("feat_selector", feat_selector),
        ("pca", pca),
        ("model", base_model)
    ])

# -------------------------------
# Cross-validation and search config
# -------------------------------
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = "f1_weighted"
n_iter_search = 30   # increase this for better tuning (more time)

random_searches = []

# -------------------------------
# Randomized search for RF
# -------------------------------
print("Tuning Random Forest (this may take a while)...")
start = time.time()
pipe_rf = build_imb_pipeline(rf)
rs_rf = RandomizedSearchCV(pipe_rf, rf_param_dist, n_iter=n_iter_search, scoring=scoring,
                           cv=cv, random_state=42, n_jobs=-1, verbose=1)
rs_rf.fit(X_train, y_train)
print("RF best score:", rs_rf.best_score_, "best params:", rs_rf.best_params_)
random_searches.append(("rf", rs_rf))
print("RF tuning time: {:.1f}s".format(time.time()-start))

# -------------------------------
# Randomized search for XGB
# -------------------------------
print("\nTuning XGBoost...")
start = time.time()
pipe_xgb = build_imb_pipeline(xgb)
rs_xgb = RandomizedSearchCV(pipe_xgb, xgb_param_dist, n_iter=n_iter_search, scoring=scoring,
                            cv=cv, random_state=42, n_jobs=1, verbose=1)  # XGB can conflict with n_jobs=-1, set 1
rs_xgb.fit(X_train, y_train)
print("XGB best score:", rs_xgb.best_score_, "best params:", rs_xgb.best_params_)
random_searches.append(("xgb", rs_xgb))
print("XGB tuning time: {:.1f}s".format(time.time()-start))

# -------------------------------
# Randomized search for SVM
# -------------------------------
print("\nTuning SVM...")
start = time.time()
pipe_svm = build_imb_pipeline(svm)
rs_svm = RandomizedSearchCV(pipe_svm, svm_param_dist, n_iter=max(15, int(n_iter_search/2)),
                            scoring=scoring, cv=cv, random_state=42, n_jobs=-1, verbose=1)
rs_svm.fit(X_train, y_train)
print("SVM best score:", rs_svm.best_score_, "best params:", rs_svm.best_params_)
random_searches.append(("svm", rs_svm))
print("SVM tuning time: {:.1f}s".format(time.time()-start))

# -------------------------------
# Collect best tuned estimators (extract fitted 'model' from pipeline)
# -------------------------------
best_rf_pipe = rs_rf.best_estimator_
best_xgb_pipe = rs_xgb.best_estimator_
best_svm_pipe = rs_svm.best_estimator_

# We will use the full pipelines (including preprocessing + SMOTE) when fitting individually,
# but for stacking we want the underlying estimators (they must be unfitted clones that accept the transformed data).
# We'll build a stacking pipeline where preprocessing (imputer..pca) happens once, then stack on the transformed features.
print("\nBuilding final pipelines and stacking ensemble...")

# build a preprocessing-only pipeline (no SMOTE here) to transform raw X -> features used by meta models
# NOTE: SMOTE is only applied during training within RandomizedSearchCV above. For final stacking training we will:
# - Option A: include SMOTE inside a pipeline for each base estimator (but sklearn's StackingClassifier expects estimators working on the same input)
# - Simpler and safer approach: replicate the preprocessing (IMPUTE->VAR->CORR->SCALE->SELECT->PCA) deterministically,
#   then apply SMOTE to the transformed training set before fitting base estimators and the meta model.
preproc_pipeline = Pipeline([
    ("imputer", imputer),
    ("var_thresh", var_thresh),
    ("corr_filter", corr_filter),
    ("scaler", scaler),
    ("feat_selector", feat_selector),
    ("pca", pca)
])

# Fit preprocessing on train and transform
X_train_trans = preproc_pipeline.fit_transform(X_train, y_train)
X_test_trans = preproc_pipeline.transform(X_test)

# Apply SMOTE to the transformed training set
sm = SMOTE(random_state=42, k_neighbors=3)
X_train_bal, y_train_bal = sm.fit_resample(X_train_trans, y_train)
print("Transformed features shape:", X_train_trans.shape, " => after SMOTE:", X_train_bal.shape)

# Now create tuned base estimators (cloned with best params from RandomizedSearchCV)
from sklearn.base import clone

def extract_best_model_from_search(rs_obj):
    # Best estimator is a pipeline; extract its 'model' step and clone it with found params
    best_pipe = rs_obj.best_estimator_
    best_model = best_pipe.named_steps["model"]
    # clone to get fresh estimator with same hyperparams
    return clone(best_model)

best_rf = extract_best_model_from_search(rs_rf)
best_xgb = extract_best_model_from_search(rs_xgb)
best_svm = extract_best_model_from_search(rs_svm)

# Fit base estimators on the balanced transformed train set
print("\nFitting tuned base estimators on transformed + SMOTE data...")
best_rf.fit(X_train_bal, y_train_bal)
best_xgb.fit(X_train_bal, y_train_bal)
best_svm.fit(X_train_bal, y_train_bal)

# Evaluate base estimators on test (preproc -> predict)
print("\nBase estimator performance on hold-out test (after preprocessing pipeline):")
for name, est in [("RF-tuned", best_rf), ("XGB-tuned", best_xgb), ("SVM-tuned", best_svm)]:
    preds = est.predict(X_test_trans)
    acc = accuracy_score(y_test, preds)
    f1w = f1_score(y_test, preds, average="weighted")
    print(f"{name:12s} - Acc: {acc:.3f}, F1w: {f1w:.3f}")

# -------------------------------
# Build stacking meta-model: train on balanced transformed data, test on transformed test set
# -------------------------------
meta_clf = LogisticRegression(max_iter=3000, random_state=42)
stack = StackingClassifier(
    estimators=[("rf", best_rf), ("xgb", best_xgb), ("svm", best_svm)],
    final_estimator=meta_clf,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    passthrough=False,
    n_jobs=-1
)

print("\nTraining stacking classifier on balanced transformed train set...")
stack.fit(X_train_bal, y_train_bal)

# Evaluate stacking
stack_preds = stack.predict(X_test_trans)
acc_stack = accuracy_score(y_test, stack_preds)
f1w_stack = f1_score(y_test, stack_preds, average="weighted")
print(f"\nStacking Ensemble - Acc: {acc_stack:.3f}, F1w: {f1w_stack:.3f}")
print("\nClassification report (stack):\n", classification_report(y_test, stack_preds, digits=3))

# Also try a soft VotingClassifier (averages probabilities)
voting = VotingClassifier(estimators=[("rf", best_rf), ("xgb", best_xgb), ("svm", best_svm)], voting="soft", n_jobs=-1)
voting.fit(X_train_bal, y_train_bal)
v_preds = voting.predict(X_test_trans)
print("Voting Ensemble - Acc: {:.3f}, F1w: {:.3f}".format(accuracy_score(y_test, v_preds), f1_score(y_test, v_preds, average="weighted")))

# Cross-validated estimate of final stack (on balanced transformed training set)
cv_scores = cross_val_score(stack, X_train_bal, y_train_bal, cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42), scoring="f1_weighted", n_jobs=-1)
print("Stack CV on balanced data f1_weighted: {:.3f} ± {:.3f}".format(cv_scores.mean(), cv_scores.std()))

print("\nDone.")


Loaded data: (469, 109) Target class counts: [102 178 189]
Tuning Random Forest (this may take a while)...
Fitting 5 folds for each of 30 candidates, totalling 150 fits


Traceback (most recent call last):
  File [35m"/home/easwer/.local/share/mise/installs/python/3.13.7/lib/python3.13/multiprocessing/resource_tracker.py"[0m, line [35m295[0m, in [35mmain[0m
    raise ValueError(
        f'Cannot register {name} for automatic cleanup: '
        f'unknown resource type {rtype}')
[1;35mValueError[0m: [35mCannot register /dev/shm/joblib_memmapping_folder_7310_ca16bfb5427140e1b098511c459f6c84_a67c8970fc804bb98ef4c8fcd37f5118 for automatic cleanup: unknown resource type folder[0m
Traceback (most recent call last):
  File [35m"/home/easwer/.local/share/mise/installs/python/3.13.7/lib/python3.13/multiprocessing/resource_tracker.py"[0m, line [35m295[0m, in [35mmain[0m
    raise ValueError(
        f'Cannot register {name} for automatic cleanup: '
        f'unknown resource type {rtype}')
[1;35mValueError[0m: [35mCannot register /loky-7310-ehhhu0dn for automatic cleanup: unknown resource type semlock[0m
Traceback (most recent call last):
  Fil

XGB best score: 0.5836988255151401 best params: {'model__colsample_bytree': 0.9402339195076288, 'model__learning_rate': 0.13487080962675865, 'model__max_depth': 11, 'model__n_estimators': 233, 'model__subsample': 0.782613828193164}
XGB tuning time: 336.8s

Tuning SVM...
Fitting 5 folds for each of 15 candidates, totalling 75 fits


Traceback (most recent call last):
  File [35m"/home/easwer/.local/share/mise/installs/python/3.13.7/lib/python3.13/multiprocessing/resource_tracker.py"[0m, line [35m295[0m, in [35mmain[0m
    raise ValueError(
        f'Cannot register {name} for automatic cleanup: '
        f'unknown resource type {rtype}')
[1;35mValueError[0m: [35mCannot register /dev/shm/joblib_memmapping_folder_7310_623c1117fd7d4811844554543df9d8c5_5ae505fb63de4208b6fafb1bdaa47d7b for automatic cleanup: unknown resource type folder[0m
Traceback (most recent call last):
  File [35m"/home/easwer/.local/share/mise/installs/python/3.13.7/lib/python3.13/multiprocessing/resource_tracker.py"[0m, line [35m295[0m, in [35mmain[0m
    raise ValueError(
        f'Cannot register {name} for automatic cleanup: '
        f'unknown resource type {rtype}')
[1;35mValueError[0m: [35mCannot register /dev/shm/joblib_memmapping_folder_7310_ca16bfb5427140e1b098511c459f6c84_67fb605aa39e4886a061c0e7df80184b for automati

SVM best score: 0.5847863361344396 best params: {'model__C': 4.138040112561013, 'model__degree': 2, 'model__gamma': 'auto', 'model__kernel': 'rbf'}
SVM tuning time: 30.5s

Building final pipelines and stacking ensemble...


NameError: name 'Pipeline' is not defined