In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline

# ======================================
# LOAD DATA
# ======================================
# Switch between cepstral/spectral files here
# df = pd.read_csv("20240106_dfall_obs_data_and_cepstral_features_revision1_n469.csv")
df = pd.read_csv("20231225_dfall_obs_data_and_spectral_features_revision1_n469.csv")

# ======================================
# TARGET SETUP
# ======================================
if "Context2" not in df.columns:
    raise ValueError("Expected column 'Context2' not found in dataset!")

target_col = "Context2"

# Encode target (emotion labels → numeric codes)
df[target_col] = df[target_col].astype("category").cat.codes

# ======================================
# FEATURE SELECTION
# ======================================
# Keep only numeric columns (drop text/object cols)
X = df.select_dtypes(include=[np.number]).copy()
y = df[target_col]

# Drop target from features if it’s numeric
X = X.drop(columns=[target_col], errors='ignore')

# Fill missing values with column means
X = X.fillna(X.mean())

# ======================================
# SPLIT DATA
# ======================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# ======================================
# PIPELINE STAGES
# ======================================
scaler = StandardScaler()
selector = SelectKBest(score_func=f_classif, k=min(60, X_train.shape[1]))
pca = PCA(n_components=min(25, X_train.shape[1]))

# ======================================
# MODELS
# ======================================
models = {
    "Random Forest": RandomForestClassifier(
        n_estimators=400, max_depth=25, min_samples_split=3, min_samples_leaf=1, random_state=42
    ),
    "XGBoost": XGBClassifier(
        n_estimators=400, learning_rate=0.04, max_depth=8, subsample=0.85, colsample_bytree=0.85, random_state=42
    ),
    "SVM": SVC(C=2.5, kernel='rbf', gamma='auto', probability=True, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(
        n_estimators=300, learning_rate=0.06, max_depth=6, random_state=42
    ),
    "Logistic Regression": LogisticRegression(
        C=3.0, max_iter=2000, solver='lbfgs', random_state=42
    )
}

# ======================================
# STACKED ENSEMBLE
# ======================================
estimators = [
    ('rf', models['Random Forest']),
    ('xgb', models['XGBoost']),
    ('svm', models['SVM'])
]

stacked = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(max_iter=1500, random_state=42),
    cv=5
)

# ======================================
# TRAIN + EVALUATE
# ======================================
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

print("=== CLASSIFYING EMOTION (Context2) ===")
for name, model in {**models, "Stacked Ensemble": stacked}.items():
    pipe = Pipeline([
        ('scaler', scaler),
        ('selector', selector),
        ('pca', pca),
        ('model', model)
    ])

    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)
    acc = accuracy_score(y_test, preds)
    scores = cross_val_score(pipe, X_train, y_train, cv=cv, scoring='accuracy')

    print(f"{name:20s} - Acc: {acc:.3f}, CV: {scores.mean():.3f} ± {scores.std():.3f}")

print("Done.")

=== CLASSIFYING EMOTION (Context2) ===


  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


Random Forest        - Acc: 0.638, CV: 0.536 ± 0.036


  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


XGBoost              - Acc: 0.617, CV: 0.528 ± 0.052


  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


SVM                  - Acc: 0.713, CV: 0.563 ± 0.018


  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


Gradient Boosting    - Acc: 0.681, CV: 0.520 ± 0.037


  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


Logistic Regression  - Acc: 0.777, CV: 0.565 ± 0.048


  f = msb / msw
  f = msb / msw
  f = msb / msw
Traceback (most recent call last):
  File [35m"/home/easwer/.local/share/mise/installs/python/3.13.7/lib/python3.13/multiprocessing/resource_tracker.py"[0m, line [35m295[0m, in [35mmain[0m
    raise ValueError(
        f'Cannot register {name} for automatic cleanup: '
        f'unknown resource type {rtype}')
[1;35mValueError[0m: [35mCannot register /loky-3320-37eu9mgi for automatic cleanup: unknown resource type semlock[0m
Traceback (most recent call last):
  File [35m"/home/easwer/.local/share/mise/installs/python/3.13.7/lib/python3.13/multiprocessing/resource_tracker.py"[0m, line [35m295[0m, in [35mmain[0m
    raise ValueError(
        f'Cannot register {name} for automatic cleanup: '
        f'unknown resource type {rtype}')
[1;35mValueError[0m: [35mCannot register /loky-3320-hdn4e6id for automatic cleanup: unknown resource type semlock[0m
Traceback (most recent call last):
  File [35m"/home/easwer/.local/share/mi

Stacked Ensemble     - Acc: 0.723, CV: 0.555 ± 0.054
Done.
