# Constants

In [185]:
import numpy as np
from sklearn.metrics import make_scorer

DROPPED_LEXICAL_COLUMNS = [
    "Swear",
    "Numbers",
    "Inhibition",
    "Preceptual",
    "Anxiety",
    "Anger",
    "Sadness",
    "Work",
    "Articles",
    "Verbs",
    "Adverbs",
    "Prepositions",
    "Conjunctions",
    "Negations",
]

facial_features = [
    "average_inner_brow_height",
    "average_outer_brow_height",
    "eye_open",
    "inner_lip_height",
    "lip_corner_distance",
    "outer_lip_height",
    "smile",
    "pitch",
    "roll",
    "yaw",
]
stats = ["max", "median", "min", "std", "mean"]
DROPPED_FACIAL_FEATURES = [
    f"{feature}_{stat}" for feature in facial_features for stat in stats
]

ALREADY_NORMALIZED_FEATURES = [
    "average_outer_brow_height_mean",
    "average_inner_brow_height_mean",
    "eye_open_mean",
    "inner_lip_height_mean",
    "inner_lip_height_mean",
    "lip_corner_distance_mean",
    "average_outer_brow_height_std",
    "average_inner_brow_height_std",
    "eye_open_std",
    "outer_lip_height_std",
    "inner_lip_height_std",
    "lip_corner_distance_std",
    "average_outer_brow_height_min",
    "average_inner_brow_height_min",
    "eye_open_min",
    "outer_lip_height_min",
    "inner_lip_height_min",
    "lip_corner_distance_min",
    "average_outer_brow_height_max",
    "average_inner_brow_height_max",
    "eye_open_max",
    "outer_lip_height_max",
    "inner_lip_height_max",
    "lip_corner_distance_max",
    "average_outer_brow_height_median",
    "average_inner_brow_height_median",
    "eye_open_median",
    "outer_lip_height_median",
    "inner_lip_height_median",
    "lip_corner_distance_median",
]  # these are already in [0, 1]

DROPPED_PROSODIC_COLUMNS = []
MUST_KEEP_FEATURES = [
    "pause_duration_avg",
    "average_outer_brow_height_mean",
    "average_inner_brow_height_mean",
    "outer_lip_height_mean",
    "Duration/Filler Words",
]

TARGET_COLUMN = "RecommendHiring"
GROUPS_COLUMN = "cleaned_ids"
INDEX_COLUMN = "participant_id"


def pearson_corr(y_true, y_pred):
    return np.corrcoef(y_true, y_pred)[0, 1]


SCORING_METRICS = {
    "r2": "r2",
    "mae": "neg_mean_absolute_error",
    "pearson": make_scorer(pearson_corr),  # Pearson Correlation Coefficient
}


MUST_KEEP_FEATURES = [
    # "pause_duration_avg",
    # "average_outer_brow_height_mean",
    # "average_inner_brow_height_mean",
    # "outer_lip_height_mean",
    "Duration/Filler Words",
]

PIPELINE_PARAMS = {
    "feature_selection__threshold": None,
    "svr__C": 0.1,
    "svr__coef0": 1.0,
    "svr__degree": 3,
    "svr__epsilon": 0.5,
    "svr__gamma": 0.1,
    "svr__kernel": "poly",
    "svr__shrinking": True,
    # "feature_selection__threshold": None,
    # "svr__C": 10.0,
    # "svr__coef0": 0.0,
    # "svr__degree": 2,
    # "svr__epsilon": 0.01,
    # "svr__gamma": 0.001,
    # "svr__kernel": "rbf",
    # "svr__shrinking": True,
}
HYPERPARAMETER_TUNING_ENABLED = True

# Data Preprocessing

## Import Datasets

In [186]:
import pandas as pd
import os

datasets_folder_path = "../datasets"

features_df = pd.read_csv(os.path.join(datasets_folder_path, "add.csv"))
features_df = features_df.set_index("participant_id")

labels_df = pd.read_csv(
    os.path.join(datasets_folder_path, "turker_scores_full_interview.csv")
)
labels_df = labels_df.set_index("Participant")
labels_df = labels_df.loc[labels_df["Worker"] == "AGGR"]

features_df.index = features_df.index.str.lower()
labels_df.index = labels_df.index.str.lower()
indexed_combined_df = features_df.join(labels_df[[TARGET_COLUMN]], how="left")

## Drop Unnecessary Columns

In [187]:
combined_df = indexed_combined_df.reset_index(drop=True)
combined_df = combined_df.drop(
    columns=DROPPED_FACIAL_FEATURES + DROPPED_LEXICAL_COLUMNS + DROPPED_PROSODIC_COLUMNS
)

# Model

## Split Data

In [188]:
X = combined_df.drop(columns=[TARGET_COLUMN, GROUPS_COLUMN])
y = combined_df[TARGET_COLUMN]

## Pipeline Creation

In [189]:
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectFromModel
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Lasso, LassoCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.model_selection import GroupKFold, cross_val_score, cross_validate
import sys

sys.path.append("..")
from models.domain_aware_selector import DomainAwareSelector

groups_column = combined_df[GROUPS_COLUMN].astype(str).values

lasso_feature_selection_model = LassoCV(
    cv=5,
    random_state=42,
    max_iter=30000,
    alphas=np.logspace(-3, 0, 30),
)
pipeline = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="mean")),  # NaN imputation
        ("scaler", StandardScaler()),
        ("feature_selection", SelectFromModel(estimator=Lasso(max_iter=10000))),
        # ('feature_selection', DomainAwareSelector(
        #     must_keep_features=MUST_KEEP_FEATURES,
        #     selector=SelectFromModel(lasso_feature_selection_model, max_features=10),
        # )),
        ("svr", SVR(kernel="rbf")),
    ]
)

## Hyperparameter Tuning

In [None]:
import optuna
from sklearn.base import clone
from sklearn.model_selection import GroupShuffleSplit, cross_val_score
import numpy as np

def objective(trial):
    """Optimized objective function for Optuna."""
    pipeline_clone = clone(pipeline)  # Clone pipeline for thread safety
    
    params = {
        "feature_selection__estimator__alpha": trial.suggest_float(
            "feature_selection__estimator__alpha", 1e-3, 0.3, log=True
        ),
        "svr__C": trial.suggest_float("svr__C", 0.01, 100, log=True),
        "svr__gamma": trial.suggest_float("svr__gamma", 1e-3, 1e1, log=True),
        "svr__epsilon": trial.suggest_float("svr__epsilon", 0.01, 0.5),
        "svr__kernel": trial.suggest_categorical("svr__kernel", ["rbf", "poly"]),
        "svr__degree": trial.suggest_int("svr__degree", 2, 3),
        "svr__coef0": trial.suggest_float("svr__coef0", 0.0, 1.0),
    }
    pipeline_clone.set_params(**params)
    
    mc_cv_tuning = GroupShuffleSplit(n_splits=20, test_size=0.2, random_state=42)
    scores = cross_val_score(
        pipeline_clone, X, y, cv=mc_cv_tuning, groups=groups_column, n_jobs=-1
    )
    return np.mean(scores)

if HYPERPARAMETER_TUNING_ENABLED:
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=100, n_jobs=-1)
    
    print("Best hyperparameters:", study.best_params)
    print(f"Best R² score: {study.best_value:.4f}")

[I 2025-04-01 21:57:02,702] A new study created in memory with name: no-name-751a5453-8058-4f69-95a7-20db0f6f164f
[I 2025-04-01 21:57:03,650] Trial 1 finished with value: -0.018087695478939285 and parameters: {'feature_selection__estimator__alpha': 0.011470995511572721, 'svr__C': 9.962059528861575, 'svr__gamma': 0.019267353830607663, 'svr__epsilon': 0.4746906967314501, 'svr__kernel': 'poly', 'svr__degree': 2, 'svr__coef0': 0.467576464509749}. Best is trial 1 with value: -0.018087695478939285.
[I 2025-04-01 21:57:03,709] Trial 5 finished with value: -1.958142242305787 and parameters: {'feature_selection__estimator__alpha': 0.008103591266739273, 'svr__C': 1.7672242157903946, 'svr__gamma': 6.936119661678552, 'svr__epsilon': 0.021596793429983024, 'svr__kernel': 'poly', 'svr__degree': 2, 'svr__coef0': 0.19697676646155138}. Best is trial 1 with value: -0.018087695478939285.
[I 2025-04-01 21:57:03,770] Trial 0 finished with value: -0.9362318657763582 and parameters: {'feature_selection__estim

ValueError: 
All the 20 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
4 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/bassel27/personal_projects/hireverse/myenv/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/bassel27/personal_projects/hireverse/myenv/lib/python3.12/site-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/bassel27/personal_projects/hireverse/myenv/lib/python3.12/site-packages/sklearn/pipeline.py", line 660, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/Users/bassel27/personal_projects/hireverse/myenv/lib/python3.12/site-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/bassel27/personal_projects/hireverse/myenv/lib/python3.12/site-packages/sklearn/svm/_base.py", line 196, in fit
    X, y = validate_data(
           ^^^^^^^^^^^^^^
  File "/Users/bassel27/personal_projects/hireverse/myenv/lib/python3.12/site-packages/sklearn/utils/validation.py", line 2961, in validate_data
    X, y = check_X_y(X, y, **check_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/bassel27/personal_projects/hireverse/myenv/lib/python3.12/site-packages/sklearn/utils/validation.py", line 1370, in check_X_y
    X = check_array(
        ^^^^^^^^^^^^
  File "/Users/bassel27/personal_projects/hireverse/myenv/lib/python3.12/site-packages/sklearn/utils/validation.py", line 1139, in check_array
    raise ValueError(
ValueError: Found array with 0 feature(s) (shape=(110, 0)) while a minimum of 1 is required by SVR.

--------------------------------------------------------------------------------
16 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/bassel27/personal_projects/hireverse/myenv/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/bassel27/personal_projects/hireverse/myenv/lib/python3.12/site-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/bassel27/personal_projects/hireverse/myenv/lib/python3.12/site-packages/sklearn/pipeline.py", line 660, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/Users/bassel27/personal_projects/hireverse/myenv/lib/python3.12/site-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/bassel27/personal_projects/hireverse/myenv/lib/python3.12/site-packages/sklearn/svm/_base.py", line 196, in fit
    X, y = validate_data(
           ^^^^^^^^^^^^^^
  File "/Users/bassel27/personal_projects/hireverse/myenv/lib/python3.12/site-packages/sklearn/utils/validation.py", line 2961, in validate_data
    X, y = check_X_y(X, y, **check_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/bassel27/personal_projects/hireverse/myenv/lib/python3.12/site-packages/sklearn/utils/validation.py", line 1370, in check_X_y
    X = check_array(
        ^^^^^^^^^^^^
  File "/Users/bassel27/personal_projects/hireverse/myenv/lib/python3.12/site-packages/sklearn/utils/validation.py", line 1139, in check_array
    raise ValueError(
ValueError: Found array with 0 feature(s) (shape=(109, 0)) while a minimum of 1 is required by SVR.


## Store Trained Model

In [None]:
from sklearn import clone

if HYPERPARAMETER_TUNING_ENABLED:
    pipeline.set_params(**study.best_params)
else:
    pipeline.set_params(**PIPELINE_PARAMS)
model = clone(pipeline)
model.fit(X, y)

## Feature Selection Results

In [None]:
selected_mask = model.named_steps["feature_selection"].get_support()

selected_features_bool_mask = model.named_steps[
    "feature_selection"
].get_support()  # get_support returns a boolean mask
selected_feature_names = X.columns[selected_features_bool_mask]
unselected_feature_names = X.columns[~selected_features_bool_mask]
print(f"Number of Selected features ({len(selected_feature_names)}):")
print(f"Selected features ({selected_feature_names}):")
print(f"Unselected features :{unselected_feature_names}):")

Number of Selected features (8):
Selected features (Index(['intensity_mean', 'f3_sd', 'f2_f1_mean', 'percent_unvoiced',
       'percent_breaks', 'Duration/Total Words', 'They', 'Cognitive'],
      dtype='object')):
Unselected features :Index(['f0_mean', 'f0_min', 'f0_max', 'f0_range', 'f0_sd', 'intensity_min',
       'intensity_max', 'intensity_range', 'intensity_sd', 'f1_mean', 'f1_sd',
       'f2_mean', 'f2_sd', 'f3_mean', 'f3_f1_mean', 'f2_f1_sd', 'f3_f1_sd',
       'jitter', 'shimmer', 'pause_duration_max', 'pause_duration_avg',
       'duration', 'Total Words', 'Unique Words', 'Filler Words',
       'Audio Duration (s)', 'Duration/Unique Words', 'Duration/Filler Words',
       'Individual', 'We', 'Non-Fluences', 'PosEmotion', 'NegEmotion',
       'Relativity', 'Quantifiers'],
      dtype='object')):


# Monte Carlo Cross Validation

In [None]:

from sklearn.model_selection import cross_validate

scoring = {
    "r2": make_scorer(r2_score),
    "pearson": make_scorer(pearson_corr)
}

results = cross_validate(
    pipeline,
    X,
    y,
    cv=GroupShuffleSplit(n_splits=1000, test_size=0.2, random_state=42),
    groups=groups_column,
    scoring=scoring,
    n_jobs=-1,
)

r2_scores = results["test_r2"]
pearson_scores = results["test_pearson"]
# Report the average performance and variability
print(
    f"Mean R² Score: {np.mean(r2_scores):.2f} (±{np.std(r2_scores):.2f})"
)
print(
    f"Mean Pearson Correlation: {np.mean(pearson_scores):.2f} (±{np.std(pearson_scores):.2f})"
)

Mean R² Score over 1000 trials: 0.17 (±0.28)
Mean Pearson Correlation over 1000 trials: 0.53 (±0.16)
