# Constants

In [109]:
import numpy as np
from sklearn.metrics import make_scorer

DROPPED_LEXICAL_COLUMNS = [
    "Swear",
    "Numbers",
    "Inhibition",
    "Preceptual",
    "Anxiety",
    "Anger",
    "Sadness",
    "Work",
    "Articles",
    "Verbs",
    "Adverbs",
    "Prepositions",
    "Conjunctions",
    "Negations",
]

facial_features = [
    "average_inner_brow_height",
    "average_outer_brow_height",
    "eye_open",
    "inner_lip_height",
    "lip_corner_distance",
    "outer_lip_height",
    "smile",
    "pitch",
    "roll",
    "yaw",
]
stats = ["max", "median", "min", "std", "mean"]
DROPPED_FACIAL_FEATURES = [
    f"{feature}_{stat}" for feature in facial_features for stat in stats
]

ALREADY_NORMALIZED_FEATURES = [
    "average_outer_brow_height_mean",
    "average_inner_brow_height_mean",
    "eye_open_mean",
    "inner_lip_height_mean",
    "inner_lip_height_mean",
    "lip_corner_distance_mean",
    "average_outer_brow_height_std",
    "average_inner_brow_height_std",
    "eye_open_std",
    "outer_lip_height_std",
    "inner_lip_height_std",
    "lip_corner_distance_std",
    "average_outer_brow_height_min",
    "average_inner_brow_height_min",
    "eye_open_min",
    "outer_lip_height_min",
    "inner_lip_height_min",
    "lip_corner_distance_min",
    "average_outer_brow_height_max",
    "average_inner_brow_height_max",
    "eye_open_max",
    "outer_lip_height_max",
    "inner_lip_height_max",
    "lip_corner_distance_max",
    "average_outer_brow_height_median",
    "average_inner_brow_height_median",
    "eye_open_median",
    "outer_lip_height_median",
    "inner_lip_height_median",
    "lip_corner_distance_median",
]  # these are already in [0, 1]

DROPPED_PROSODIC_COLUMNS = []
MUST_KEEP_FEATURES = [
    "pause_duration_avg" "average_outer_brow_height_mean",
    "average_inner_brow_height_mean" "outer_lip_height_mean" "Duration/Filler Words",
]
TARGET_COLUMN = "RecommendHiring"
GROUPS_COLUMN = "cleaned_ids"


def pearson_corr(y_true, y_pred):
    return np.corrcoef(y_true, y_pred)[0, 1]


SCORING_METRICS = {
    "r2": "r2",
    "mae": "neg_mean_absolute_error",
    "pearson": make_scorer(pearson_corr),  # Pearson Correlation Coefficient
}


MUST_KEEP_FEATURES = [
    # "pause_duration_avg",
    # "average_outer_brow_height_mean",
    # "average_inner_brow_height_mean",
    # "outer_lip_height_mean",
    "Duration/Filler Words",
]

HYPERPARAMETER_TUNING_ENABLED = False


PIPELINE_PARAMS = {  # 0.3032
    "feature_selection__threshold": None,
    "svr__C": 0.1,
    "svr__coef0": 1.0,
    "svr__degree": 3,
    "svr__epsilon": 0.5,
    "svr__gamma": 0.1,
    "svr__kernel": "poly",
    "svr__shrinking": True,
}

# Data Preprocessing

## Import Datasets

In [110]:
import pandas as pd
import os

datasets_folder_path = "../datasets"

features_df = pd.read_csv(os.path.join(datasets_folder_path, "add.csv"))
features_df = features_df.set_index("participant_id")

labels_df = pd.read_csv(
    os.path.join(datasets_folder_path, "turker_scores_full_interview.csv")
)
labels_df = labels_df.set_index("Participant")
labels_df = labels_df.loc[labels_df["Worker"] == "AGGR"]

features_df.index = features_df.index.str.lower()
labels_df.index = labels_df.index.str.lower()
indexed_combined_df = features_df.join(labels_df[[TARGET_COLUMN]], how="left")

## Fill NaN Values

In [111]:
# Print number of missing values before filling
print(indexed_combined_df.isna().sum().sum())

# Fill missing values only in numeric columns
numeric_cols = indexed_combined_df.select_dtypes(include=["number"]).columns
indexed_combined_df[numeric_cols] = indexed_combined_df[numeric_cols].fillna(
    indexed_combined_df[numeric_cols].mean()
)

# Print number of missing values after filling
print(indexed_combined_df.isna().sum().sum())

# Reset index
combined_df = indexed_combined_df.reset_index()

16
0


## Drop Unnecessary Columns

In [112]:
combined_df = combined_df.drop(
    columns=DROPPED_FACIAL_FEATURES + DROPPED_LEXICAL_COLUMNS + DROPPED_PROSODIC_COLUMNS
)

# Model

## Split Data

In [113]:
X = combined_df.iloc[:, 2:-1]
y = combined_df.iloc[:, -1]

print(X.columns)

Index(['f0_mean', 'f0_min', 'f0_max', 'f0_range', 'f0_sd', 'intensity_mean',
       'intensity_min', 'intensity_max', 'intensity_range', 'intensity_sd',
       'f1_mean', 'f1_sd', 'f2_mean', 'f2_sd', 'f3_mean', 'f3_sd',
       'f2_f1_mean', 'f3_f1_mean', 'f2_f1_sd', 'f3_f1_sd', 'jitter', 'shimmer',
       'percent_unvoiced', 'percent_breaks', 'pause_duration_max',
       'pause_duration_avg', 'duration', 'Total Words', 'Unique Words',
       'Filler Words', 'Audio Duration (s)', 'Duration/Total Words',
       'Duration/Unique Words', 'Duration/Filler Words', 'Individual', 'We',
       'They', 'Non-Fluences', 'PosEmotion', 'NegEmotion', 'Cognitive',
       'Relativity', 'Quantifiers'],
      dtype='object')


## Pipeline Creation

In [114]:
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.model_selection import GroupKFold, cross_val_score, cross_validate
import sys

sys.path.append("..")
from models.domain_aware_selector import DomainAwareSelector

groups_column = combined_df[GROUPS_COLUMN].astype(str).values

preprocessor = ColumnTransformer(
    transformers=[
        (
            "scale",
            StandardScaler(),
            [
                column
                for column in X.columns
                if column not in ALREADY_NORMALIZED_FEATURES
            ],
        )
    ],
    remainder="passthrough",
)

lasso_feature_selection_model = LassoCV(
    cv=5,
    random_state=42,
    max_iter=30000,
    alphas=np.logspace(-3, 0, 30),
)
pipeline = Pipeline(
    [
        # ("preprocessor", preprocessor),
        ("scaler", StandardScaler()),
        ("feature_selection", SelectFromModel(estimator=lasso_feature_selection_model)),
        # ('feature_selection', DomainAwareSelector(
        #     must_keep_features=MUST_KEEP_FEATURES,
        #     selector=SelectFromModel(lasso_feature_selection_model, max_features=10),
        # )),
        ("svr", SVR(kernel="rbf")),
    ]
)

## Hyperparameter Tuning

In [115]:
import time
from sklearn.model_selection import GridSearchCV
import sys


param_grid = {
    # Feature selection parameters
    "feature_selection__threshold": [None, "median"],
    # 'feature_selection__selector__threshold': [None, 'median'],
    # SVR parameters
    "svr__C": np.logspace(-2, 2, 5),  # [0.01, 0.1, 1, 10, 100]
    "svr__gamma": ["scale", "auto"] + list(np.logspace(-3, 1, 5)),  # More gamma options
    "svr__epsilon": [0.01, 0.1, 0.5, 1.0],
    "svr__kernel": ["rbf", "poly"],  # Added poly kernel
    "svr__degree": [2, 3],  # Only used for poly kernel
    "svr__coef0": [0.0, 1.0],  # Important for poly kernel
    "svr__shrinking": [
        True,
        False,
    ],  # the difference may be subtle—often the default (shrinking=True) works well
}

if HYPERPARAMETER_TUNING_ENABLED:
    grid_search = GridSearchCV(
        pipeline,
        param_grid,
        cv=GroupKFold(n_splits=5),
        scoring="r2",
        verbose=1,
        n_jobs=-1,  # Uses all available CPU cores for faster execution
    )
    grid_search.fit(
        X, y, groups=groups_column
    )  # Trains models on different hyperparameter combinations using cross-validation.
    print("Best hyperparameters:", grid_search.best_params_)
    print(f"Best R² score from GridSearchCV: {grid_search.best_score_:.4f}")

## Store Trained Model

In [116]:
if HYPERPARAMETER_TUNING_ENABLED:
    pipeline = grid_search.best_estimator_
else:
    pipeline.set_params(**PIPELINE_PARAMS)
    pipeline.fit(X, y)

## Feature Selection Results

In [117]:
selected_mask = pipeline.named_steps["feature_selection"].get_support()

selected_features_bool_mask = pipeline.named_steps[
    "feature_selection"
].get_support()  # get_support returns a boolean mask
selected_feature_names = X.columns[selected_features_bool_mask]
unselected_feature_names = X.columns[~selected_features_bool_mask]
print(f"Number of Selected features ({len(selected_feature_names)}):")
print(f"Selected features ({selected_feature_names}):")
print(f"Unselected features :{unselected_feature_names}):")

Number of Selected features (12):
Selected features (Index(['f0_sd', 'intensity_mean', 'f2_mean', 'f3_mean', 'f3_sd', 'f2_f1_mean',
       'percent_unvoiced', 'percent_breaks', 'Duration/Total Words', 'They',
       'NegEmotion', 'Cognitive'],
      dtype='object')):
Unselected features :Index(['f0_mean', 'f0_min', 'f0_max', 'f0_range', 'intensity_min',
       'intensity_max', 'intensity_range', 'intensity_sd', 'f1_mean', 'f1_sd',
       'f2_sd', 'f3_f1_mean', 'f2_f1_sd', 'f3_f1_sd', 'jitter', 'shimmer',
       'pause_duration_max', 'pause_duration_avg', 'duration', 'Total Words',
       'Unique Words', 'Filler Words', 'Audio Duration (s)',
       'Duration/Unique Words', 'Duration/Filler Words', 'Individual', 'We',
       'Non-Fluences', 'PosEmotion', 'Relativity', 'Quantifiers'],
      dtype='object')):


## Model Evaluation Using Cross Validation

In [None]:
from sklearn.model_selection import KFold

# Evaluate using R² as the metric
cv_results = cross_validate(
    pipeline,
    X,
    y,
    cv=GroupKFold(n_splits=5),
    groups=groups_column,
    scoring=SCORING_METRICS,
    return_train_score=False,
    return_estimator=True,  # To access feature selection details
)

R2 (MC CV): 0.227 (±0.226)
MAE (MC CV): -0.492 (±0.061)
PEARSON (MC CV): 0.519 (±0.180)


In [119]:
for metric in SCORING_METRICS.keys():
    mean_score = cv_results[f"test_{metric}"].mean()
    std_score = cv_results[f"test_{metric}"].std()
    print(f"{metric.upper()} (CV): {mean_score:.3f} (±{std_score:.3f})")

R2 (CV): 0.227 (±0.226)
MAE (CV): -0.492 (±0.061)
PEARSON (CV): 0.519 (±0.180)


In [None]:
from sklearn import clone
from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import r2_score
import numpy as np
from scipy.stats import pearsonr

N_TRIALS = 1000
TEST_SIZE = 0.2  # 80% training, 20% testing
r2_scores = []
pearson_scores = []

# Use GroupShuffleSplit to ensure that all data from the same participant (group) is kept together.
gss = GroupShuffleSplit(n_splits=N_TRIALS, test_size=TEST_SIZE, random_state=42)

for train_idx, test_idx in gss.split(X, y, groups=groups_column):   # runs n_splits times
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    # Clone the pipeline to ensure each trial is independent
    model = clone(pipeline)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Compute R² score
    r2_scores.append(r2_score(y_test, y_pred))
    
    # Compute Pearson correlation coefficient
    pearson_coef, _ = pearsonr(y_test, y_pred)
    pearson_scores.append(pearson_coef)

# Convert to numpy arrays for easier computation
r2_scores = np.array(r2_scores)
pearson_scores = np.array(pearson_scores)

# Report the average performance and variability
print(f"Mean R² Score over {N_TRIALS} trials: {r2_scores.mean():.2f} (±{r2_scores.std():.2f})")
print(f"Mean Pearson Correlation over {N_TRIALS} trials: {pearson_scores.mean():.2f} (±{pearson_scores.std():.2f})")