# Constants

In [None]:
import numpy as np
from sklearn.metrics import make_scorer

HYPERPARAMETER_TUNING = TRUE

DROPPED_LEXICAL_COLUMNS = [
    "Swear",
    "Numbers",
    "Inhibition",
    "Preceptual",
    "Anxiety",
    "Anger",
    "Sadness",
    "Work",
    "Articles",
    "Verbs",
    "Adverbs",
    "Prepositions",
    "Conjunctions",
    "Negations",
]
DROPPED_FACIAL_COLUMNS = [
    "average_inner_brow_height_max",
    "average_inner_brow_height_median",
    "average_inner_brow_height_min",
    "average_outer_brow_height_max",
    "average_outer_brow_height_median",
    "average_outer_brow_height_min",
    "eye_open_max",
    "eye_open_median",
    "eye_open_min",
    "inner_lip_height_max",
    "inner_lip_height_median",
    "inner_lip_height_min",
    "lip_corner_distance_max",
    "lip_corner_distance_median",
    "lip_corner_distance_min",
    "outer_lip_height_max",
    "outer_lip_height_median",
    "outer_lip_height_min",
    "smile_max",
    "smile_median",
    "smile_min",
    "pitch_max",
    "pitch_median",
    "pitch_min",
    "roll_max",
    "roll_median",
    "roll_min",
    "yaw_max",
    "yaw_median",
    "yaw_min",
]
DROPPED_PROSODIC_COLUMNS = []
MUST_KEEP_FEATURES = [
    "pause_duration_avg" "average_outer_brow_height_mean",
    "average_inner_brow_height_mean" "outer_lip_height_mean" "Duration/Filler Words",
]
TARGET_COLUMN = "RecommendHiring"
GROUPS_COLUMN = "cleaned_ids"

def pearson_corr(y_true, y_pred):
    return np.corrcoef(y_true, y_pred)[0, 1]
SCORING_METRICS = {
    "r2": "r2",
    "mae": "neg_mean_absolute_error",
    "pearson": make_scorer(pearson_corr),  # Pearson Correlation Coefficient
}

ALREADY_NORMALIZED_FEATURES = [
    "average_outer_brow_height_mean",
    "average_inner_brow_height_mean",
    "eye_open_mean",
    "inner_lip_height_mean",
    "inner_lip_height_mean",
    "lip_corner_distance_mean",
    "average_outer_brow_height_std",
    "average_inner_brow_height_std",
    "eye_open_std",
    "outer_lip_height_std",
    "inner_lip_height_std",
    "lip_corner_distance_std",
    "average_outer_brow_height_min",
    "average_inner_brow_height_min",
    "eye_open_min",
    "outer_lip_height_min",
    "inner_lip_height_min",
    "lip_corner_distance_min",
    "average_outer_brow_height_max",
    "average_inner_brow_height_max",
    "eye_open_max",
    "outer_lip_height_max",
    "inner_lip_height_max",
    "lip_corner_distance_max",
    "average_outer_brow_height_median",
    "average_inner_brow_height_median",
    "eye_open_median",
    "outer_lip_height_median",
    "inner_lip_height_median",
    "lip_corner_distance_median",
]  # these are already in [0, 1]

# Data Preprocessing

## Import Datasets

In [2]:
import pandas as pd
import os

datasets_folder_path = "../datasets"

features_df = pd.read_csv(os.path.join(datasets_folder_path, 'add.csv'))
features_df = features_df.set_index('participant_id')

labels_df = pd.read_csv(os.path.join(datasets_folder_path, 'turker_scores_full_interview.csv'))
labels_df = labels_df.set_index('Participant')
labels_df = labels_df.loc[labels_df['Worker'] == 'AGGR']

features_df.index = features_df.index.str.lower()
labels_df.index = labels_df.index.str.lower()
indexed_combined_df = features_df.join(labels_df[[TARGET_COLUMN]], how='left')

## Fill NaN Values

In [3]:
# Print number of missing values before filling
print(indexed_combined_df.isna().sum().sum())

# Fill missing values only in numeric columns
numeric_cols = indexed_combined_df.select_dtypes(include=['number']).columns
indexed_combined_df[numeric_cols] = indexed_combined_df[numeric_cols].fillna(indexed_combined_df[numeric_cols].mean())

# Print number of missing values after filling
print(indexed_combined_df.isna().sum().sum())

# Reset index
combined_df = indexed_combined_df.reset_index()

16
0


## Drop Unnecessary Columns

In [4]:
combined_df= combined_df.drop(columns=DROPPED_FACIAL_COLUMNS + DROPPED_LEXICAL_COLUMNS + DROPPED_PROSODIC_COLUMNS)

# Split Data

In [46]:
X = combined_df.iloc[:, 2:-1]
y = combined_df.iloc[:, -1]

# Train SVR Model

In [None]:
# import numpy as np
# from sklearn.compose import ColumnTransformer
# from sklearn.feature_selection import SelectFromModel
# from sklearn.linear_model import Lasso, LassoCV
# from sklearn.pipeline import Pipeline
# from sklearn.preprocessing import StandardScaler
# from sklearn.svm import SVR
# from sklearn.model_selection import GroupKFold, cross_val_score, cross_validate

# import sys
# # sys.path.append('..')
# # from models.domain_aware_selector import DomainAwareSelector

# groups_column = combined_df[GROUPS_COLUMN].astype(str).values  

# lasso_feature_selection_model = LassoCV(cv=5, random_state=42, max_iter=30000, alphas=np.logspace(-5, 2, 100),)

# # preprocessor = ColumnTransformer(
# #     transformers=[("scale", StandardScaler(),[column for column in X.columns if column not in ALREADY_NORMALIZED_FEATURES])],
# #     remainder="passthrough", 
# # )

# pipeline = Pipeline([
#     # ('preprocessor', preprocessor),
#     ('scaler', StandardScaler()), 
#     ('feature_selection', SelectFromModel(estimator=lasso_feature_selection_model)),
#     ('svr', SVR(kernel='rbf',)) 
# ])


import numpy as np
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.model_selection import GroupKFold, cross_val_score, cross_validate

groups_column = combined_df[GROUPS_COLUMN].astype(str).values

lasso_feature_selection_model = LassoCV(cv=5, random_state=42, max_iter=30000, alphas=np.logspace(-3, 2, 20),)
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('feature_selection', SelectFromModel(estimator=lasso_feature_selection_model)),
    ('svr', SVR(kernel='rbf'))
])

MUST_KEEP_FEATURES = ['pause_duration_avg' 'average_outer_brow_height_mean', 'average_inner_brow_height_mean' 'outer_lip_height_mean'  'Duration/Filler Words']


# Hyperparameter Tuning

In [52]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    # Feature selection parameters
    'feature_selection__threshold': [None, 'median'],  # Control feature retention

    # SVR parameters
    'svr__C': np.logspace(-2, 2, 5),  # [0.01, 0.1, 1, 10, 100]
    'svr__gamma': ['scale', 'auto'] + list(np.logspace(-3, 1, 5)),  # More gamma options
    'svr__epsilon': [0.01, 0.1, 0.5, 1.0],
    'svr__kernel': ['rbf', 'poly'],  # Added poly kernel
    'svr__degree': [2, 3],  # Only used for poly kernel
    'svr__coef0': [0.0, 1.0],  # Important for poly kernel

    # Experimental parameters
    'svr__shrinking': [True, False]  # Sometimes helps with complex datasets
}

grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=GroupKFold(n_splits=3),  # Faster with 3 folds
    scoring='r2',
    verbose= 1,
    n_jobs=-1   # Uses all available CPU cores for faster execution
)
grid_search.fit(X, y, groups=groups_column) # Trains models on different hyperparameter combinations using cross-validation.
print("Best hyperparameters:", grid_search.best_params_)
print(f"Best R² score from GridSearchCV: {grid_search.best_score_:.4f}")

Fitting 3 folds for each of 4480 candidates, totalling 13440 fits


KeyboardInterrupt: 

In [None]:
best_pipeline = grid_search.best_estimator_

selected_mask = best_pipeline.named_steps['feature_selection'].get_support()

selected_features_bool_mask = best_pipeline.named_steps['feature_selection'].get_support()  # get_support returns a boolean mask
selected_feature_names = X.columns[selected_features_bool_mask]
unselected_feature_names = X.columns[~selected_features_bool_mask]
print(f"Number of Selected features ({len(selected_feature_names)}):")
print(f"Selected features ({selected_feature_names}):")
print(f"Unselected features :{unselected_feature_names}):")

Number of Selected features (27):
Selected features (Index(['f0_sd', 'intensity_mean', 'f3_mean', 'f3_sd', 'f2_f1_mean', 'jitter',
       'percent_unvoiced', 'percent_breaks', 'eye_open_mean',
       'inner_lip_height_mean', 'lip_corner_distance_mean', 'smile_mean',
       'pitch_mean', 'yaw_mean', 'roll_mean', 'eye_open_std',
       'inner_lip_height_std', 'lip_corner_distance_std', 'yaw_std',
       'roll_std', 'Duration/Total Words', 'Individual', 'They', 'PosEmotion',
       'NegEmotion', 'Cognitive', 'Quantifiers'],
      dtype='object')):
Unselected features :Index(['f0_mean', 'f0_min', 'f0_max', 'f0_range', 'intensity_min',
       'intensity_max', 'intensity_range', 'intensity_sd', 'f1_mean', 'f1_sd',
       'f2_mean', 'f2_sd', 'f3_f1_mean', 'f2_f1_sd', 'f3_f1_sd', 'shimmer',
       'pause_duration_max', 'pause_duration_avg', 'duration',
       'average_outer_brow_height_mean', 'average_inner_brow_height_mean',
       'outer_lip_height_mean', 'average_outer_brow_height_std',
   

# Model Evaluation

In [None]:
cv_results = cross_validate(
    best_pipeline,
    X,
    y,
    cv=GroupKFold(n_splits=5),
    groups=groups_column,
    scoring=SCORING_METRICS,
    return_train_score=False,
    return_estimator=True  # To access feature selection details
)

In [None]:
for metric in SCORING_METRICS.keys():
    mean_score = cv_results[f'test_{metric}'].mean()
    std_score = cv_results[f'test_{metric}'].std()
    print(f"{metric.upper()} (CV): {mean_score:.3f} (±{std_score:.3f})")

R2 (CV): 0.137 (±0.213)
MAE (CV): -0.522 (±0.054)
PEARSON (CV): 0.441 (±0.172)
