# Input

In [1]:
target_column = "RecommendHiring"

In [2]:
# Parameters
target_column = "RecommendHiring"


# Constants

In [3]:
import os
import numpy as np
from sklearn.metrics import make_scorer

SCRIPT_DIR = os.path.join(os.getcwd(),)

SAVED_MODELS_PATH="./regression_models/saved_models"

DROPPED_LEXICAL_COLUMNS = [
    "Swear",
    "Numbers",
    "Inhibition",
    "Preceptual",
    "Anxiety",
    "Anger",
    "Sadness",
    "Work",
    "Articles",
    "Verbs",
    "Adverbs",
    "Prepositions",
    "Conjunctions",
    "Negations",
]

facial_features = [
    "average_inner_brow_height",
    "average_outer_brow_height",
    "eye_open",
    "inner_lip_height",
    "lip_corner_distance",
    "outer_lip_height",
    "smile",
    "pitch",
    "roll",
    "yaw",
]
stats = ["max", "median", "min", "std", "mean"]
DROPPED_FACIAL_FEATURES = [
    f"{feature}_{stat}" for feature in facial_features for stat in stats
]


DROPPED_PROSODIC_COLUMNS = []

ALREADY_NORMALIZED_FEATURES = [
    "average_outer_brow_height_mean",
    "average_inner_brow_height_mean",
    "eye_open_mean",
    "inner_lip_height_mean",
    "inner_lip_height_mean",
    "lip_corner_distance_mean",
    "average_outer_brow_height_std",
    "average_inner_brow_height_std",
    "eye_open_std",
    "outer_lip_height_std",
    "inner_lip_height_std",
    "lip_corner_distance_std",
    "average_outer_brow_height_min",
    "average_inner_brow_height_min",
    "eye_open_min",
    "outer_lip_height_min",
    "inner_lip_height_min",
    "lip_corner_distance_min",
    "average_outer_brow_height_max",
    "average_inner_brow_height_max",
    "eye_open_max",
    "outer_lip_height_max",
    "inner_lip_height_max",
    "lip_corner_distance_max",
    "average_outer_brow_height_median",
    "average_inner_brow_height_median",
    "eye_open_median",
    "outer_lip_height_median",
    "inner_lip_height_median",
    "lip_corner_distance_median",
]  # these are already in [0, 1]

MUST_KEEP_FEATURES = [
    "pause_duration_avg",
    "average_outer_brow_height_mean",
    "average_inner_brow_height_mean",
    "outer_lip_height_mean",
    "Duration/Filler Words",
]


GROUPS_COLUMN = "cleaned_ids"
INDEX_COLUMN = "participant_id"


def pearson_corr(y_true, y_pred):
    return np.corrcoef(y_true, y_pred)[0, 1]


SCORING_METRICS = {
    "r2": "r2",
    "mae": "neg_mean_absolute_error",
    "pearson": make_scorer(pearson_corr),  # Pearson Correlation Coefficient
}


MUST_KEEP_FEATURES = [
    # "pause_duration_avg",
    # "average_outer_brow_height_mean",
    # "average_inner_brow_height_mean",
    # "outer_lip_height_mean",
    "Duration/Filler Words",
]

PIPELINE_PARAMS = {'feature_selection__estimator__alpha': 0.057376790661083456, 'svr__C': 0.655379988356498, 'svr__gamma': 0.02784736494309893, 'svr__epsilon': 0.2617249201838037, 'svr__kernel': 'rbf'}
HYPERPARAMETER_TUNING_ENABLED = True

# Data Preprocessing

## Import Datasets

In [4]:
import pandas as pd
import os

features_df = pd.read_csv(os.path.join(SCRIPT_DIR, "datasets", "add.csv"))
features_df = features_df.set_index("participant_id")

labels_df = pd.read_csv(
    os.path.join(os.path.join(SCRIPT_DIR,  "datasets", "turker_scores_full_interview.csv"))
)
labels_df = labels_df.set_index("Participant")
labels_df = labels_df.loc[labels_df["Worker"] == "AGGR"]

features_df.index = features_df.index.str.lower()
labels_df.index = labels_df.index.str.lower()
indexed_combined_df = features_df.join(labels_df[[target_column]], how="left")
combined_df = indexed_combined_df.reset_index(drop=True)

# Model

## Split Data

In [5]:
X = combined_df.drop(columns=[target_column, GROUPS_COLUMN])
y = combined_df[target_column]

## Pipeline Creation

In [6]:
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectFromModel
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Lasso, LassoCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.model_selection import GroupKFold, cross_val_score, cross_validate
import sys

sys.path.append("..")
from models.domain_aware_selector import DomainAwareSelector

groups_column = combined_df[GROUPS_COLUMN].astype(str).values


preprocessor = ColumnTransformer(
    [
        ('dropper', 'drop', DROPPED_FACIAL_FEATURES + 
                            DROPPED_LEXICAL_COLUMNS + 
                            DROPPED_PROSODIC_COLUMNS)
    ],
    remainder='passthrough'
)
unfitted_pipeline = Pipeline(
    [
        ('preprocessor', preprocessor),
        ("imputer", SimpleImputer(strategy="mean")),  # NaN imputation
        ("scaler", StandardScaler()),
        ("feature_selection", SelectFromModel(estimator=Lasso(max_iter=50000), )),
        # ('feature_selection', DomainAwareSelector(
        #     must_keep_features=MUST_KEEP_FEATURES,
        #     selector=SelectFromModel(lasso_feature_selection_model, max_features=10),
        # )),
        ("svr", SVR(kernel="rbf")),
    ]
)

## Hyperparameter Tuning

In [7]:
import optuna
from sklearn.base import clone
from sklearn.model_selection import GroupShuffleSplit, cross_val_score
import numpy as np

def objective(trial):
    pipeline_clone = clone(unfitted_pipeline)  # Clone pipeline for thread safety
    
    params = {
        "feature_selection__estimator__alpha": trial.suggest_float(
        "feature_selection__estimator__alpha", 1e-4, 0.1, log=True  # Adjusted lower bound
        ),
        "svr__C": trial.suggest_float("svr__C", 0.01, 100, log=True),
        "svr__gamma": trial.suggest_float("svr__gamma", 1e-3, 1e1, log=True),
        "svr__epsilon": trial.suggest_float("svr__epsilon", 0.01, 0.5),
        "svr__kernel": trial.suggest_categorical("svr__kernel", ["rbf"
                                                                #  , "poly"
                                                                 ]),
    }
    
    # if params["svr__kernel"] == "poly":
    #     params["svr__degree"] = trial.suggest_int("svr__degree", 2, 3)  # Reduced from 5
    #     params["svr__coef0"] = trial.suggest_float("svr__coef0", 0.0, 0.5)  # Narrower range
        
    pipeline_clone.set_params(**params)
    
    mc_cv_tuning = GroupShuffleSplit(n_splits=20, test_size=0.2, random_state=42)
    
    scores = cross_val_score(
        pipeline_clone, X, y, cv=mc_cv_tuning, groups=groups_column, n_jobs=1
    )

    return np.mean(scores)

if HYPERPARAMETER_TUNING_ENABLED:
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=100, n_jobs=-1, timeout=4*60)
    
    print("Best hyperparameters:", study.best_params)
    print(f"Best R² score: {study.best_value:.4f}")

  from .autonotebook import tqdm as notebook_tqdm
[I 2025-04-08 00:45:18,455] A new study created in memory with name: no-name-4d1a2985-45bc-490a-ad86-f952f12c6778


[I 2025-04-08 00:45:19,306] Trial 0 finished with value: 0.17350901171998703 and parameters: {'feature_selection__estimator__alpha': 0.015567668163072177, 'svr__C': 3.2737829675206935, 'svr__gamma': 0.018634431126422882, 'svr__epsilon': 0.36439877643924773, 'svr__kernel': 'rbf'}. Best is trial 0 with value: 0.17350901171998703.


[I 2025-04-08 00:45:19,318] Trial 6 finished with value: 0.049136390489087724 and parameters: {'feature_selection__estimator__alpha': 0.07993352056589773, 'svr__C': 0.05025331686876881, 'svr__gamma': 0.11033591525742865, 'svr__epsilon': 0.42770551739421897, 'svr__kernel': 'rbf'}. Best is trial 0 with value: 0.17350901171998703.


[I 2025-04-08 00:45:19,319] Trial 7 finished with value: 0.24887720948256264 and parameters: {'feature_selection__estimator__alpha': 0.09830661370366535, 'svr__C': 3.1560331817736675, 'svr__gamma': 0.0018701524065659792, 'svr__epsilon': 0.3146965258650922, 'svr__kernel': 'rbf'}. Best is trial 7 with value: 0.24887720948256264.


[I 2025-04-08 00:45:19,320] Trial 4 finished with value: 0.016455632630669376 and parameters: {'feature_selection__estimator__alpha': 0.060078203388261815, 'svr__C': 39.87758176128221, 'svr__gamma': 0.29906070135818663, 'svr__epsilon': 0.01311962815896581, 'svr__kernel': 'rbf'}. Best is trial 7 with value: 0.24887720948256264.


[I 2025-04-08 00:45:19,351] Trial 2 finished with value: -0.07539867161004057 and parameters: {'feature_selection__estimator__alpha': 0.04764989546441475, 'svr__C': 0.03243407289552119, 'svr__gamma': 0.0030585310140017964, 'svr__epsilon': 0.22976368426061564, 'svr__kernel': 'rbf'}. Best is trial 7 with value: 0.24887720948256264.


[I 2025-04-08 00:45:19,419] Trial 1 finished with value: -0.10268241951006987 and parameters: {'feature_selection__estimator__alpha': 0.0021455653736297633, 'svr__C': 0.21828485867764905, 'svr__gamma': 3.1331337345447468, 'svr__epsilon': 0.17083165604420944, 'svr__kernel': 'rbf'}. Best is trial 7 with value: 0.24887720948256264.


[I 2025-04-08 00:45:19,427] Trial 3 finished with value: -0.09722088979822842 and parameters: {'feature_selection__estimator__alpha': 0.002719312815457394, 'svr__C': 0.2083424547361178, 'svr__gamma': 1.228204695854617, 'svr__epsilon': 0.4181441882184514, 'svr__kernel': 'rbf'}. Best is trial 7 with value: 0.24887720948256264.


[I 2025-04-08 00:45:20,140] Trial 5 finished with value: 0.22367976675078555 and parameters: {'feature_selection__estimator__alpha': 0.0005874559948561746, 'svr__C': 1.0154421429498972, 'svr__gamma': 0.021798428010753455, 'svr__epsilon': 0.21924858416162413, 'svr__kernel': 'rbf'}. Best is trial 7 with value: 0.24887720948256264.


[I 2025-04-08 00:45:20,353] Trial 11 finished with value: 0.23363698018949183 and parameters: {'feature_selection__estimator__alpha': 0.013592318949064437, 'svr__C': 0.7726989997763725, 'svr__gamma': 0.00865517365772045, 'svr__epsilon': 0.30886229071828225, 'svr__kernel': 'rbf'}. Best is trial 7 with value: 0.24887720948256264.


[I 2025-04-08 00:45:20,377] Trial 9 finished with value: 0.2279223434544789 and parameters: {'feature_selection__estimator__alpha': 0.020715474711806216, 'svr__C': 0.5446302546863867, 'svr__gamma': 0.014970858376175355, 'svr__epsilon': 0.19366666765759802, 'svr__kernel': 'rbf'}. Best is trial 7 with value: 0.24887720948256264.


[I 2025-04-08 00:45:20,381] Trial 14 finished with value: 0.15822179323738456 and parameters: {'feature_selection__estimator__alpha': 0.08323797021989544, 'svr__C': 3.093319101800204, 'svr__gamma': 0.1838491679442266, 'svr__epsilon': 0.2528421063091114, 'svr__kernel': 'rbf'}. Best is trial 7 with value: 0.24887720948256264.


[I 2025-04-08 00:45:20,941] Trial 12 finished with value: -0.0840694623829581 and parameters: {'feature_selection__estimator__alpha': 0.0010998424735936178, 'svr__C': 0.0186123212272048, 'svr__gamma': 0.0023353859419117085, 'svr__epsilon': 0.4749867020476553, 'svr__kernel': 'rbf'}. Best is trial 7 with value: 0.24887720948256264.


[I 2025-04-08 00:45:21,067] Trial 8 finished with value: 0.14390417986461188 and parameters: {'feature_selection__estimator__alpha': 0.0009244557306497062, 'svr__C': 63.02218536234399, 'svr__gamma': 0.04021048979080345, 'svr__epsilon': 0.07168509721674264, 'svr__kernel': 'rbf'}. Best is trial 7 with value: 0.24887720948256264.


[I 2025-04-08 00:45:21,146] Trial 15 finished with value: 0.26074293293293743 and parameters: {'feature_selection__estimator__alpha': 0.08955972112217186, 'svr__C': 0.6442178929775443, 'svr__gamma': 0.030488035456915637, 'svr__epsilon': 0.19679127032814667, 'svr__kernel': 'rbf'}. Best is trial 15 with value: 0.26074293293293743.


[I 2025-04-08 00:45:21,422] Trial 16 finished with value: 0.02057717878066322 and parameters: {'feature_selection__estimator__alpha': 0.024364961539798734, 'svr__C': 0.0508215213050401, 'svr__gamma': 0.01925986799999699, 'svr__epsilon': 0.20211798586354998, 'svr__kernel': 'rbf'}. Best is trial 15 with value: 0.26074293293293743.


[I 2025-04-08 00:45:21,484] Trial 18 finished with value: 0.19824212011042383 and parameters: {'feature_selection__estimator__alpha': 0.008583861459370115, 'svr__C': 22.13340093378217, 'svr__gamma': 0.0010965422142834125, 'svr__epsilon': 0.3265417346862306, 'svr__kernel': 'rbf'}. Best is trial 15 with value: 0.26074293293293743.


[I 2025-04-08 00:45:21,491] Trial 17 finished with value: 0.20972457214334503 and parameters: {'feature_selection__estimator__alpha': 0.012946198296099365, 'svr__C': 20.088588185412743, 'svr__gamma': 0.001101422189907239, 'svr__epsilon': 0.33849148602675294, 'svr__kernel': 'rbf'}. Best is trial 15 with value: 0.26074293293293743.


[I 2025-04-08 00:45:21,728] Trial 13 finished with value: 0.04016231883349029 and parameters: {'feature_selection__estimator__alpha': 0.0003104499645143583, 'svr__C': 98.3611017905012, 'svr__gamma': 0.014346528868329298, 'svr__epsilon': 0.02537905519982132, 'svr__kernel': 'rbf'}. Best is trial 15 with value: 0.26074293293293743.


[I 2025-04-08 00:45:21,918] Trial 20 finished with value: 0.2308645172968215 and parameters: {'feature_selection__estimator__alpha': 0.012147557820421144, 'svr__C': 11.122525819264325, 'svr__gamma': 0.0010979573721388062, 'svr__epsilon': 0.31222038418313125, 'svr__kernel': 'rbf'}. Best is trial 15 with value: 0.26074293293293743.


[I 2025-04-08 00:45:22,285] Trial 25 finished with value: -0.09205683356075203 and parameters: {'feature_selection__estimator__alpha': 0.005631794006497526, 'svr__C': 5.678005002952588, 'svr__gamma': 0.5448941336503803, 'svr__epsilon': 0.12331502431070288, 'svr__kernel': 'rbf'}. Best is trial 15 with value: 0.26074293293293743.


[I 2025-04-08 00:45:22,638] Trial 10 finished with value: -0.03763222054716604 and parameters: {'feature_selection__estimator__alpha': 0.00015598518612062673, 'svr__C': 0.3940736210960707, 'svr__gamma': 0.09447539544440962, 'svr__epsilon': 0.4941271925420992, 'svr__kernel': 'rbf'}. Best is trial 15 with value: 0.26074293293293743.


[I 2025-04-08 00:45:22,820] Trial 27 finished with value: 0.27179886845497075 and parameters: {'feature_selection__estimator__alpha': 0.03504949569744124, 'svr__C': 2.0055069383926756, 'svr__gamma': 0.005038071956551019, 'svr__epsilon': 0.13661425141299813, 'svr__kernel': 'rbf'}. Best is trial 27 with value: 0.27179886845497075.


[I 2025-04-08 00:45:23,284] Trial 28 finished with value: 0.2723073977701415 and parameters: {'feature_selection__estimator__alpha': 0.03605035186219133, 'svr__C': 1.4757334196384273, 'svr__gamma': 0.0053953095047143945, 'svr__epsilon': 0.28537327993033, 'svr__kernel': 'rbf'}. Best is trial 28 with value: 0.2723073977701415.


[I 2025-04-08 00:45:23,530] Trial 29 finished with value: 0.2668598797908909 and parameters: {'feature_selection__estimator__alpha': 0.03674066993894598, 'svr__C': 1.836580263454697, 'svr__gamma': 0.0051126897397481596, 'svr__epsilon': 0.13832930130717921, 'svr__kernel': 'rbf'}. Best is trial 28 with value: 0.2723073977701415.


[I 2025-04-08 00:45:24,169] Trial 30 finished with value: 0.2631806742879436 and parameters: {'feature_selection__estimator__alpha': 0.03332949535437492, 'svr__C': 2.231251079948694, 'svr__gamma': 0.005487133993495437, 'svr__epsilon': 0.1445512405476591, 'svr__kernel': 'rbf'}. Best is trial 28 with value: 0.2723073977701415.


[I 2025-04-08 00:45:24,335] Trial 31 finished with value: 0.24005716055659257 and parameters: {'feature_selection__estimator__alpha': 0.03263311948166365, 'svr__C': 1.598709402119262, 'svr__gamma': 0.004651482070866006, 'svr__epsilon': 0.12552352918106235, 'svr__kernel': 'rbf'}. Best is trial 28 with value: 0.2723073977701415.


[I 2025-04-08 00:45:24,675] Trial 21 finished with value: 0.23091930346171136 and parameters: {'feature_selection__estimator__alpha': 0.0001830078164053614, 'svr__C': 10.020805049660401, 'svr__gamma': 0.0011453173925596377, 'svr__epsilon': 0.1294519700695383, 'svr__kernel': 'rbf'}. Best is trial 28 with value: 0.2723073977701415.


[I 2025-04-08 00:45:24,812] Trial 19 finished with value: 0.2320856268479458 and parameters: {'feature_selection__estimator__alpha': 0.0001589414206459391, 'svr__C': 16.817123737641037, 'svr__gamma': 0.0010463246298411987, 'svr__epsilon': 0.32530999461763, 'svr__kernel': 'rbf'}. Best is trial 28 with value: 0.2723073977701415.


[I 2025-04-08 00:45:25,082] Trial 32 finished with value: 0.2303520844844515 and parameters: {'feature_selection__estimator__alpha': 0.005583809967846833, 'svr__C': 1.2673668236064697, 'svr__gamma': 0.005605730706128647, 'svr__epsilon': 0.09773833485816741, 'svr__kernel': 'rbf'}. Best is trial 28 with value: 0.2723073977701415.


[I 2025-04-08 00:45:25,471] Trial 33 finished with value: 0.04300818752672092 and parameters: {'feature_selection__estimator__alpha': 0.005395204586218458, 'svr__C': 0.21884899514916625, 'svr__gamma': 0.05585408429977601, 'svr__epsilon': 0.0788649151009968, 'svr__kernel': 'rbf'}. Best is trial 28 with value: 0.2723073977701415.


[I 2025-04-08 00:45:25,561] Trial 22 finished with value: 0.23399078996120207 and parameters: {'feature_selection__estimator__alpha': 0.00012502778357121244, 'svr__C': 8.774064078955332, 'svr__gamma': 0.0014323012903375708, 'svr__epsilon': 0.12530673966035133, 'svr__kernel': 'rbf'}. Best is trial 28 with value: 0.2723073977701415.


[I 2025-04-08 00:45:25,657] Trial 24 finished with value: 0.25553534725812443 and parameters: {'feature_selection__estimator__alpha': 0.0001316689450412096, 'svr__C': 6.7621993500696975, 'svr__gamma': 0.00539520960110983, 'svr__epsilon': 0.12496835230475062, 'svr__kernel': 'rbf'}. Best is trial 28 with value: 0.2723073977701415.


[I 2025-04-08 00:45:25,740] Trial 34 finished with value: 0.08157650748619331 and parameters: {'feature_selection__estimator__alpha': 0.0063079541016861146, 'svr__C': 0.18585274581105798, 'svr__gamma': 0.006237097814167646, 'svr__epsilon': 0.08284445109842473, 'svr__kernel': 'rbf'}. Best is trial 28 with value: 0.2723073977701415.


[I 2025-04-08 00:45:25,839] Trial 23 finished with value: 0.26907238024728675 and parameters: {'feature_selection__estimator__alpha': 0.0001209026391939519, 'svr__C': 4.478776609335474, 'svr__gamma': 0.00462511644280001, 'svr__epsilon': 0.13209310811300312, 'svr__kernel': 'rbf'}. Best is trial 28 with value: 0.2723073977701415.


[I 2025-04-08 00:45:26,013] Trial 35 finished with value: 0.053025816035774255 and parameters: {'feature_selection__estimator__alpha': 0.006279670932769966, 'svr__C': 0.21183734415737301, 'svr__gamma': 0.05331537490078901, 'svr__epsilon': 0.07784073596247812, 'svr__kernel': 'rbf'}. Best is trial 28 with value: 0.2723073977701415.


[I 2025-04-08 00:45:26,153] Trial 36 finished with value: 0.1678490973863242 and parameters: {'feature_selection__estimator__alpha': 0.04304481324251985, 'svr__C': 5.229298141910233, 'svr__gamma': 0.04791965313095649, 'svr__epsilon': 0.2538940577987965, 'svr__kernel': 'rbf'}. Best is trial 28 with value: 0.2723073977701415.


[I 2025-04-08 00:45:26,296] Trial 26 finished with value: -0.09303231439453269 and parameters: {'feature_selection__estimator__alpha': 0.00010734038207377075, 'svr__C': 4.026577364606203, 'svr__gamma': 0.5266091349684497, 'svr__epsilon': 0.10552525721313369, 'svr__kernel': 'rbf'}. Best is trial 28 with value: 0.2723073977701415.


[I 2025-04-08 00:45:26,559] Trial 37 finished with value: 0.2922372006245864 and parameters: {'feature_selection__estimator__alpha': 0.03809262818881697, 'svr__C': 5.603017587925503, 'svr__gamma': 0.009511931361900896, 'svr__epsilon': 0.25942708450623053, 'svr__kernel': 'rbf'}. Best is trial 37 with value: 0.2922372006245864.


[I 2025-04-08 00:45:26,682] Trial 40 finished with value: 0.25730565398529043 and parameters: {'feature_selection__estimator__alpha': 0.03970752246941649, 'svr__C': 1.6342240170104354, 'svr__gamma': 0.0034919477053734632, 'svr__epsilon': 0.27697694806084616, 'svr__kernel': 'rbf'}. Best is trial 37 with value: 0.2922372006245864.


[I 2025-04-08 00:45:26,686] Trial 38 finished with value: 0.2795842068655224 and parameters: {'feature_selection__estimator__alpha': 0.03467376732816953, 'svr__C': 2.088485945754797, 'svr__gamma': 0.007374739063041242, 'svr__epsilon': 0.26831040277613843, 'svr__kernel': 'rbf'}. Best is trial 37 with value: 0.2922372006245864.


[I 2025-04-08 00:45:26,776] Trial 39 finished with value: 0.2823879443007433 and parameters: {'feature_selection__estimator__alpha': 0.03600190873809946, 'svr__C': 1.8009883442021892, 'svr__gamma': 0.008566424352019781, 'svr__epsilon': 0.1636553687406465, 'svr__kernel': 'rbf'}. Best is trial 37 with value: 0.2922372006245864.


[I 2025-04-08 00:45:26,852] Trial 41 finished with value: 0.29458643344169455 and parameters: {'feature_selection__estimator__alpha': 0.04816492123997186, 'svr__C': 4.831663222675531, 'svr__gamma': 0.0029594905363585456, 'svr__epsilon': 0.2594141411052615, 'svr__kernel': 'rbf'}. Best is trial 41 with value: 0.29458643344169455.


[I 2025-04-08 00:45:27,038] Trial 42 finished with value: 0.29004398928022157 and parameters: {'feature_selection__estimator__alpha': 0.04541105016617079, 'svr__C': 4.51039777238906, 'svr__gamma': 0.003082035222796581, 'svr__epsilon': 0.27292473333525646, 'svr__kernel': 'rbf'}. Best is trial 41 with value: 0.29458643344169455.


[I 2025-04-08 00:45:27,417] Trial 43 finished with value: -0.09245422954484261 and parameters: {'feature_selection__estimator__alpha': 0.001954949198204999, 'svr__C': 1.704060959038676, 'svr__gamma': 4.8033113226199555, 'svr__epsilon': 0.16092701533735854, 'svr__kernel': 'rbf'}. Best is trial 41 with value: 0.29458643344169455.


[I 2025-04-08 00:45:27,427] Trial 44 finished with value: 0.29261761935917663 and parameters: {'feature_selection__estimator__alpha': 0.04910450469577028, 'svr__C': 1.6129452391326087, 'svr__gamma': 0.00972206590589264, 'svr__epsilon': 0.16535911021612604, 'svr__kernel': 'rbf'}. Best is trial 41 with value: 0.29458643344169455.


[I 2025-04-08 00:45:27,530] Trial 45 finished with value: -0.09026479513079126 and parameters: {'feature_selection__estimator__alpha': 0.05699813621135941, 'svr__C': 2.618239783062492, 'svr__gamma': 5.381642234877095, 'svr__epsilon': 0.2896738221699172, 'svr__kernel': 'rbf'}. Best is trial 41 with value: 0.29458643344169455.


[I 2025-04-08 00:45:27,603] Trial 46 finished with value: 0.22390622416403927 and parameters: {'feature_selection__estimator__alpha': 0.05566571687229608, 'svr__C': 0.333131082830325, 'svr__gamma': 0.010631188514530893, 'svr__epsilon': 0.36792792630286386, 'svr__kernel': 'rbf'}. Best is trial 41 with value: 0.29458643344169455.


[I 2025-04-08 00:45:27,650] Trial 47 finished with value: -0.08919783011687063 and parameters: {'feature_selection__estimator__alpha': 0.06290983174347371, 'svr__C': 2.5071552822796646, 'svr__gamma': 5.6487572315744945, 'svr__epsilon': 0.36982368546673994, 'svr__kernel': 'rbf'}. Best is trial 41 with value: 0.29458643344169455.


[I 2025-04-08 00:45:27,787] Trial 48 finished with value: 0.3024393638389276 and parameters: {'feature_selection__estimator__alpha': 0.05774318982740513, 'svr__C': 2.335728480983109, 'svr__gamma': 0.011297313875558258, 'svr__epsilon': 0.16845815529430316, 'svr__kernel': 'rbf'}. Best is trial 48 with value: 0.3024393638389276.


[I 2025-04-08 00:45:27,950] Trial 49 finished with value: 0.28970167475368236 and parameters: {'feature_selection__estimator__alpha': 0.05682056032003833, 'svr__C': 0.9599874331683528, 'svr__gamma': 0.010477131528071615, 'svr__epsilon': 0.2815986325849442, 'svr__kernel': 'rbf'}. Best is trial 48 with value: 0.3024393638389276.


[I 2025-04-08 00:45:28,096] Trial 50 finished with value: 0.2928642761090795 and parameters: {'feature_selection__estimator__alpha': 0.06213828817088402, 'svr__C': 2.8772217234719255, 'svr__gamma': 0.009552580063778097, 'svr__epsilon': 0.36193273863745296, 'svr__kernel': 'rbf'}. Best is trial 48 with value: 0.3024393638389276.


[I 2025-04-08 00:45:28,358] Trial 52 finished with value: 0.2845972014118701 and parameters: {'feature_selection__estimator__alpha': 0.06175577206455025, 'svr__C': 0.893234860222278, 'svr__gamma': 0.011964610029829516, 'svr__epsilon': 0.2291440968617115, 'svr__kernel': 'rbf'}. Best is trial 48 with value: 0.3024393638389276.


[I 2025-04-08 00:45:28,463] Trial 51 finished with value: 0.05910474789180011 and parameters: {'feature_selection__estimator__alpha': 0.020976884850980617, 'svr__C': 33.854365832944005, 'svr__gamma': 0.011217724442901504, 'svr__epsilon': 0.3660951569949223, 'svr__kernel': 'rbf'}. Best is trial 48 with value: 0.3024393638389276.


[I 2025-04-08 00:45:28,645] Trial 54 finished with value: 0.2194555056854725 and parameters: {'feature_selection__estimator__alpha': 0.020561275565605407, 'svr__C': 36.97566979531203, 'svr__gamma': 0.0021069348087189026, 'svr__epsilon': 0.2211534541649706, 'svr__kernel': 'rbf'}. Best is trial 48 with value: 0.3024393638389276.


[I 2025-04-08 00:45:28,656] Trial 53 finished with value: 0.030889019458214356 and parameters: {'feature_selection__estimator__alpha': 0.020288500731012474, 'svr__C': 30.320902876410315, 'svr__gamma': 0.01078849104897118, 'svr__epsilon': 0.23306496395877638, 'svr__kernel': 'rbf'}. Best is trial 48 with value: 0.3024393638389276.


[I 2025-04-08 00:45:28,757] Trial 55 finished with value: 0.1898598916091898 and parameters: {'feature_selection__estimator__alpha': 0.020499137637353428, 'svr__C': 0.8789754559540632, 'svr__gamma': 0.0029434397342944572, 'svr__epsilon': 0.22804427791819915, 'svr__kernel': 'rbf'}. Best is trial 48 with value: 0.3024393638389276.


[I 2025-04-08 00:45:28,815] Trial 56 finished with value: 0.18292658927098385 and parameters: {'feature_selection__estimator__alpha': 0.019770432102002338, 'svr__C': 0.9467209615886191, 'svr__gamma': 0.002478600656418037, 'svr__epsilon': 0.22389245153201484, 'svr__kernel': 'rbf'}. Best is trial 48 with value: 0.3024393638389276.


[I 2025-04-08 00:45:28,922] Trial 57 finished with value: 0.22395153593910844 and parameters: {'feature_selection__estimator__alpha': 0.018698892972105755, 'svr__C': 11.9402418665342, 'svr__gamma': 0.0021231345118091754, 'svr__epsilon': 0.22776977019815003, 'svr__kernel': 'rbf'}. Best is trial 48 with value: 0.3024393638389276.


[I 2025-04-08 00:45:29,134] Trial 58 finished with value: 0.2557637242545803 and parameters: {'feature_selection__estimator__alpha': 0.022408798023974733, 'svr__C': 3.6846740852187327, 'svr__gamma': 0.0024955035327438977, 'svr__epsilon': 0.22587273029379873, 'svr__kernel': 'rbf'}. Best is trial 48 with value: 0.3024393638389276.


[I 2025-04-08 00:45:29,349] Trial 59 finished with value: 0.15556251586726147 and parameters: {'feature_selection__estimator__alpha': 0.019610129411331202, 'svr__C': 34.31143048880764, 'svr__gamma': 0.002544442960885979, 'svr__epsilon': 0.39566031577995114, 'svr__kernel': 'rbf'}. Best is trial 48 with value: 0.3024393638389276.


[I 2025-04-08 00:45:29,507] Trial 60 finished with value: 0.2297309747021011 and parameters: {'feature_selection__estimator__alpha': 0.09995687234112426, 'svr__C': 13.54373823778407, 'svr__gamma': 0.0023432527820797895, 'svr__epsilon': 0.4332788056588776, 'svr__kernel': 'rbf'}. Best is trial 48 with value: 0.3024393638389276.


[I 2025-04-08 00:45:29,678] Trial 62 finished with value: 0.2566297371085054 and parameters: {'feature_selection__estimator__alpha': 0.09804624968008911, 'svr__C': 3.3884208905746895, 'svr__gamma': 0.0033352579657320745, 'svr__epsilon': 0.4117045553170473, 'svr__kernel': 'rbf'}. Best is trial 48 with value: 0.3024393638389276.


[I 2025-04-08 00:45:29,696] Trial 61 finished with value: 0.23213810034636037 and parameters: {'feature_selection__estimator__alpha': 0.08067805747526519, 'svr__C': 3.6809697305681857, 'svr__gamma': 0.029670422469564642, 'svr__epsilon': 0.42817955294518195, 'svr__kernel': 'rbf'}. Best is trial 48 with value: 0.3024393638389276.


[I 2025-04-08 00:45:29,761] Trial 63 finished with value: 0.2304733576127725 and parameters: {'feature_selection__estimator__alpha': 0.09750494942293553, 'svr__C': 3.421336171379714, 'svr__gamma': 0.021950770660603504, 'svr__epsilon': 0.38703153510764576, 'svr__kernel': 'rbf'}. Best is trial 48 with value: 0.3024393638389276.


[I 2025-04-08 00:45:29,900] Trial 64 finished with value: 0.16064119209828856 and parameters: {'feature_selection__estimator__alpha': 0.07723062323626918, 'svr__C': 12.97050463459374, 'svr__gamma': 0.02817937139100368, 'svr__epsilon': 0.43474946501216954, 'svr__kernel': 'rbf'}. Best is trial 48 with value: 0.3024393638389276.


[I 2025-04-08 00:45:29,959] Trial 65 finished with value: 0.2358406702633873 and parameters: {'feature_selection__estimator__alpha': 0.07779883035249105, 'svr__C': 3.7017376142233847, 'svr__gamma': 0.028827521991343898, 'svr__epsilon': 0.4032039437837868, 'svr__kernel': 'rbf'}. Best is trial 48 with value: 0.3024393638389276.


[I 2025-04-08 00:45:30,226] Trial 66 finished with value: 0.16798091855092895 and parameters: {'feature_selection__estimator__alpha': 0.07890551537549408, 'svr__C': 8.701877434251855, 'svr__gamma': 0.02762322995246683, 'svr__epsilon': 0.3925029787978213, 'svr__kernel': 'rbf'}. Best is trial 48 with value: 0.3024393638389276.


[I 2025-04-08 00:45:30,438] Trial 67 finished with value: 0.16342689821554254 and parameters: {'feature_selection__estimator__alpha': 0.0987046884585413, 'svr__C': 7.639839404476183, 'svr__gamma': 0.023987628531267572, 'svr__epsilon': 0.18446019893813045, 'svr__kernel': 'rbf'}. Best is trial 48 with value: 0.3024393638389276.


[I 2025-04-08 00:45:30,440] Trial 68 finished with value: 0.18296139296378885 and parameters: {'feature_selection__estimator__alpha': 0.07339136301096354, 'svr__C': 7.482313204217089, 'svr__gamma': 0.021420964183841616, 'svr__epsilon': 0.18765392263824723, 'svr__kernel': 'rbf'}. Best is trial 48 with value: 0.3024393638389276.


[I 2025-04-08 00:45:30,737] Trial 69 finished with value: 0.2665751642408808 and parameters: {'feature_selection__estimator__alpha': 0.07103167810808467, 'svr__C': 0.5175138916779864, 'svr__gamma': 0.02827294797445443, 'svr__epsilon': 0.18985083798647195, 'svr__kernel': 'rbf'}. Best is trial 48 with value: 0.3024393638389276.


[I 2025-04-08 00:45:30,770] Trial 70 finished with value: 0.2181093800378438 and parameters: {'feature_selection__estimator__alpha': 0.07129386242677176, 'svr__C': 6.458357014162559, 'svr__gamma': 0.018685505757530987, 'svr__epsilon': 0.1802045610415236, 'svr__kernel': 'rbf'}. Best is trial 48 with value: 0.3024393638389276.


[I 2025-04-08 00:45:30,866] Trial 71 finished with value: 0.24202165238831846 and parameters: {'feature_selection__estimator__alpha': 0.04841166237635747, 'svr__C': 7.084778119977998, 'svr__gamma': 0.01681611196502239, 'svr__epsilon': 0.2980859223542834, 'svr__kernel': 'rbf'}. Best is trial 48 with value: 0.3024393638389276.


[I 2025-04-08 00:45:31,046] Trial 72 finished with value: 0.16077489226389727 and parameters: {'feature_selection__estimator__alpha': 0.02687715824928005, 'svr__C': 9.101015996018532, 'svr__gamma': 0.01706651720449777, 'svr__epsilon': 0.17878938439371894, 'svr__kernel': 'rbf'}. Best is trial 48 with value: 0.3024393638389276.


[I 2025-04-08 00:45:31,051] Trial 73 finished with value: 0.21614856381126182 and parameters: {'feature_selection__estimator__alpha': 0.027640850998601255, 'svr__C': 7.532881026705849, 'svr__gamma': 0.013544854779644375, 'svr__epsilon': 0.18187081070424604, 'svr__kernel': 'rbf'}. Best is trial 48 with value: 0.3024393638389276.


[I 2025-04-08 00:45:31,325] Trial 74 finished with value: 0.19392519198198938 and parameters: {'feature_selection__estimator__alpha': 0.02829487109589449, 'svr__C': 6.99713224922668, 'svr__gamma': 0.015108653309382629, 'svr__epsilon': 0.18232369168920456, 'svr__kernel': 'rbf'}. Best is trial 48 with value: 0.3024393638389276.


[I 2025-04-08 00:45:31,459] Trial 75 finished with value: 0.06326799608769214 and parameters: {'feature_selection__estimator__alpha': 0.02598089047050764, 'svr__C': 0.4903805577008772, 'svr__gamma': 0.0015988798506032433, 'svr__epsilon': 0.3020486933077639, 'svr__kernel': 'rbf'}. Best is trial 48 with value: 0.3024393638389276.


[I 2025-04-08 00:45:31,509] Trial 76 finished with value: 0.26302992306699036 and parameters: {'feature_selection__estimator__alpha': 0.048116563466281084, 'svr__C': 0.4841817133848147, 'svr__gamma': 0.015865837935097223, 'svr__epsilon': 0.24431064795444912, 'svr__kernel': 'rbf'}. Best is trial 48 with value: 0.3024393638389276.


[I 2025-04-08 00:45:31,818] Trial 78 finished with value: 0.2634341123616849 and parameters: {'feature_selection__estimator__alpha': 0.027671344563963935, 'svr__C': 0.7123861415357676, 'svr__gamma': 0.014150333292361122, 'svr__epsilon': 0.3406284971250109, 'svr__kernel': 'rbf'}. Best is trial 48 with value: 0.3024393638389276.


[I 2025-04-08 00:45:31,849] Trial 77 finished with value: 0.27211810006150977 and parameters: {'feature_selection__estimator__alpha': 0.02667140269831158, 'svr__C': 1.147235051181524, 'svr__gamma': 0.016352210890663257, 'svr__epsilon': 0.3462250359508561, 'svr__kernel': 'rbf'}. Best is trial 48 with value: 0.3024393638389276.


[I 2025-04-08 00:45:31,869] Trial 79 finished with value: 0.27552710747972087 and parameters: {'feature_selection__estimator__alpha': 0.02619398838998633, 'svr__C': 1.2148900519403243, 'svr__gamma': 0.013040243165097744, 'svr__epsilon': 0.3425777881653533, 'svr__kernel': 'rbf'}. Best is trial 48 with value: 0.3024393638389276.


[I 2025-04-08 00:45:32,100] Trial 81 finished with value: 0.10630454042392859 and parameters: {'feature_selection__estimator__alpha': 0.04971569503528298, 'svr__C': 0.7260254968073669, 'svr__gamma': 0.0016396532502982615, 'svr__epsilon': 0.3456518554444967, 'svr__kernel': 'rbf'}. Best is trial 48 with value: 0.3024393638389276.


[I 2025-04-08 00:45:32,169] Trial 80 finished with value: 0.2503017828907447 and parameters: {'feature_selection__estimator__alpha': 0.0502103383418362, 'svr__C': 0.6909909595958103, 'svr__gamma': 0.007817323971101705, 'svr__epsilon': 0.2476768119345844, 'svr__kernel': 'rbf'}. Best is trial 48 with value: 0.3024393638389276.


[I 2025-04-08 00:45:32,285] Trial 82 finished with value: 0.21309636206714036 and parameters: {'feature_selection__estimator__alpha': 0.04814466987715254, 'svr__C': 0.6649955814829718, 'svr__gamma': 0.08514520241321197, 'svr__epsilon': 0.2457368695635766, 'svr__kernel': 'rbf'}. Best is trial 48 with value: 0.3024393638389276.


[I 2025-04-08 00:45:32,551] Trial 84 finished with value: 0.29032974334553246 and parameters: {'feature_selection__estimator__alpha': 0.05162942050680658, 'svr__C': 1.274938077859782, 'svr__gamma': 0.007346579011056605, 'svr__epsilon': 0.3440557749731954, 'svr__kernel': 'rbf'}. Best is trial 48 with value: 0.3024393638389276.


[I 2025-04-08 00:45:32,555] Trial 83 finished with value: 0.18823958159743412 and parameters: {'feature_selection__estimator__alpha': 0.050277455663777434, 'svr__C': 1.148591078197152, 'svr__gamma': 0.1077357871676859, 'svr__epsilon': 0.3431754263848209, 'svr__kernel': 'rbf'}. Best is trial 48 with value: 0.3024393638389276.


[I 2025-04-08 00:45:32,807] Trial 85 finished with value: 0.2889528189131694 and parameters: {'feature_selection__estimator__alpha': 0.05229925742902182, 'svr__C': 1.165473596346174, 'svr__gamma': 0.008579964051540247, 'svr__epsilon': 0.2665665333635833, 'svr__kernel': 'rbf'}. Best is trial 48 with value: 0.3024393638389276.


[I 2025-04-08 00:45:32,911] Trial 87 finished with value: 0.14302352121201048 and parameters: {'feature_selection__estimator__alpha': 0.05075773034744476, 'svr__C': 4.9790291146381405, 'svr__gamma': 0.0742692968241129, 'svr__epsilon': 0.2591077603909902, 'svr__kernel': 'rbf'}. Best is trial 48 with value: 0.3024393638389276.


[I 2025-04-08 00:45:32,932] Trial 86 finished with value: 0.21419170466698456 and parameters: {'feature_selection__estimator__alpha': 0.05153191539319532, 'svr__C': 1.1494260723395213, 'svr__gamma': 0.08306022664101563, 'svr__epsilon': 0.2070962222446549, 'svr__kernel': 'rbf'}. Best is trial 48 with value: 0.3024393638389276.


[I 2025-04-08 00:45:33,125] Trial 88 finished with value: 0.30053441533832903 and parameters: {'feature_selection__estimator__alpha': 0.05771343434684922, 'svr__C': 2.256945007992333, 'svr__gamma': 0.007348886081130886, 'svr__epsilon': 0.20886520581499868, 'svr__kernel': 'rbf'}. Best is trial 48 with value: 0.3024393638389276.


[I 2025-04-08 00:45:33,191] Trial 89 finished with value: 0.29687951677368213 and parameters: {'feature_selection__estimator__alpha': 0.06288264840083996, 'svr__C': 4.762417998439058, 'svr__gamma': 0.009216623346718198, 'svr__epsilon': 0.15641376125206072, 'svr__kernel': 'rbf'}. Best is trial 48 with value: 0.3024393638389276.


[I 2025-04-08 00:45:33,352] Trial 90 finished with value: 0.2943555840891846 and parameters: {'feature_selection__estimator__alpha': 0.06054053901692061, 'svr__C': 2.7697027879753877, 'svr__gamma': 0.008614832217945925, 'svr__epsilon': 0.16046927013159124, 'svr__kernel': 'rbf'}. Best is trial 48 with value: 0.3024393638389276.


[I 2025-04-08 00:45:33,608] Trial 92 finished with value: 0.28517393564732246 and parameters: {'feature_selection__estimator__alpha': 0.060586179073257446, 'svr__C': 5.214085910496992, 'svr__gamma': 0.003974796684069671, 'svr__epsilon': 0.27155994438713094, 'svr__kernel': 'rbf'}. Best is trial 48 with value: 0.3024393638389276.


[I 2025-04-08 00:45:33,613] Trial 91 finished with value: 0.30336125573220407 and parameters: {'feature_selection__estimator__alpha': 0.06352016982789091, 'svr__C': 2.7830525069909835, 'svr__gamma': 0.007432112632355197, 'svr__epsilon': 0.20885547184188877, 'svr__kernel': 'rbf'}. Best is trial 91 with value: 0.30336125573220407.


[I 2025-04-08 00:45:33,750] Trial 93 finished with value: 0.2903920857906193 and parameters: {'feature_selection__estimator__alpha': 0.06127874996824762, 'svr__C': 4.449470859557808, 'svr__gamma': 0.003907892040127075, 'svr__epsilon': 0.26336100854773825, 'svr__kernel': 'rbf'}. Best is trial 91 with value: 0.30336125573220407.


[I 2025-04-08 00:45:33,906] Trial 95 finished with value: 0.28186834363360125 and parameters: {'feature_selection__estimator__alpha': 0.04136567189416738, 'svr__C': 2.1428107661394806, 'svr__gamma': 0.003945873606734564, 'svr__epsilon': 0.3211044702251088, 'svr__kernel': 'rbf'}. Best is trial 91 with value: 0.30336125573220407.


[I 2025-04-08 00:45:33,911] Trial 94 finished with value: 0.25575286122024227 and parameters: {'feature_selection__estimator__alpha': 0.010187862383314391, 'svr__C': 2.610315564662616, 'svr__gamma': 0.004065389103169334, 'svr__epsilon': 0.20873886494842828, 'svr__kernel': 'rbf'}. Best is trial 91 with value: 0.30336125573220407.


[I 2025-04-08 00:45:33,996] Trial 96 finished with value: 0.28351695437549534 and parameters: {'feature_selection__estimator__alpha': 0.062101306842862566, 'svr__C': 2.8299744179903503, 'svr__gamma': 0.003846973643221383, 'svr__epsilon': 0.32001277185506793, 'svr__kernel': 'rbf'}. Best is trial 91 with value: 0.30336125573220407.


[I 2025-04-08 00:45:34,025] Trial 97 finished with value: 0.23733023914299162 and parameters: {'feature_selection__estimator__alpha': 0.014940315341691511, 'svr__C': 2.6403049850537985, 'svr__gamma': 0.004246728089348708, 'svr__epsilon': 0.14782597681236923, 'svr__kernel': 'rbf'}. Best is trial 91 with value: 0.30336125573220407.


[I 2025-04-08 00:45:34,057] Trial 98 finished with value: 0.2731713016333882 and parameters: {'feature_selection__estimator__alpha': 0.04086206995690584, 'svr__C': 2.3876496237856735, 'svr__gamma': 0.0037972443954974624, 'svr__epsilon': 0.15613099445909046, 'svr__kernel': 'rbf'}. Best is trial 91 with value: 0.30336125573220407.


[I 2025-04-08 00:45:34,088] Trial 99 finished with value: 0.2861250651235435 and parameters: {'feature_selection__estimator__alpha': 0.039749574941484817, 'svr__C': 2.6272428770161276, 'svr__gamma': 0.006564441762319138, 'svr__epsilon': 0.15453777770414487, 'svr__kernel': 'rbf'}. Best is trial 91 with value: 0.30336125573220407.


Best hyperparameters: {'feature_selection__estimator__alpha': 0.06352016982789091, 'svr__C': 2.7830525069909835, 'svr__gamma': 0.007432112632355197, 'svr__epsilon': 0.20885547184188877, 'svr__kernel': 'rbf'}
Best R² score: 0.3034


## Store Trained Model

In [8]:
from sklearn import clone

if HYPERPARAMETER_TUNING_ENABLED:
    unfitted_pipeline.set_params(**study.best_params)
else:
    unfitted_pipeline.set_params(**PIPELINE_PARAMS)
fitted_pipeline = clone(unfitted_pipeline)
fitted_pipeline.fit(X, y)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



## Feature Selection Results

In [9]:
preprocessor = fitted_pipeline.named_steps['preprocessor']
feature_names = preprocessor.get_feature_names_out()    # after preprocessing

feature_selector = fitted_pipeline.named_steps['feature_selection']
selected_mask = feature_selector.get_support()

selected_features = feature_names[selected_mask]
unselected_features = feature_names[~selected_mask]

print(f"Number of Selected features ({len(selected_features)}):")
print(f"Selected features: {selected_features}")
print(f"Unselected features: {unselected_features}")

Number of Selected features (8):
Selected features: ['remainder__intensity_mean' 'remainder__f3_sd' 'remainder__f2_f1_mean'
 'remainder__percent_unvoiced' 'remainder__percent_breaks'
 'remainder__Duration/Total Words' 'remainder__They'
 'remainder__Cognitive']
Unselected features: ['remainder__f0_mean' 'remainder__f0_min' 'remainder__f0_max'
 'remainder__f0_range' 'remainder__f0_sd' 'remainder__intensity_min'
 'remainder__intensity_max' 'remainder__intensity_range'
 'remainder__intensity_sd' 'remainder__f1_mean' 'remainder__f1_sd'
 'remainder__f2_mean' 'remainder__f2_sd' 'remainder__f3_mean'
 'remainder__f3_f1_mean' 'remainder__f2_f1_sd' 'remainder__f3_f1_sd'
 'remainder__jitter' 'remainder__shimmer' 'remainder__pause_duration_max'
 'remainder__pause_duration_avg' 'remainder__duration'
 'remainder__Total Words' 'remainder__Unique Words'
 'remainder__Filler Words' 'remainder__Audio Duration (s)'
 'remainder__Duration/Unique Words' 'remainder__Duration/Filler Words'
 'remainder__Individu

# Monte Carlo Cross Validation

In [10]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import r2_score

scoring = {
    "r2": make_scorer(r2_score),
    "pearson": make_scorer(pearson_corr)
}

results = cross_validate(
    unfitted_pipeline,
    X,
    y,
    cv=GroupShuffleSplit(n_splits=1000, test_size=0.2, random_state=42),
    groups=groups_column,
    scoring=scoring,
    n_jobs=-1,
)

r2_scores = results["test_r2"]
pearson_scores = results["test_pearson"]
# Report the average performance and variability
avg_r2_score = np.mean(r2_scores)
print(
    f"Mean R² Score: {avg_r2_score:.2f} (±{np.std(r2_scores):.2f})"
)
avg_perason_score = np.mean(pearson_scores)
print(
    f"Mean Pearson Correlation: {avg_perason_score:.2f} (±{np.std(pearson_scores):.2f})"
)

# Mean R² Score: 0.20 (±0.23)
# Mean Pearson Correlation: 0.53 (±0.16)

Mean R² Score: 0.20 (±0.27)
Mean Pearson Correlation: 0.54 (±0.16)


# Save the Model

In [11]:
from joblib import dump, load

dump(fitted_pipeline, os.path.join(SAVED_MODELS_PATH,f'{target_column}.joblib'))

['./regression_models/saved_models/RecommendHiring.joblib']