# Input

In [1]:
target_column = "RecommendHiring"

In [2]:
# Parameters
target_column = "Colleague"


# Constants

In [3]:
import os
import numpy as np
from sklearn.metrics import make_scorer

SCRIPT_DIR = os.path.join(os.getcwd(),)

SAVED_MODELS_PATH="./regression_models/saved_models"

DROPPED_LEXICAL_COLUMNS = [
    "Swear",
    "Numbers",
    "Inhibition",
    "Preceptual",
    "Anxiety",
    "Anger",
    "Sadness",
    "Work",
    "Articles",
    "Verbs",
    "Adverbs",
    "Prepositions",
    "Conjunctions",
    "Negations",
]

facial_features = [
    "average_inner_brow_height",
    "average_outer_brow_height",
    "eye_open",
    "inner_lip_height",
    "lip_corner_distance",
    "outer_lip_height",
    "smile",
    "pitch",
    "roll",
    "yaw",
]
stats = ["max", "median", "min", "std", "mean"]
DROPPED_FACIAL_FEATURES = [
    f"{feature}_{stat}" for feature in facial_features for stat in stats
]


DROPPED_PROSODIC_COLUMNS = []

ALREADY_NORMALIZED_FEATURES = [
    "average_outer_brow_height_mean",
    "average_inner_brow_height_mean",
    "eye_open_mean",
    "inner_lip_height_mean",
    "inner_lip_height_mean",
    "lip_corner_distance_mean",
    "average_outer_brow_height_std",
    "average_inner_brow_height_std",
    "eye_open_std",
    "outer_lip_height_std",
    "inner_lip_height_std",
    "lip_corner_distance_std",
    "average_outer_brow_height_min",
    "average_inner_brow_height_min",
    "eye_open_min",
    "outer_lip_height_min",
    "inner_lip_height_min",
    "lip_corner_distance_min",
    "average_outer_brow_height_max",
    "average_inner_brow_height_max",
    "eye_open_max",
    "outer_lip_height_max",
    "inner_lip_height_max",
    "lip_corner_distance_max",
    "average_outer_brow_height_median",
    "average_inner_brow_height_median",
    "eye_open_median",
    "outer_lip_height_median",
    "inner_lip_height_median",
    "lip_corner_distance_median",
]  # these are already in [0, 1]

MUST_KEEP_FEATURES = [
    "pause_duration_avg",
    "average_outer_brow_height_mean",
    "average_inner_brow_height_mean",
    "outer_lip_height_mean",
    "Duration/Filler Words",
]


GROUPS_COLUMN = "cleaned_ids"
INDEX_COLUMN = "participant_id"


def pearson_corr(y_true, y_pred):
    return np.corrcoef(y_true, y_pred)[0, 1]


SCORING_METRICS = {
    "r2": "r2",
    "mae": "neg_mean_absolute_error",
    "pearson": make_scorer(pearson_corr),  # Pearson Correlation Coefficient
}


MUST_KEEP_FEATURES = [
    # "pause_duration_avg",
    # "average_outer_brow_height_mean",
    # "average_inner_brow_height_mean",
    # "outer_lip_height_mean",
    "Duration/Filler Words",
]

PIPELINE_PARAMS = {'feature_selection__estimator__alpha': 0.057376790661083456, 'svr__C': 0.655379988356498, 'svr__gamma': 0.02784736494309893, 'svr__epsilon': 0.2617249201838037, 'svr__kernel': 'rbf'}
HYPERPARAMETER_TUNING_ENABLED = True

# Data Preprocessing

## Import Datasets

In [4]:
import pandas as pd
import os

features_df = pd.read_csv(os.path.join(SCRIPT_DIR, "datasets", "add.csv"))
features_df = features_df.set_index("participant_id")

labels_df = pd.read_csv(
    os.path.join(os.path.join(SCRIPT_DIR,  "datasets", "turker_scores_full_interview.csv"))
)
labels_df = labels_df.set_index("Participant")
labels_df = labels_df.loc[labels_df["Worker"] == "AGGR"]

features_df.index = features_df.index.str.lower()
labels_df.index = labels_df.index.str.lower()
indexed_combined_df = features_df.join(labels_df[[target_column]], how="left")
combined_df = indexed_combined_df.reset_index(drop=True)

# Model

## Split Data

In [5]:
X = combined_df.drop(columns=[target_column, GROUPS_COLUMN])
y = combined_df[target_column]

## Pipeline Creation

In [6]:
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectFromModel
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Lasso, LassoCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.model_selection import GroupKFold, cross_val_score, cross_validate
import sys

sys.path.append("..")
from models.domain_aware_selector import DomainAwareSelector

groups_column = combined_df[GROUPS_COLUMN].astype(str).values


preprocessor = ColumnTransformer(
    [
        ('dropper', 'drop', DROPPED_FACIAL_FEATURES + 
                            DROPPED_LEXICAL_COLUMNS + 
                            DROPPED_PROSODIC_COLUMNS)
    ],
    remainder='passthrough'
)
unfitted_pipeline = Pipeline(
    [
        ('preprocessor', preprocessor),
        ("imputer", SimpleImputer(strategy="mean")),  # NaN imputation
        ("scaler", StandardScaler()),
        ("feature_selection", SelectFromModel(estimator=Lasso(max_iter=50000), )),
        # ('feature_selection', DomainAwareSelector(
        #     must_keep_features=MUST_KEEP_FEATURES,
        #     selector=SelectFromModel(lasso_feature_selection_model, max_features=10),
        # )),
        ("svr", SVR(kernel="rbf")),
    ]
)

## Hyperparameter Tuning

In [7]:
import optuna
from sklearn.base import clone
from sklearn.model_selection import GroupShuffleSplit, cross_val_score
import numpy as np

def objective(trial):
    pipeline_clone = clone(unfitted_pipeline)  # Clone pipeline for thread safety
    
    params = {
        "feature_selection__estimator__alpha": trial.suggest_float(
        "feature_selection__estimator__alpha", 1e-4, 0.1, log=True  # Adjusted lower bound
        ),
        "svr__C": trial.suggest_float("svr__C", 0.01, 100, log=True),
        "svr__gamma": trial.suggest_float("svr__gamma", 1e-3, 1e1, log=True),
        "svr__epsilon": trial.suggest_float("svr__epsilon", 0.01, 0.5),
        "svr__kernel": trial.suggest_categorical("svr__kernel", ["rbf"
                                                                #  , "poly"
                                                                 ]),
    }
    
    # if params["svr__kernel"] == "poly":
    #     params["svr__degree"] = trial.suggest_int("svr__degree", 2, 3)  # Reduced from 5
    #     params["svr__coef0"] = trial.suggest_float("svr__coef0", 0.0, 0.5)  # Narrower range
        
    pipeline_clone.set_params(**params)
    
    mc_cv_tuning = GroupShuffleSplit(n_splits=20, test_size=0.2, random_state=42)
    
    scores = cross_val_score(
        pipeline_clone, X, y, cv=mc_cv_tuning, groups=groups_column, n_jobs=1
    )

    return np.mean(scores)

if HYPERPARAMETER_TUNING_ENABLED:
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=100, n_jobs=-1, timeout=4*60)
    
    print("Best hyperparameters:", study.best_params)
    print(f"Best R² score: {study.best_value:.4f}")

  from .autonotebook import tqdm as notebook_tqdm
[I 2025-04-08 00:45:45,495] A new study created in memory with name: no-name-17051f0c-4483-4676-8323-6f1550ec9d41


[I 2025-04-08 00:45:46,427] Trial 3 finished with value: 0.07845514124275874 and parameters: {'feature_selection__estimator__alpha': 0.0020662523858685565, 'svr__C': 1.8656989848912469, 'svr__gamma': 0.014023063763186953, 'svr__epsilon': 0.07600777242830688, 'svr__kernel': 'rbf'}. Best is trial 3 with value: 0.07845514124275874.


[I 2025-04-08 00:45:46,429] Trial 4 finished with value: 0.10705542655939064 and parameters: {'feature_selection__estimator__alpha': 0.0808499083806953, 'svr__C': 2.7140548097470516, 'svr__gamma': 0.025180540423057517, 'svr__epsilon': 0.3520361852418756, 'svr__kernel': 'rbf'}. Best is trial 4 with value: 0.10705542655939064.


[I 2025-04-08 00:45:46,500] Trial 2 finished with value: -0.14502044698656943 and parameters: {'feature_selection__estimator__alpha': 0.003476005102927311, 'svr__C': 0.05807546909287261, 'svr__gamma': 4.491644364897645, 'svr__epsilon': 0.08128361665408501, 'svr__kernel': 'rbf'}. Best is trial 4 with value: 0.10705542655939064.


[I 2025-04-08 00:45:46,546] Trial 0 finished with value: 0.07716002194844226 and parameters: {'feature_selection__estimator__alpha': 0.000837558133035314, 'svr__C': 99.31980011402402, 'svr__gamma': 0.04581147904428183, 'svr__epsilon': 0.36091783389565285, 'svr__kernel': 'rbf'}. Best is trial 4 with value: 0.10705542655939064.


[I 2025-04-08 00:45:46,569] Trial 6 finished with value: 0.03852064402545172 and parameters: {'feature_selection__estimator__alpha': 0.0007520829781754286, 'svr__C': 3.4802423948383088, 'svr__gamma': 0.02907534210507567, 'svr__epsilon': 0.04080459321591228, 'svr__kernel': 'rbf'}. Best is trial 4 with value: 0.10705542655939064.


[I 2025-04-08 00:45:47,193] Trial 1 finished with value: 0.09590797010649565 and parameters: {'feature_selection__estimator__alpha': 0.000415374438409209, 'svr__C': 0.23696468280344546, 'svr__gamma': 0.009154261778814463, 'svr__epsilon': 0.2019603809579679, 'svr__kernel': 'rbf'}. Best is trial 4 with value: 0.10705542655939064.


[I 2025-04-08 00:45:47,285] Trial 5 finished with value: -0.09844169796435674 and parameters: {'feature_selection__estimator__alpha': 0.0002673150850936222, 'svr__C': 0.014138287938608808, 'svr__gamma': 0.015295864662112192, 'svr__epsilon': 0.38608666086915044, 'svr__kernel': 'rbf'}. Best is trial 4 with value: 0.10705542655939064.


[I 2025-04-08 00:45:47,291] Trial 7 finished with value: -0.05628542645560151 and parameters: {'feature_selection__estimator__alpha': 0.00023199416118410777, 'svr__C': 30.257838337049204, 'svr__gamma': 0.01299727257722064, 'svr__epsilon': 0.23100336812153693, 'svr__kernel': 'rbf'}. Best is trial 4 with value: 0.10705542655939064.


[I 2025-04-08 00:45:47,339] Trial 9 finished with value: -0.10325268420021946 and parameters: {'feature_selection__estimator__alpha': 0.022246852595868526, 'svr__C': 0.021718165392590798, 'svr__gamma': 0.15543021843772195, 'svr__epsilon': 0.46363139225418776, 'svr__kernel': 'rbf'}. Best is trial 4 with value: 0.10705542655939064.


[I 2025-04-08 00:45:47,443] Trial 8 finished with value: -0.12586124228426462 and parameters: {'feature_selection__estimator__alpha': 0.003367462355774206, 'svr__C': 6.906599401117313, 'svr__gamma': 1.6737879723979403, 'svr__epsilon': 0.3960844772038607, 'svr__kernel': 'rbf'}. Best is trial 4 with value: 0.10705542655939064.


[I 2025-04-08 00:45:47,492] Trial 10 finished with value: -0.12959456701273853 and parameters: {'feature_selection__estimator__alpha': 0.012363294905823935, 'svr__C': 0.011165162317155532, 'svr__gamma': 0.1818279499818538, 'svr__epsilon': 0.24270038826010415, 'svr__kernel': 'rbf'}. Best is trial 4 with value: 0.10705542655939064.


[I 2025-04-08 00:45:47,564] Trial 11 finished with value: 0.0001955908204577861 and parameters: {'feature_selection__estimator__alpha': 0.026767137702785625, 'svr__C': 0.04356817090273986, 'svr__gamma': 0.024051462140612014, 'svr__epsilon': 0.2329538106093522, 'svr__kernel': 'rbf'}. Best is trial 4 with value: 0.10705542655939064.


[I 2025-04-08 00:45:48,256] Trial 13 finished with value: 0.000680325821227401 and parameters: {'feature_selection__estimator__alpha': 0.0010480661372680982, 'svr__C': 0.5062784704773362, 'svr__gamma': 0.07028780051033068, 'svr__epsilon': 0.46362688742497576, 'svr__kernel': 'rbf'}. Best is trial 4 with value: 0.10705542655939064.


[I 2025-04-08 00:45:48,257] Trial 14 finished with value: -0.12832451202376713 and parameters: {'feature_selection__estimator__alpha': 0.0029612221636104307, 'svr__C': 67.31996190509068, 'svr__gamma': 5.920066330850177, 'svr__epsilon': 0.22635566990969283, 'svr__kernel': 'rbf'}. Best is trial 4 with value: 0.10705542655939064.


[I 2025-04-08 00:45:48,264] Trial 16 finished with value: 0.10463306869123998 and parameters: {'feature_selection__estimator__alpha': 0.07540264899878275, 'svr__C': 0.10242995796542265, 'svr__gamma': 0.09343675473986773, 'svr__epsilon': 0.12210964937230306, 'svr__kernel': 'rbf'}. Best is trial 4 with value: 0.10705542655939064.


[I 2025-04-08 00:45:48,381] Trial 18 finished with value: -0.06417829038325698 and parameters: {'feature_selection__estimator__alpha': 0.08360993648771277, 'svr__C': 0.14718130004040308, 'svr__gamma': 0.002000016019812911, 'svr__epsilon': 0.17428436662051694, 'svr__kernel': 'rbf'}. Best is trial 4 with value: 0.10705542655939064.


[I 2025-04-08 00:45:48,383] Trial 17 finished with value: -0.07417772120111729 and parameters: {'feature_selection__estimator__alpha': 0.08719322450765153, 'svr__C': 0.2535820246071425, 'svr__gamma': 0.0010538742996090446, 'svr__epsilon': 0.3085511518186652, 'svr__kernel': 'rbf'}. Best is trial 4 with value: 0.10705542655939064.


[I 2025-04-08 00:45:48,483] Trial 19 finished with value: -0.03435520530412671 and parameters: {'feature_selection__estimator__alpha': 0.056448884477767025, 'svr__C': 0.23499713834692865, 'svr__gamma': 0.001495818466472867, 'svr__epsilon': 0.15335844376016888, 'svr__kernel': 'rbf'}. Best is trial 4 with value: 0.10705542655939064.


[I 2025-04-08 00:45:48,561] Trial 12 finished with value: -0.12910281062172507 and parameters: {'feature_selection__estimator__alpha': 0.00013417016050448378, 'svr__C': 1.4027484429754091, 'svr__gamma': 1.987338597641507, 'svr__epsilon': 0.1397649323272254, 'svr__kernel': 'rbf'}. Best is trial 4 with value: 0.10705542655939064.


[I 2025-04-08 00:45:48,915] Trial 15 finished with value: -0.13681967053676258 and parameters: {'feature_selection__estimator__alpha': 0.0002412487573084409, 'svr__C': 0.03891963223932818, 'svr__gamma': 0.46096222690404925, 'svr__epsilon': 0.30188658227088194, 'svr__kernel': 'rbf'}. Best is trial 4 with value: 0.10705542655939064.


[I 2025-04-08 00:45:49,157] Trial 20 finished with value: -0.06001895432101832 and parameters: {'feature_selection__estimator__alpha': 0.08870645252360554, 'svr__C': 0.18249576513519936, 'svr__gamma': 0.0016125329127454928, 'svr__epsilon': 0.13333918571265416, 'svr__kernel': 'rbf'}. Best is trial 4 with value: 0.10705542655939064.


[I 2025-04-08 00:45:49,214] Trial 21 finished with value: -0.060497364809019294 and parameters: {'feature_selection__estimator__alpha': 0.07906721113457271, 'svr__C': 0.22265789076290934, 'svr__gamma': 0.001194438068858752, 'svr__epsilon': 0.1310450381136202, 'svr__kernel': 'rbf'}. Best is trial 4 with value: 0.10705542655939064.


[I 2025-04-08 00:45:49,253] Trial 22 finished with value: -0.02959971014458503 and parameters: {'feature_selection__estimator__alpha': 0.0823284027726391, 'svr__C': 0.1906870170102992, 'svr__gamma': 0.9031790096647132, 'svr__epsilon': 0.15092531792014288, 'svr__kernel': 'rbf'}. Best is trial 4 with value: 0.10705542655939064.


[I 2025-04-08 00:45:49,278] Trial 24 finished with value: -0.12371288159817082 and parameters: {'feature_selection__estimator__alpha': 0.009803515372521124, 'svr__C': 11.4508312645903, 'svr__gamma': 0.5557935752581757, 'svr__epsilon': 0.15664919838436492, 'svr__kernel': 'rbf'}. Best is trial 4 with value: 0.10705542655939064.


[I 2025-04-08 00:45:49,403] Trial 23 finished with value: -0.035949451694675096 and parameters: {'feature_selection__estimator__alpha': 0.044611851247943785, 'svr__C': 10.509026307822925, 'svr__gamma': 0.4629356608117655, 'svr__epsilon': 0.13846200155426222, 'svr__kernel': 'rbf'}. Best is trial 4 with value: 0.10705542655939064.


[I 2025-04-08 00:45:49,496] Trial 26 finished with value: -0.09318144974373545 and parameters: {'feature_selection__estimator__alpha': 0.010823408014205838, 'svr__C': 16.694798664373266, 'svr__gamma': 0.29565359773350025, 'svr__epsilon': 0.30121457811192826, 'svr__kernel': 'rbf'}. Best is trial 4 with value: 0.10705542655939064.


[I 2025-04-08 00:45:49,834] Trial 27 finished with value: 0.002375892766268273 and parameters: {'feature_selection__estimator__alpha': 0.012280993988506298, 'svr__C': 10.275790369800296, 'svr__gamma': 0.005140944367374624, 'svr__epsilon': 0.29310247806143486, 'svr__kernel': 'rbf'}. Best is trial 4 with value: 0.10705542655939064.


[I 2025-04-08 00:45:50,131] Trial 28 finished with value: 0.01681717277022819 and parameters: {'feature_selection__estimator__alpha': 0.008354163229885261, 'svr__C': 7.368337521749835, 'svr__gamma': 0.005470166836179754, 'svr__epsilon': 0.31402165958454387, 'svr__kernel': 'rbf'}. Best is trial 4 with value: 0.10705542655939064.


[I 2025-04-08 00:45:50,137] Trial 29 finished with value: 0.13656239166656206 and parameters: {'feature_selection__estimator__alpha': 0.008503506328151215, 'svr__C': 0.648943031726313, 'svr__gamma': 0.004784295850211031, 'svr__epsilon': 0.2856172409489674, 'svr__kernel': 'rbf'}. Best is trial 29 with value: 0.13656239166656206.


[I 2025-04-08 00:45:50,192] Trial 30 finished with value: 0.15811228525168133 and parameters: {'feature_selection__estimator__alpha': 0.036519579515496815, 'svr__C': 0.7254617892062591, 'svr__gamma': 0.005366613615819708, 'svr__epsilon': 0.29944161632456456, 'svr__kernel': 'rbf'}. Best is trial 30 with value: 0.15811228525168133.


[I 2025-04-08 00:45:50,235] Trial 31 finished with value: 0.151005383853772 and parameters: {'feature_selection__estimator__alpha': 0.035389287989981755, 'svr__C': 0.6773230575229429, 'svr__gamma': 0.005172125843202971, 'svr__epsilon': 0.3025111446558388, 'svr__kernel': 'rbf'}. Best is trial 30 with value: 0.15811228525168133.


[I 2025-04-08 00:45:50,386] Trial 32 finished with value: 0.13364409435347466 and parameters: {'feature_selection__estimator__alpha': 0.007495094708547463, 'svr__C': 0.604477532007495, 'svr__gamma': 0.005047248933637475, 'svr__epsilon': 0.28824273020769753, 'svr__kernel': 'rbf'}. Best is trial 30 with value: 0.15811228525168133.


[I 2025-04-08 00:45:50,470] Trial 33 finished with value: 0.1229613880505724 and parameters: {'feature_selection__estimator__alpha': 0.006627134540691801, 'svr__C': 0.4577424669240775, 'svr__gamma': 0.00540382899462992, 'svr__epsilon': 0.1950818645801625, 'svr__kernel': 'rbf'}. Best is trial 30 with value: 0.15811228525168133.


[I 2025-04-08 00:45:50,530] Trial 25 finished with value: -0.12348345265580574 and parameters: {'feature_selection__estimator__alpha': 0.00010170581886559485, 'svr__C': 9.341874922417928, 'svr__gamma': 0.3075516590291665, 'svr__epsilon': 0.3042300670547752, 'svr__kernel': 'rbf'}. Best is trial 30 with value: 0.15811228525168133.


[I 2025-04-08 00:45:50,756] Trial 34 finished with value: 0.15701578548700315 and parameters: {'feature_selection__estimator__alpha': 0.020264790403631903, 'svr__C': 0.5141551513961609, 'svr__gamma': 0.0061340799213565975, 'svr__epsilon': 0.1838055475764079, 'svr__kernel': 'rbf'}. Best is trial 30 with value: 0.15811228525168133.


[I 2025-04-08 00:45:51,139] Trial 35 finished with value: 0.17522052253130413 and parameters: {'feature_selection__estimator__alpha': 0.027009191698832615, 'svr__C': 0.7529524894967893, 'svr__gamma': 0.05179385307261992, 'svr__epsilon': 0.34586741997565446, 'svr__kernel': 'rbf'}. Best is trial 35 with value: 0.17522052253130413.


[I 2025-04-08 00:45:51,182] Trial 37 finished with value: 0.11250197214285787 and parameters: {'feature_selection__estimator__alpha': 0.02781555689895506, 'svr__C': 0.629327710243367, 'svr__gamma': 0.0034849768908370854, 'svr__epsilon': 0.3535975652473174, 'svr__kernel': 'rbf'}. Best is trial 35 with value: 0.17522052253130413.


[I 2025-04-08 00:45:51,214] Trial 38 finished with value: 0.1085265379125087 and parameters: {'feature_selection__estimator__alpha': 0.02848116227132529, 'svr__C': 0.5998834243153072, 'svr__gamma': 0.003576402252382906, 'svr__epsilon': 0.3494660137065388, 'svr__kernel': 'rbf'}. Best is trial 35 with value: 0.17522052253130413.


[I 2025-04-08 00:45:51,247] Trial 36 finished with value: 0.1649335504003845 and parameters: {'feature_selection__estimator__alpha': 0.03091869231730841, 'svr__C': 0.6891891398380363, 'svr__gamma': 0.053276932265335086, 'svr__epsilon': 0.3572487074448042, 'svr__kernel': 'rbf'}. Best is trial 35 with value: 0.17522052253130413.


[I 2025-04-08 00:45:51,352] Trial 39 finished with value: 0.11337778803357694 and parameters: {'feature_selection__estimator__alpha': 0.005861575544259088, 'svr__C': 0.6461921979952385, 'svr__gamma': 0.003161056821882363, 'svr__epsilon': 0.27218844595978053, 'svr__kernel': 'rbf'}. Best is trial 35 with value: 0.17522052253130413.


[I 2025-04-08 00:45:51,516] Trial 40 finished with value: 0.11444011259439987 and parameters: {'feature_selection__estimator__alpha': 0.024296232001588154, 'svr__C': 0.6490136650683848, 'svr__gamma': 0.002770941704968215, 'svr__epsilon': 0.26892765658586704, 'svr__kernel': 'rbf'}. Best is trial 35 with value: 0.17522052253130413.


[I 2025-04-08 00:45:51,643] Trial 41 finished with value: 0.12380528693162929 and parameters: {'feature_selection__estimator__alpha': 0.025112512721805053, 'svr__C': 0.7979001894241962, 'svr__gamma': 0.0029188662990815677, 'svr__epsilon': 0.3457315949813898, 'svr__kernel': 'rbf'}. Best is trial 35 with value: 0.17522052253130413.


[I 2025-04-08 00:45:51,780] Trial 42 finished with value: 0.13415772815059968 and parameters: {'feature_selection__estimator__alpha': 0.026362943632864975, 'svr__C': 0.8971872081954471, 'svr__gamma': 0.002981881534998121, 'svr__epsilon': 0.34680117411866074, 'svr__kernel': 'rbf'}. Best is trial 35 with value: 0.17522052253130413.


[I 2025-04-08 00:45:52,163] Trial 43 finished with value: 0.14680120016187176 and parameters: {'feature_selection__estimator__alpha': 0.02478424871334316, 'svr__C': 1.0876860716158252, 'svr__gamma': 0.002732435997560487, 'svr__epsilon': 0.34595177428868484, 'svr__kernel': 'rbf'}. Best is trial 35 with value: 0.17522052253130413.


[I 2025-04-08 00:45:52,238] Trial 45 finished with value: 0.19908125774961855 and parameters: {'feature_selection__estimator__alpha': 0.017556867388719956, 'svr__C': 1.150981198042654, 'svr__gamma': 0.009291684061102375, 'svr__epsilon': 0.4191372222991985, 'svr__kernel': 'rbf'}. Best is trial 45 with value: 0.19908125774961855.


[I 2025-04-08 00:45:52,240] Trial 44 finished with value: 0.12900621570896412 and parameters: {'feature_selection__estimator__alpha': 0.043475540283621085, 'svr__C': 1.1123857157139747, 'svr__gamma': 0.0490993968929462, 'svr__epsilon': 0.42534758129282935, 'svr__kernel': 'rbf'}. Best is trial 45 with value: 0.19908125774961855.


[I 2025-04-08 00:45:52,244] Trial 46 finished with value: 0.06715418819191424 and parameters: {'feature_selection__estimator__alpha': 0.017473231432249627, 'svr__C': 3.077824875412997, 'svr__gamma': 0.05091720450729214, 'svr__epsilon': 0.40515202986485727, 'svr__kernel': 'rbf'}. Best is trial 45 with value: 0.19908125774961855.


[I 2025-04-08 00:45:52,376] Trial 47 finished with value: 0.08550694038326674 and parameters: {'feature_selection__estimator__alpha': 0.017742251282457856, 'svr__C': 2.14044641945624, 'svr__gamma': 0.043362763218410205, 'svr__epsilon': 0.41569681981848017, 'svr__kernel': 'rbf'}. Best is trial 45 with value: 0.19908125774961855.


[I 2025-04-08 00:45:52,624] Trial 48 finished with value: 0.1433782829845744 and parameters: {'feature_selection__estimator__alpha': 0.05006527693445455, 'svr__C': 1.2935327368380594, 'svr__gamma': 0.037742380250844094, 'svr__epsilon': 0.4107163924033669, 'svr__kernel': 'rbf'}. Best is trial 45 with value: 0.19908125774961855.


[I 2025-04-08 00:45:52,695] Trial 49 finished with value: 0.12185189151753309 and parameters: {'feature_selection__estimator__alpha': 0.016650637046494082, 'svr__C': 2.9500360834222863, 'svr__gamma': 0.009166387808092771, 'svr__epsilon': 0.4027392193854863, 'svr__kernel': 'rbf'}. Best is trial 45 with value: 0.19908125774961855.


[I 2025-04-08 00:45:52,712] Trial 50 finished with value: 0.025868950306469275 and parameters: {'feature_selection__estimator__alpha': 0.04127863724861081, 'svr__C': 3.3760732269744396, 'svr__gamma': 0.03880804688236637, 'svr__epsilon': 0.3993540574567805, 'svr__kernel': 'rbf'}. Best is trial 45 with value: 0.19908125774961855.


[I 2025-04-08 00:45:53,142] Trial 51 finished with value: 0.04706800877968147 and parameters: {'feature_selection__estimator__alpha': 0.0156405594606062, 'svr__C': 3.8082194317817097, 'svr__gamma': 0.03871672714921027, 'svr__epsilon': 0.4081092423857725, 'svr__kernel': 'rbf'}. Best is trial 45 with value: 0.19908125774961855.


[I 2025-04-08 00:45:53,235] Trial 53 finished with value: 0.1644049857345496 and parameters: {'feature_selection__estimator__alpha': 0.016972888642512902, 'svr__C': 1.9179486329744941, 'svr__gamma': 0.011252197747151268, 'svr__epsilon': 0.4150877919331666, 'svr__kernel': 'rbf'}. Best is trial 45 with value: 0.19908125774961855.


[I 2025-04-08 00:45:53,274] Trial 54 finished with value: 0.168039957836096 and parameters: {'feature_selection__estimator__alpha': 0.017903452629466788, 'svr__C': 2.048631978273916, 'svr__gamma': 0.009144487428054352, 'svr__epsilon': 0.49407486590101424, 'svr__kernel': 'rbf'}. Best is trial 45 with value: 0.19908125774961855.


[I 2025-04-08 00:45:53,356] Trial 52 finished with value: 0.1777130720463304 and parameters: {'feature_selection__estimator__alpha': 0.004617236987186118, 'svr__C': 1.6600159791297309, 'svr__gamma': 0.009975268945785763, 'svr__epsilon': 0.43484759614289037, 'svr__kernel': 'rbf'}. Best is trial 45 with value: 0.19908125774961855.


[I 2025-04-08 00:45:53,439] Trial 55 finished with value: 0.1468730656199801 and parameters: {'feature_selection__estimator__alpha': 0.017773860242993877, 'svr__C': 0.3485059135592976, 'svr__gamma': 0.009854452360060264, 'svr__epsilon': 0.3798008196395655, 'svr__kernel': 'rbf'}. Best is trial 45 with value: 0.19908125774961855.


[I 2025-04-08 00:45:53,618] Trial 56 finished with value: 0.17795460220527787 and parameters: {'feature_selection__estimator__alpha': 0.017303757173799596, 'svr__C': 1.60295262521683, 'svr__gamma': 0.0098903142779309, 'svr__epsilon': 0.4898748593512251, 'svr__kernel': 'rbf'}. Best is trial 45 with value: 0.19908125774961855.


[I 2025-04-08 00:45:53,697] Trial 57 finished with value: 0.15943585442443356 and parameters: {'feature_selection__estimator__alpha': 0.03384726679568365, 'svr__C': 0.3478272893648552, 'svr__gamma': 0.020400154920088, 'svr__epsilon': 0.38128218201485553, 'svr__kernel': 'rbf'}. Best is trial 45 with value: 0.19908125774961855.


[I 2025-04-08 00:45:53,705] Trial 58 finished with value: 0.14582861939803976 and parameters: {'feature_selection__estimator__alpha': 0.03466324476082537, 'svr__C': 0.30896977111020935, 'svr__gamma': 0.017853681363679572, 'svr__epsilon': 0.37375665485482035, 'svr__kernel': 'rbf'}. Best is trial 45 with value: 0.19908125774961855.


[I 2025-04-08 00:45:54,207] Trial 59 finished with value: 0.1644642182732993 and parameters: {'feature_selection__estimator__alpha': 0.03322563662338652, 'svr__C': 0.4136731898488436, 'svr__gamma': 0.017206407380183898, 'svr__epsilon': 0.3759060428215236, 'svr__kernel': 'rbf'}. Best is trial 45 with value: 0.19908125774961855.


[I 2025-04-08 00:45:54,231] Trial 60 finished with value: 0.1312467593271262 and parameters: {'feature_selection__estimator__alpha': 0.06136268786444559, 'svr__C': 0.3638999957667297, 'svr__gamma': 0.014366882789802358, 'svr__epsilon': 0.4388324484232589, 'svr__kernel': 'rbf'}. Best is trial 45 with value: 0.19908125774961855.


[I 2025-04-08 00:45:54,366] Trial 61 finished with value: 0.1460248178322 and parameters: {'feature_selection__estimator__alpha': 0.004262999628653724, 'svr__C': 0.36814745015476513, 'svr__gamma': 0.018615167376006866, 'svr__epsilon': 0.48493530555850234, 'svr__kernel': 'rbf'}. Best is trial 45 with value: 0.19908125774961855.


[I 2025-04-08 00:45:54,518] Trial 62 finished with value: 0.04665823950984147 and parameters: {'feature_selection__estimator__alpha': 0.0021215179368708256, 'svr__C': 5.026495498137283, 'svr__gamma': 0.02056474043859237, 'svr__epsilon': 0.4817622348634377, 'svr__kernel': 'rbf'}. Best is trial 45 with value: 0.19908125774961855.


[I 2025-04-08 00:45:54,560] Trial 63 finished with value: 0.10801418916799584 and parameters: {'feature_selection__estimator__alpha': 0.0047313178176025626, 'svr__C': 1.8805474314474988, 'svr__gamma': 0.016558373595988923, 'svr__epsilon': 0.4926495026790284, 'svr__kernel': 'rbf'}. Best is trial 45 with value: 0.19908125774961855.


[I 2025-04-08 00:45:54,747] Trial 64 finished with value: 0.04429537345097775 and parameters: {'feature_selection__estimator__alpha': 0.004874227148098988, 'svr__C': 4.878706504365954, 'svr__gamma': 0.023095250590779173, 'svr__epsilon': 0.4939173283107252, 'svr__kernel': 'rbf'}. Best is trial 45 with value: 0.19908125774961855.


[I 2025-04-08 00:45:54,779] Trial 65 finished with value: -0.041145533046554696 and parameters: {'feature_selection__estimator__alpha': 0.0019315759924599335, 'svr__C': 4.295567797291263, 'svr__gamma': 0.12245180391659681, 'svr__epsilon': 0.49107498938572386, 'svr__kernel': 'rbf'}. Best is trial 45 with value: 0.19908125774961855.


[I 2025-04-08 00:45:54,786] Trial 66 finished with value: -0.027277010916429072 and parameters: {'feature_selection__estimator__alpha': 0.0021584457974884613, 'svr__C': 4.643523539088689, 'svr__gamma': 0.11278108235208523, 'svr__epsilon': 0.4910347610837326, 'svr__kernel': 'rbf'}. Best is trial 45 with value: 0.19908125774961855.


[I 2025-04-08 00:45:55,312] Trial 69 finished with value: 0.05653055384598464 and parameters: {'feature_selection__estimator__alpha': 0.013205947934208507, 'svr__C': 5.018772474641461, 'svr__gamma': 0.09356417888272855, 'svr__epsilon': 0.49480831372825007, 'svr__kernel': 'rbf'}. Best is trial 45 with value: 0.19908125774961855.


[I 2025-04-08 00:45:55,315] Trial 67 finished with value: 0.09039561893204509 and parameters: {'feature_selection__estimator__alpha': 0.0008989211208514437, 'svr__C': 4.7948522418119115, 'svr__gamma': 0.02639337058209368, 'svr__epsilon': 0.48772562266111724, 'svr__kernel': 'rbf'}. Best is trial 45 with value: 0.19908125774961855.


[I 2025-04-08 00:45:55,316] Trial 68 finished with value: 0.009741587483316267 and parameters: {'feature_selection__estimator__alpha': 0.0008857098087501899, 'svr__C': 4.878784297882648, 'svr__gamma': 0.07909263169180301, 'svr__epsilon': 0.4941549180041254, 'svr__kernel': 'rbf'}. Best is trial 45 with value: 0.19908125774961855.


[I 2025-04-08 00:45:55,494] Trial 70 finished with value: 0.0729037956520264 and parameters: {'feature_selection__estimator__alpha': 0.013410833979604622, 'svr__C': 1.6023234876552814, 'svr__gamma': 0.07846633160016692, 'svr__epsilon': 0.4510018019276927, 'svr__kernel': 'rbf'}. Best is trial 45 with value: 0.19908125774961855.


[I 2025-04-08 00:45:55,733] Trial 71 finished with value: -0.04216247099866297 and parameters: {'feature_selection__estimator__alpha': 0.0007764617254348525, 'svr__C': 1.3981816991471077, 'svr__gamma': 0.11297987790154086, 'svr__epsilon': 0.45013395046141286, 'svr__kernel': 'rbf'}. Best is trial 45 with value: 0.19908125774961855.


[I 2025-04-08 00:45:55,740] Trial 73 finished with value: 0.07799822571469164 and parameters: {'feature_selection__estimator__alpha': 0.012341592677952179, 'svr__C': 1.4872967454194796, 'svr__gamma': 0.06964148515807027, 'svr__epsilon': 0.4522837906189984, 'svr__kernel': 'rbf'}. Best is trial 45 with value: 0.19908125774961855.


[I 2025-04-08 00:45:55,757] Trial 74 finished with value: 0.20018591692548365 and parameters: {'feature_selection__estimator__alpha': 0.013283965040613592, 'svr__C': 1.4089438846239357, 'svr__gamma': 0.0074454702031749075, 'svr__epsilon': 0.4607329249643162, 'svr__kernel': 'rbf'}. Best is trial 74 with value: 0.20018591692548365.


[I 2025-04-08 00:45:55,777] Trial 72 finished with value: -0.04737033228843831 and parameters: {'feature_selection__estimator__alpha': 0.0008881316212269955, 'svr__C': 1.6495678501514235, 'svr__gamma': 0.11700679079046668, 'svr__epsilon': 0.4594704407684992, 'svr__kernel': 'rbf'}. Best is trial 74 with value: 0.20018591692548365.


[I 2025-04-08 00:45:56,259] Trial 76 finished with value: -0.02566489574175565 and parameters: {'feature_selection__estimator__alpha': 0.06352643175778609, 'svr__C': 0.09776565849369032, 'svr__gamma': 0.007398028408472742, 'svr__epsilon': 0.4449333905509723, 'svr__kernel': 'rbf'}. Best is trial 74 with value: 0.20018591692548365.


[I 2025-04-08 00:45:56,343] Trial 78 finished with value: 0.15006439555445006 and parameters: {'feature_selection__estimator__alpha': 0.020917122111944214, 'svr__C': 2.220140856848917, 'svr__gamma': 0.010719464592585447, 'svr__epsilon': 0.46430678572240286, 'svr__kernel': 'rbf'}. Best is trial 74 with value: 0.20018591692548365.


[I 2025-04-08 00:45:56,584] Trial 77 finished with value: 0.20564019278484028 and parameters: {'feature_selection__estimator__alpha': 0.000564518732465574, 'svr__C': 1.472352503409597, 'svr__gamma': 0.007489108275176515, 'svr__epsilon': 0.4545848194830384, 'svr__kernel': 'rbf'}. Best is trial 77 with value: 0.20564019278484028.


[I 2025-04-08 00:45:56,688] Trial 79 finished with value: 0.11766120096632884 and parameters: {'feature_selection__estimator__alpha': 0.06101728619106554, 'svr__C': 2.4729821494927027, 'svr__gamma': 0.011140114900445539, 'svr__epsilon': 0.4310374351033165, 'svr__kernel': 'rbf'}. Best is trial 77 with value: 0.20564019278484028.


[I 2025-04-08 00:45:56,689] Trial 80 finished with value: -0.03317045556074068 and parameters: {'feature_selection__estimator__alpha': 0.05926502145045031, 'svr__C': 0.09769673542630991, 'svr__gamma': 0.007765512664826436, 'svr__epsilon': 0.46707479166722415, 'svr__kernel': 'rbf'}. Best is trial 77 with value: 0.20564019278484028.


[I 2025-04-08 00:45:56,753] Trial 81 finished with value: 0.11061336827676368 and parameters: {'feature_selection__estimator__alpha': 0.06253554078494006, 'svr__C': 2.398937224315305, 'svr__gamma': 0.007286587907098587, 'svr__epsilon': 0.47250587719836035, 'svr__kernel': 'rbf'}. Best is trial 77 with value: 0.20564019278484028.


[I 2025-04-08 00:45:56,762] Trial 82 finished with value: 0.14004019130768966 and parameters: {'feature_selection__estimator__alpha': 0.009277237759524083, 'svr__C': 2.3514243702743514, 'svr__gamma': 0.008268311147476502, 'svr__epsilon': 0.4700405870248879, 'svr__kernel': 'rbf'}. Best is trial 77 with value: 0.20564019278484028.


[I 2025-04-08 00:45:56,768] Trial 75 finished with value: 0.19350375713043586 and parameters: {'feature_selection__estimator__alpha': 0.0004841016545555085, 'svr__C': 1.449040498263282, 'svr__gamma': 0.008481107291041923, 'svr__epsilon': 0.4492897647827078, 'svr__kernel': 'rbf'}. Best is trial 77 with value: 0.20564019278484028.


[I 2025-04-08 00:45:57,215] Trial 83 finished with value: 0.1618276455334064 and parameters: {'feature_selection__estimator__alpha': 0.02125569684854375, 'svr__C': 2.400652023866527, 'svr__gamma': 0.008001268642109854, 'svr__epsilon': 0.4314516932444654, 'svr__kernel': 'rbf'}. Best is trial 77 with value: 0.20564019278484028.


[I 2025-04-08 00:45:57,216] Trial 84 finished with value: 0.1948725690131225 and parameters: {'feature_selection__estimator__alpha': 0.009834312745432975, 'svr__C': 0.9392909902916499, 'svr__gamma': 0.007399858093487023, 'svr__epsilon': 0.4312614073148613, 'svr__kernel': 'rbf'}. Best is trial 77 with value: 0.20564019278484028.


[I 2025-04-08 00:45:57,636] Trial 86 finished with value: 0.21335510004719574 and parameters: {'feature_selection__estimator__alpha': 0.0012693428109479302, 'svr__C': 1.08933256186536, 'svr__gamma': 0.007383880324670445, 'svr__epsilon': 0.4747279409436811, 'svr__kernel': 'rbf'}. Best is trial 86 with value: 0.21335510004719574.


[I 2025-04-08 00:45:58,136] Trial 87 finished with value: -0.10702553828624128 and parameters: {'feature_selection__estimator__alpha': 0.0005161144587459318, 'svr__C': 0.9999805529245117, 'svr__gamma': 0.20451285472802141, 'svr__epsilon': 0.32260005613637593, 'svr__kernel': 'rbf'}. Best is trial 86 with value: 0.21335510004719574.


[I 2025-04-08 00:45:58,200] Trial 89 finished with value: -0.10681236814363544 and parameters: {'feature_selection__estimator__alpha': 0.00046617152692542526, 'svr__C': 0.9627649822591567, 'svr__gamma': 0.19564478978631955, 'svr__epsilon': 0.01825732210742964, 'svr__kernel': 'rbf'}. Best is trial 86 with value: 0.21335510004719574.


[I 2025-04-08 00:45:58,365] Trial 85 finished with value: 0.12961866119691162 and parameters: {'feature_selection__estimator__alpha': 0.0002856637901498154, 'svr__C': 2.5360182843987804, 'svr__gamma': 0.007152360305723216, 'svr__epsilon': 0.47332015210276346, 'svr__kernel': 'rbf'}. Best is trial 86 with value: 0.21335510004719574.


[I 2025-04-08 00:45:58,403] Trial 90 finished with value: 0.18912756588904972 and parameters: {'feature_selection__estimator__alpha': 0.00036074726329911756, 'svr__C': 0.9176438713794788, 'svr__gamma': 0.004315971949687006, 'svr__epsilon': 0.4369311427164688, 'svr__kernel': 'rbf'}. Best is trial 86 with value: 0.21335510004719574.


[I 2025-04-08 00:45:58,622] Trial 88 finished with value: 0.16505009683804225 and parameters: {'feature_selection__estimator__alpha': 0.00027609263420491534, 'svr__C': 1.0191659515310814, 'svr__gamma': 0.004059453949346249, 'svr__epsilon': 0.32556967325533737, 'svr__kernel': 'rbf'}. Best is trial 86 with value: 0.21335510004719574.


[I 2025-04-08 00:45:58,787] Trial 92 finished with value: 0.18181715256029027 and parameters: {'feature_selection__estimator__alpha': 0.0004791967907025735, 'svr__C': 0.9050421203121947, 'svr__gamma': 0.0037796303894011515, 'svr__epsilon': 0.4239775772953648, 'svr__kernel': 'rbf'}. Best is trial 86 with value: 0.21335510004719574.


[I 2025-04-08 00:45:58,912] Trial 91 finished with value: 0.1745760477832891 and parameters: {'feature_selection__estimator__alpha': 0.0004066133472858601, 'svr__C': 0.9159708635597538, 'svr__gamma': 0.012632285470474075, 'svr__epsilon': 0.32397128779741297, 'svr__kernel': 'rbf'}. Best is trial 86 with value: 0.21335510004719574.


[I 2025-04-08 00:45:59,375] Trial 93 finished with value: 0.18655436717920515 and parameters: {'feature_selection__estimator__alpha': 0.000432975376257357, 'svr__C': 0.9176334575233349, 'svr__gamma': 0.004002712582121286, 'svr__epsilon': 0.42903296503335925, 'svr__kernel': 'rbf'}. Best is trial 86 with value: 0.21335510004719574.


[I 2025-04-08 00:45:59,404] Trial 97 finished with value: 0.20538944375595908 and parameters: {'feature_selection__estimator__alpha': 0.0014802219350996532, 'svr__C': 1.2959696323411798, 'svr__gamma': 0.004365131696798142, 'svr__epsilon': 0.42420203495725584, 'svr__kernel': 'rbf'}. Best is trial 86 with value: 0.21335510004719574.


[I 2025-04-08 00:45:59,451] Trial 96 finished with value: 0.14930346059346458 and parameters: {'feature_selection__estimator__alpha': 0.0011492368185143497, 'svr__C': 1.1631069258141085, 'svr__gamma': 0.0018217757664494856, 'svr__epsilon': 0.4213248816613282, 'svr__kernel': 'rbf'}. Best is trial 86 with value: 0.21335510004719574.


[I 2025-04-08 00:45:59,511] Trial 95 finished with value: 0.20347476302934756 and parameters: {'feature_selection__estimator__alpha': 0.0005815630962529162, 'svr__C': 1.0921814108091665, 'svr__gamma': 0.003965685792645633, 'svr__epsilon': 0.4307127290867892, 'svr__kernel': 'rbf'}. Best is trial 86 with value: 0.21335510004719574.


[I 2025-04-08 00:45:59,690] Trial 99 finished with value: 0.17520814232544688 and parameters: {'feature_selection__estimator__alpha': 0.0006386634694190589, 'svr__C': 1.1988825353540022, 'svr__gamma': 0.0022414481939009444, 'svr__epsilon': 0.42289617454698314, 'svr__kernel': 'rbf'}. Best is trial 86 with value: 0.21335510004719574.


[I 2025-04-08 00:45:59,850] Trial 94 finished with value: 0.19201041131532925 and parameters: {'feature_selection__estimator__alpha': 0.00029285914074015854, 'svr__C': 0.899408003893454, 'svr__gamma': 0.004541595702917719, 'svr__epsilon': 0.4359661688264824, 'svr__kernel': 'rbf'}. Best is trial 86 with value: 0.21335510004719574.


[I 2025-04-08 00:45:59,961] Trial 98 finished with value: 0.16705275648207601 and parameters: {'feature_selection__estimator__alpha': 0.0003485643784588591, 'svr__C': 1.2154509285976414, 'svr__gamma': 0.002137330576625067, 'svr__epsilon': 0.4239952300218971, 'svr__kernel': 'rbf'}. Best is trial 86 with value: 0.21335510004719574.


Best hyperparameters: {'feature_selection__estimator__alpha': 0.0012693428109479302, 'svr__C': 1.08933256186536, 'svr__gamma': 0.007383880324670445, 'svr__epsilon': 0.4747279409436811, 'svr__kernel': 'rbf'}
Best R² score: 0.2134


## Store Trained Model

In [8]:
from sklearn import clone

if HYPERPARAMETER_TUNING_ENABLED:
    unfitted_pipeline.set_params(**study.best_params)
else:
    unfitted_pipeline.set_params(**PIPELINE_PARAMS)
fitted_pipeline = clone(unfitted_pipeline)
fitted_pipeline.fit(X, y)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



## Feature Selection Results

In [9]:
preprocessor = fitted_pipeline.named_steps['preprocessor']
feature_names = preprocessor.get_feature_names_out()    # after preprocessing

feature_selector = fitted_pipeline.named_steps['feature_selection']
selected_mask = feature_selector.get_support()

selected_features = feature_names[selected_mask]
unselected_features = feature_names[~selected_mask]

print(f"Number of Selected features ({len(selected_features)}):")
print(f"Selected features: {selected_features}")
print(f"Unselected features: {unselected_features}")

Number of Selected features (39):
Selected features: ['remainder__f0_mean' 'remainder__f0_min' 'remainder__f0_range'
 'remainder__f0_sd' 'remainder__intensity_mean' 'remainder__intensity_min'
 'remainder__intensity_max' 'remainder__intensity_sd' 'remainder__f1_mean'
 'remainder__f1_sd' 'remainder__f2_mean' 'remainder__f2_sd'
 'remainder__f3_mean' 'remainder__f3_sd' 'remainder__f2_f1_mean'
 'remainder__f3_f1_mean' 'remainder__f2_f1_sd' 'remainder__f3_f1_sd'
 'remainder__jitter' 'remainder__shimmer' 'remainder__percent_unvoiced'
 'remainder__percent_breaks' 'remainder__pause_duration_max'
 'remainder__pause_duration_avg' 'remainder__duration'
 'remainder__Total Words' 'remainder__Unique Words'
 'remainder__Filler Words' 'remainder__Audio Duration (s)'
 'remainder__Duration/Total Words' 'remainder__Duration/Unique Words'
 'remainder__Individual' 'remainder__We' 'remainder__They'
 'remainder__PosEmotion' 'remainder__NegEmotion' 'remainder__Cognitive'
 'remainder__Relativity' 'remainder__Qu

# Monte Carlo Cross Validation

In [10]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import r2_score

scoring = {
    "r2": make_scorer(r2_score),
    "pearson": make_scorer(pearson_corr)
}

results = cross_validate(
    unfitted_pipeline,
    X,
    y,
    cv=GroupShuffleSplit(n_splits=1000, test_size=0.2, random_state=42),
    groups=groups_column,
    scoring=scoring,
    n_jobs=-1,
)

r2_scores = results["test_r2"]
pearson_scores = results["test_pearson"]
# Report the average performance and variability
avg_r2_score = np.mean(r2_scores)
print(
    f"Mean R² Score: {avg_r2_score:.2f} (±{np.std(r2_scores):.2f})"
)
avg_perason_score = np.mean(pearson_scores)
print(
    f"Mean Pearson Correlation: {avg_perason_score:.2f} (±{np.std(pearson_scores):.2f})"
)

# Mean R² Score: 0.20 (±0.23)
# Mean Pearson Correlation: 0.53 (±0.16)

Mean R² Score: 0.14 (±0.22)
Mean Pearson Correlation: 0.47 (±0.17)


# Save the Model

In [11]:
from joblib import dump, load

dump(fitted_pipeline, os.path.join(SAVED_MODELS_PATH,f'{target_column}.joblib'))

['./regression_models/saved_models/Colleague.joblib']