# Constants

In [134]:
DROPPED_LEXICAL_COLUMNS = ['Swear', 'Numbers', 'Inhibition', 'Preceptual','Anxiety', 'Anger', 'Sadness', 'Work', 'Articles',]
DROPPED_FACIAL_COLUMNS = [
'average_inner_brow_height_max' , 'average_inner_brow_height_median', 'average_inner_brow_height_min',
'average_outer_brow_height_max', 	'average_outer_brow_height_median', 	'average_outer_brow_height_min',
'eye_open_max', 'eye_open_median', 'eye_open_min',
'inner_lip_height_max', 'inner_lip_height_median', 'inner_lip_height_min',
'lip_corner_distance_max', 'lip_corner_distance_median', 'lip_corner_distance_min',
'outer_lip_height_max', 'outer_lip_height_median', 'outer_lip_height_min',
'smile_max', 'smile_median', 'smile_min',
'pitch_max', 'pitch_median', 'pitch_min',
'roll_max', 'roll_median', 'roll_min',
'yaw_max', 'yaw_median', 'yaw_min'
                           ] 
DROPPED_PROSODIC_COLUMNS = []
TARGET_COLUMN = 'RecommendHiring'
GROUPS_COLUMN = 'cleaned_ids'
SCORING_METRICS = {
    'r2': 'r2',              
    'mae': 'neg_mean_absolute_error', 
    # 'mse': 'neg_mean_squared_error'   
}

# Data Preprocessing

## Import Datasets

In [135]:
import pandas as pd
import os

datasets_folder_path = "../datasets"

features_df = pd.read_csv(os.path.join(datasets_folder_path, 'add.csv'))
features_df = features_df.set_index('participant_id')

labels_df = pd.read_csv(os.path.join(datasets_folder_path, 'turker_scores_full_interview.csv'))
labels_df = labels_df.set_index('Participant')
labels_df = labels_df.loc[labels_df['Worker'] == 'AGGR']

features_df.index = features_df.index.str.lower()
labels_df.index = labels_df.index.str.lower()
indexed_combined_df = features_df.join(labels_df[[TARGET_COLUMN]], how='left')

## Fill NaN Values

In [136]:
# Print number of missing values before filling
print(indexed_combined_df.isna().sum().sum())

# Fill missing values only in numeric columns
numeric_cols = indexed_combined_df.select_dtypes(include=['number']).columns
indexed_combined_df[numeric_cols] = indexed_combined_df[numeric_cols].fillna(indexed_combined_df[numeric_cols].mean())

# Print number of missing values after filling
print(indexed_combined_df.isna().sum().sum())

# Reset index
combined_df = indexed_combined_df.reset_index()

16
0


## Drop Unnecessary Columns

In [137]:
combined_df= combined_df.drop(columns=DROPPED_FACIAL_COLUMNS + DROPPED_LEXICAL_COLUMNS + DROPPED_PROSODIC_COLUMNS)

# Split Data

In [138]:
X = combined_df.iloc[:, 2:].values
y = combined_df.iloc[:, -1].values

# Feature Selection

In [139]:
# import numpy as np
# import pandas as pd
# from sklearn.linear_model import LassoCV

# lasso_cv = LassoCV(cv=5, random_state=42, max_iter=20000)
# lasso_cv.fit(X_train_scaled, y_train)

# coefficients = lasso_cv.coef_
# selected_features_indices = np.where(coefficients != 0)[0]

# feature_names = list(indexed_combined_df.columns[:-1])
# selected_feature_names = [feature_names[i] for i in selected_features_indices]

# selected_feature_set = set(selected_features_indices)
# unselected_feature_names = [feature_names[i] for i in range(len(feature_names)) if i not in selected_feature_set]

# print(f"Remove {len(unselected_feature_names)} features")
# print("Unselected features:", unselected_feature_names)
# print("Selected features:", selected_feature_names)
# print(f"After removal, you'll have {len(selected_feature_names)} features")

# X_train_scaled = X_train_scaled[:, selected_features_indices]
# X_test_scaled = X_test_scaled[:, selected_features_indices]

# Train SVR Model

In [140]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.model_selection import GroupKFold, cross_val_score, cross_validate

groups_column = combined_df[GROUPS_COLUMN].astype(str).values  

pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Scaling applied within each fold
    ('svr', SVR(kernel='rbf'))
])

cv_results = cross_validate(
    pipeline,
    X,
    y,
    cv=GroupKFold(n_splits=5),
    groups=groups_column,
    scoring=SCORING_METRICS,
    return_train_score=False  # Set to True if you also want training scores
)
for metric in SCORING_METRICS.keys():
    mean_score = cv_results[f'test_{metric}'].mean()
    std_score = cv_results[f'test_{metric}'].std()
    print(f"{metric.upper()} (CV): {mean_score:.3f} (±{std_score:.3f})")

    
trained_model = pipeline.fit(X, y)

R2 (CV): 0.534 (±0.104)
MAE (CV): -0.370 (±0.057)
