## Saving the final all_trait_predictor for the HuggingFace classifier

In [4]:
'''
>>> Script for freezing the weights of the random forest for the final classification of each trait.
>>> Each trait is predicted separately, due to skewed label distribution and the 
huge difference between traits, especially in training and validation set, different 
hyperparameters has been set for each random forest classifier, so the best prediction 
for each trait is guaranteed.
>>> This procedure serves the purpose of adapting the project for predicting personality traits
in job interview scenarios.

'''

import os
import pandas as pd
import joblib
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

train_path = "/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/train/comb_train_liwc_embed.csv"
val_path = "/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/validation/comb_val_liwc_embed.csv"
output_dir = "models"
os.makedirs(output_dir, exist_ok=True)

train_df = pd.read_csv(train_path)
val_df = pd.read_csv(val_path)

feature_cols = [col for col in train_df.columns if col.startswith("embed_") or col.startswith("liwc_")]
feature_cols = [col for col in feature_cols if col in val_df.columns]

# traits to classify
traits = ["Openness", "Conscientiousness", "Extraversion", "Agreeableness", "Emotional stability"]

# trait-specific hyperparameters
trait_params = {
    "Openness": {"n_estimators": 14, "max_depth": 30},
    "Conscientiousness": {"n_estimators": 20, "max_depth": None},
    "Extraversion": {"n_estimators": 80, "max_depth": 30},
    "Agreeableness": {"n_estimators": 10, "max_depth": 12},
    "Emotional stability": {"n_estimators": 50, "max_depth": 7},
}

# scaling 
X_train = train_df[feature_cols].values
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
joblib.dump(scaler, os.path.join(output_dir, "feature_scaler.pkl"))

# === Train and save model for each trait ===
for trait in traits:
    print(f"Training model for {trait}...")

    y_train = train_df[trait].values
    params = trait_params[trait]

    clf = RandomForestClassifier(
        n_estimators=params["n_estimators"],
        max_depth=params["max_depth"],
        random_state=42,
        n_jobs=-1
    )
    clf.fit(X_train_scaled, y_train)

    trait_filename = trait.lower().replace(" ", "_") + "_classifier.pkl"
    joblib.dump(clf, os.path.join(output_dir, trait_filename))

print("\nAll models and scaler saved in 'models/' folder.")

Training model for Openness...
Training model for Conscientiousness...
Training model for Extraversion...
Training model for Agreeableness...
Training model for Emotional stability...

All models and scaler saved in 'models/' folder.
