In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, classification_report
import joblib


# Load data
train_df = pd.read_csv("/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/train/comb_train_liwc_embed.csv")
val_df = pd.read_csv("/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/validation/comb_val_liwc_embed.csv")

# Select features
feature_cols = [col for col in train_df.columns if col.startswith("embed_") or col.startswith("liwc_")]
X_train = train_df[feature_cols].values
X_val = val_df[feature_cols].values

# Labels for Agreeableness
y_train = train_df["Openness"].values
y_val = val_df["Openness"].values

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Train classifier (no resampling)
clf = RandomForestClassifier(n_estimators=10, random_state=42, n_jobs=-1)
clf.fit(X_train_scaled, y_train)

# Predict and evaluate
y_pred = clf.predict(X_val_scaled)

print("\nClassification report for Agreeableness (No Resampling):")
print(f"Accuracy: {accuracy_score(y_val, y_pred):.4f}")
print(f"F1 (macro): {f1_score(y_val, y_pred, average='macro'):.4f}")
print(classification_report(y_val, y_pred, zero_division=0))


joblib.dump(clf, "models/openness_rf.pkl")

joblib.dump(scaler, "models/scaler.pkl")


Classification report for Agreeableness (No Resampling):
Accuracy: 0.5000
F1 (macro): 0.3456
              precision    recall  f1-score   support

        high       0.67      0.70      0.68        20
         low       0.50      0.12      0.20         8
      medium       0.11      0.25      0.15         4

    accuracy                           0.50        32
   macro avg       0.43      0.36      0.35        32
weighted avg       0.56      0.50      0.50        32



['models/scaler.pkl']

## Saving the final all_trait_predictor for the HuggingFace classifier

In [3]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
import joblib

train_df = pd.read_csv("processed_data/train/comb_train_liwc_embed.csv")

# === Define columns and traits ===
feature_cols = [col for col in train_df.columns if col.startswith("embed_") or col.startswith("liwc_")]
traits = ["Openness", "Conscientiousness", "Extraversion", "Agreeableness", "Emotional stability"]

# === Scale features ===
X_train = train_df[feature_cols].values
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# === Define classifiers ===
rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, class_weight="balanced")
gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=42)
svm = SVC(kernel="linear", C=1.0, probability=True, random_state=42, class_weight="balanced")

# === Train one classifier per trait and save ===
for trait in traits:
    print(f"Training classifier for {trait}...")

    y_train = train_df[trait].values
    ensemble = VotingClassifier(
        estimators=[('rf', rf), ('gb', gb), ('mlp', mlp), ('svm', svm)],
        voting='soft',
        n_jobs=-1
    )

    ensemble.fit(X_train_scaled, y_train)

    # Save classifier
    joblib.dump(ensemble, f"models/{trait.lower()}_classifier.pkl")

# Save scaler
joblib.dump(scaler, "models/feature_scaler.pkl")

print("All classifiers and scaler saved.")

Training classifier for Openness...
Training classifier for Conscientiousness...
Training classifier for Extraversion...
Training classifier for Agreeableness...
Training classifier for Emotional stability...
All classifiers and scaler saved.
