## Model Comparison
In this notebook we compared many different models for predicting personality traits. We also built voting ensembles of SVM, RF, MLP and GradientBoostingClassifier. The best result was given by random forest. We also tested SMOTE, due to strong presence of dominant labels in training set. But the result didn't show improvement, since the label distribution in validation set behaves differently.

### SVM

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, classification_report

train_df = pd.read_csv("/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/train/comb_train_liwc_embed.csv")
val_df = pd.read_csv("/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/validation/comb_val_liwc_embed.csv")

feature_cols = [col for col in train_df.columns if col.startswith("embed_") or col.startswith("liwc_")]
feature_cols = [col for col in feature_cols if col in val_df.columns] 

traits = ["Openness", "Conscientiousness", "Extraversion", "Agreeableness", "Emotional stability"]

reports = []

for trait in traits:
    print(f"\n==== Trait: {trait.capitalize()} ====")

    X_train = train_df[feature_cols].values
    y_train = train_df[trait].values

    X_test = val_df[feature_cols].values
    y_test = val_df[trait].values

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    svm = SVC(kernel="linear", C=1.0, random_state=42)
    svm.fit(X_train_scaled, y_train)

    y_pred = svm.predict(X_test_scaled)

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="macro")
    report = classification_report(y_test, y_pred, zero_division=0)

    print("Accuracy:", acc)
    print("F1-score (macro):", f1)

    reports.append((trait, acc, f1, report))

report_path = "/Users/arashalborz/Desktop/amiv_nlp_2025/classification/reports/svm.txt"
with open(report_path, "w") as f:
    for trait, acc, f1, rep in reports:
        f.write(f"=== Trait: {trait.capitalize()} ===\n")
        f.write(f"Accuracy: {acc:.4f}\n")
        f.write(f"F1-score (macro): {f1:.4f}\n")
        f.write(rep + "\n\n")


==== Trait: Openness ====
Accuracy: 0.25
F1-score (macro): 0.22873900293255134

==== Trait: Conscientiousness ====
Accuracy: 0.25
F1-score (macro): 0.25250544662309365

==== Trait: Extraversion ====
Accuracy: 0.40625
F1-score (macro): 0.2905982905982906

==== Trait: Agreeableness ====
Accuracy: 0.1875
F1-score (macro): 0.135632183908046

==== Trait: Emotional stability ====
Accuracy: 0.375
F1-score (macro): 0.33745654458727464
Saved all SVM reports to /Users/arashalborz/Desktop/amiv_nlp_2025/classification/reports/svm.txt


### Random Forest

In [89]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

train_df = pd.read_csv("/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/train/comb_train_liwc_embed.csv")
val_df = pd.read_csv("/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/validation/comb_val_liwc_embed.csv")

feature_cols = [col for col in train_df.columns if col.startswith("embed_") or col.startswith("liwc_")]
feature_cols = [col for col in feature_cols if col in val_df.columns]

traits = ["Openness", "Conscientiousness", "Extraversion", "Agreeableness", "Emotional stability"]

reports = []

for trait in traits:
    print(f"\n==== Trait: {trait.capitalize()} ====")

    X_train = train_df[feature_cols].values
    y_train = train_df[trait].values

    X_test = val_df[feature_cols].values
    y_test = val_df[trait].values

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    clf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
    clf.fit(X_train_scaled, y_train)

    y_pred = clf.predict(X_test_scaled)

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="macro")
    report = classification_report(y_test, y_pred, zero_division=0)

    print("Accuracy:", acc)
    print("F1-score (macro):", f1)

    reports.append((trait, acc, f1, report))

report_path = "/Users/arashalborz/Desktop/amiv_nlp_2025/classification/reports/random_forest.txt"
with open(report_path, "w") as f:
    for trait, acc, f1, rep in reports:
        f.write(f"=== Trait: {trait.capitalize()} ===\n")
        f.write(f"Accuracy: {acc:.4f}\n")
        f.write(f"F1-score (macro): {f1:.4f}\n")
        f.write(rep + "\n\n")


==== Trait: Openness ====
Accuracy: 0.625
F1-score (macro): 0.25641025641025644

==== Trait: Conscientiousness ====
Accuracy: 0.65625
F1-score (macro): 0.44805194805194803

==== Trait: Extraversion ====
Accuracy: 0.40625
F1-score (macro): 0.35555555555555557

==== Trait: Agreeableness ====
Accuracy: 0.1875
F1-score (macro): 0.20915032679738563

==== Trait: Emotional stability ====
Accuracy: 0.3125
F1-score (macro): 0.2924867724867725


## Gradient Boosting

In [None]:
"""
Gradient Boosting Script for predicting the traits separately 
based on the combined CLS-BERT-embeddings + LIWC-features
Evaluates on validation set.
"""

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

train_df = pd.read_csv("/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/train/comb_train_liwc_embed.csv")
val_df = pd.read_csv("/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/validation/comb_val_liwc_embed.csv")

feature_cols = [col for col in train_df.columns if col.startswith("embed_") or col.startswith("liwc_")]
feature_cols = [col for col in feature_cols if col in val_df.columns]  # Ensure feature overlap

traits = ["Openness", "Conscientiousness", "Extraversion", "Agreeableness", "Emotional stability"]

scaler = StandardScaler()
X_train = train_df[feature_cols].values
X_test = val_df[feature_cols].values
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

reports = ""

for trait in traits:
    print(f"\n========== Trait: {trait} ==========")

    y_train = train_df[trait].values
    y_test = val_df[trait].values

    clf = GradientBoostingClassifier(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=3,
        random_state=42
    )
    clf.fit(X_train_scaled, y_train)
    y_pred = clf.predict(X_test_scaled)

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="macro")
    report = classification_report(y_test, y_pred, zero_division=0)

    print(f"Accuracy: {acc:.4f}")
    print(f"F1-score (macro): {f1:.4f}")

    reports += f"\n=== Trait: {trait} ===\n"
    reports += f"Accuracy: {acc:.4f}\n"
    reports += f"F1-score (macro): {f1:.4f}\n"
    reports += report + "\n"

output_path = "/Users/arashalborz/Desktop/amiv_nlp_2025/classification/reports/gradient_boosting.txt"
with open(output_path, "w") as f:
    f.write("Classification Reports - Gradient Boosting (Train/Val Split)\n")
    f.write(reports)


Accuracy: 0.6250
F1-score (macro): 0.2564

Accuracy: 0.4062
F1-score (macro): 0.2924

Accuracy: 0.3438
F1-score (macro): 0.2688

Accuracy: 0.1250
F1-score (macro): 0.0784

Accuracy: 0.2500
F1-score (macro): 0.2516
Saved full trait classification report to /Users/arashalborz/Desktop/amiv_nlp_2025/classification/reports/gradient_boosting.txt


## MLP classifier

In [None]:
"""
MLP Classifier Script for predicting the traits separately 
based on the combined CLS-BERT-embeddings + LIWC-features
Evaluates on validation set.
"""

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

train_df = pd.read_csv("/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/train/comb_train_liwc_embed.csv")
val_df = pd.read_csv("/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/validation/comb_val_liwc_embed.csv")

feature_cols = [col for col in train_df.columns if col.startswith("embed_") or col.startswith("liwc_")]
feature_cols = [col for col in feature_cols if col in val_df.columns] 

traits = ["Openness", "Conscientiousness", "Extraversion", "Agreeableness", "Emotional stability"]

scaler = StandardScaler()
X_train = train_df[feature_cols].values
X_test = val_df[feature_cols].values
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

reports = ""

for trait in traits:
    print(f"\n========== Trait: {trait} ==========")

    y_train = train_df[trait].values
    y_test = val_df[trait].values

    clf = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=42)
    clf.fit(X_train_scaled, y_train)
    y_pred = clf.predict(X_test_scaled)

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="macro")
    report = classification_report(y_test, y_pred, zero_division=0)

    print(f"Accuracy: {acc:.4f}")
    print(f"F1-score (macro): {f1:.4f}")

    reports += f"\n=== Trait: {trait} ===\n"
    reports += f"Accuracy: {acc:.4f}\n"
    reports += f"F1-score (macro): {f1:.4f}\n"
    reports += report + "\n"

output_path = "/Users/arashalborz/Desktop/amiv_nlp_2025/classification/reports/mlp.txt"
with open(output_path, "w") as f:
    f.write("Classification Reports - MLP Classifier (Train/Val Split)\n")
    f.write(reports)


Accuracy: 0.3750
F1-score (macro): 0.2265

Accuracy: 0.2812
F1-score (macro): 0.2829

Accuracy: 0.3750
F1-score (macro): 0.1818

Accuracy: 0.1875
F1-score (macro): 0.1193

Accuracy: 0.4688
F1-score (macro): 0.2128
Saved full trait classification report to /Users/arashalborz/Desktop/amiv_nlp_2025/classification/reports/mlp.txt


## voting ensemble RF, GB, MLP

In [None]:
"""
Voting Ensemble Classifier (Random Forest + Gradient Boosting + MLP)
Train on train set and evaluate on validation set
"""

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

train_df = pd.read_csv("/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/train/comb_train_liwc_embed.csv")
val_df = pd.read_csv("/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/validation/comb_val_liwc_embed.csv")

feature_cols = [col for col in train_df.columns if col.startswith("embed_") or col.startswith("liwc_")]
feature_cols = [col for col in feature_cols if col in val_df.columns]  

traits = ["Openness", "Conscientiousness", "Extraversion", "Agreeableness", "Emotional stability"]

scaler = StandardScaler()
X_train = train_df[feature_cols].values
X_test = val_df[feature_cols].values
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, class_weight="balanced")
gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=42)

ensemble = VotingClassifier(
    estimators=[('rf', rf), ('gb', gb), ('mlp', mlp)],
    voting='soft',
    n_jobs=-1
)

all_reports = ""

for trait in traits:
    print(f"\n========== Predicting {trait.capitalize()} ==========")

    y_train = train_df[trait].values
    y_test = val_df[trait].values

    ensemble.fit(X_train_scaled, y_train)
    y_pred = ensemble.predict(X_test_scaled)

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="macro")
    report = classification_report(y_test, y_pred, zero_division=0)

    print(f"Accuracy: {acc:.4f}")
    print(f"F1-score (macro): {f1:.4f}")

    all_reports += f"\n\n=== {trait.upper()} ===\n"
    all_reports += f"Accuracy: {acc:.4f}\nF1-score (macro): {f1:.4f}\n"
    all_reports += report

report_path = "/Users/arashalborz/Desktop/amiv_nlp_2025/classification/reports/rf_gb_mlp.txt"
with open(report_path, "w") as f:
    f.write("Classification Reports - Voting Ensemble (Train/Val Split)\n")
    f.write(all_reports)


Accuracy: 0.4375
F1-score (macro): 0.2029

Accuracy: 0.3125
F1-score (macro): 0.3062

Accuracy: 0.3750
F1-score (macro): 0.1818

Accuracy: 0.1875
F1-score (macro): 0.1193

Accuracy: 0.4688
F1-score (macro): 0.2128
Saved ensemble report to /Users/arashalborz/Desktop/amiv_nlp_2025/classification/reports/rf_gb_mlp.txt


In [None]:
"""
Voting Ensemble Classifier (Random Forest + Gradient Boosting + MLP + SVM)
Train on train set and evaluate on validation set
"""

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, classification_report

train_df = pd.read_csv("/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/train/comb_train_liwc_embed.csv")
val_df = pd.read_csv("/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/validation/comb_val_liwc_embed.csv")

feature_cols = [col for col in train_df.columns if col.startswith("embed_") or col.startswith("liwc_")]
feature_cols = [col for col in feature_cols if col in val_df.columns] 

traits = ["Openness", "Conscientiousness", "Extraversion", "Agreeableness", "Emotional stability"]

scaler = StandardScaler()
X_train = train_df[feature_cols].values
X_test = val_df[feature_cols].values
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, class_weight="balanced")
gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)  
mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=42) 
svm = SVC(kernel="linear", C=1.0, probability=True, random_state=42, class_weight="balanced")

ensemble = VotingClassifier(
    estimators=[('rf', rf), ('gb', gb), ('mlp', mlp), ('svm', svm)],
    voting='soft',
    n_jobs=-1
)

all_reports = ""

for trait in traits:
    print(f"\n========== Predicting {trait.capitalize()} ==========")

    y_train = train_df[trait].values
    y_test = val_df[trait].values

    ensemble.fit(X_train_scaled, y_train)
    y_pred = ensemble.predict(X_test_scaled)

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="macro")
    report = classification_report(y_test, y_pred, zero_division=0)

    print(f"Accuracy: {acc:.4f}")
    print(f"F1-score (macro): {f1:.4f}")

    all_reports += f"\n\n=== {trait.upper()} ===\n"
    all_reports += f"Accuracy: {acc:.4f}\nF1-score (macro): {f1:.4f}\n"
    all_reports += report

report_path = "/Users/arashalborz/Desktop/amiv_nlp_2025/classification/reports/rf_gb_mlp_svm.txt"
with open(report_path, "w") as f:
    f.write("Classification Reports - Voting Ensemble (Train/Val Split)\n")
    f.write(all_reports)


Accuracy: 0.5625
F1-score (macro): 0.2400

Accuracy: 0.3750
F1-score (macro): 0.3496

Accuracy: 0.3750
F1-score (macro): 0.1818

Accuracy: 0.1875
F1-score (macro): 0.1193

Accuracy: 0.4688
F1-score (macro): 0.2128
Saved ensemble report to /Users/arashalborz/Desktop/amiv_nlp_2025/classification/reports/rf_gb_mlp_svm.txt


### Ensembles with SMOTE

In [None]:
"""
Voting Ensemble Classifier (Random Forest + Gradient Boosting + MLP + SVM)
with SMOTE applied separately for each personality trait.
"""

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, classification_report
from imblearn.over_sampling import SMOTE

train_df = pd.read_csv("/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/train/comb_train_liwc_embed.csv")
val_df = pd.read_csv("/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/validation/comb_val_liwc_embed.csv")

feature_cols = [col for col in train_df.columns if col.startswith("embed_") or col.startswith("liwc_")]
feature_cols = [col for col in feature_cols if col in val_df.columns] 

traits = ["Openness", "Conscientiousness", "Extraversion", "Agreeableness", "Emotional stability"]

scaler = StandardScaler()
X_train = train_df[feature_cols].values
X_test = val_df[feature_cols].values
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, class_weight="balanced")
gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=42)
svm = SVC(kernel="linear", C=1.0, probability=True, random_state=42, class_weight="balanced")

ensemble = VotingClassifier(
    estimators=[('rf', rf), ('gb', gb), ('mlp', mlp), ('svm', svm)],
    voting='soft',
    n_jobs=-1
)

all_reports = ""

for trait in traits:
    print(f"\n========== Predicting {trait.capitalize()} ==========")

    y_train = train_df[trait].values
    y_test = val_df[trait].values

    # SMOTE 
    smote = SMOTE(random_state=42)
    X_train_balanced, y_train_balanced = smote.fit_resample(X_train_scaled, y_train)

    ensemble.fit(X_train_balanced, y_train_balanced)
    y_pred = ensemble.predict(X_test_scaled)

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="macro")
    report = classification_report(y_test, y_pred, zero_division=0)

    print(f"Accuracy: {acc:.4f}")
    print(f"F1-score (macro): {f1:.4f}")

    all_reports += f"\n\n=== {trait.upper()} ===\n"
    all_reports += f"Accuracy: {acc:.4f}\nF1-score (macro): {f1:.4f}\n"
    all_reports += report

report_path = "/Users/arashalborz/Desktop/amiv_nlp_2025/classification/reports/voting_ensemble_all_traits_val_report.txt"
with open(report_path, "w") as f:
    f.write("Classification Reports - Voting Ensemble with SMOTE (Train/Val Split)\n")
    f.write(all_reports)






Accuracy: 0.6250
F1-score (macro): 0.2564





Accuracy: 0.2500
F1-score (macro): 0.2494





Accuracy: 0.3438
F1-score (macro): 0.1705





Accuracy: 0.1562
F1-score (macro): 0.0901





Accuracy: 0.4375
F1-score (macro): 0.2968
✅ Saved ensemble report with SMOTE to /Users/arashalborz/Desktop/amiv_nlp_2025/classification/reports/voting_ensemble_all_traits_val_report.txt


In [None]:
from collections import Counter

print("Predicted class distribution:", Counter(y_pred))
print("True class distribution:", Counter(y_test))

Predicted class distribution: Counter({'medium': 28, 'high': 3, 'low': 1})
True class distribution: Counter({'medium': 15, 'low': 13, 'high': 4})


In [None]:
"""
Logistic Regression Script for predicting personality traits
based on CLS-BERT embeddings + LIWC features.
Evaluates on validation set and saves full report.
"""

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report

train_df = pd.read_csv("/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/train/comb_train_liwc_embed.csv")
val_df = pd.read_csv("/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/validation/comb_val_liwc_embed.csv")

feature_cols = [col for col in train_df.columns if col.startswith("embed_") or col.startswith("liwc_")]
feature_cols = [col for col in feature_cols if col in val_df.columns]

traits = ["Openness", "Conscientiousness", "Extraversion", "Agreeableness", "Emotional stability"]

scaler = StandardScaler()
X_train = train_df[feature_cols].values
X_val = val_df[feature_cols].values
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

reports = ""

for trait in traits:
    print(f"\n========== Trait: {trait} ==========")
    
    y_train = train_df[trait].values
    y_val = val_df[trait].values

    clf = LogisticRegression(max_iter=1000, random_state=42)
    clf.fit(X_train_scaled, y_train)
    y_pred = clf.predict(X_val_scaled)

    acc = accuracy_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred, average="macro")
    report = classification_report(y_val, y_pred, zero_division=0)

    print(f"Accuracy: {acc:.4f}")
    print(f"F1-score (macro): {f1:.4f}")

    reports += f"\n=== Trait: {trait} ===\n"
    reports += f"Accuracy: {acc:.4f}\n"
    reports += f"F1-score (macro): {f1:.4f}\n"
    reports += report + "\n"

output_path = "/Users/arashalborz/Desktop/amiv_nlp_2025/classification/reports/logistic_regression.txt"
with open(output_path, "w") as f:
    f.write("Classification Reports - Logistic Regression (Train/Val Split)\n")
    f.write(reports)


Accuracy: 0.3750
F1-score (macro): 0.3886

Accuracy: 0.2500
F1-score (macro): 0.2619

Accuracy: 0.3750
F1-score (macro): 0.2426

Accuracy: 0.1875
F1-score (macro): 0.1356

Accuracy: 0.4375
F1-score (macro): 0.3889
Saved full trait classification report to /Users/arashalborz/Desktop/amiv_nlp_2025/classification/reports/logistic_regression.txt
