In [58]:
'''
SVM Script for predicting the traits separately based on the combined CLS-BERT-embeddings + LIWC-features


'''

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, classification_report

df = pd.read_csv("/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/train/comb_train_liwc_embed.csv")

X = df[[col for col in df.columns if col.startswith("embed_") or col.startswith("liwc_")]].values

traits = ["Openness", "Conscientiousness", "Extraversion", "Agreeableness", "Emotional stability"]

reports = []

for trait in traits:
    print(f"\n==== Trait: {trait.capitalize()} ====")

    y = df[trait].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    svm = SVC(kernel="linear", C=1.0, random_state=42)
    svm.fit(X_train_scaled, y_train)

    y_pred = svm.predict(X_test_scaled)

    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="macro")
    report = classification_report(y_test, y_pred, zero_division=0)

    print("Accuracy:", accuracy)
    print("F1-score (macro):", f1)
    #print("Classification Report:\n", report)

    reports.append((trait, accuracy, f1, report))

with open("/Users/arashalborz/Desktop/amiv_nlp_2025/classification/reports/svm_classification_report_all_traits.txt", "w") as f:
    for trait, acc, f1, rep in reports:
        f.write(f"=== Trait: {trait.capitalize()} ===\n")
        f.write(f"Accuracy: {acc:.4f}\n")
        f.write(f"F1-score (macro): {f1:.4f}\n")
        f.write(rep + "\n\n")

print("All SVM classification reports saved to svm_classification_report_all_traits.txt")


==== Trait: Openness ====
Accuracy: 0.4426751592356688
F1-score (macro): 0.33051364916501214

==== Trait: Conscientiousness ====
Accuracy: 0.4012738853503185
F1-score (macro): 0.38495091978108675

==== Trait: Extraversion ====
Accuracy: 0.4745222929936306
F1-score (macro): 0.42963209387027784

==== Trait: Agreeableness ====
Accuracy: 0.3980891719745223
F1-score (macro): 0.36544933762978876

==== Trait: Emotional stability ====
Accuracy: 0.4267515923566879
F1-score (macro): 0.40599718597696355
All SVM classification reports saved to svm_classification_report_all_traits.txt


### Test on real VAL SET

In [61]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, classification_report

train_df = pd.read_csv("/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/train/comb_train_liwc_embed.csv")
val_df = pd.read_csv("/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/validation/comb_val_liwc_embed.csv")

feature_cols = [col for col in train_df.columns if col.startswith("embed_") or col.startswith("liwc_")]
feature_cols = [col for col in feature_cols if col in val_df.columns]  # Make sure they exist in val_df too

traits = ["Openness", "Conscientiousness", "Extraversion", "Agreeableness", "Emotional stability"]

reports = []

for trait in traits:
    print(f"\n==== Trait: {trait.capitalize()} ====")

    X_train = train_df[feature_cols].values
    y_train = train_df[trait].values

    X_test = val_df[feature_cols].values
    y_test = val_df[trait].values

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    svm = SVC(kernel="linear", C=1.0, random_state=42)
    svm.fit(X_train_scaled, y_train)

    y_pred = svm.predict(X_test_scaled)

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="macro")
    report = classification_report(y_test, y_pred, zero_division=0)

    print("Accuracy:", acc)
    print("F1-score (macro):", f1)

    reports.append((trait, acc, f1, report))

report_path = "/Users/arashalborz/Desktop/amiv_nlp_2025/classification/reports/svm_classification_report_all_traits.txt"
with open(report_path, "w") as f:
    for trait, acc, f1, rep in reports:
        f.write(f"=== Trait: {trait.capitalize()} ===\n")
        f.write(f"Accuracy: {acc:.4f}\n")
        f.write(f"F1-score (macro): {f1:.4f}\n")
        f.write(rep + "\n\n")

print(f"Saved all SVM reports to {report_path}")


==== Trait: Openness ====
Accuracy: 0.25
F1-score (macro): 0.22873900293255134

==== Trait: Conscientiousness ====
Accuracy: 0.25
F1-score (macro): 0.25250544662309365

==== Trait: Extraversion ====
Accuracy: 0.40625
F1-score (macro): 0.2905982905982906

==== Trait: Agreeableness ====
Accuracy: 0.1875
F1-score (macro): 0.135632183908046

==== Trait: Emotional stability ====
Accuracy: 0.375
F1-score (macro): 0.33745654458727464
Saved all SVM reports to /Users/arashalborz/Desktop/amiv_nlp_2025/classification/reports/svm_classification_report_all_traits.txt


In [63]:
'''
Random Forest Script for predicting the traits separately based on the combined CLS-BERT-embeddings + LIWC-features


'''


import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

df = pd.read_csv("/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/train/comb_train_liwc_embed.csv")

X = df[[col for col in df.columns if col.startswith("embed_") or col.startswith("liwc_")]].values

traits = ["Openness", "Conscientiousness", "Extraversion", "Agreeableness", "Emotional stability"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

all_reports = ""

for trait in traits:
    print(f"\n========== Predicting {trait.capitalize()} ==========")
    
    y = df[trait].values
    
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
    
    clf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="macro")
    report = classification_report(y_test, y_pred, zero_division=0)
    
    print(f"Accuracy: {acc:.4f}")
    print(f"F1-score (macro): {f1:.4f}")
    #print(report)
    
    all_reports += f"\n\n=== {trait.upper()} ===\n"
    all_reports += f"Accuracy: {acc:.4f}\nF1-score (macro): {f1:.4f}\n"
    all_reports += report

with open("/Users/arashalborz/Desktop/amiv_nlp_2025/classification/reports/random_forest_all_traits_report.txt", "w") as f:
    f.write("Classification Reports - Random Forest (All Traits)\n")
    f.write(all_reports)

print("Saved full trait classification report to random_forest_all_traits_report.txt")


Accuracy: 0.5605
F1-score (macro): 0.3536

Accuracy: 0.4522
F1-score (macro): 0.3119

Accuracy: 0.5350
F1-score (macro): 0.3375

Accuracy: 0.4299
F1-score (macro): 0.3353

Accuracy: 0.4363
F1-score (macro): 0.3786
Saved full trait classification report to random_forest_all_traits_report.txt


### Test on the real VAL set

In [64]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

train_df = pd.read_csv("/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/train/comb_train_liwc_embed.csv")
val_df = pd.read_csv("/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/validation/comb_val_liwc_embed.csv")

feature_cols = [col for col in train_df.columns if col.startswith("embed_") or col.startswith("liwc_")]
feature_cols = [col for col in feature_cols if col in val_df.columns]

traits = ["Openness", "Conscientiousness", "Extraversion", "Agreeableness", "Emotional stability"]

reports = []

for trait in traits:
    print(f"\n==== Trait: {trait.capitalize()} ====")

    X_train = train_df[feature_cols].values
    y_train = train_df[trait].values

    X_test = val_df[feature_cols].values
    y_test = val_df[trait].values

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    clf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
    clf.fit(X_train_scaled, y_train)

    y_pred = clf.predict(X_test_scaled)

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="macro")
    report = classification_report(y_test, y_pred, zero_division=0)

    print("Accuracy:", acc)
    print("F1-score (macro):", f1)

    reports.append((trait, acc, f1, report))

report_path = "/Users/arashalborz/Desktop/amiv_nlp_2025/classification/reports/random_forest_classification_report_all_traits.txt"
with open(report_path, "w") as f:
    for trait, acc, f1, rep in reports:
        f.write(f"=== Trait: {trait.capitalize()} ===\n")
        f.write(f"Accuracy: {acc:.4f}\n")
        f.write(f"F1-score (macro): {f1:.4f}\n")
        f.write(rep + "\n\n")

print(f"Saved all Random Forest reports to {report_path}")


==== Trait: Openness ====
Accuracy: 0.625
F1-score (macro): 0.25641025641025644

==== Trait: Conscientiousness ====
Accuracy: 0.65625
F1-score (macro): 0.44805194805194803

==== Trait: Extraversion ====
Accuracy: 0.40625
F1-score (macro): 0.35555555555555557

==== Trait: Agreeableness ====
Accuracy: 0.1875
F1-score (macro): 0.20915032679738563

==== Trait: Emotional stability ====
Accuracy: 0.3125
F1-score (macro): 0.2924867724867725
Saved all Random Forest reports to /Users/arashalborz/Desktop/amiv_nlp_2025/classification/reports/random_forest_classification_report_all_traits.txt


In [65]:
'''
Naive Bayes Script for predicting the traits separately based on the combined CLS-BERT-embeddings + LIWC-features
'''

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, f1_score, classification_report

df = pd.read_csv("/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/train/comb_train_liwc_embed.csv")

X = df[[col for col in df.columns if col.startswith("embed_") or col.startswith("liwc_")]].values
traits = ["Openness", "Conscientiousness", "Extraversion", "Agreeableness", "Emotional stability"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

all_reports = ""

for trait in traits:
    print(f"\n========== Predicting {trait.capitalize()} ==========")
    
    y = df[trait].values
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
    
    clf = GaussianNB()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="macro")
    report = classification_report(y_test, y_pred, zero_division=0)
    
    print(f"Accuracy: {acc:.4f}")
    print(f"F1-score (macro): {f1:.4f}")
    #print(report)
    
    all_reports += f"\n\n=== {trait.upper()} ===\n"
    all_reports += f"Accuracy: {acc:.4f}\nF1-score (macro): {f1:.4f}\n"
    all_reports += report

with open("/Users/arashalborz/Desktop/amiv_nlp_2025/classification/reports/naive_bayes_all_traits_report.txt", "w") as f:
    f.write("Classification Reports - Naive Bayes (All Traits)\n")
    f.write(all_reports)

print("Saved full trait classification report to naive_bayes_all_traits_report.txt")


Accuracy: 0.3057
F1-score (macro): 0.2277

Accuracy: 0.3408
F1-score (macro): 0.3076

Accuracy: 0.2516
F1-score (macro): 0.2209

Accuracy: 0.3949
F1-score (macro): 0.3033

Accuracy: 0.3312
F1-score (macro): 0.3011
Saved full trait classification report to naive_bayes_all_traits_report.txt


In [66]:
'''
Gradient Boosting Script for predicting the traits separately based on the combined CLS-BERT-embeddings + LIWC-features
'''

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

df = pd.read_csv("/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/train/comb_train_liwc_embed.csv")

X = df[[col for col in df.columns if col.startswith("embed_") or col.startswith("liwc_")]].values

traits = ["Openness", "Conscientiousness", "Extraversion", "Agreeableness", "Emotional stability"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

all_reports = ""

for trait in traits:
    print(f"\n========== Predicting {trait.capitalize()} ==========")
    
    y = df[trait].values
    
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
    
    clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="macro")
    report = classification_report(y_test, y_pred, zero_division=0)
    
    print(f"Accuracy: {acc:.4f}")
    print(f"F1-score (macro): {f1:.4f}")
    #print(report)
    
    all_reports += f"\n\n=== {trait.upper()} ===\n"
    all_reports += f"Accuracy: {acc:.4f}\nF1-score (macro): {f1:.4f}\n"
    all_reports += report

with open("/Users/arashalborz/Desktop/amiv_nlp_2025/classification/reports/gradient_boosting_all_traits_report.txt", "w") as f:
    f.write("Classification Reports - Gradient Boosting (All Traits)\n")
    f.write(all_reports)

print("Saved full trait classification report to gradient_boosting_all_traits_report.txt")




Accuracy: 0.4809
F1-score (macro): 0.3044

Accuracy: 0.4427
F1-score (macro): 0.3818

Accuracy: 0.5446
F1-score (macro): 0.3962

Accuracy: 0.4013
F1-score (macro): 0.3387

Accuracy: 0.4331
F1-score (macro): 0.3891
Saved full trait classification report to gradient_boosting_all_traits_report.txt


In [67]:
'''
MLP Classifier Script for predicting the traits separately based on the combined CLS-BERT-embeddings + LIWC-features
'''

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

df = pd.read_csv("/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/train/comb_train_liwc_embed.csv")

X = df[[col for col in df.columns if col.startswith("embed_") or col.startswith("liwc_")]].values

traits = ["Openness", "Conscientiousness", "Extraversion", "Agreeableness", "Emotional stability"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

all_reports = ""

for trait in traits:
    print(f"\n========== Predicting {trait.capitalize()} ==========")
    
    y = df[trait].values
    
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
    
    clf = MLPClassifier(hidden_layer_sizes=(128,), max_iter=300, random_state=42)
    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="macro")
    report = classification_report(y_test, y_pred, zero_division=0)
    
    print(f"Accuracy: {acc:.4f}")
    print(f"F1-score (macro): {f1:.4f}")
    #print(report)
    
    all_reports += f"\n\n=== {trait.upper()} ===\n"
    all_reports += f"Accuracy: {acc:.4f}\nF1-score (macro): {f1:.4f}\n"
    all_reports += report

with open("/Users/arashalborz/Desktop/amiv_nlp_2025/classification/reports/mlp_all_traits_report.txt", "w") as f:
    f.write("Classification Reports - MLP Classifier (All Traits)\n")
    f.write(all_reports)

print("Saved full trait classification report to mlp_all_traits_report.txt")


Accuracy: 0.4713
F1-score (macro): 0.3739

Accuracy: 0.3885
F1-score (macro): 0.3524

Accuracy: 0.5064
F1-score (macro): 0.4423

Accuracy: 0.3981
F1-score (macro): 0.3711

Accuracy: 0.4268
F1-score (macro): 0.4113
Saved full trait classification report to mlp_all_traits_report.txt


In [None]:
'''
Voting Ensemble Classifier (Random Forest + Gradient Boosting + MLP)
for predicting all Big Five traits
'''
############################################################GOOD RESULTS TILL NOW
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

df = pd.read_csv("/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/train/comb_train_liwc_embed.csv")

X = df[[col for col in df.columns if col.startswith("embed_") or col.startswith("liwc_")]].values
traits = ["Openness", "Conscientiousness", "Extraversion", "Agreeableness", "Emotional stability"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=42)

ensemble = VotingClassifier(
    estimators=[
        ('rf', rf),
        ('gb', gb),
        ('mlp', mlp)
    ],
    voting='soft',  # hard was also tried; the results with softs were better except for agreableness
    n_jobs=-1
)

all_reports = ""

for trait in traits:
    print(f"\n========== Predicting {trait.capitalize()} ==========")

    y = df[trait].values

    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    ensemble.fit(X_train, y_train)
    y_pred = ensemble.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="macro")
    report = classification_report(y_test, y_pred, zero_division=0)

    print(f"Accuracy: {acc:.4f}")
    print(f"F1-score (macro): {f1:.4f}")
    #print(report)

    all_reports += f"\n\n=== {trait.upper()} ===\n"
    all_reports += f"Accuracy: {acc:.4f}\nF1-score (macro): {f1:.4f}\n"
    all_reports += report

with open("/Users/arashalborz/Desktop/amiv_nlp_2025/classification/reports/voting_ensemble_all_traits_report.txt", "w") as f:
    f.write("Classification Reports - Voting Ensemble (All Traits)\n")
    f.write(all_reports)

print("Saved report to voting_ensemble_all_traits_report.txt")


Accuracy: 0.4968
F1-score (macro): 0.3575

Accuracy: 0.4331
F1-score (macro): 0.3785

Accuracy: 0.5701
F1-score (macro): 0.4751

Accuracy: 0.3822
F1-score (macro): 0.3269

Accuracy: 0.4268
F1-score (macro): 0.4054
Saved report to voting_ensemble_all_traits_report.txt


In [79]:
### LIKE ABOVE BUT WITH SMOTE

'''
Voting Ensemble Classifier (Random Forest + Gradient Boosting + MLP) with SMOTE
for predicting all Big Five traits
'''

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report
from imblearn.over_sampling import SMOTE
from collections import Counter

df = pd.read_csv("/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/train/comb_train_liwc_embed.csv")

X = df[[col for col in df.columns if col.startswith("embed_") or col.startswith("liwc_")]].values
traits = ["Openness", "Conscientiousness", "Extraversion", "Agreeableness", "Emotional stability"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=42)

ensemble = VotingClassifier(
    estimators=[('rf', rf), ('gb', gb), ('mlp', mlp)],
    voting='soft',
    n_jobs=-1
)

all_reports = ""

for trait in traits:
    print(f"\n========== Predicting {trait.capitalize()} ==========")

    y = df[trait].values
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

    # Apply SMOTE to training split
    smote = SMOTE(random_state=42)
    X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

    print("SMOTE class distribution:", Counter(y_train_res))

    ensemble.fit(X_train_res, y_train_res)
    y_pred = ensemble.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="macro")
    report = classification_report(y_test, y_pred, zero_division=0)

    print(f"Accuracy: {acc:.4f}")
    print(f"F1-score (macro): {f1:.4f}")

    all_reports += f"\n\n=== {trait.upper()} ===\n"
    all_reports += f"Accuracy: {acc:.4f}\nF1-score (macro): {f1:.4f}\n"
    all_reports += report

with open("/Users/arashalborz/Desktop/amiv_nlp_2025/classification/reports/voting_ensemble_all_traits_report_with_smote.txt", "w") as f:
    f.write("Classification Reports - Voting Ensemble with SMOTE (All Traits)\n")
    f.write(all_reports)

print("✅ Saved report to voting_ensemble_all_traits_report_with_smote.txt")



SMOTE class distribution: Counter({'low': 669, 'high': 669, 'medium': 669})




Accuracy: 0.4586
F1-score (macro): 0.3756

SMOTE class distribution: Counter({'low': 599, 'medium': 599, 'high': 599})




Accuracy: 0.4268
F1-score (macro): 0.3800

SMOTE class distribution: Counter({'low': 658, 'medium': 658, 'high': 658})




Accuracy: 0.4586
F1-score (macro): 0.3805

SMOTE class distribution: Counter({'low': 573, 'medium': 573, 'high': 573})




Accuracy: 0.3981
F1-score (macro): 0.3572

SMOTE class distribution: Counter({'medium': 478, 'low': 478, 'high': 478})




Accuracy: 0.4076
F1-score (macro): 0.3877
✅ Saved report to voting_ensemble_all_traits_report_with_smote.txt


In [70]:
"""
Voting Ensemble Classifier (Random Forest + Gradient Boosting + MLP)
Train on train set and evaluate on validation set
"""

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

# --- Load Train & Validation Sets ---
train_df = pd.read_csv("/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/combined_author_embeddings_with_liwc_labeled.csv")
val_df = pd.read_csv("/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/val_embeddings_with_liwc_labeled.csv")

feature_cols = [col for col in train_df.columns if col.startswith("embed_") or col.startswith("liwc_")]
feature_cols = [col for col in feature_cols if col in val_df.columns]  # Ensure compatibility

traits = ["openness", "conscientiousness", "extraversion", "agreeableness", "neuroticism"]

scaler = StandardScaler()
X_train = train_df[feature_cols].values
X_test = val_df[feature_cols].values
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- Base Classifiers ---
rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=42)

ensemble = VotingClassifier(
    estimators=[('rf', rf), ('gb', gb), ('mlp', mlp)],
    voting='soft',
    n_jobs=-1
)

all_reports = ""

for trait in traits:
    print(f"\n========== Predicting {trait.capitalize()} ==========")

    y_train = train_df[trait].values
    y_test = val_df[trait].values

    ensemble.fit(X_train_scaled, y_train)
    y_pred = ensemble.predict(X_test_scaled)

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="macro")
    report = classification_report(y_test, y_pred, zero_division=0)

    print(f"Accuracy: {acc:.4f}")
    print(f"F1-score (macro): {f1:.4f}")

    all_reports += f"\n\n=== {trait.upper()} ===\n"
    all_reports += f"Accuracy: {acc:.4f}\nF1-score (macro): {f1:.4f}\n"
    all_reports += report

report_path = "/Users/arashalborz/Desktop/amiv_nlp_2025/classification/reports/voting_ensemble_all_traits_val_report.txt"
with open(report_path, "w") as f:
    f.write("Classification Reports - Voting Ensemble (Train/Val Split)\n")
    f.write(all_reports)

print(f"Saved ensemble report to {report_path}")

FileNotFoundError: [Errno 2] No such file or directory: '/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/combined_author_embeddings_with_liwc_labeled.csv'

In [71]:
"""
Voting Ensemble Classifier (Random Forest + Gradient Boosting + MLP + SVM)
Train on train set and evaluate on validation set
"""

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, classification_report

train_df = pd.read_csv("/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/train/comb_train_liwc_embed.csv")
val_df = pd.read_csv("/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/validation/comb_val_liwc_embed.csv")

feature_cols = [col for col in train_df.columns if col.startswith("embed_") or col.startswith("liwc_")]
feature_cols = [col for col in feature_cols if col in val_df.columns]  # Ensure compatibility

traits = ["Openness", "Conscientiousness", "Extraversion", "Agreeableness", "Emotional stability"]

scaler = StandardScaler()
X_train = train_df[feature_cols].values
X_test = val_df[feature_cols].values
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, class_weight="balanced")
gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)  # No built-in class_weight
mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=42)  # Doesn't support class_weight directly
svm = SVC(kernel="linear", C=1.0, probability=True, random_state=42, class_weight="balanced")

ensemble = VotingClassifier(
    estimators=[('rf', rf), ('gb', gb), ('mlp', mlp), ('svm', svm)],
    voting='soft',
    n_jobs=-1
)

all_reports = ""

for trait in traits:
    print(f"\n========== Predicting {trait.capitalize()} ==========")

    y_train = train_df[trait].values
    y_test = val_df[trait].values

    ensemble.fit(X_train_scaled, y_train)
    y_pred = ensemble.predict(X_test_scaled)

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="macro")
    report = classification_report(y_test, y_pred, zero_division=0)

    print(f"Accuracy: {acc:.4f}")
    print(f"F1-score (macro): {f1:.4f}")

    all_reports += f"\n\n=== {trait.upper()} ===\n"
    all_reports += f"Accuracy: {acc:.4f}\nF1-score (macro): {f1:.4f}\n"
    all_reports += report

report_path = "/Users/arashalborz/Desktop/amiv_nlp_2025/classification/reports/voting_ensemble_all_traits_val_report.txt"
with open(report_path, "w") as f:
    f.write("Classification Reports - Voting Ensemble (Train/Val Split)\n")
    f.write(all_reports)

print(f"Saved ensemble report to {report_path}")


Accuracy: 0.5625
F1-score (macro): 0.2400

Accuracy: 0.3750
F1-score (macro): 0.3496

Accuracy: 0.3750
F1-score (macro): 0.1818

Accuracy: 0.1875
F1-score (macro): 0.1193

Accuracy: 0.4688
F1-score (macro): 0.2128
Saved ensemble report to /Users/arashalborz/Desktop/amiv_nlp_2025/classification/reports/voting_ensemble_all_traits_val_report.txt


### EMsembles with SMOTE

In [73]:
"""
Voting Ensemble Classifier (Random Forest + Gradient Boosting + MLP + SVM)
with SMOTE applied separately for each personality trait.
"""

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, classification_report
from imblearn.over_sampling import SMOTE

train_df = pd.read_csv("/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/train/comb_train_liwc_embed.csv")
val_df = pd.read_csv("/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/validation/comb_val_liwc_embed.csv")

feature_cols = [col for col in train_df.columns if col.startswith("embed_") or col.startswith("liwc_")]
feature_cols = [col for col in feature_cols if col in val_df.columns]  # Ensure compatibility

traits = ["Openness", "Conscientiousness", "Extraversion", "Agreeableness", "Emotional stability"]

# Scale features
scaler = StandardScaler()
X_train = train_df[feature_cols].values
X_test = val_df[feature_cols].values
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define base classifiers
rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, class_weight="balanced")
gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=42)
svm = SVC(kernel="linear", C=1.0, probability=True, random_state=42, class_weight="balanced")

ensemble = VotingClassifier(
    estimators=[('rf', rf), ('gb', gb), ('mlp', mlp), ('svm', svm)],
    voting='soft',
    n_jobs=-1
)

all_reports = ""

for trait in traits:
    print(f"\n========== Predicting {trait.capitalize()} ==========")

    y_train = train_df[trait].values
    y_test = val_df[trait].values

    # Apply SMOTE to the scaled data
    smote = SMOTE(random_state=42)
    X_train_balanced, y_train_balanced = smote.fit_resample(X_train_scaled, y_train)

    # Fit ensemble
    ensemble.fit(X_train_balanced, y_train_balanced)
    y_pred = ensemble.predict(X_test_scaled)

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="macro")
    report = classification_report(y_test, y_pred, zero_division=0)

    print(f"Accuracy: {acc:.4f}")
    print(f"F1-score (macro): {f1:.4f}")

    all_reports += f"\n\n=== {trait.upper()} ===\n"
    all_reports += f"Accuracy: {acc:.4f}\nF1-score (macro): {f1:.4f}\n"
    all_reports += report

# Save report
report_path = "/Users/arashalborz/Desktop/amiv_nlp_2025/classification/reports/voting_ensemble_all_traits_val_report.txt"
with open(report_path, "w") as f:
    f.write("Classification Reports - Voting Ensemble with SMOTE (Train/Val Split)\n")
    f.write(all_reports)

print(f"✅ Saved ensemble report with SMOTE to {report_path}")






Accuracy: 0.6250
F1-score (macro): 0.2564





Accuracy: 0.2500
F1-score (macro): 0.2494





Accuracy: 0.3438
F1-score (macro): 0.1705





Accuracy: 0.1562
F1-score (macro): 0.0901





Accuracy: 0.4375
F1-score (macro): 0.2968
✅ Saved ensemble report with SMOTE to /Users/arashalborz/Desktop/amiv_nlp_2025/classification/reports/voting_ensemble_all_traits_val_report.txt


In [74]:
print("Before SMOTE:", pd.Series(y_train).value_counts().to_dict())
print("After SMOTE: ", pd.Series(y_train_balanced).value_counts().to_dict())

Before SMOTE: {'low': 598, 'high': 569, 'medium': 401}
After SMOTE:  {'high': 598, 'medium': 598, 'low': 598}


In [77]:
print(set(y_train)) == print(set(y_test))  # From training data
   # From validation data

{'low', 'medium', 'high'}
{'medium', 'high', 'low'}


True

In [76]:
from collections import Counter

# After predictions:
print("Predicted class distribution:", Counter(y_pred))
print("True class distribution:", Counter(y_test))

Predicted class distribution: Counter({'medium': 28, 'high': 3, 'low': 1})
True class distribution: Counter({'medium': 15, 'low': 13, 'high': 4})


In [78]:
from collections import Counter
print("SMOTE class distribution:", Counter(y_res))

NameError: name 'y_res' is not defined