In [14]:
'''
SVM Script for predicting the traits separately based on the combined CLS-BERT-embeddings + LIWC-features


'''

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, classification_report

df = pd.read_csv("/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/combined_author_embeddings_with_liwc_labeled.csv")

X = df[[col for col in df.columns if col.startswith("embed_") or col.startswith("liwc_")]].values

traits = ["openness", "conscientiousness", "extraversion", "agreeableness", "neuroticism"]

reports = []

for trait in traits:
    print(f"\n==== Trait: {trait.capitalize()} ====")

    y = df[trait].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    svm = SVC(kernel="linear", C=1.0, random_state=42)
    svm.fit(X_train_scaled, y_train)

    y_pred = svm.predict(X_test_scaled)

    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="macro")
    report = classification_report(y_test, y_pred, zero_division=0)

    print("Accuracy:", accuracy)
    print("F1-score (macro):", f1)
    #print("Classification Report:\n", report)

    reports.append((trait, accuracy, f1, report))

with open("/Users/arashalborz/Desktop/amiv_nlp_2025/classification/reports/svm_classification_report_all_traits.txt", "w") as f:
    for trait, acc, f1, rep in reports:
        f.write(f"=== Trait: {trait.capitalize()} ===\n")
        f.write(f"Accuracy: {acc:.4f}\n")
        f.write(f"F1-score (macro): {f1:.4f}\n")
        f.write(rep + "\n\n")

print("All SVM classification reports saved to svm_classification_report_all_traits.txt")


==== Trait: Openness ====
Accuracy: 0.4426751592356688
F1-score (macro): 0.3575830567902518

==== Trait: Conscientiousness ====
Accuracy: 0.4267515923566879
F1-score (macro): 0.4098462519212225

==== Trait: Extraversion ====
Accuracy: 0.4840764331210191
F1-score (macro): 0.4415439912486326

==== Trait: Agreeableness ====
Accuracy: 0.3853503184713376
F1-score (macro): 0.36094941808577136

==== Trait: Neuroticism ====
Accuracy: 0.4140127388535032
F1-score (macro): 0.396973945485271
All SVM classification reports saved to svm_classification_report_all_traits.txt


### Test on real VAL SET

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, classification_report

train_df = pd.read_csv("/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/combined_author_embeddings_with_liwc_labeled.csv")
val_df = pd.read_csv("/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/val_embeddings_with_liwc_labeled.csv")

feature_cols = [col for col in train_df.columns if col.startswith("embed_") or col.startswith("liwc_")]
feature_cols = [col for col in feature_cols if col in val_df.columns]  # Make sure they exist in val_df too

traits = ["openness", "conscientiousness", "extraversion", "agreeableness", "neuroticism"]

reports = []

for trait in traits:
    print(f"\n==== Trait: {trait.capitalize()} ====")

    X_train = train_df[feature_cols].values
    y_train = train_df[trait].values

    X_test = val_df[feature_cols].values
    y_test = val_df[trait].values

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    svm = SVC(kernel="linear", C=1.0, random_state=42)
    svm.fit(X_train_scaled, y_train)

    y_pred = svm.predict(X_test_scaled)

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="macro")
    report = classification_report(y_test, y_pred, zero_division=0)

    print("Accuracy:", acc)
    print("F1-score (macro):", f1)

    reports.append((trait, acc, f1, report))

report_path = "/Users/arashalborz/Desktop/amiv_nlp_2025/classification/reports/svm_classification_report_all_traits.txt"
with open(report_path, "w") as f:
    for trait, acc, f1, rep in reports:
        f.write(f"=== Trait: {trait.capitalize()} ===\n")
        f.write(f"Accuracy: {acc:.4f}\n")
        f.write(f"F1-score (macro): {f1:.4f}\n")
        f.write(rep + "\n\n")

print(f"✅ Saved all SVM reports to {report_path}")


==== Trait: Openness ====
Accuracy: 0.25
F1-score (macro): 0.20000000000000004

==== Trait: Conscientiousness ====
Accuracy: 0.17857142857142858
F1-score (macro): 0.16323417238749047

==== Trait: Extraversion ====
Accuracy: 0.32142857142857145
F1-score (macro): 0.28777777777777774

==== Trait: Agreeableness ====
Accuracy: 0.25
F1-score (macro): 0.18219461697722564

==== Trait: Neuroticism ====
Accuracy: 0.4642857142857143
F1-score (macro): 0.45131652661064425
✅ Saved all SVM reports to /Users/arashalborz/Desktop/amiv_nlp_2025/classification/reports/svm_classification_report_all_traits.txt


In [15]:
'''
Random Forest Script for predicting the traits separately based on the combined CLS-BERT-embeddings + LIWC-features


'''


import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

df = pd.read_csv("/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/combined_author_embeddings_with_liwc_labeled.csv")

X = df[[col for col in df.columns if col.startswith("embed_") or col.startswith("liwc_")]].values

traits = ["openness", "conscientiousness", "extraversion", "agreeableness", "neuroticism"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

all_reports = ""

for trait in traits:
    print(f"\n========== Predicting {trait.capitalize()} ==========")
    
    y = df[trait].values
    
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
    
    clf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="macro")
    report = classification_report(y_test, y_pred, zero_division=0)
    
    print(f"Accuracy: {acc:.4f}")
    print(f"F1-score (macro): {f1:.4f}")
    #print(report)
    
    all_reports += f"\n\n=== {trait.upper()} ===\n"
    all_reports += f"Accuracy: {acc:.4f}\nF1-score (macro): {f1:.4f}\n"
    all_reports += report

with open("/Users/arashalborz/Desktop/amiv_nlp_2025/classification/reports/random_forest_all_traits_report.txt", "w") as f:
    f.write("Classification Reports - Random Forest (All Traits)\n")
    f.write(all_reports)

print("Saved full trait classification report to random_forest_all_traits_report.txt")


Accuracy: 0.5127
F1-score (macro): 0.2938

Accuracy: 0.4427
F1-score (macro): 0.2882

Accuracy: 0.5318
F1-score (macro): 0.3124

Accuracy: 0.4395
F1-score (macro): 0.3338

Accuracy: 0.3885
F1-score (macro): 0.3294
Saved full trait classification report to random_forest_all_traits_report.txt


### Test on the real VAL set

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

train_df = pd.read_csv("/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/combined_author_embeddings_with_liwc_labeled.csv")
val_df = pd.read_csv("/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/val_embeddings_with_liwc_labeled.csv")

feature_cols = [col for col in train_df.columns if col.startswith("embed_") or col.startswith("liwc_")]
feature_cols = [col for col in feature_cols if col in val_df.columns]

traits = ["openness", "conscientiousness", "extraversion", "agreeableness", "neuroticism"]

reports = []

for trait in traits:
    print(f"\n==== Trait: {trait.capitalize()} ====")

    X_train = train_df[feature_cols].values
    y_train = train_df[trait].values

    X_test = val_df[feature_cols].values
    y_test = val_df[trait].values

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    clf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
    clf.fit(X_train_scaled, y_train)

    y_pred = clf.predict(X_test_scaled)

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="macro")
    report = classification_report(y_test, y_pred, zero_division=0)

    print("Accuracy:", acc)
    print("F1-score (macro):", f1)

    reports.append((trait, acc, f1, report))

report_path = "/Users/arashalborz/Desktop/amiv_nlp_2025/classification/reports/random_forest_classification_report_all_traits.txt"
with open(report_path, "w") as f:
    for trait, acc, f1, rep in reports:
        f.write(f"=== Trait: {trait.capitalize()} ===\n")
        f.write(f"Accuracy: {acc:.4f}\n")
        f.write(f"F1-score (macro): {f1:.4f}\n")
        f.write(rep + "\n\n")

print(f"Saved all Random Forest reports to {report_path}")


==== Trait: Openness ====
Accuracy: 0.6428571428571429
F1-score (macro): 0.2608695652173913

==== Trait: Conscientiousness ====
Accuracy: 0.6071428571428571
F1-score (macro): 0.2518518518518518

==== Trait: Extraversion ====
Accuracy: 0.42857142857142855
F1-score (macro): 0.3125

==== Trait: Agreeableness ====
Accuracy: 0.17857142857142858
F1-score (macro): 0.25268817204301075

==== Trait: Neuroticism ====
Accuracy: 0.4642857142857143
F1-score (macro): 0.3712962962962963
Saved all Random Forest reports to /Users/arashalborz/Desktop/amiv_nlp_2025/classification/reports/random_forest_classification_report_all_traits.txt


In [16]:
'''
Naive Bayes Script for predicting the traits separately based on the combined CLS-BERT-embeddings + LIWC-features
'''

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, f1_score, classification_report

df = pd.read_csv("/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/combined_author_embeddings_with_liwc_labeled.csv")

X = df[[col for col in df.columns if col.startswith("embed_") or col.startswith("liwc_")]].values
traits = ["openness", "conscientiousness", "extraversion", "agreeableness", "neuroticism"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

all_reports = ""

for trait in traits:
    print(f"\n========== Predicting {trait.capitalize()} ==========")
    
    y = df[trait].values
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
    
    clf = GaussianNB()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="macro")
    report = classification_report(y_test, y_pred, zero_division=0)
    
    print(f"Accuracy: {acc:.4f}")
    print(f"F1-score (macro): {f1:.4f}")
    #print(report)
    
    all_reports += f"\n\n=== {trait.upper()} ===\n"
    all_reports += f"Accuracy: {acc:.4f}\nF1-score (macro): {f1:.4f}\n"
    all_reports += report

with open("/Users/arashalborz/Desktop/amiv_nlp_2025/classification/reports/naive_bayes_all_traits_report.txt", "w") as f:
    f.write("Classification Reports - Naive Bayes (All Traits)\n")
    f.write(all_reports)

print("Saved full trait classification report to naive_bayes_all_traits_report.txt")


Accuracy: 0.3025
F1-score (macro): 0.2226

Accuracy: 0.3153
F1-score (macro): 0.2732

Accuracy: 0.2707
F1-score (macro): 0.2332

Accuracy: 0.4076
F1-score (macro): 0.3026

Accuracy: 0.3153
F1-score (macro): 0.2757
Saved full trait classification report to naive_bayes_all_traits_report.txt


In [17]:
'''
Gradient Boosting Script for predicting the traits separately based on the combined CLS-BERT-embeddings + LIWC-features
'''

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

df = pd.read_csv("/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/combined_author_embeddings_with_liwc_labeled.csv")

X = df[[col for col in df.columns if col.startswith("embed_") or col.startswith("liwc_")]].values

traits = ["openness", "conscientiousness", "extraversion", "agreeableness", "neuroticism"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

all_reports = ""

for trait in traits:
    print(f"\n========== Predicting {trait.capitalize()} ==========")
    
    y = df[trait].values
    
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
    
    clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="macro")
    report = classification_report(y_test, y_pred, zero_division=0)
    
    print(f"Accuracy: {acc:.4f}")
    print(f"F1-score (macro): {f1:.4f}")
    #print(report)
    
    all_reports += f"\n\n=== {trait.upper()} ===\n"
    all_reports += f"Accuracy: {acc:.4f}\nF1-score (macro): {f1:.4f}\n"
    all_reports += report

with open("/Users/arashalborz/Desktop/amiv_nlp_2025/classification/reports/gradient_boosting_all_traits_report.txt", "w") as f:
    f.write("Classification Reports - Gradient Boosting (All Traits)\n")
    f.write(all_reports)

print("Saved full trait classification report to gradient_boosting_all_traits_report.txt")




Accuracy: 0.5000
F1-score (macro): 0.3328

Accuracy: 0.4013
F1-score (macro): 0.3079

Accuracy: 0.5382
F1-score (macro): 0.4103

Accuracy: 0.4268
F1-score (macro): 0.3527

Accuracy: 0.4076
F1-score (macro): 0.3615
Saved full trait classification report to gradient_boosting_all_traits_report.txt


In [18]:
'''
MLP Classifier Script for predicting the traits separately based on the combined CLS-BERT-embeddings + LIWC-features
'''

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

df = pd.read_csv("/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/combined_author_embeddings_with_liwc_labeled.csv")

X = df[[col for col in df.columns if col.startswith("embed_") or col.startswith("liwc_")]].values

traits = ["openness", "conscientiousness", "extraversion", "agreeableness", "neuroticism"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

all_reports = ""

for trait in traits:
    print(f"\n========== Predicting {trait.capitalize()} ==========")
    
    y = df[trait].values
    
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
    
    clf = MLPClassifier(hidden_layer_sizes=(128,), max_iter=300, random_state=42)
    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="macro")
    report = classification_report(y_test, y_pred, zero_division=0)
    
    print(f"Accuracy: {acc:.4f}")
    print(f"F1-score (macro): {f1:.4f}")
    #print(report)
    
    all_reports += f"\n\n=== {trait.upper()} ===\n"
    all_reports += f"Accuracy: {acc:.4f}\nF1-score (macro): {f1:.4f}\n"
    all_reports += report

with open("/Users/arashalborz/Desktop/amiv_nlp_2025/classification/reports/mlp_all_traits_report.txt", "w") as f:
    f.write("Classification Reports - MLP Classifier (All Traits)\n")
    f.write(all_reports)

print("Saved full trait classification report to mlp_all_traits_report.txt")


Accuracy: 0.4331
F1-score (macro): 0.3362

Accuracy: 0.4076
F1-score (macro): 0.3713

Accuracy: 0.5000
F1-score (macro): 0.4335

Accuracy: 0.4013
F1-score (macro): 0.3719

Accuracy: 0.4045
F1-score (macro): 0.3815
Saved full trait classification report to mlp_all_traits_report.txt


In [None]:
'''
Voting Ensemble Classifier (Random Forest + Gradient Boosting + MLP)
for predicting all Big Five traits
'''

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

df = pd.read_csv("/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/combined_author_embeddings_with_liwc_labeled.csv")

X = df[[col for col in df.columns if col.startswith("embed_") or col.startswith("liwc_")]].values
traits = ["openness", "conscientiousness", "extraversion", "agreeableness", "neuroticism"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=42)

ensemble = VotingClassifier(
    estimators=[
        ('rf', rf),
        ('gb', gb),
        ('mlp', mlp)
    ],
    voting='soft',  # hard was also tried; the results with softs were better except for agreableness
    n_jobs=-1
)

all_reports = ""

for trait in traits:
    print(f"\n========== Predicting {trait.capitalize()} ==========")

    y = df[trait].values

    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    ensemble.fit(X_train, y_train)
    y_pred = ensemble.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="macro")
    report = classification_report(y_test, y_pred, zero_division=0)

    print(f"Accuracy: {acc:.4f}")
    print(f"F1-score (macro): {f1:.4f}")
    #print(report)

    all_reports += f"\n\n=== {trait.upper()} ===\n"
    all_reports += f"Accuracy: {acc:.4f}\nF1-score (macro): {f1:.4f}\n"
    all_reports += report

with open("/Users/arashalborz/Desktop/amiv_nlp_2025/classification/reports/voting_ensemble_all_traits_report.txt", "w") as f:
    f.write("Classification Reports - Voting Ensemble (All Traits)\n")
    f.write(all_reports)

print("Saved report to voting_ensemble_all_traits_report.txt")


Accuracy: 0.5159
F1-score (macro): 0.3163

Accuracy: 0.4140
F1-score (macro): 0.3143

Accuracy: 0.5287
F1-score (macro): 0.3854

Accuracy: 0.4395
F1-score (macro): 0.3583

Accuracy: 0.4013
F1-score (macro): 0.3470
Saved report to voting_ensemble_all_traits_report.txt


In [None]:
"""
Voting Ensemble Classifier (Random Forest + Gradient Boosting + MLP)
Train on train set and evaluate on validation set
"""

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

train_df = pd.read_csv("/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/combined_author_embeddings_with_liwc_labeled.csv")
val_df = pd.read_csv("/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/val_embeddings_with_liwc_labeled.csv")

feature_cols = [col for col in train_df.columns if col.startswith("embed_") or col.startswith("liwc_")]
feature_cols = [col for col in feature_cols if col in val_df.columns]  # Ensure compatibility

traits = ["openness", "conscientiousness", "extraversion", "agreeableness", "neuroticism"]

scaler = StandardScaler()
X_train = train_df[feature_cols].values
X_test = val_df[feature_cols].values
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- Base Classifiers ---
rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=42)

ensemble = VotingClassifier(
    estimators=[('rf', rf), ('gb', gb), ('mlp', mlp)],
    voting='soft',
    n_jobs=-1
)

all_reports = ""

for trait in traits:
    print(f"\n========== Predicting {trait.capitalize()} ==========")

    y_train = train_df[trait].values
    y_test = val_df[trait].values

    ensemble.fit(X_train_scaled, y_train)
    y_pred = ensemble.predict(X_test_scaled)

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="macro")
    report = classification_report(y_test, y_pred, zero_division=0)

    print(f"Accuracy: {acc:.4f}")
    print(f"F1-score (macro): {f1:.4f}")

    all_reports += f"\n\n=== {trait.upper()} ===\n"
    all_reports += f"Accuracy: {acc:.4f}\nF1-score (macro): {f1:.4f}\n"
    all_reports += report

report_path = "/Users/arashalborz/Desktop/amiv_nlp_2025/classification/reports/voting_ensemble_all_traits_val_report.txt"
with open(report_path, "w") as f:
    f.write("Classification Reports - Voting Ensemble (Train/Val Split)\n")
    f.write(all_reports)

print(f"Saved ensemble report to {report_path}")


Accuracy: 0.5357
F1-score (macro): 0.2381

Accuracy: 0.3214
F1-score (macro): 0.3354

Accuracy: 0.4643
F1-score (macro): 0.3727

Accuracy: 0.2143
F1-score (macro): 0.1622

Accuracy: 0.4643
F1-score (macro): 0.4657
Saved ensemble report to /Users/arashalborz/Desktop/amiv_nlp_2025/classification/reports/voting_ensemble_all_traits_val_report.txt


In [None]:
"""
Voting Ensemble Classifier (Random Forest + Gradient Boosting + MLP + SVM)
Train on train set and evaluate on validation set
"""

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, classification_report

train_df = pd.read_csv("/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/combined_author_embeddings_with_liwc_labeled.csv")
val_df = pd.read_csv("/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/val_embeddings_with_liwc_labeled.csv")

feature_cols = [col for col in train_df.columns if col.startswith("embed_") or col.startswith("liwc_")]
feature_cols = [col for col in feature_cols if col in val_df.columns]  # Ensure compatibility

traits = ["openness", "conscientiousness", "extraversion", "agreeableness", "neuroticism"]

scaler = StandardScaler()
X_train = train_df[feature_cols].values
X_test = val_df[feature_cols].values
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, class_weight="balanced")
gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)  # No built-in class_weight
mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=42)  # Doesn't support class_weight directly
svm = SVC(kernel="linear", C=1.0, probability=True, random_state=42, class_weight="balanced")

ensemble = VotingClassifier(
    estimators=[('rf', rf), ('gb', gb), ('mlp', mlp), ('svm', svm)],
    voting='soft',
    n_jobs=-1
)

all_reports = ""

for trait in traits:
    print(f"\n========== Predicting {trait.capitalize()} ==========")

    y_train = train_df[trait].values
    y_test = val_df[trait].values

    ensemble.fit(X_train_scaled, y_train)
    y_pred = ensemble.predict(X_test_scaled)

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="macro")
    report = classification_report(y_test, y_pred, zero_division=0)

    print(f"Accuracy: {acc:.4f}")
    print(f"F1-score (macro): {f1:.4f}")

    all_reports += f"\n\n=== {trait.upper()} ===\n"
    all_reports += f"Accuracy: {acc:.4f}\nF1-score (macro): {f1:.4f}\n"
    all_reports += report

report_path = "/Users/arashalborz/Desktop/amiv_nlp_2025/classification/reports/voting_ensemble_all_traits_val_report.txt"
with open(report_path, "w") as f:
    f.write("Classification Reports - Voting Ensemble (Train/Val Split)\n")
    f.write(all_reports)

print(f"Saved ensemble report to {report_path}")


Accuracy: 0.6071
F1-score (macro): 0.2576

Accuracy: 0.3929
F1-score (macro): 0.3934

Accuracy: 0.4643
F1-score (macro): 0.3727

Accuracy: 0.1786
F1-score (macro): 0.1255

Accuracy: 0.4643
F1-score (macro): 0.4657
✅ Saved ensemble report to /Users/arashalborz/Desktop/amiv_nlp_2025/classification/reports/voting_ensemble_all_traits_val_report.txt


### K-FOLD CV