In [None]:
'''
>>> Checking the label distribution in train set to find bias in our Random Forest Classifier

'''


import pandas as pd
train_df = pd.read_csv("/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/train/comb_train_liwc_embed.csv")
val_df = pd.read_csv("/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/validation/comb_val_liwc_embed.csv")
print(train_df["Openness"].value_counts(), train_df["Conscientiousness"].value_counts(), 
      train_df["Extraversion"].value_counts(), train_df["Agreeableness"].value_counts(), 
      train_df["Emotional stability"].value_counts())

Openness
high      837
medium    445
low       286
Name: count, dtype: int64 Conscientiousness
low       749
medium    443
high      376
Name: count, dtype: int64 Extraversion
low       823
medium    408
high      337
Name: count, dtype: int64 Agreeableness
low       716
high      433
medium    419
Name: count, dtype: int64 Emotional stability
low       598
high      569
medium    401
Name: count, dtype: int64


In [None]:
'''
>>> Checking the label distribution in validation set to find bias in our Random Forest Classifier

'''

import pandas as pd
train_df = pd.read_csv("/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/train/comb_train_liwc_embed.csv")
val_df = pd.read_csv("/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/validation/comb_val_liwc_embed.csv")
print(val_df["Openness"].value_counts(), val_df["Conscientiousness"].value_counts(), 
      val_df["Extraversion"].value_counts(), val_df["Agreeableness"].value_counts(), 
      val_df["Emotional stability"].value_counts())

Openness
high      20
low        8
medium     4
Name: count, dtype: int64 Conscientiousness
low       20
high       6
medium     6
Name: count, dtype: int64 Extraversion
low       13
high      10
medium     9
Name: count, dtype: int64 Agreeableness
high      24
low        5
medium     3
Name: count, dtype: int64 Emotional stability
medium    15
low       13
high       4
Name: count, dtype: int64


In [None]:
'''
>>> Script for manual experimenting with the hyperparameters for each trait classifier.
>>> Since the label distribution for each trait is skewed, but also the dominant labels in some cases
differ in train and test, a GridSearchCV does not help generalizing the training to job interview.
After experimenting with GridSearchCV, we found that the best solution to build a "fair" classifier 
that generalizes to unseen data, is to experiment with hyperparameters manually 
and compare the results in the classification report.
>>> There are 5 rf_classifier being trained separately for each trait. Depending on label distribution
in train and test, there are different effects of n_estimators and max_depth for each trait classifier.
>>> The hyperparameters in the script are the ones with highest accuracy and f1-score.
>>> To accomplish this task, we also looked at bias in predictions. 
>>> Therefore, some hyperparameters that achieved very high accuracy and f1-score, were not uses, 
because they were biased towards dominant labels, and since the number of these had a very high percentage of 
the whole labels in the val set, the bias was evident. Hence, the best parameters are not necessarily the ones
with highest scores, but the ones with highest scores within a "fair" and "scientific" research framework.

'''

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

train_df = pd.read_csv("/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/train/comb_train_liwc_embed.csv")
val_df = pd.read_csv("/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/validation/comb_val_liwc_embed.csv")

feature_cols = [col for col in train_df.columns if col.startswith("embed_") or col.startswith("liwc_")]
feature_cols = [col for col in feature_cols if col in val_df.columns]

traits = ["Openness", "Conscientiousness", "Extraversion", "Agreeableness", "Emotional stability"]

# trait-specific hyperparameters
trait_params = {
    "Openness": {"n_estimators": 14, "max_depth": 30},
    "Conscientiousness": {"n_estimators": 20, "max_depth": None},
    "Extraversion": {"n_estimators": 80, "max_depth": 30},
    "Agreeableness": {"n_estimators": 10, "max_depth": 12},
    "Emotional stability": {"n_estimators": 50, "max_depth": 7},
}

reports = []

for trait in traits:
    print(f"\n==== Trait: {trait.capitalize()} ====")

    # getting parameters for current trait (or get defaults)
    params = trait_params.get(trait, {"n_estimators": 100, "max_depth": None})

    X_train = train_df[feature_cols].values
    y_train = train_df[trait].values

    X_test = val_df[feature_cols].values
    y_test = val_df[trait].values

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # the classifier
    clf = RandomForestClassifier(
        n_estimators=params["n_estimators"],
        max_depth=params["max_depth"],
        random_state=42,
        n_jobs=-1,
    )
    clf.fit(X_train_scaled, y_train)

    y_pred = clf.predict(X_test_scaled)

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="macro")
    report = classification_report(y_test, y_pred, zero_division=0)

    print("Accuracy:", acc)
    print("F1-score (macro):", f1)

    reports.append((trait, acc, f1, report))

report_path = "/Users/arashalborz/Desktop/amiv_nlp_2025/classification/reports/rf_reports/rf_per_trait_custom_params.txt"
with open(report_path, "w") as f:
    for trait, acc, f1, rep in reports:
        f.write(f"=== Trait: {trait.capitalize()} ===\n")
        f.write(f"Accuracy: {acc:.4f}\n")
        f.write(f"F1-score (macro): {f1:.4f}\n")
        f.write(rep + "\n\n")

print(f"Saved all trait-specific RF results to {report_path}")


==== Trait: Openness ====
Accuracy: 0.625
F1-score (macro): 0.4666666666666666

==== Trait: Conscientiousness ====
Accuracy: 0.625
F1-score (macro): 0.4801587301587302

==== Trait: Extraversion ====
Accuracy: 0.46875
F1-score (macro): 0.43994669332000663

==== Trait: Agreeableness ====
Accuracy: 0.375
F1-score (macro): 0.36262626262626263

==== Trait: Emotional stability ====
Accuracy: 0.53125
F1-score (macro): 0.46881091617933723
Saved all trait-specific RF results to /Users/arashalborz/Desktop/amiv_nlp_2025/classification/reports/rf_reports/rf_per_trait_custom_params.txt
