In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# === Load Training Data ===
df = pd.read_csv("/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/train/comb_train_liwc_embed.csv")

# === Features and Traits ===
feature_cols = [col for col in df.columns if col.startswith("embed_") or col.startswith("liwc_")]
traits = ["Openness", "Conscientiousness", "Extraversion", "Agreeableness", "Emotional stability"]

# === Loop over traits ===
for trait in traits:
    print(f"\n==== Trait: {trait.upper()} ====")

    X = df[feature_cols].values
    y = df[trait].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    clf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
    clf.fit(X_train_scaled, y_train)

    y_pred = clf.predict(X_test_scaled)

    print(classification_report(y_test, y_pred, zero_division=0))


==== Trait: OPENNESS ====
              precision    recall  f1-score   support

        high       0.58      0.91      0.71       174
         low       0.40      0.08      0.14        48
      medium       0.42      0.14      0.21        92

    accuracy                           0.56       314
   macro avg       0.47      0.38      0.35       314
weighted avg       0.51      0.56      0.48       314


==== Trait: CONSCIENTIOUSNESS ====
              precision    recall  f1-score   support

        high       0.29      0.06      0.10        85
         low       0.48      0.84      0.61       146
      medium       0.34      0.17      0.23        83

    accuracy                           0.45       314
   macro avg       0.37      0.36      0.31       314
weighted avg       0.39      0.45      0.37       314


==== Trait: EXTRAVERSION ====
              precision    recall  f1-score   support

        high       0.43      0.09      0.15        65
         low       0.56      0.91  

In [67]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# === Load Data ===
train_df = pd.read_csv("/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/train/comb_train_liwc_embed.csv")
val_df = pd.read_csv("/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/validation/comb_val_liwc_embed.csv")

# === Features and Traits ===
feature_cols = [col for col in train_df.columns if col.startswith("embed_") or col.startswith("liwc_")]
feature_cols = [col for col in feature_cols if col in val_df.columns]
traits = ["Openness", "Conscientiousness", "Extraversion", "Agreeableness", "Emotional stability"]

# === Loop over traits ===
for trait in traits:
    print(f"\n==== Trait: {trait.upper()} ====")

    X_train = train_df[feature_cols].values
    y_train = train_df[trait].values

    X_test = val_df[feature_cols].values
    y_test = val_df[trait].values

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    #clf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
    clf = RandomForestClassifier(
    n_estimators=5,         # fewer trees
    max_depth=6,             # limit complexity
    #min_samples_leaf=3,      # prevent tiny leaves
    #class_weight='balanced', # address imbalance
    random_state=42,
    n_jobs=-1
)
    
    clf.fit(X_train_scaled, y_train)

    y_pred = clf.predict(X_test_scaled)

    print(classification_report(y_test, y_pred, zero_division=0))


==== Trait: OPENNESS ====
              precision    recall  f1-score   support

        high       0.58      0.75      0.65        20
         low       0.00      0.00      0.00         8
      medium       0.00      0.00      0.00         4

    accuracy                           0.47        32
   macro avg       0.19      0.25      0.22        32
weighted avg       0.36      0.47      0.41        32


==== Trait: CONSCIENTIOUSNESS ====
              precision    recall  f1-score   support

        high       0.00      0.00      0.00         6
         low       0.64      0.70      0.67        20
      medium       0.00      0.00      0.00         6

    accuracy                           0.44        32
   macro avg       0.21      0.23      0.22        32
weighted avg       0.40      0.44      0.42        32


==== Trait: EXTRAVERSION ====
              precision    recall  f1-score   support

        high       0.20      0.10      0.13        10
         low       0.25      0.46  

In [59]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, classification_report
from imblearn.over_sampling import RandomOverSampler

# Load data
train_df = pd.read_csv("/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/train/comb_train_liwc_embed.csv")
val_df = pd.read_csv("/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/validation/comb_val_liwc_embed.csv")

# Select features
feature_cols = [col for col in train_df.columns if col.startswith("embed_") or col.startswith("liwc_")]
X_train = train_df[feature_cols].values
X_val = val_df[feature_cols].values

# Labels
y_train = train_df["Agreeableness"].values
y_val = val_df["Agreeableness"].values

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# === Rebalance training set with random oversampling ===
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_train_scaled, y_train)

# === Train classifier ===
clf = RandomForestClassifier(n_estimators=50, random_state=42, n_jobs=-1)
clf.fit(X_resampled, y_resampled)

# === Evaluate ===
y_pred = clf.predict(X_val_scaled)

print("\n✅ Classification report for Agreeableness (Random Oversampling):")
print(f"Accuracy: {accuracy_score(y_val, y_pred):.4f}")
print(f"F1 (macro): {f1_score(y_val, y_pred, average='macro'):.4f}")
print(classification_report(y_val, y_pred, zero_division=0))




✅ Classification report for Agreeableness (Random Oversampling):
Accuracy: 0.3125
F1 (macro): 0.2191
              precision    recall  f1-score   support

        high       1.00      0.21      0.34        24
         low       0.19      1.00      0.31         5
      medium       0.00      0.00      0.00         3

    accuracy                           0.31        32
   macro avg       0.40      0.40      0.22        32
weighted avg       0.78      0.31      0.31        32



In [36]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Load data
train_df = pd.read_csv("/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/train/comb_train_liwc_embed.csv")
val_df = pd.read_csv("/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/validation/comb_val_liwc_embed.csv")

# Select features
feature_cols = [col for col in train_df.columns if col.startswith("embed_") or col.startswith("liwc_")]
X_train = train_df[feature_cols].values
X_val = val_df[feature_cols].values

# Labels for Agreeableness
y_train = train_df["Agreeableness"].values
y_val = val_df["Agreeableness"].values

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Train classifier (no resampling)
clf = RandomForestClassifier(n_estimators=2, random_state=42, n_jobs=-1)
clf.fit(X_train_scaled, y_train)

# Predict and evaluate
y_pred = clf.predict(X_val_scaled)

print("\n✅ Classification report for Agreeableness (No Resampling):")
print(f"Accuracy: {accuracy_score(y_val, y_pred):.4f}")
print(f"F1 (macro): {f1_score(y_val, y_pred, average='macro'):.4f}")
print(classification_report(y_val, y_pred, zero_division=0))


✅ Classification report for Agreeableness (No Resampling):
Accuracy: 0.5000
F1 (macro): 0.2924
              precision    recall  f1-score   support

        high       0.78      0.58      0.67        24
         low       0.14      0.40      0.21         5
      medium       0.00      0.00      0.00         3

    accuracy                           0.50        32
   macro avg       0.31      0.33      0.29        32
weighted avg       0.61      0.50      0.53        32



In [17]:
print(train_df["Openness"].value_counts(), train_df["Conscientiousness"].value_counts(), 
      train_df["Extraversion"].value_counts(), train_df["Agreeableness"].value_counts(), 
      train_df["Emotional stability"].value_counts())

Openness
high      837
medium    445
low       286
Name: count, dtype: int64 Conscientiousness
low       749
medium    443
high      376
Name: count, dtype: int64 Extraversion
low       823
medium    408
high      337
Name: count, dtype: int64 Agreeableness
low       716
high      433
medium    419
Name: count, dtype: int64 Emotional stability
low       598
high      569
medium    401
Name: count, dtype: int64


In [18]:
print(val_df["Openness"].value_counts(), val_df["Conscientiousness"].value_counts(), 
      val_df["Extraversion"].value_counts(), val_df["Agreeableness"].value_counts(), 
      val_df["Emotional stability"].value_counts())

Openness
high      20
low        8
medium     4
Name: count, dtype: int64 Conscientiousness
low       20
high       6
medium     6
Name: count, dtype: int64 Extraversion
low       13
high      10
medium     9
Name: count, dtype: int64 Agreeableness
high      24
low        5
medium     3
Name: count, dtype: int64 Emotional stability
medium    15
low       13
high       4
Name: count, dtype: int64
