In [None]:
'''
>>> Script for converting numbers to labels 


'''


import pandas as pd

input_path = "/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/combined_author_embeddings_with_liwc.csv"
df = pd.read_csv(input_path)

def score_to_label(score):
    if score <= 32:
        return "Low"
    elif score <= 66:
        return "Medium"
    else:
        return "High"

traits = ["openness", "conscientiousness", "extraversion", "agreeableness", "neuroticism"]

for trait in traits:
    df[trait] = df[trait].apply(lambda x: score_to_label(x * 100))  # scale back to 0–100

output_path = "/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/combined_author_embeddings_with_liwc_labeled.csv"
df.to_csv(output_path, index=False)
print(f"Saved labeled trait data to {output_path}")

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, classification_report

df = pd.read_csv("/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/combined_author_embeddings_with_liwc_labeled.csv")

X = df[[col for col in df.columns if col.startswith("embed_") or col.startswith("liwc_")]].values
y = df["openness"].values  # label: Low, Medium, High

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

svm = SVC(kernel="linear", C=1.0, random_state=42)
svm.fit(X_train_scaled, y_train)

y_pred = svm.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="macro")
report = classification_report(y_test, y_pred, zero_division=0)

print("SVM Openness - Accuracy:", accuracy)
print("SVM Openness - F1-score (macro):", f1)
print("Classification Report:\n", report)

with open("svm_classification_report_openness.txt", "w") as f:
    f.write("SVM Classification Report for Openness:\n")
    f.write(report)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, classification_report

df = pd.read_csv("combined_author_embeddings_with_liwc_labeled.csv")

X = df[[col for col in df.columns if col.startswith("embed_") or col.startswith("liwc_")]].values

traits = ["openness", "conscientiousness", "extraversion", "agreeableness", "neuroticism"]

reports = []

for trait in traits:
    print(f"\n==== Trait: {trait.capitalize()} ====")

    y = df[trait].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    svm = SVC(kernel="linear", C=1.0, random_state=42)
    svm.fit(X_train_scaled, y_train)

    y_pred = svm.predict(X_test_scaled)

    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="macro")
    report = classification_report(y_test, y_pred, zero_division=0)

    print("Accuracy:", accuracy)
    print("F1-score (macro):", f1)
    print("Classification Report:\n", report)

    # Save each report
    reports.append((trait, accuracy, f1, report))

# Save all reports to file
with open("svm_classification_report_all_traits.txt", "w") as f:
    for trait, acc, f1, rep in reports:
        f.write(f"=== Trait: {trait.capitalize()} ===\n")
        f.write(f"Accuracy: {acc:.4f}\n")
        f.write(f"F1-score (macro): {f1:.4f}\n")
        f.write(rep + "\n\n")

In [None]:
'''

RANDOM FOREST

'''


import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

df = pd.read_csv("combined_author_embeddings_with_liwc_labeled.csv")

X = df[[col for col in df.columns if col.startswith("embed_") or col.startswith("liwc_")]].values

y = df["openness"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X_train_scaled, y_train)

y_pred = rf.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="macro")
report = classification_report(y_test, y_pred, zero_division=0)

print("Openness - Accuracy:", accuracy)
print("Openness - F1-score (macro):", f1)
print("Classification Report:\n", report)

with open("random_forest_openness_report.txt", "w") as f:
    f.write("Classification Report - Random Forest (Openness):\n")
    f.write(report)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

df = pd.read_csv("combined_author_embeddings_with_liwc_labeled.csv")

X = df[[col for col in df.columns if col.startswith("embed_") or col.startswith("liwc_")]].values

traits = ["openness", "conscientiousness", "extraversion", "agreeableness", "neuroticism"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

all_reports = ""

for trait in traits:
    print(f"\n========== Predicting {trait.capitalize()} ==========")
    
    y = df[trait].values
    
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
    
    clf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="macro")
    report = classification_report(y_test, y_pred, zero_division=0)
    
    print(f"Accuracy: {acc:.4f}")
    print(f"F1-score (macro): {f1:.4f}")
    print(report)
    
    all_reports += f"\n\n=== {trait.upper()} ===\n"
    all_reports += f"Accuracy: {acc:.4f}\nF1-score (macro): {f1:.4f}\n"
    all_reports += report

with open("random_forest_all_traits_report.txt", "w") as f:
    f.write("Classification Reports - Random Forest (All Traits)\n")
    f.write(all_reports)