In [20]:
import json
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# ========== LOAD DATA ==========
with open("/Users/arashalborz/Desktop/amiv_nlp_2025/Data/filtered_pandora.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# ========== PREPARE TEXTS AND LABELS ==========

texts = []     # concatenated comments
labels = []    # personality scores
author_ids = [] # optional, in case you want to keep IDs

for author in data["authors"]:
    author_id = author["id"]
    author_ids.append(author_id)
    
    comments = author.get("comments", [])
    # Concatenate all comments into one string
    full_text = " ".join(comments)
    texts.append(full_text)
    
    # Get Big Five trait scores
    trait_scores = [
        author["labels"]["openness"],
        author["labels"]["conscientiousness"],
        author["labels"]["extraversion"],
        author["labels"]["agreeableness"],
        author["labels"]["neuroticism"]
    ]
    labels.append(trait_scores)

labels = np.array(labels)  # shape: (num_authors, 5)

print("Number of authors:", len(texts))
print("Labels shape:", labels.shape)

# ========== TF-IDF VECTORIZE ==========

# Initialize TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000)  # You can change max_features as needed

# Fit and transform the concatenated comments
X = vectorizer.fit_transform(texts).toarray()  # shape: (num_authors, max_features)

print("TF-IDF features shape:", X.shape)

# ========== SAVE DATA FOR TRAINING (optional) ==========

df = pd.DataFrame(X)
df["openness"] = labels[:, 0]
df["conscientiousness"] = labels[:, 1]
df["extraversion"] = labels[:, 2]
df["agreeableness"] = labels[:, 3]
df["neuroticism"] = labels[:, 4]
df["author_id"] = author_ids

df.to_csv("tfidf_author_data.csv", index=False)
print("Saved TF-IDF data to tfidf_author_data.csv ✅")

Number of authors: 1568
Labels shape: (1568, 5)
TF-IDF features shape: (1568, 5000)
Saved TF-IDF data to tfidf_author_data.csv ✅


In [5]:
import pandas as pd

df = pd.read_csv('tfidf_author_data.csv')
df.columns


Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
       ...
       '996', '997', '998', '999', 'openness', 'conscientiousness',
       'extraversion', 'agreeableness', 'neuroticism', 'author_id'],
      dtype='object', length=1006)

In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, f1_score, classification_report

# Load CSV
df = pd.read_csv("tfidf_author_data.csv")

# Prepare X and Y
X = df[[str(i) for i in range(1000)]].values
Y = df[["openness", "conscientiousness", "extraversion", "agreeableness", "neuroticism"]].values

# Split into train/test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Train Ridge Regression
model = MultiOutputRegressor(Ridge())
model.fit(X_train, Y_train)

# Predict
Y_pred = model.predict(X_test)

# Evaluate MSE and R²
mse = mean_squared_error(Y_test, Y_pred)
r2 = r2_score(Y_test, Y_pred)

print("Ridge MSE:", mse)
print("Ridge R²:", r2)

# ========== Binning Function ==========
def score_to_label(score):
    if score <= 32:
        return "Low"
    elif score <= 66:
        return "Medium"
    else:
        return "High"

def convert_scores_to_labels(array):
    labels = []
    for row in array:
        row_labels = []
        for score in row:
            scaled_score = score * 100  # Scale 0-1 to 0-100
            row_labels.append(score_to_label(scaled_score))
        labels.append(row_labels)
    return labels

# Convert predictions and ground truth to labels
pred_labels = convert_scores_to_labels(Y_pred)
true_labels = convert_scores_to_labels(Y_test)

# Flatten for F1-score and Accuracy calculation
pred_labels_flat = [label for row in pred_labels for label in row]
true_labels_flat = [label for row in true_labels for label in row]

# Accuracy
accuracy = accuracy_score(true_labels_flat, pred_labels_flat)
print("Ridge Accuracy:", accuracy)

# F1-score
f1 = f1_score(true_labels_flat, pred_labels_flat, average="macro")
print("Ridge F1-score (macro):", f1)

# Classification report
report = classification_report(true_labels_flat, pred_labels_flat)
print("Classification Report:")
print(report)

# Save report
with open("ridge_classification_report.txt", "w") as f:
    f.write("Classification Report:\n")
    f.write(report)

print("Classification report saved to ridge_classification_report.txt")

Ridge MSE: 0.091549820669119
Ridge R²: 0.01261319666344458
Ridge Accuracy: 0.2878980891719745
Ridge F1-score (macro): 0.18802014867648845
Classification Report:
              precision    recall  f1-score   support

        High       0.68      0.04      0.07       531
         Low       0.70      0.03      0.07       615
      Medium       0.27      0.97      0.42       424

    accuracy                           0.29      1570
   macro avg       0.55      0.35      0.19      1570
weighted avg       0.58      0.29      0.17      1570

Classification report saved to ridge_classification_report.txt


In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.multioutput import MultiOutputRegressor

# Load CSV
df = pd.read_csv("tfidf_author_data.csv")

# Prepare X and Y
X = df[[str(i) for i in range(1000)]].values
Y = df[["openness", "conscientiousness", "extraversion", "agreeableness", "neuroticism"]].values

# Split into train/test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Train RandomForest Regression for each trait
model = MultiOutputRegressor(RandomForestRegressor(
    n_estimators=100,  # number of trees
    max_depth=None,    # you can limit this later (e.g., max_depth=10) to avoid overfitting
    random_state=42,
    n_jobs=-1          # use all CPU cores for speed
))

model.fit(X_train, Y_train)

# Predict
Y_pred = model.predict(X_test)

# Evaluate
mse = mean_squared_error(Y_test, Y_pred)
r2 = r2_score(Y_test, Y_pred)

print("MSE:", mse)
print("R²:", r2)

MSE: 0.09047947056570406
R²: 0.023857952405019645


In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.multioutput import MultiOutputRegressor
from xgboost import XGBRegressor

# Load CSV
df = pd.read_csv("tfidf_author_data.csv")

# Prepare X and Y
X = df[[str(i) for i in range(1000)]].values
Y = df[["openness", "conscientiousness", "extraversion", "agreeableness", "neuroticism"]].values

# Split into train/test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Train XGBoost Regression for each trait
model = MultiOutputRegressor(XGBRegressor(
    n_estimators=100,   # number of trees
    learning_rate=0.1,  # step size shrinkage
    max_depth=6,        # depth of each tree
    subsample=0.8,      # fraction of samples used per tree
    colsample_bytree=0.8, # fraction of features per tree
    random_state=42,
    n_jobs=-1
))

model.fit(X_train, Y_train)

# Predict
Y_pred = model.predict(X_test)

# Evaluate
mse = mean_squared_error(Y_test, Y_pred)
r2 = r2_score(Y_test, Y_pred)

print("MSE:", mse)
print("R²:", r2)

MSE: 0.09168917339944015
R²: 0.01074274487711806


In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, f1_score, classification_report

# Load CSV
df = pd.read_csv("tfidf_author_data.csv")

# Prepare X and Y
X = df[[str(i) for i in range(1000)]].values
Y = df[["openness", "conscientiousness", "extraversion", "agreeableness", "neuroticism"]].values

# Split into train/test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Train RandomForest
model = MultiOutputRegressor(RandomForestRegressor(
    n_estimators=100,
    random_state=42,
    n_jobs=-1
))

model.fit(X_train, Y_train)

# Predict
Y_pred = model.predict(X_test)

# Evaluate MSE and R²
mse = mean_squared_error(Y_test, Y_pred)
r2 = r2_score(Y_test, Y_pred)

print("RandomForest MSE:", mse)
print("RandomForest R²:", r2)

# ========== Binning Function ==========
def score_to_label(score):
    if score <= 32:
        return "Low"
    elif score <= 66:
        return "Medium"
    else:
        return "High"

def convert_scores_to_labels(array):
    labels = []
    for row in array:
        row_labels = []
        for score in row:
            scaled_score = score * 100  # Scale 0-1 to 0-100
            row_labels.append(score_to_label(scaled_score))
        labels.append(row_labels)
    return labels

# Convert predictions and ground truth to labels
pred_labels = convert_scores_to_labels(Y_pred)
true_labels = convert_scores_to_labels(Y_test)

# Flatten for F1-score and Accuracy calculation
pred_labels_flat = [label for row in pred_labels for label in row]
true_labels_flat = [label for row in true_labels for label in row]

# Accuracy
accuracy = accuracy_score(true_labels_flat, pred_labels_flat)
print("RandomForest Accuracy:", accuracy)

# F1-score
f1 = f1_score(true_labels_flat, pred_labels_flat, average="macro")
print("RandomForest F1-score (macro):", f1)

# Classification report
report = classification_report(true_labels_flat, pred_labels_flat)
print("Classification Report:")
print(report)

# Save report
with open("randomforest_classification_report.txt", "w") as f:
    f.write("Classification Report:\n")
    f.write(report)

print("Classification report saved to randomforest_classification_report.txt ✅")

RandomForest MSE: 0.09118591644108279
RandomForest R²: 0.014977398150012066
RandomForest Accuracy: 0.31337579617834393
RandomForest F1-score (macro): 0.2560789869468249
Classification Report:
              precision    recall  f1-score   support

        High       0.67      0.11      0.19       531
         Low       0.52      0.09      0.16       615
      Medium       0.27      0.88      0.42       424

    accuracy                           0.31      1570
   macro avg       0.49      0.36      0.26      1570
weighted avg       0.51      0.31      0.24      1570

Classification report saved to randomforest_classification_report.txt ✅


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import classification_report

# Load CSV
df = pd.read_csv("tfidf_author_data.csv")

# Prepare X and Y
X = df[[str(i) for i in range(1000)]].values
Y = df[["openness", "conscientiousness", "extraversion", "agreeableness", "neuroticism"]].values

# Split into train/test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# --- Train Ridge ---
ridge = MultiOutputRegressor(Ridge())
ridge.fit(X_train, Y_train)
ridge_pred = ridge.predict(X_test)

# --- Train RandomForest ---
rf = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1))
rf.fit(X_train, Y_train)
rf_pred = rf.predict(X_test)

# --- Train XGBoost ---
xgb = MultiOutputRegressor(XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, subsample=0.8, colsample_bytree=0.8, random_state=42, n_jobs=-1))
xgb.fit(X_train, Y_train)
xgb_pred = xgb.predict(X_test)

# --- Combine predictions (Simple Average) ---
final_pred = (ridge_pred + rf_pred + xgb_pred) / 3

# --- Evaluate ---
mse = mean_squared_error(Y_test, final_pred)
r2 = r2_score(Y_test, final_pred)

print("Ensemble MSE:", mse)
print("Ensemble R²:", r2)

Ensemble MSE: 0.08838570953230483
Ensemble R²: 0.04638519737783533


In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, f1_score

# Load CSV
df = pd.read_csv("tfidf_author_data.csv")

# Prepare X and Y
X = df[[str(i) for i in range(1000)]].values
Y = df[["openness", "conscientiousness", "extraversion", "agreeableness", "neuroticism"]].values

# Split into train/test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# --- Train Ridge ---
ridge = MultiOutputRegressor(Ridge())
ridge.fit(X_train, Y_train)
ridge_pred = ridge.predict(X_test)

# --- Train RandomForest ---
rf = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1))
rf.fit(X_train, Y_train)
rf_pred = rf.predict(X_test)

# --- Train XGBoost ---
xgb = MultiOutputRegressor(XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, subsample=0.8, colsample_bytree=0.8, random_state=42, n_jobs=-1))
xgb.fit(X_train, Y_train)
xgb_pred = xgb.predict(X_test)

# --- Combine predictions (Simple Average) ---
final_pred = (ridge_pred + rf_pred + xgb_pred) / 3

# --- Evaluate MSE and R² ---
mse = mean_squared_error(Y_test, final_pred)
r2 = r2_score(Y_test, final_pred)

print("Ensemble MSE:", mse)
print("Ensemble R²:", r2)

# ========== Binning Function ==========
def score_to_label(score):
    if score <= 32:
        return "Low"
    elif score <= 66:
        return "Medium"
    else:
        return "High"

# Convert continuous predictions (0-1) to 0-100, then to labels
def convert_scores_to_labels(array):
    labels = []
    for row in array:
        row_labels = []
        for score in row:
            scaled_score = score * 100  # Scale 0-1 to 0-100
            row_labels.append(score_to_label(scaled_score))
        labels.append(row_labels)
    return labels

# Convert predictions and ground truth to labels
pred_labels = convert_scores_to_labels(final_pred)
true_labels = convert_scores_to_labels(Y_test)

# Flatten for F1-score and Accuracy calculation (optional: micro, macro, weighted)
from sklearn.metrics import classification_report

pred_labels_flat = [label for row in pred_labels for label in row]
true_labels_flat = [label for row in true_labels for label in row]

# Accuracy
accuracy = accuracy_score(true_labels_flat, pred_labels_flat)
print("Ensemble Accuracy:", accuracy)

# F1-score
f1 = f1_score(true_labels_flat, pred_labels_flat, average="macro")
print("Ensemble F1-score (macro):", f1)

report = classification_report(true_labels_flat, pred_labels_flat)
print("Classification Report:")
print(report)

# Save classification report to file
with open("classification_report.txt", "w") as f:
    f.write("Classification Report:\n")
    f.write(report)

print("Classification report saved to classification_report.txt ✅")

Ensemble MSE: 0.09030300406094408
Ensemble R²: 0.024889587856140417
Ensemble Accuracy: 0.30636942675159234
Ensemble F1-score (macro): 0.2380267360650983
Classification Report:
              precision    recall  f1-score   support

        High       0.68      0.09      0.16       531
         Low       0.56      0.07      0.13       615
      Medium       0.27      0.91      0.42       424

    accuracy                           0.31      1570
   macro avg       0.50      0.36      0.24      1570
weighted avg       0.52      0.31      0.22      1570

Classification report saved to classification_report.txt ✅
