In [None]:
import json
import pandas as pd

# Load the JSON file
with open("data/filtered_pandora_neuro.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Build rows per author
rows = []
for author in data["authors"]:
    row = {
        "author_id": author["id"],
        "comments": author["comments"],
        **author["labels"]  # unpack the personality trait scores
    }
    rows.append(row)

# Convert to DataFrame
df = pd.DataFrame(rows)
# Optional: if you want comments as one long string
df["comments"] = df["comments"].apply(lambda x: " ".join(x))
df

In [None]:
trait_cols = ["Openness", "Conscientiousness", "Extraversion", "Agreeableness", "Emotional stability"]
text_col = "comments"  

# Comment length stats 
df["comment_length_words"] = df[text_col].apply(lambda x: len(str(x).split())) #split on whitespaces
df["comment_length_char"] = df[text_col].apply(lambda x: len(str(x))) #length of strings

print("\n🔹 Word Length Stats:")
print(df["comment_length_words"].describe())
print("\n🔹 Character Length Stats:")
print(df["comment_length_char"].describe())

plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
sns.histplot(df["comment_length_words"], bins=50, kde=True)
plt.title("Comment Length Distribution (words)")

plt.subplot(1, 2, 2)
sns.histplot(df["comment_length_char"], bins=50, kde=True, color="orange")
plt.title("Comment Length Distribution (characters)")
plt.tight_layout()
plt.show()

# Class distribution per trait

# Apply conversion
def convert(score):
    if score <= 0.32:
        return "low"
    elif score <= 0.66:
        return "medium"
    else:
        return "high"

for trait in trait_cols:
    if df[trait].dtype in ['float64', 'int64']:
        df[trait + "_cat"] = df[trait].apply(convert)

print("\n🔹 Class Distributions:")
for trait in trait_cols:
    cat_trait = trait + "_cat"
    if cat_trait in df.columns:
        print(f"\n{cat_trait} class counts:")
        print(df[cat_trait].value_counts())
        plt.figure(figsize=(4, 3))
        sns.countplot(x=cat_trait, data=df, palette="pastel")
        plt.title(f"Class Distribution: {trait}")
        plt.xlabel("Category")
        plt.ylabel("Count")
        plt.show()


In [None]:
import matplotlib.pyplot as plt

# Sort by word count (optional, for clearer plot)
df_sorted = df.sort_values("comment_length_words", ascending=False)

# Plot
plt.figure(figsize=(12, 6))
plt.bar(df_sorted.index, df_sorted['comment_length_words'], color='skyblue')
plt.xticks(rotation=90)
plt.xlabel("Author ID")
plt.ylabel("Total Word Count in Comments")
plt.title("Total Number of Words per Author")
plt.tight_layout()
plt.show()


In [None]:
import json
import pandas as pd


val = pd.read_csv('data/val_data.csv')
val["comments"] = val[["Q1", "Q2", "Q3"]].apply(lambda x: ' '.join(x.astype(str)), axis=1)
val

In [None]:
df = val
trait_cols = ["Openness", "Conscientiousness", "Extraversion", "Agreeableness", "Emotional stability"]
text_col = "comments"  

# Comment length stats 
df["comment_length_words"] = df[text_col].apply(lambda x: len(str(x).split())) #split on whitespaces
df["comment_length_char"] = df[text_col].apply(lambda x: len(str(x))) #length of strings

print("\n🔹 Word Length Stats:")
print(df["comment_length_words"].describe())
print("\n🔹 Character Length Stats:")
print(df["comment_length_char"].describe())

plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
sns.histplot(df["comment_length_words"], bins=50, kde=True)
plt.title("Comment Length Distribution (words)")

plt.subplot(1, 2, 2)
sns.histplot(df["comment_length_char"], bins=50, kde=True, color="orange")
plt.title("Comment Length Distribution (characters)")
plt.tight_layout()
plt.show()

# Class distribution per trait

print("\n🔹 Class Distributions:")
for trait in trait_cols:
    if trait in df.columns:
        print(f"\n{trait} class counts:")
        print(df[trait].value_counts())
        plt.figure(figsize=(4, 3))
        sns.countplot(x=trait, data=df, palette="pastel")
        plt.title(f"Class Distribution: {trait}")
        plt.xlabel("Category")
        plt.ylabel("Count")
        plt.show()


In [None]:
import matplotlib.pyplot as plt

# Calculate total word count per author
df["total_word_count"] = df["comments"].apply(
    lambda comments: sum(len(c.split()) for c in comments)
)

# Sort by word count (optional, for clearer plot)
df_sorted = df.sort_values("total_word_count", ascending=False)

# Plot
plt.figure(figsize=(12, 6))
plt.bar(df_sorted.index, df_sorted["total_word_count"], color='skyblue') #df_sorted["author_id"]
plt.xticks(rotation=90)
plt.xlabel("Author ID")
plt.ylabel("Total Word Count in Comments")
plt.title("Total Number of Words per Author")
plt.tight_layout()
plt.show()
