In [None]:
"""
Extract LIWC features from filtered_pandora_all_labeled.json
Each author → concatenate all comments → get normalized LIWC category vector
Also saves Big Five traits and author ID
"""

import os
import json
import numpy as np
import pandas as pd
import re
from collections import defaultdict, Counter

def load_liwc_dic(dic_path):
    category_map = defaultdict(list)
    with open(dic_path, 'r', encoding='utf-8') as f:
        for line in f:
            if ':' not in line:
                continue
            parts = line.strip().split()
            category = parts[0].rstrip(':')
            words = parts[1:]
            category_map[category] = words
    return category_map


def liwc_embedding(text, category_map):
    tokens = re.findall(r"\b\w+\b", text.lower()) # tokens = text.lower().split()
    counts = Counter()
    for category, words in category_map.items():
        for token in tokens:
            if token in words:
                counts[category] += 1
    sorted_categories = sorted(category_map.keys())
    vec = np.array([counts.get(cat, 0) for cat in sorted_categories])
    return vec

liwc_dic_path = "output.dic" 
input_filename = "filtered_pandora_all_labeled.json"
input_folder = os.path.dirname("/Users/arashalborz/Desktop/Data/filtered_pandora_all_labeled.json") 
input_path = os.path.join(input_folder, input_filename)
save_path = os.path.join(input_folder, "liwc_author_data.csv")

category_map = load_liwc_dic(liwc_dic_path)
sorted_categories = sorted(category_map.keys())

with open(input_path, "r", encoding="utf-8") as f:
    data = json.load(f)

rows = []

for author in data["authors"]:
    author_id = author["id"]
    comments = author.get("comments", [])
    full_text = " ".join(comments)

    vec = liwc_embedding(full_text, category_map)
    if np.sum(vec) > 0:
        vec = vec / np.sum(vec)  # normalize vector to sum to 1

    traits = author["labels"]

    row = {
        "id": author_id,
        "Openness": traits["Openness"],
        "Conscientiousness": traits["Conscientiousness"],
        "Extraversion": traits["Extraversion"],
        "Agreeableness": traits["Agreeableness"],
        "Emotional stability": traits["Emotional stability"]
    }

    for i, cat in enumerate(sorted_categories):
        row[f"liwc_{cat}"] = vec[i]

    rows.append(row)

df = pd.DataFrame(rows)
df.to_csv(save_path, index=False)
print(f"LIWC features saved to {save_path}")

LIWC features saved to /Users/arashalborz/Desktop/Data/liwc_author_data.csv


In [45]:
"""
Extract LIWC features from val_data.csv
Each row → concatenate Q1, Q2, Q3 → get normalized LIWC vector + labels
"""

import os
import numpy as np
import pandas as pd
from collections import defaultdict, Counter

def load_liwc_dic(dic_path):
    category_map = defaultdict(list)
    with open(dic_path, 'r', encoding='utf-8') as f:
        for line in f:
            if ':' not in line:
                continue
            parts = line.strip().split()
            category = parts[0].rstrip(':')
            words = parts[1:]
            category_map[category] = words
    return category_map

def liwc_embedding(text, category_map):
    tokens = re.findall(r"\b\w+\b", text.lower()) # tokens = text.lower().split()
    counts = Counter()
    for category, words in category_map.items():
        for token in tokens:
            if token in words:
                counts[category] += 1
    sorted_categories = sorted(category_map.keys())
    vec = np.array([counts.get(cat, 0) for cat in sorted_categories])
    return vec

# === Paths ===
liwc_dic_path = "output.dic"
val_csv_path = "/Users/arashalborz/Desktop/Data/val_data.csv"
save_path = "/Users/arashalborz/Desktop/Data/liwc_val_data.csv"

# === Load dictionary and CSV ===
category_map = load_liwc_dic(liwc_dic_path)
sorted_categories = sorted(category_map.keys())
val_df = pd.read_csv(val_csv_path)

# === Personality trait columns ===
trait_columns = ["Openness", "Conscientiousness", "Extraversion", "Agreeableness", "Emotional stability"]

rows = []

for idx, row in val_df.iterrows():
    author_id = str(row["id"])
    text = " ".join(str(row[q]) for q in ['Q1', 'Q2', 'Q3'] if pd.notna(row[q]))

    vec = liwc_embedding(text, category_map)
    if np.sum(vec) > 0:
        vec = vec / np.sum(vec)

    row_data = {
        "id": author_id,
        **{trait: row[trait] for trait in trait_columns}
    }

    for i, cat in enumerate(sorted_categories):
        row_data[f"liwc_{cat}"] = vec[i]

    rows.append(row_data)

# === Save to CSV ===
df = pd.DataFrame(rows)
df.to_csv(save_path, index=False)
print(f"✅ LIWC features + labels saved to {save_path}")

✅ LIWC features + labels saved to /Users/arashalborz/Desktop/Data/liwc_val_data.csv


In [None]:
import pandas as pd

# Path to your file
val_path = "/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/val_embeddings_with_liwc_labeled.csv"

# Step 1: Load with correct separator
df = pd.read_csv(val_path, sep=";")

# Step 2: Replace comma with dot in all string cells
df = df.applymap(lambda x: str(x).replace(",", ".") if isinstance(x, str) else x)

# Step 3: Convert numeric columns to float where possible
for col in df.columns:
    try:
        df[col] = pd.to_numeric(df[col])
    except ValueError:
        pass  # skip non-numeric columns

df.to_csv(val_path, index=False)

✅ Cleaned and saved file: /Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/val_embeddings_with_liwc_labeled.csv


  df = df.applymap(lambda x: str(x).replace(",", ".") if isinstance(x, str) else x)


In [30]:
import pandas as pd

train_path = "/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/combined_author_embeddings_with_liwc_labeled.csv"
val_path = "/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/val_embeddings_with_liwc_labeled.csv"

train_df = pd.read_csv(train_path)
val_df = pd.read_csv(val_path)

In [31]:
train_cols = set(train_df.columns)
val_cols = set(val_df.columns)

print("In train but not in val:", train_cols - val_cols)
print("In val but not in train:", val_cols - train_cols)

In train but not in val: set()
In val but not in train: {'humility', 'full_text', 'q3', 'q2', 'q1'}


In [32]:
import pandas as pd

# Load validation file
val_path = "/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/val_embeddings_with_liwc_labeled.csv"
val_df = pd.read_csv(val_path)

# Drop unnecessary columns
columns_to_drop = ['q1', 'q2', 'q3', 'full_text', 'humility']
val_df = val_df.drop(columns=columns_to_drop, errors='ignore')

# Save the cleaned file (overwrite original)
val_df.to_csv(val_path, index=False)
print("✅ Extra columns dropped and validation file updated.")

✅ Extra columns dropped and validation file updated.


In [33]:
print("Train dtypes:\n", train_df.dtypes)
print("\nVal dtypes:\n", val_df.dtypes)

# Optional: check sample row
print("\nSample row from train:\n", train_df.iloc[0])
print("\nSample row from val:\n", val_df.iloc[0])

Train dtypes:
 id                    object
embed_0              float64
embed_1              float64
embed_2              float64
embed_3              float64
                      ...   
openness              object
conscientiousness     object
extraversion          object
agreeableness         object
neuroticism           object
Length: 838, dtype: object

Val dtypes:
 id                     int64
openness              object
conscientiousness     object
extraversion          object
agreeableness         object
                      ...   
liwc_Time            float64
liwc_Verbs           float64
liwc_We              float64
liwc_Work            float64
liwc_You             float64
Length: 838, dtype: object

Sample row from train:
 id                   -Areopagan-
embed_0                   0.9595
embed_1                  -0.4644
embed_2                    1.148
embed_3                  -0.4001
                        ...     
openness                    High
conscientiousness      

In [34]:
import pandas as pd

# Load the original validation CSV
val_path = "/Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/val_embeddings_with_liwc_labeled.csv"
val_df = pd.read_csv(val_path)

# Convert 'id' to string for consistency
val_df["id"] = val_df["id"].astype(str)

# Drop unwanted columns if they exist
columns_to_drop = ["q1", "q2", "q3", "humility", "full_text"]
val_df = val_df.drop(columns=[col for col in columns_to_drop if col in val_df.columns])

# Save cleaned DataFrame (overwrite)
val_df.to_csv(val_path, index=False)

print("✅ Validation set cleaned and saved:", val_path)

✅ Validation set cleaned and saved: /Users/arashalborz/Desktop/amiv_nlp_2025/processed_data/val_embeddings_with_liwc_labeled.csv


In [None]:
'''
>>> Script for merginf liwc features with embeddings

'''

import pandas as pd

# for validation set
embedding_csv = "../processed_data/validation/val_embeddings.csv"
liwc_csv = "../processed_data/validation/liwc_val_with_regex.csv"
output_csv = "../processed_data/validation/comb_val_liwc_embed.csv"

# for train set
#embedding_csv = "../processed_data/train/author_embeddings.csv"
#liwc_csv = "../processed_data/train/liwc_train_with_regex.csv"
#output_csv = "../processed_data/train/comb_train_liwc_embed.csv"

emb_df = pd.read_csv(embedding_csv)
liwc_df = pd.read_csv(liwc_csv)

trait_columns = ["Openness", "Conscientiousness", "Extraversion", "Agreeableness", "Emotional stability"]
emb_df = emb_df.drop(columns=[col for col in trait_columns if col in emb_df.columns], errors="ignore")

merged_df = pd.merge(liwc_df, emb_df, on="id", how="inner")

merged_df.to_csv(output_csv, index=False)
print(f"Merged file saved to: {output_csv}")

Merged file saved to: ../processed_data/validation/comb_val_liwc_embed.csv
