In [None]:
import numpy as np
import pandas as pd
from collections import defaultdict, Counter

# === Load LIWC .dic file ===
def load_liwc_dic(dic_path):
    category_map = defaultdict(list)
    with open(dic_path, 'r', encoding='utf-8') as f:
        for line in f:
            if ':' not in line:
                continue
            parts = line.strip().split()
            category = parts[0].rstrip(':')
            words = parts[1:]
            category_map[category] = words
    return category_map

# === Extract LIWC features from a single text ===
def liwc_embedding(text, category_map):
    tokens = text.lower().split()
    counts = Counter()
    for category, words in category_map.items():
        for token in tokens:
            if token in words:
                counts[category] += 1
    sorted_categories = sorted(category_map.keys())
    vec = np.array([counts.get(cat, 0) for cat in sorted_categories])
    return vec

# === Full processing ===
def extract_liwc_features(texts, dic_path):
    category_map = load_liwc_dic(dic_path)
    sorted_categories = sorted(category_map.keys())

    liwc_features = []
    for text in texts:
        vec = liwc_embedding(text, category_map)
        # Optional: normalize to make it a distribution
        if np.sum(vec) > 0:
            vec = vec / np.sum(vec)
        liwc_features.append(vec)

    df = pd.DataFrame(liwc_features, columns=[f"liwc_{cat}" for cat in sorted_categories])
    return df

# === Example usage ===
if __name__ == "__main__":
    # Example input: list of concatenated author comments
    texts = [
        "I always help my friends and feel bad when others are sad.",
        "I love challenges, taking risks and trying new things.",
        "People are mostly stupid and deserve what they get."
    ]

    dic_path = "output.dic"  # Path to your LIWC .dic file
    liwc_df = extract_liwc_features(texts, dic_path)

    # Save or inspect
    liwc_df.to_csv("liwc_only_features.csv", index=False)
    print("✅ LIWC features saved to liwc_only_features.csv")
    print(liwc_df.head())

✅ LIWC features saved to liwc_only_features.csv
   liwc_Achiev  liwc_Adverbs  liwc_Affect  liwc_Anger  liwc_Anx  liwc_Article  \
0         0.00      0.027027     0.027027         0.0       0.0           0.0   
1         0.05      0.000000     0.050000         0.0       0.0           0.0   
2         0.00      0.050000     0.000000         0.0       0.0           0.0   

   liwc_Assent  liwc_AuxVb  liwc_Bio  liwc_Body  ...  liwc_Social  liwc_Space  \
0          0.0    0.027027      0.00        0.0  ...     0.027027         0.0   
1          0.0    0.000000      0.05        0.0  ...     0.050000         0.0   
2          0.0    0.050000      0.00        0.0  ...     0.050000         0.0   

   liwc_Swear  liwc_Tentat  liwc_They  liwc_Time  liwc_Verbs  liwc_We  \
0         0.0         0.00       0.00   0.054054    0.054054      0.0   
1         0.0         0.00       0.00   0.050000    0.100000      0.0   
2         0.0         0.05       0.05   0.000000    0.050000      0.0   

   liwc_W

In [4]:
"""
Extract LIWC features from filtered_pandora.json
Each author → concatenate all comments → get normalized LIWC category vector
Also saves Big Five traits and author ID
"""

import os
import json
import numpy as np
import pandas as pd
from collections import defaultdict, Counter

# ========== LIWC DICTIONARY LOADER ==========

def load_liwc_dic(dic_path):
    category_map = defaultdict(list)
    with open(dic_path, 'r', encoding='utf-8') as f:
        for line in f:
            if ':' not in line:
                continue
            parts = line.strip().split()
            category = parts[0].rstrip(':')
            words = parts[1:]
            category_map[category] = words
    return category_map

# ========== LIWC FEATURE EXTRACTOR ==========

def liwc_embedding(text, category_map):
    tokens = text.lower().split()
    counts = Counter()
    for category, words in category_map.items():
        for token in tokens:
            if token in words:
                counts[category] += 1
    sorted_categories = sorted(category_map.keys())
    vec = np.array([counts.get(cat, 0) for cat in sorted_categories])
    return vec

# ========== MAIN PROCESS ==========

# --- Paths ---
liwc_dic_path = "output.dic"  # Path to your LIWC .dic file
input_filename = "filtered_pandora.json"
input_folder = os.path.dirname("/Users/arashalborz/Desktop/Data/filtered_pandora.json")  # ⬅️ Set your actual path here
input_path = os.path.join(input_folder, input_filename)
save_path = os.path.join(input_folder, "liwc_author_data.csv")

# --- Load LIWC dict ---
category_map = load_liwc_dic(liwc_dic_path)
sorted_categories = sorted(category_map.keys())

# --- Load JSON data ---
with open(input_path, "r", encoding="utf-8") as f:
    data = json.load(f)

# --- Process authors ---
rows = []

for author in data["authors"]:
    author_id = author["id"]
    comments = author.get("comments", [])
    full_text = " ".join(comments)

    # LIWC embedding
    vec = liwc_embedding(full_text, category_map)
    if np.sum(vec) > 0:
        vec = vec / np.sum(vec)  # normalize vector to sum to 1

    # Big Five traits
    traits = author["labels"]

    # Build row
    row = {
        "author_id": author_id,
        "openness": traits["openness"],
        "conscientiousness": traits["conscientiousness"],
        "extraversion": traits["extraversion"],
        "agreeableness": traits["agreeableness"],
        "neuroticism": traits["neuroticism"]
    }

    # Add LIWC features
    for i, cat in enumerate(sorted_categories):
        row[f"liwc_{cat}"] = vec[i]

    rows.append(row)

# --- Save to CSV ---
df = pd.DataFrame(rows)
df.to_csv(save_path, index=False)
print(f"✅ LIWC features saved to {save_path}")

✅ LIWC features saved to /Users/arashalborz/Desktop/Data/liwc_author_data.csv
