In [None]:
# concatenation of csv files

import pandas as pd

# Load both CSV files
df1 = pd.read_csv('file1.csv')  # This has 5 columns
df2 = pd.read_csv('file2.csv')  # This has 3 columns

# Combine the two dataframes side-by-side
combined_df = pd.concat([df1, df2], axis=1)

# Save to a new CSV file
combined_df.to_csv('combined_file.csv', index=False)

In [29]:
import json
import random

# Load your JSON data
with open('data/filtered_pandora_all.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# Randomly sample 700 authors
sampled_authors = random.sample(data['authors'], 700)

# Save to new JSON
filtered_pandora_small = {'authors': sampled_authors}

with open('filtered_pandora_small.json', 'w', encoding='utf-8') as f:
    json.dump(filtered_pandora_small, f, ensure_ascii=False, indent=2)

print("Sampled 700 authors and saved to 'filtered_pandora_small.json'")


Sampled 700 authors and saved to 'filtered_pandora_small.json'


In [26]:
# add csv to filtered_pandora.json

import pandas as pd
import json
import random

# === CONFIG ===
csv_file = 'synthetic_val_data.csv'       # Replace with your CSV filename
output_file = 'filtered_pandora_small.json'      # Output JSON filename

# === LOAD CSV ===
df = pd.read_csv(csv_file)

# === DEFINE COLUMNS ===
id_column = 'id'
comment_columns = ['Q1', 'Q2', 'Q3']  # Change if yours are named differently
trait_columns = ['Openness', 'Conscientiousness', 'Extraversion', 'Agreeableness', 'Emotional stability']

# === CONVERT EACH ROW TO JSON OBJECT ===
json_list = []

for _, row in df.iterrows():
    entry = {
        "id": str(row[id_column]),
        "labels": {trait: row[trait] for trait in trait_columns},
        "comments": [str(row[col]) for col in comment_columns if pd.notna(row[col])]
    }
    json_list.append(entry)

random.shuffle(json_list)

# === WRITE TO JSON FILE ===
with open(output_file, 'a', encoding='utf-8') as f:
    json.dump(json_list, f, indent=2, ensure_ascii=False)

print(f"✅ JSON file created: {output_file}")

✅ JSON file created: filtered_pandora_small.json


In [30]:

# === CONFIG ===
csv_file = 'synthetic_val_data.csv'       # Replace with your CSV filename
output_file = 'filtered_pandora_small.json'      # Output JSON filename

# === LOAD CSV ===
df = pd.read_csv(csv_file)

# === DEFINE COLUMNS ===
id_column = 'id'
comment_columns = ['Q1', 'Q2', 'Q3']  # Change if yours are named differently
trait_columns = ['Openness', 'Conscientiousness', 'Extraversion', 'Agreeableness', 'Emotional stability']

# === CONVERT EACH ROW TO JSON OBJECT ===
json_list = []

for _, row in df.iterrows():
    entry = {
        "id": str(row[id_column]),
        "labels": {trait: row[trait] for trait in trait_columns},
        "comments": [str(row[col]) for col in comment_columns if pd.notna(row[col])]
    }
    json_list.append(entry)


# === WRITE TO JSON FILE ===
with open(output_file, 'a', encoding='utf-8') as f:
    json.dump(json_list, f, indent=2, ensure_ascii=False)

print(f"✅ JSON file created: {output_file}")

✅ JSON file created: filtered_pandora_small.json


In [None]:
import pandas as pd
import json

# === CONFIG ===
csv_file = 'synthetic_val_data_labeled.csv'       # Your labeled synthetic data
original_json_file = 'filtered_pandora.json'      # Original Reddit training data
output_file = 'filtered_pandora_merged.json'      # Final merged output

# === LOAD DATA ===
df = pd.read_csv(csv_file)
with open(original_json_file, 'r', encoding='utf-8') as f:
    original_data = json.load(f)

# === DEFINE COLUMNS ===
comment_columns = ['Q1', 'Q2', 'Q3']
trait_columns = ['Openness', 'Conscientiousness', 'Extraversion', 'Agreeableness', 'Emotional Stability', 'Humility']

# === CONVERT SYNTHETIC ROWS ===
synthetic_authors = []

for _, row in df.iterrows():
    author_entry = {
        "id": str(row["id"]),
        "labels": {
            "openness": row["Openness"].lower(),
            "conscientiousness": row["Conscientiousness"].lower(),
            "extraversion": row["Extraversion"].lower(),
            "agreeableness": row["Agreeableness"].lower(),
            "emotional_stability": row["Emotional Stability"].lower(),
            "humility": row["Humility"].lower()
        },
        "comments": [str(row[c]) for c in comment_columns if pd.notna(row[c])]
    }
    synthetic_authors.append(author_entry)

# === MERGE & SAVE ===
merged_data = {
    "authors": original_data["authors"] + synthetic_authors
}

with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(merged_data, f, indent=2, ensure_ascii=False)

print(f"✅ Merged JSON saved to: {output_file}")