In [12]:
import json
import csv
import re

posts_file = "/Users/mogen/Desktop/Research_Case/results/Run_o3_categories/generated_posts.json"  # Change to your actual filename
personas_file = "/Users/mogen/Desktop/Research_Case/results/Run_o3_categories/personas.json"  # Change to your actual filename
output_file = "/Users/mogen/Desktop/Research_Case/results/Final_finetune/03_categories.csv"
# Read the JSON data from files
with open(posts_file, 'r', encoding='utf-8') as f:
    posts_data = json.load(f)

with open(personas_file, 'r', encoding='utf-8') as f:
    personas_data = json.load(f)

# Extract the generated posts
generated_posts = posts_data.get("generated_posts", [])

# Prepare the output data
output_rows = []

# Process each post
for post in generated_posts:
    user_id = post.get("user_id")
    
    # Get persona data for this user
    persona_data = personas_data.get(user_id, {})
    
    # Get the original tweet text directly (no need to parse it as JSON)
    tweet_text = post.get("original_text", "")
    
    # Get the stimulus
    stimulus = post.get("stimulus", "")
    
    # Count persona fields
    persona_field_count = 0
    for field, value in persona_data.items():
        if value and isinstance(value, str) and value.strip():
            persona_field_count += 1
    
    # Create a summary of the persona (include all fields with their keys)
    persona_summary = ""
    for field, value in persona_data.items():
        if value and isinstance(value, str) and value.strip():
            # Add the field name and its full value
            persona_summary += f"{field}: {value}; "
    
    # Add row to output
    output_rows.append({
        "Tweet": tweet_text,
        "Persona": persona_summary,
        "Stimulus": stimulus,
        "Number of Persona Fields": persona_field_count
    })

# Write to CSV
with open(output_file, 'w', encoding='utf-8', newline='') as f:
    fieldnames = ["Tweet", "Persona", "Stimulus", "Number of Persona Fields"]
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(output_rows)

print(f"Combined data saved to {output_file}")
print(f"Total records processed: {len(output_rows)}")

Combined data saved to /Users/mogen/Desktop/Research_Case/results/Final_finetune/03_categories.csv
Total records processed: 50


In [15]:
# Assuming you have 4 DataFrames: df1, df2, df3, df4
import pandas as pd
# If you need to load them from files, uncomment and modify these lines:
df1 = pd.read_csv('/Users/mogen/Desktop/Research_Case/results/Final_finetune/03_categories.csv')
df2 = pd.read_csv('/Users/mogen/Desktop/Research_Case/results/Final_finetune/03_open_des.csv')
df3 = pd.read_csv('/Users/mogen/Desktop/Research_Case/results/Final_finetune/fine_tune.csv')
df4 = pd.read_csv('/Users/mogen/Desktop/Research_Case/results/Final_finetune/fine_tune03.csv')

# List of all the DataFrames
dfs = [df1, df2, df3, df4]

# Combine all DataFrames into one
combined_df = pd.concat(dfs, ignore_index=True)

# Reset the index to have a clean sequential index
combined_df.reset_index(drop=True, inplace=True)

# Check the shape of the combined DataFrame
print(f"Combined DataFrame shape: {combined_df.shape}")

# Check for duplicate rows (optional)
duplicate_count = combined_df.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_count}")

# Remove duplicates if needed
combined_df.drop_duplicates(inplace=True)

# Save the combined DataFrame to a CSV file
combined_df.to_csv('/Users/mogen/Desktop/Research_Case/results/Final_finetune/combined_data.csv', index=False)

print("Combined DataFrame saved to '/Users/mogen/Desktop/Research_Case/results/Final_finetune/combined_data.csv'")

Combined DataFrame shape: (14851, 4)
Number of duplicate rows: 20
Combined DataFrame saved to '/Users/mogen/Desktop/Research_Case/results/Final_finetune/combined_data.csv'


In [16]:
import pandas as pd
import numpy as np
import random
import re

# Step 1: Load the combined DataFrame
combined_df = pd.read_csv('/Users/mogen/Desktop/Research_Case/results/Final_finetune/combined_data.csv')

# Step 2: Check the current distribution of persona fields
print("Current distribution of number of persona fields:")
print(combined_df['Number of Persona Fields'].value_counts().sort_index())

# Step 3: Define our target distribution between 3 and 8 fields
# We want roughly equal numbers for each category
total_rows = len(combined_df)
target_count_per_category = total_rows // 6  # 6 categories (3,4,5,6,7,8)
print(f"Target count per category: ~{target_count_per_category}")

# Step 4: Create a function to adjust the number of fields in a persona string
def adjust_persona_fields(persona_str, target_num_fields):
    # Split the persona string into individual fields
    fields = re.split(r';\s*', persona_str.strip())
    fields = [f for f in fields if f]  # Remove empty fields
    
    current_num_fields = len(fields)
    
    # If we need to reduce fields
    if current_num_fields > target_num_fields:
        # Randomly select fields to keep
        selected_fields = random.sample(fields, target_num_fields)
        return '; '.join(selected_fields) + ';'
    
    # If we need to add fields
    elif current_num_fields < target_num_fields:
        # Calculate how many fields to add
        fields_to_add = target_num_fields - current_num_fields
        
        # Duplicate some existing fields with slightly modified text
        extra_fields = []
        for _ in range(fields_to_add):
            if fields:  # Make sure we have fields to duplicate
                field_to_duplicate = random.choice(fields)
                field_parts = field_to_duplicate.split(': ', 1)
                
                if len(field_parts) == 2:
                    key, value = field_parts
                    # Create a new field with a slightly modified key
                    new_key = key + "_extended"
                    extra_fields.append(f"{new_key}: {value}")
            
        # Add the new fields
        all_fields = fields + extra_fields
        return '; '.join(all_fields) + ';'
    
    # If already at target, return as is
    else:
        return persona_str

# Step 5: Create a new column for the target number of fields
# Distribute target field counts evenly between 3 and 8
target_distribution = []
for i in range(3, 9):  # 3,4,5,6,7,8
    target_distribution.extend([i] * target_count_per_category)

# If we have more rows than our distribution covers, repeat the pattern
while len(target_distribution) < total_rows:
    target_distribution.append(random.randint(3, 8))

# Shuffle the distribution and trim if needed
random.shuffle(target_distribution)
target_distribution = target_distribution[:total_rows]

combined_df['Target Number of Fields'] = target_distribution

# Step 6: Apply the adjustment function to each row
combined_df['Adjusted Persona'] = combined_df.apply(
    lambda row: adjust_persona_fields(row['Persona'], row['Target Number of Fields']), 
    axis=1
)

# Step 7: Verify the new distribution
# Count fields in the adjusted persona strings
def count_fields(persona_str):
    fields = re.split(r';\s*', persona_str.strip())
    return len([f for f in fields if f])

combined_df['Adjusted Number of Fields'] = combined_df['Adjusted Persona'].apply(count_fields)

print("\nNew distribution of number of persona fields:")
print(combined_df['Adjusted Number of Fields'].value_counts().sort_index())

# Step 8: Save the adjusted DataFrame
combined_df.to_csv('/Users/mogen/Desktop/Research_Case/results/Final_finetune/adjusted_persona_data.csv', index=False)

print("\nAdjusted DataFrame saved!")

Current distribution of number of persona fields:
Number of Persona Fields
5     14738
13       50
21       43
Name: count, dtype: int64
Target count per category: ~2471

New distribution of number of persona fields:
Adjusted Number of Fields
3    2473
4    2471
5    2472
6    2481
7    2485
8    2449
Name: count, dtype: int64

Adjusted DataFrame saved!


In [20]:
path = '/Users/mogen/Desktop/Research_Case/results/Final_finetune/adjusted_persona_data.csv'
df= pd.read_csv(path)

df['Adjusted Persona'][0]

"length_of_post: Posts are typically concise, often consisting of brief narratives or summaries of historical events, accompanied by open-ended questions or calls for reflection. There is a balance between informative content and engaging prompts, with a preference for medium-length posts that are easily digestible.; interests_hobbies: The user has a strong interest in history, cultural artifacts, and notable historical figures across various timelines and regions. There is a consistent pattern of sharing stories about historical events, curiosities, and achievements, indicating a deep engagement with historical exploration.; humor_style: The user exhibits a diverse sense of humor, often leaning towards historical irony and light-hearted commentary on unusual historical events, such as the 'Great Emu War' and 'Wanksy' graffiti. There is a playful engagement with historical anecdotes and occasional use of rhetorical questions to provoke thought.; demographic_indicators: The user's conte

In [23]:
df=df.drop(columns=['Target Number of Fields','Number of Persona Fields', 'Persona' ])

In [24]:
df.head()

Unnamed: 0,Tweet,Stimulus,Adjusted Persona,Adjusted Number of Fields
0,"A drum solo by 18-year-old Karen Carpenter, 19...",A social media post highlighting a performance...,"length_of_post: Posts are typically concise, o...",4
1,You're doing a disservice by diluting what con...,A post discussing the interpretation of certai...,interests_hobbies: The user is deeply interest...,7
2,Decent odds you end up with a 100% non white b...,A post discussing diversity and representation...,cultural_context: The user is influenced by co...,3
3,Twitter hacker and crypto scammer sentenced to...,A post about a legal sentencing related to cyb...,length_of_post: Posts are typically short and ...,7
4,$760 billion in PPP loans were forgiven.\n\nTh...,A post discussing financial relief programs an...,social_dynamics: The user is an active partici...,8


In [25]:
df=df.rename(columns={"Adjusted Persona": "persona", "Adjusted Number of Fields": "N_fields"})

df.head()

Unnamed: 0,Tweet,Stimulus,persona,N_fields
0,"A drum solo by 18-year-old Karen Carpenter, 19...",A social media post highlighting a performance...,"length_of_post: Posts are typically concise, o...",4
1,You're doing a disservice by diluting what con...,A post discussing the interpretation of certai...,interests_hobbies: The user is deeply interest...,7
2,Decent odds you end up with a 100% non white b...,A post discussing diversity and representation...,cultural_context: The user is influenced by co...,3
3,Twitter hacker and crypto scammer sentenced to...,A post about a legal sentencing related to cyb...,length_of_post: Posts are typically short and ...,7
4,$760 billion in PPP loans were forgiven.\n\nTh...,A post discussing financial relief programs an...,social_dynamics: The user is an active partici...,8


In [26]:
len(df)

14831

In [27]:
df.to_csv('/Users/mogen/Desktop/Research_Case/results/Final_finetune/14831r_3bis8P_randomsample.csv')