In [2]:
import pandas as pd

In [3]:

def inspect_columns(file_path: str, chunk_size: int = 50000):
    """
    Inspect `reply_to_id` and `tweet_id` columns in a CSV file for issues.
    
    :param file_path: Path to the CSV file.
    :param chunk_size: Number of rows to process per chunk.
    """
    invalid_reply_to_ids = []
    invalid_tweet_ids = []
    nan_reply_to_count = 0
    nan_tweet_id_count = 0

    print(f"Inspecting file: {file_path}")
    
    for chunk in pd.read_csv(file_path, chunksize=chunk_size):
        # Check `reply_to_id` column
        reply_to_id_invalid = chunk.loc[
            ~chunk['reply_to_id'].apply(lambda x: pd.isna(x) or str(x).isdigit())
        ]
        nan_reply_to_count += chunk['reply_to_id'].isna().sum()
        invalid_reply_to_ids.extend(reply_to_id_invalid['reply_to_id'].tolist())
        
        # Check `tweet_id` column
        tweet_id_invalid = chunk.loc[
            ~chunk['tweet_id'].apply(lambda x: pd.isna(x) or str(x).isdigit())
        ]
        nan_tweet_id_count += chunk['tweet_id'].isna().sum()
        invalid_tweet_ids.extend(tweet_id_invalid['tweet_id'].tolist())

    print("\nInspection Results:")
    print(f"Total NaN values in reply_to_id: {nan_reply_to_count}")
    print(f"Total NaN values in tweet_id: {nan_tweet_id_count}")
    print(f"Sample invalid reply_to_id values: {invalid_reply_to_ids[:10]}")
    print(f"Sample invalid tweet_id values: {invalid_tweet_ids[:10]}")

# Provide the path to your CSV file
replies_file = "/Users/mogen/Desktop/Research_Case/data/Kopie von FolloweeIDs2_tweets_df_AugustPull.csv"
inspect_columns(replies_file)


Inspecting file: /Users/mogen/Desktop/Research_Case/data/Kopie von FolloweeIDs2_tweets_df_AugustPull.csv

Inspection Results:
Total NaN values in reply_to_id: 5751827
Total NaN values in tweet_id: 792399
Sample invalid reply_to_id values: [1.373893031074566e+18, 1.3674096597919048e+18, 2.305994474390856e+17, 2.3032551416070144e+17, 2.2531366808938496e+17, 2.2079796338190746e+17, 2.0998299704112333e+17, 2.09731566816084e+17, 2.0723305911065805e+17, 2.071411255244349e+17]
Sample invalid tweet_id values: [1.676173805482553e+18, 1.6674037971808584e+18, 1.6644783072436756e+18, 1.661787784536019e+18, 1.6500370260692828e+18, 1.6496250164416676e+18, 1.6494065720225956e+18, 1.6464940466749071e+18, 1.6428837161239716e+18, 1.6374641399154934e+18]


In [7]:
df_full=pd.read_csv('/Users/mogen/Desktop/Research_Case/data/Kopie von FolloweeIDs2_tweets_df_AugustPull.csv')
df_full.head()

  df_full=pd.read_csv('/Users/mogen/Desktop/Research_Case/data/Kopie von FolloweeIDs2_tweets_df_AugustPull.csv')


Unnamed: 0,full_text,tweet_id,created_at,screen_name,original_user_id,retweeted_user_ID,collected_at,reply_to_id,reply_to_user,expandedURL
0,"Have a great day Tweeps.. \nPeace &amp; Love, ...",1.676174e+18,2023-07-04 10:19:11+00:00,BellenyFufu,254185636.0,,2023-08-13 13:32:00.832978,,,[]
1,Touching &amp; Inspiring. \n\nThis isn't just ...,1.667404e+18,2023-06-10 05:30:18+00:00,BellenyFufu,254185636.0,,2023-08-13 13:32:00.832997,,,"[{'url': 'https://t.co/xPkuLPXZUu', 'expanded_..."
2,"Follow the value, not the person.",1.664478e+18,2023-06-02 03:45:27+00:00,BellenyFufu,254185636.0,,2023-08-13 13:32:00.833006,,,[]
3,"Now, war is like a hobby, people enjoy killing...",1.661788e+18,2023-05-25 17:34:17+00:00,BellenyFufu,254185636.0,,2023-08-13 13:32:00.833013,,,[]
4,"Have a great day Tweeps.. \nStay safe, stay he...",1.650037e+18,2023-04-23 07:20:57+00:00,BellenyFufu,254185636.0,,2023-08-13 13:32:00.833020,,,[]


In [2]:
# Analysis script for CSV data quality
import pandas as pd
import numpy as np
from collections import defaultdict
import logging
import sys

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Read CSV in chunks to handle large files
chunk_size = 10000
file_path = '/Users/mogen/Desktop/Research_Case/data/Kopie von FolloweeIDs2_tweets_df_AugustPull.csv'

# Storage for analysis
tweet_id_counts = defaultdict(int)
scientific_notation_examples = []
data_types_seen = set()
null_count = 0
total_rows = 0

# Process file in chunks
for chunk in pd.read_csv(file_path, chunksize=chunk_size):
    # Track total rows
    chunk_size = len(chunk)
    total_rows += chunk_size
    
    # Check data types
    data_types_seen.add(str(chunk['tweet_id'].dtype))
    
    # Count tweet_ids
    tweet_id_counts.update(chunk['tweet_id'].value_counts().to_dict())
    
    # Check for scientific notation
    for tweet_id in chunk['tweet_id']:
        if isinstance(tweet_id, str) and ('e' in tweet_id.lower()):
            scientific_notation_examples.append(tweet_id)
    
    # Count nulls
    null_count += chunk['tweet_id'].isnull().sum()

# Find duplicates
duplicates = {k: v for k, v in tweet_id_counts.items() if v > 1}

# Print analysis results
print("\n=== Dataset Analysis Results ===")
print(f"Total rows processed: {total_rows:,}")
print(f"\nData types found for tweet_id: {data_types_seen}")
print(f"Null/missing tweet_ids: {null_count:,}")
print(f"\nDuplicate tweet_ids found: {len(duplicates):,}")

if duplicates:
    print("\nSample of duplicate tweet_ids:")
    for tweet_id, count in list(duplicates.items())[:5]:
        print(f"tweet_id: {tweet_id}, occurrences: {count}")

if scientific_notation_examples:
    print("\nExamples of scientific notation found:")
    for example in scientific_notation_examples[:5]:
        print(example)

# Optional: Detailed analysis of a specific chunk with duplicates
if duplicates:
    duplicate_id = list(duplicates.keys())[0]
    print(f"\nDetailed look at records for duplicate tweet_id: {duplicate_id}")
    for chunk in pd.read_csv(file_path, chunksize=chunk_size):
        duplicate_records = chunk[chunk['tweet_id'] == duplicate_id]
        if not duplicate_records.empty:
            print("\nDuplicate records found:")
            print(duplicate_records.to_string())
            break

# Memory usage analysis
print("\nMemory usage of tweet_id counts:", f"{sys.getsizeof(tweet_id_counts):,} bytes")


=== Dataset Analysis Results ===
Total rows processed: 7,790,741

Data types found for tweet_id: {'float64'}
Null/missing tweet_ids: 792,399

Duplicate tweet_ids found: 33

Sample of duplicate tweet_ids:
tweet_id: 6.57708430782763e+17, occurrences: 2
tweet_id: 1.6842382791750124e+18, occurrences: 2
tweet_id: 1.6911854661569372e+18, occurrences: 2
tweet_id: 1.6914990347776243e+18, occurrences: 2
tweet_id: 1.671594924821676e+18, occurrences: 2

Detailed look at records for duplicate tweet_id: 6.57708430782763e+17

Duplicate records found:
                                                                                                                  full_text      tweet_id                 created_at   screen_name  original_user_id  retweeted_user_ID                collected_at  reply_to_id  reply_to_user                                                                                                                     expandedURL
1098545  Make your voice heard now! Tell Congress that t

In [3]:
# Process first chunk with null tweet_ids
for chunk in pd.read_csv(file_path, chunksize=chunk_size):
    null_rows = chunk[chunk['tweet_id'].isnull()]
    if not null_rows.empty:
        print("\nFirst 5 rows with null tweet_ids:")
        print(null_rows.head().to_string())
        
        print("\nColumn values distribution in null rows:")
        for column in null_rows.columns:
            non_null_count = null_rows[column].count()
            print(f"\n{column}:")
            print(f"Non-null values: {non_null_count}")
            if non_null_count > 0:
                print("Sample values:")
                print(null_rows[column].head().to_string())
        break


First 5 rows with null tweet_ids:
                                                                    full_text  tweet_id created_at screen_name  original_user_id  retweeted_user_ID collected_at  reply_to_id  reply_to_user expandedURL
94775  Biden's regime just won't quit until his primary rival is behind bars.       NaN        NaN         NaN               NaN                NaN          NaN          NaN            NaN         NaN
94776                                  Trump delivers troubling legal news...       NaN        NaN         NaN               NaN                NaN          NaN          NaN            NaN         NaN

Column values distribution in null rows:

full_text:
Non-null values: 2
Sample values:
94775    Biden's regime just won't quit until his prima...
94776               Trump delivers troubling legal news...

tweet_id:
Non-null values: 0

created_at:
Non-null values: 0

screen_name:
Non-null values: 0

original_user_id:
Non-null values: 0

retweeted_user_ID:
Non

In [6]:
df_test=pd.read_csv('/Users/mogen/Desktop/Research_Case/data/df_test_10k.csv')
df_test.head()

Unnamed: 0,full_text,tweet_id,created_at,screen_name,original_user_id,retweeted_user_ID,collected_at,reply_to_id,reply_to_user,expandedURL
0,"Have a great day Tweeps.. \nPeace &amp; Love, ...",1.676174e+18,2023-07-04 10:19:11+00:00,BellenyFufu,254185636.0,,2023-08-13 13:32:00.832978,,,[]
1,Touching &amp; Inspiring. \n\nThis isn't just ...,1.667404e+18,2023-06-10 05:30:18+00:00,BellenyFufu,254185636.0,,2023-08-13 13:32:00.832997,,,"[{'url': 'https://t.co/xPkuLPXZUu', 'expanded_..."
2,"Follow the value, not the person.",1.664478e+18,2023-06-02 03:45:27+00:00,BellenyFufu,254185636.0,,2023-08-13 13:32:00.833006,,,[]
3,"Now, war is like a hobby, people enjoy killing...",1.661788e+18,2023-05-25 17:34:17+00:00,BellenyFufu,254185636.0,,2023-08-13 13:32:00.833013,,,[]
4,"Have a great day Tweeps.. \nStay safe, stay he...",1.650037e+18,2023-04-23 07:20:57+00:00,BellenyFufu,254185636.0,,2023-08-13 13:32:00.833020,,,[]


In [4]:
path = '/Users/mogen/Desktop/Research_Case/results/processed_data_20250105_123512/processed_conversations.json'

with open(path, 'r', encoding='utf8') as JSONFile:
    # Read a small part of the file to inspect its structure
    partial_data = JSONFile.read(1024)  # Read the first 1024 bytes (1 KB)
    print(partial_data)


{"1691288262365663232":[{"tweet_id":"1691288441974128640","reply_to_id":"1691288262365663232","created_at":"2023-08-15 03:19:21+00:00","full_text":"@SpeakerMcCarthy @MAGAIncWarRoom https:\/\/t.co\/FKQnERPcgN","original_user_id":"4086750921.0"},{"tweet_id":"1691288576225402880","reply_to_id":"1691288262365663232","created_at":"2023-08-15 03:19:53+00:00","full_text":"@SpeakerMcCarthy Start an impeachment inquiry of Joe Biden now or just stop taking.","original_user_id":"49034507.0"},{"tweet_id":"1691288609192603648","reply_to_id":"1691288262365663232","created_at":"2023-08-15 03:20:01+00:00","full_text":"@SpeakerMcCarthy READ THE INDICTMENT KEVIN.\nhttps:\/\/t.co\/5aJK1MwzqE","original_user_id":"568345906.0"},{"tweet_id":"1691288795818168320","reply_to_id":"1691288262365663232","created_at":"2023-08-15 03:20:46+00:00","full_text":"@SpeakerMcCarthy @TrumpWarRoom They indicted everyone under the sun!\n\nIt\u2019s nuts. https:\/\/t.co\/WqlkXlccxe","original_user_id":"9.29912597519925e+17"},

In [5]:
import ijson

path = '/Users/mogen/Desktop/Research_Case/results/processed_data_20250105_123512/processed_conversations.json'

# Parse the dictionary incrementally and extract the first 3 key-value pairs
with open(path, 'r', encoding='utf8') as JSONFile:
    parser = ijson.kvitems(JSONFile, '')
    first_three_entries = {}
    for i, (key, value) in enumerate(parser):
        if i >= 3:  # Stop after the first 3 key-value pairs
            break
        first_three_entries[key] = value

print(first_three_entries)


{'1691288262365663232': [{'tweet_id': '1691288441974128640', 'reply_to_id': '1691288262365663232', 'created_at': '2023-08-15 03:19:21+00:00', 'full_text': '@SpeakerMcCarthy @MAGAIncWarRoom https://t.co/FKQnERPcgN', 'original_user_id': '4086750921.0'}, {'tweet_id': '1691288576225402880', 'reply_to_id': '1691288262365663232', 'created_at': '2023-08-15 03:19:53+00:00', 'full_text': '@SpeakerMcCarthy Start an impeachment inquiry of Joe Biden now or just stop taking.', 'original_user_id': '49034507.0'}, {'tweet_id': '1691288609192603648', 'reply_to_id': '1691288262365663232', 'created_at': '2023-08-15 03:20:01+00:00', 'full_text': '@SpeakerMcCarthy READ THE INDICTMENT KEVIN.\nhttps://t.co/5aJK1MwzqE', 'original_user_id': '568345906.0'}, {'tweet_id': '1691288795818168320', 'reply_to_id': '1691288262365663232', 'created_at': '2023-08-15 03:20:46+00:00', 'full_text': '@SpeakerMcCarthy @TrumpWarRoom They indicted everyone under the sun!\n\nIt’s nuts. https://t.co/WqlkXlccxe', 'original_user_id'

In [8]:
path = '/Users/mogen/Desktop/Research_Case/results/processed_data_20250105_130510/generated_posts.json'
import pandas as pd
import json
import csv
import json

def convert_json_to_csv(input_file, output_file):
    # Read the JSON file
    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    # Get the posts from the JSON
    posts = data['generated_posts']
    
    # Extract fields for CSV
    csv_data = []
    for post in posts:
        # Parse the nested generated_text JSON
        try:
            generated_text = json.loads(post['generated_text'])['post_text']
        except:
            generated_text = post['generated_text']
            
        # Create a row for each post
        row = {
            'user_id': post['user_id'],
            'generation_id': post['generation_id'],
            'original_post_id': post['original_post_id'],
            'original_text': post['original_text'].replace('\n', ' ').replace('\r', ' '),
            'original_timestamp': post['original_timestamp'],
            'stimulus': post['stimulus'].replace('\n', ' ').replace('\r', ' '),
            'generated_text': generated_text.replace('\n', ' ').replace('\r', ' '),
            'generation_timestamp': post['generation_timestamp'],
            'persona_writing_style': post['persona_writing_style'].replace('\n', ' ').replace('\r', ' '),
            'persona_tone': post['persona_tone'].replace('\n', ' ').replace('\r', ' '),
            'persona_topics': post['persona_topics'].replace('\n', ' ').replace('\r', ' ')
        }
        csv_data.append(row)
    
    # Write to CSV
    if csv_data:
        fieldnames = csv_data[0].keys()
        with open(output_file, 'w', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(csv_data)
        print(f"Successfully converted {len(csv_data)} posts to CSV")
    else:
        print("No data to convert")

# Example usage
if __name__ == "__main__":
    input_file = "paste.txt"  # Your JSON file
    output_file = "generated_posts.csv"  # Output CSV file
    convert_json_to_csv(path, output_file)

Successfully converted 50 posts to CSV


In [9]:
df= pd.read_csv('/Users/mogen/Desktop/Research_Case/results/processed_data_20250105_130510/generated_posts.csv')
df.head()

Unnamed: 0,user_id,generation_id,original_post_id,original_text,original_timestamp,stimulus,generated_text,generation_timestamp,persona_writing_style,persona_tone,persona_topics
0,2960098000.0,2960098274.0_gen_0,1.690851e+18,We made this naughty fan boys dreams cum true ...,2023-08-13 22:21:05+00:00,A post about fulfilling a fan's fantasy or des...,"Oooh, you know what I love? 😏 Making dreams co...",2025-01-05T12:43:44.412911,Informal and provocative writing style with sh...,"Playful and suggestive tone, often flirtatious...","Adult entertainment, self-promotion, online in..."
1,2374201000.0,2374200834.0_gen_0,1.6915e+18,"Happy birthday, @kendylscott! 🎂🎉 https://t.co/...",2023-08-15 17:21:12+00:00,A post celebrating someone's birthday with a f...,🎉 Happy Birthday to the amazing @teammate_extr...,2025-01-05T12:43:46.726120,"The writing style is informal and dynamic, fea...",The overall tone is enthusiastic and supportiv...,"Softball, Athletes Unlimited, Great Britain wo..."
2,182047500.0,182047549.0_gen_0,1.69128e+18,This is what I was talking about... @IvanCarte...,2023-08-15 02:47:35+00:00,A post referencing a previous discussion and t...,"Hey @sportsfanatic, remember when Julio was te...",2025-01-05T12:43:52.666507,Informal and conversational writing style with...,"Casual and assertive tone, with a hint of auth...","Sports commentary, Alabama football, Julio Jon..."
3,1.36931e+18,1.36931024370074e+18_gen_0,1.691535e+18,Het rode kruis weigert de mensen te helpen op ...,2023-08-15 19:40:57+00:00,A post expressing dissatisfaction with an orga...,So @TheOrgOfficial is failing us AGAIN?! 🤔 How...,2025-01-05T12:43:55.431774,"The writing style is informal and provocative,...",The overall tone is conspiratorial and alarmis...,"Conspiracy theories, government distrust, medi..."
4,711630600.0,711630608.0_gen_0,1.658534e+18,Gracias por aparecer en nuestras vidas y tener...,2023-05-16 18:03:08+00:00,A post expressing gratitude and well wishes fo...,"{""post_text"": ""Happy Birthday to the GOAT, @Li...",2025-01-05T12:43:57.172473,The writing style is a mix of casual and refle...,The tone varies from passionate and emotional ...,"Sports, particularly football and admiration f..."


In [4]:
import json
import pandas as pd
from typing import Dict, List, Set

PERSONA_FIELDS = [
    "general_decription",
    "brevity_style",
    "language_formality", 
    "narrative_voice",
    "vocabulary_range",
    "punctuation_style",
    "controversy_handling",
    "community_role",
    "content_triggers",
    "reaction_patterns",
    "message_effectiveness",
    "opinion_expression",
    "emotional_expression",
    "cognitive_patterns",
    "social_orientation",
    "conflict_approach",
    "value_signals",
    "identity_projection",
    "belief_expression",
    "stress_indicators",
    "adaptability_signs",
    "authenticity_markers"
]

def detect_persona_fields(post: Dict) -> Dict[str, bool]:
    """
    Detect which persona fields are present in a post's metadata
    """
    present_fields = {}
    
    for field in PERSONA_FIELDS:
        # Format the field name as it appears in the data
        field_key = f"persona_{field}"
        # Check if the field exists and has content
        field_value = post.get(field_key) or ""
        if isinstance(field_value, str):
            field_value = field_value.strip()
        present_fields[field_key] = bool(field_value)
    
    return present_fields

def process_eval_results(eval_data: Dict, generated_posts: List[Dict]) -> pd.DataFrame:
    # Create a list to store the processed data
    processed_data = []
    
    # Get individual evaluations from the new data structure
    individual_evaluations = eval_data.get('individual_evaluations', [])
    
    # Create a mapping of evaluation data by original_id
    eval_mapping = {
        str(eval_item['metadata']['original_id']): eval_item 
        for eval_item in individual_evaluations
    }
    
    # Store aggregate metrics for later use if needed
    aggregate_metrics = eval_data.get('aggregate_metrics', {})
    
    # Process each post
    for post in generated_posts:
        base_user_id = post.get('user_id')
        base_original_post_id = post.get('original_post_id')
        base_original_text = post.get('original_text')
        base_stimulus = post.get('stimulus')
        
        # Get the generated text directly
        generated_text = post.get('generated_text')

        # Get corresponding evaluation data using original_id
        eval_item = eval_mapping.get(str(base_original_post_id), {})
        rouge_scores = eval_item.get('rouge_scores', {})
        llm_evaluation = eval_item.get('llm_evaluation', {})
        
        # Create row
        row = {
            'user_id': base_user_id,
            'original_post_id': base_original_post_id,
            'original_text': base_original_text,
            'stimulus': base_stimulus,
            'generated_text': generated_text,
            'similarity_scores': eval_item.get('similarity_scores'),
            'rouge1_fmeasure': rouge_scores.get('rouge1', {}).get('fmeasure'),
            'rouge1_recall': rouge_scores.get('rouge1', {}).get('recall'),
            'rouge1_precision': rouge_scores.get('rouge1', {}).get('precision'),
            'rouge2_fmeasure': rouge_scores.get('rouge2', {}).get('fmeasure'),
            'rouge2_recall': rouge_scores.get('rouge2', {}).get('recall'),
            'rouge2_precision': rouge_scores.get('rouge2', {}).get('precision'),
            'rougel_fmeasure': rouge_scores.get('rougeL', {}).get('fmeasure'),
            'rougel_recall': rouge_scores.get('rougeL', {}).get('recall'),
            'rougel_precision': rouge_scores.get('rougeL', {}).get('precision'),
            # Add LLM evaluation metrics correctly
            'llm_evaluation_authenticity': llm_evaluation.get('authenticity', {}).get('score'),
            'llm_evaluation_style_consistency': llm_evaluation.get('style_consistency', {}).get('score'),
            'llm_evaluation_matching_intent': llm_evaluation.get('matching_intent')
        }
        
        # Add persona field detection results
        persona_fields = detect_persona_fields(post)
        row.update(persona_fields)
        
        processed_data.append(row)
    
    # Convert to DataFrame
    df = pd.DataFrame(processed_data)
    
    # Add aggregate metrics as additional information
    df.attrs['aggregate_metrics'] = aggregate_metrics
    df.attrs['total_evaluated'] = eval_data.get('metadata', {}).get('total_evaluated')
    df.attrs['evaluation_timestamp'] = eval_data.get('metadata', {}).get('evaluation_timestamp')
    
    return df

if __name__ == "__main__":
    # Read the files
    with open('/Users/mogen/Desktop/Research_Case/evaluation_results/eval_results_20250203_082545.json', 'r') as f:
        eval_data = json.load(f)
    
    with open('/Users/mogen/Desktop/Research_Case/fine_tune03/generated_posts.json', 'r') as f:
        posts_data = json.load(f)
        generated_posts = posts_data['generated_posts']
        
    # Process the data
    df = process_eval_results(eval_data, generated_posts)
    
    # Print basic statistics
    print(f"\nTotal number of generations processed: {len(df)}")
    print(f"Number of unique original posts: {df['original_post_id'].nunique()}")
    print(f"Average generations per post: {len(df) / df['original_post_id'].nunique():.2f}")
    
    # Print persona field distribution
    persona_columns = [col for col in df.columns if col.startswith('persona_')]
    print("\nPersona Fields Distribution:")
    print(df[persona_columns].sum())
    
    # Print aggregate metrics
    print("\nAggregate Metrics:")
    print(f"Total Evaluated: {df.attrs['total_evaluated']}")
    print(f"Evaluation Timestamp: {df.attrs['evaluation_timestamp']}")
    print("\nROUGE Score Means:")
    for rouge_type, scores in df.attrs['aggregate_metrics']['rouge'].items():
        print(f"{rouge_type}:")
        for metric, values in scores.items():
            print(f"  {metric}: {values['mean']:.4f}")
            
    # Print LLM evaluation metrics means
    llm_eval_columns = ['llm_evaluation_authenticity', 'llm_evaluation_style_consistency', 'llm_evaluation_matching_intent']
    print("\nLLM Evaluation Metrics Means:")
    for col in llm_eval_columns:
        if col in df.columns:
            print(f"{col}: {df[col].mean():.4f}")
    
    # Save to CSV
    output_path = '/Users/mogen/Desktop/Research_Case/fine_tune03/8k_gpt4o_rnd_shuffled_personas_2.csv'
    df.to_csv(output_path, index=False)
    print(f"\nResults saved to {output_path}")


Total number of generations processed: 8000
Number of unique original posts: 7991
Average generations per post: 1.00

Persona Fields Distribution:
persona_general_decription       1838
persona_brevity_style            1762
persona_language_formality       1820
persona_narrative_voice          1822
persona_vocabulary_range         1772
persona_punctuation_style        1840
persona_controversy_handling     1842
persona_community_role           1750
persona_content_triggers         1738
persona_reaction_patterns        1866
persona_message_effectiveness    1836
persona_opinion_expression       1864
persona_emotional_expression     1816
persona_cognitive_patterns       1860
persona_social_orientation       1862
persona_conflict_approach        1820
persona_value_signals            1910
persona_identity_projection      1804
persona_belief_expression        1844
persona_stress_indicators        1736
persona_adaptability_signs       1788
persona_authenticity_markers     1810
dtype: int64

Ag

KeyError: 'rouge'