In [1]:
import json
import pandas as pd
import os

def process_json_to_csv(input_file, output_file):
    """
    Transform JSON data into CSV with specified columns.
    
    Args:
        input_file (str): Path to input JSON file
        output_file (str): Path to output CSV file
    """
    try:
        # Check if input file exists
        if not os.path.exists(input_file):
            raise FileNotFoundError(f"Input file '{input_file}' not found")
        
        # Create output directory if it doesn't exist
        output_dir = os.path.dirname(output_file)
        if output_dir and not os.path.exists(output_dir):
            os.makedirs(output_dir)
            print(f"Created output directory: {output_dir}")

        # Read the JSON file
        with open(input_file, 'r', encoding='utf-8') as f:
            content = f.read()
            # Add the missing opening brace if needed
            if not content.startswith('{'):
                content = '{' + content
            data = json.loads(content)
        
        # Process each post
        processed_data = []
        for post in data['generated_posts']:
            # Collect all persona fields
            persona_fields = []
            for key, value in post.items():
                if key.startswith('persona_'):
                    persona_fields.append(f"{key}: {value}")
            
            # Create entry with required fields
            entry = {
                'tweet': post['original_text'],
                'stimulus': post['stimulus'],
                'persona': '\n'.join(persona_fields)
            }
            processed_data.append(entry)
        
        # Convert to DataFrame and save as CSV
        df = pd.DataFrame(processed_data)
        print(f'rows:{len(df)}')
        df.to_csv(output_file, index=False)
        print(f"Successfully processed {len(processed_data)} records")
        print(f"Output saved to: {output_file}")
        
        # Display first few rows as sample
        print("\nFirst few rows of the processed data:")
        print(df.head())
        
    except FileNotFoundError as e:
        print(f"Error: {str(e)}")
    except json.JSONDecodeError as e:
        print(f"Error: Invalid JSON format - {str(e)}")
    except KeyError as e:
        print(f"Error: Missing required field - {str(e)}")
    except Exception as e:
        print(f"Error: An unexpected error occurred - {str(e)}")
        raise  # Re-raise the exception to see the full traceback

# Example usage
if __name__ == "__main__":
    # Get current working directory
    current_dir = os.getcwd()
    print(f"Current working directory: {current_dir}")
    
    # Use current directory for input and output files
    input_file = "/Users/mogen/Desktop/Research_Case/fine_tune03/generated_posts.json"  # Input file in current directory
    output_file = "/Users/mogen/Desktop/Research_Case/fine_tune03/fine_tune04.csv"  # Output file in current directory
    
    print(f"Processing file: {input_file}")
    print(f"Output will be saved to: {output_file}")
    
    process_json_to_csv(input_file, output_file)

Current working directory: /Users/mogen/Desktop/Research_Case/notebooks
Processing file: /Users/mogen/Desktop/Research_Case/fine_tune03/generated_posts.json
Output will be saved to: /Users/mogen/Desktop/Research_Case/fine_tune03/fine_tune04.csv
rows:8000
Successfully processed 8000 records
Output saved to: /Users/mogen/Desktop/Research_Case/fine_tune03/fine_tune04.csv

First few rows of the processed data:
                                               tweet  \
0  Usually, big tech companies are more than will...   
1  Attention bolshevik dirtbags,\n\nSwiss researc...   
2  Another Arraignment Day, yet Sean Hannity says...   
3  Old school hip-hip makes celebrity net worth s...   
4  Day 931!\nJudge Tanya Chutkan issued a protect...   

                                            stimulus  \
0  Comment: Discussion on the willingness of big ...   
1  Statement: Swiss research indicates that the f...   
2  Comment: Discussion on Sean Hannity's public s...   
3  Comment: Comparison of net

In [2]:
import json
import pandas as pd
import os

# Define persona fields in order
PERSONA_FIELDS = [
    "general_decription",
    "brevity_style",
    "language_formality",
    "narrative_voice",
    "vocabulary_range",
    "punctuation_style",
    "controversy_handling",
    "community_role",
    "content_triggers",
    "reaction_patterns",
    "message_effectiveness",
    "opinion_expression",
    "emotional_expression",
    "cognitive_patterns",
    "social_orientation",
    "conflict_approach",
    "value_signals",
    "identity_projection",
    "belief_expression",
    "stress_indicators",
    "adaptability_signs",
    "authenticity_markers"
]

def process_json_to_csv(input_file, output_file):
    """
    Transform JSON data into CSV with persona fields represented as a single vector column.
    
    Args:
        input_file (str): Path to input JSON file
        output_file (str): Path to output CSV file
    """
    try:
        # Check if input file exists
        if not os.path.exists(input_file):
            raise FileNotFoundError(f"Input file '{input_file}' not found")

        # Create output directory if it doesn't exist
        output_dir = os.path.dirname(output_file)
        if output_dir and not os.path.exists(output_dir):
            os.makedirs(output_dir)
            print(f"Created output directory: {output_dir}")

        # Read the JSON file
        with open(input_file, 'r', encoding='utf-8') as f:
            content = f.read()
            
        # Add the missing opening brace if needed
        if not content.startswith('{'):
            content = '{' + content
            
        data = json.loads(content)

        # Process each post
        processed_data = []
        for post in data['generated_posts']:
            # Initialize entry with required fields
            entry = {
                'tweet': post['original_text'],
                'stimulus': post['stimulus'],
                'persona_text': ''  # Will store the full text version
            }
            
            # Initialize vector for persona fields
            persona_vector = [0] * len(PERSONA_FIELDS)
            
            # Collect persona fields and build vector
            persona_texts = []
            for key, value in post.items():
                if key.startswith('persona_'):
                    # Extract the field name without 'persona_' prefix
                    field_name = key[8:]  # Remove 'persona_' prefix
                    if field_name in PERSONA_FIELDS:
                        # Set vector position to 1
                        field_index = PERSONA_FIELDS.index(field_name)
                        persona_vector[field_index] = 1
                    persona_texts.append(f"{key}: {value}")
            
            # Add vector to entry
            entry['persona_vector'] = str(persona_vector)  # Store as string representation
            
            # Join all persona texts
            entry['persona_text'] = '\n'.join(persona_texts)
            processed_data.append(entry)

        # Convert to DataFrame
        df = pd.DataFrame(processed_data)

        print(f'Total rows: {len(df)}')
        
        # Save to CSV
        df.to_csv(output_file, index=False)
        print(f"Successfully processed {len(processed_data)} records")
        print(f"Output saved to: {output_file}")

        # Display first few rows as sample
        print("\nFirst few rows of the processed data:")
        print(df.head())
        
        # Display summary of persona field presence
        print("\nPersona field presence summary:")
        vector_sums = [0] * len(PERSONA_FIELDS)
        for _, row in df.iterrows():
            vector = eval(row['persona_vector'])  # Convert string back to list
            for i in range(len(vector)):
                vector_sums[i] += vector[i]
                
        for field, count in zip(PERSONA_FIELDS, vector_sums):
            percentage = (count / len(df)) * 100
            print(f"{field}: {count} records ({percentage:.2f}%)")

    except FileNotFoundError as e:
        print(f"Error: {str(e)}")
    except json.JSONDecodeError as e:
        print(f"Error: Invalid JSON format - {str(e)}")
    except KeyError as e:
        print(f"Error: Missing required field - {str(e)}")
    except Exception as e:
        print(f"Error: An unexpected error occurred - {str(e)}")
        raise  # Re-raise the exception to see the full traceback

# Example usage
if __name__ == "__main__":
    # Get current working directory
    current_dir = os.getcwd()
    print(f"Current working directory: {current_dir}")

    # Use current directory for input and output files
    input_file = "/Users/mogen/Desktop/Research_Case/fine_tune03/generated_posts.json"
    output_file = "/Users/mogen/Desktop/Research_Case/fine_tune03/fine_tune_regress.csv"
    
    print(f"Processing file: {input_file}")
    print(f"Output will be saved to: {output_file}")
    
    process_json_to_csv(input_file, output_file)

Current working directory: /Users/mogen/Desktop/Research_Case/notebooks
Processing file: /Users/mogen/Desktop/Research_Case/fine_tune03/generated_posts.json
Output will be saved to: /Users/mogen/Desktop/Research_Case/fine_tune03/fine_tune_regress.csv
Total rows: 8000
Successfully processed 8000 records
Output saved to: /Users/mogen/Desktop/Research_Case/fine_tune03/fine_tune_regress.csv

First few rows of the processed data:
                                               tweet  \
0  Usually, big tech companies are more than will...   
1  Attention bolshevik dirtbags,\n\nSwiss researc...   
2  Another Arraignment Day, yet Sean Hannity says...   
3  Old school hip-hip makes celebrity net worth s...   
4  Day 931!\nJudge Tanya Chutkan issued a protect...   

                                            stimulus  \
0  Comment: Discussion on the willingness of big ...   
1  Statement: Swiss research indicates that the f...   
2  Comment: Discussion on Sean Hannity's public s...   
3  Comment

In [4]:
# Check if input file exists
if not os.path.exists(input_file):
    raise FileNotFoundError(f"Input file '{input_file}' not found")

# Create output directory if it doesn't exist
output_dir = os.path.dirname(output_file)
if output_dir and not os.path.exists(output_dir):
    os.makedirs(output_dir)
    print(f"Created output directory: {output_dir}")

# Read the JSON file
with open(input_file, 'r', encoding='utf-8') as f:
    content = f.read()
    # Add the missing opening brace if needed
    if not content.startswith('{'):
        content = '{' + content
    data = json.loads(content)

# Process each post
processed_data = []
for post in data['generated_posts']:
    # Collect all persona fields
    persona_fields = []
    for key, value in post.items():
        if key.startswith('persona_'):
            persona_fields.append(f"{key}: {value}")
    
    # Create entry with required fields
    entry = {
        'tweet': post['original_text'],
        'stimulus': post['stimulus'],
        'persona': '\n'.join(persona_fields)
    }
    processed_data.append(entry)

# Convert to DataFrame and save as CSV
df = pd.DataFrame(processed_data)
print(f'rows:{len(df)}')
df.to_csv(output_file, index=False)
print(f"Successfully processed {len(processed_data)} records")
print(f"Output saved to: {output_file}")

# Display first few rows as sample
print("\nFirst few rows of the processed data:")
print(df.head())

rows:6758
Successfully processed 6758 records
Output saved to: /Users/mogen/Desktop/Research_Case/fine_tune/fine_tune.csv

First few rows of the processed data:
                                               tweet  \
0  Support my local conservative talk show in the...   
1  It is so sad that twitter had to self implode,...   
2  new message from square\nAlso, anything you pr...   
3  She should have never resigned to calm the com...   
4  Check out my website, I called out BLM for who...   

                                            stimulus  \
0  Statement: Promotion of a local conservative t...   
1  Comment: The decline of Twitter and its impact...   
2  Information: Square's policy on content modera...   
3  Comment: Discussion about the resignation of a...   
4  Comment: Criticism of the Black Lives Matter m...   

                                             persona  
0  persona_narrative_voice: The user's narrative ...  
1  persona_narrative_voice: The user's narrative ...  


In [9]:
liste=list(df['persona'][-5:-1])
for l in liste:
    print(l)
    

persona_opinion_expression: The user shares viewpoints assertively, often with a hint of sarcasm or humor. They seem to engage with topics critically, questioning norms and highlighting issues, such as media transparency and prison call charges. Their opinions are often presented with an expectation of accountability or transparency, as seen in their call for media to investigate governmental actions.
persona_authenticity_markers: The user's communication appears genuine and driven by personal experiences or observations. They display a consistent tone of skepticism and advocacy, particularly when addressing social justice issues. Their posts suggest a willingness to challenge systems and highlight inefficiencies, indicating a commitment to authenticity and truthfulness.
persona_punctuation_style: The user employs a straightforward punctuation style, using commas and periods effectively to convey clear thoughts. They do not rely on excessive punctuation, such as exclamation marks, to m