## transform 

In [13]:
import json
import pandas as pd
import os
from datetime import datetime
import ast

# Define file paths
eval_results_path = "/Users/christophhau/Desktop/Research_case/results/fine_tune03/eval_results.json"
llm_eval_results_path = "/Users/christophhau/Desktop/Research_case/results/fine_tune03/judge/evaluation_results_20250325_121256.json"
generated_posts_path = "/Users/christophhau/Desktop/Research_case/results/fine_tune03/8k_gpt4o_rnd_shuffled_personas.csv"
output_path = "/Users/christophhau/Desktop/Research_case/results/fine_tune03/combined_results.csv"

# Load the evaluation results
with open(eval_results_path, 'r') as f:
    eval_data = json.load(f)

# Load the LLM evaluation results
with open(llm_eval_results_path, 'r') as f:
    llm_eval_data = json.load(f)

# Load the generated posts data
# Note: This is described as CSV but has JSON content in the example
# We'll handle both possibilities
try:
    # Try loading as CSV first
    generated_posts_df = pd.read_csv(generated_posts_path)
    is_csv = True
    print(f"Successfully loaded {len(generated_posts_df)} records from CSV file")
except Exception as e:
    print(f"Could not load as CSV, trying JSON format: {str(e)}")
    try:
        # If that fails, try loading as JSON
        with open(generated_posts_path, 'r') as f:
            generated_posts_data = json.load(f)
        is_csv = False
        print(f"Successfully loaded JSON data with {len(generated_posts_data.get('generated_posts', []))} posts")
    except Exception as e:
        raise Exception(f"Failed to load generated posts data in either CSV or JSON format: {str(e)}")

# Process the ROUGE and similarity scores data
rouge_data = {}
for item in eval_data['individual_evaluations']:
    gen_id = item['metadata']['generated_id']
    
    # Extract ROUGE scores
    rouge_scores = item['rouge_scores']
    similarity = item['similarity_scores']
    
    rouge_data[gen_id] = {
        'similarity_scores': similarity,
        'rouge1_fmeasure': rouge_scores['rouge1']['fmeasure'],
        'rouge1_recall': rouge_scores['rouge1']['recall'],
        'rouge1_precision': rouge_scores['rouge1']['precision'],
        'rouge2_fmeasure': rouge_scores['rouge2']['fmeasure'],
        'rouge2_recall': rouge_scores['rouge2']['recall'],
        'rouge2_precision': rouge_scores['rouge2']['precision'],
        'rougel_fmeasure': rouge_scores['rougeL']['fmeasure'],
        'rougel_recall': rouge_scores['rougeL']['recall'],
        'rougel_precision': rouge_scores['rougeL']['precision']
    }

# Process the LLM evaluation data
llm_eval_dict = {}
for item in llm_eval_data['results']:
    post_id = item['post_id']
    eval_data = item['evaluation']
    
    llm_eval_dict[post_id] = {
        'llm_evaluation_authenticity': eval_data['authenticity']['score'],
        'llm_evaluation_style_consistency': eval_data['style_consistency']['score'],
        'llm_evaluation_matching_intent': eval_data['matching_intent']
    }

# Prepare persona columns for Boolean conversion
persona_columns = [
    'persona_general_decription', 'persona_brevity_style', 'persona_language_formality',
    'persona_narrative_voice', 'persona_vocabulary_range', 'persona_punctuation_style',
    'persona_controversy_handling', 'persona_community_role', 'persona_content_triggers',
    'persona_reaction_patterns', 'persona_message_effectiveness', 'persona_opinion_expression',
    'persona_emotional_expression', 'persona_cognitive_patterns', 'persona_social_orientation',
    'persona_conflict_approach', 'persona_value_signals', 'persona_identity_projection',
    'persona_belief_expression', 'persona_stress_indicators', 'persona_adaptability_signs',
    'persona_authenticity_markers'
]

# Process the generated posts data and create the final DataFrame
if is_csv:
    # If the file was successfully loaded as CSV
    final_df = generated_posts_df.copy()
    
    # Clean up the generated_text field if it's in JSON format
    # Use a safer approach to handle potentially malformed JSON
    def extract_post_text(text):
        if not isinstance(text, str) or not text.startswith('{'):
            return text
            
        # Try to extract post_text using regex instead of direct JSON parsing
        import re
        match = re.search(r'"post_text"\s*:\s*"([^"]*)', text)
        if match:
            # Return the captured content, handling potential escaping
            return match.group(1).replace('\\"', '"').replace('\\\\', '\\')
        
        # If regex approach fails, return the original text
        return text
        
    final_df['generated_text'] = final_df['generated_text'].apply(extract_post_text)
else:
    # If the file was loaded as JSON
    # Extract the generated posts into a DataFrame
    posts_list = generated_posts_data['generated_posts']
    final_df = pd.DataFrame(posts_list)
    
    # Clean up the generated_text field if it's in JSON format
    # Use same safe approach as above
    def extract_post_text(text):
        if not isinstance(text, str) or not text.startswith('{'):
            return text
            
        # Try to extract post_text using regex instead of direct JSON parsing
        import re
        match = re.search(r'"post_text"\s*:\s*"([^"]*)', text)
        if match:
            # Return the captured content, handling potential escaping
            return match.group(1).replace('\\"', '"').replace('\\\\', '\\')
        
        # If regex approach fails, return the original text
        return text
        
    final_df['generated_text'] = final_df['generated_text'].apply(extract_post_text)

# Add generation_index column if it doesn't exist
if 'generation_index' not in final_df.columns:
    final_df['generation_index'] = final_df['generation_id'].apply(
        lambda x: int(x.split('_gen_')[1]) if '_gen_' in str(x) else 0
    )

# Add ROUGE and similarity scores to the DataFrame
rouge_match_count = 0
for gen_id, scores in rouge_data.items():
    mask = final_df['generation_id'] == gen_id
    if any(mask):
        rouge_match_count += 1
        for key, value in scores.items():
            final_df.loc[mask, key] = value
print(f"Matched ROUGE scores for {rouge_match_count} of {len(rouge_data)} records")

# Add LLM evaluation scores to the DataFrame
llm_match_count = 0
for post_id, eval_scores in llm_eval_dict.items():
    mask = final_df['generation_id'] == post_id
    if any(mask):
        llm_match_count += 1
        for key, value in eval_scores.items():
            final_df.loc[mask, key] = value
print(f"Matched LLM evaluations for {llm_match_count} of {len(llm_eval_dict)} records")

# Convert persona columns to boolean if they contain text descriptions
for col in persona_columns:
    if col in final_df.columns:
        # Check if this column contains strings (descriptions) instead of booleans
        if final_df[col].dtype == 'object' and isinstance(final_df[col].iloc[0], str):
            # If it contains descriptions, set to True (present) for non-empty strings
            final_df[col] = final_df[col].notna() & (final_df[col] != '')

# Ensure all expected columns are present, even if empty
expected_columns = [
    'user_id', 'generation_id', 'original_post_id', 'original_text', 'stimulus',
    'generation_index', 'generated_text', 'similarity_scores',
    'rouge1_fmeasure', 'rouge1_recall', 'rouge1_precision',
    'rouge2_fmeasure', 'rouge2_recall', 'rouge2_precision',
    'rougel_fmeasure', 'rougel_recall', 'rougel_precision',
    'llm_evaluation_authenticity', 'llm_evaluation_style_consistency',
    'llm_evaluation_matching_intent'
] + persona_columns

for col in expected_columns:
    if col not in final_df.columns:
        final_df[col] = None

# Reorder columns to match the reference file
final_df = final_df[expected_columns]

# Print a summary of the data before saving
print("\nData Summary:")
print(f"Total records: {len(final_df)}")
print("Column completion rates:")
for col in expected_columns:
    non_null_count = final_df[col].notna().sum()
    print(f"  {col}: {non_null_count}/{len(final_df)} records ({non_null_count/len(final_df)*100:.1f}%)")

# Validate that all required fields have good coverage
critical_columns = ['user_id', 'generation_id', 'original_post_id', 'generated_text', 
                   'similarity_scores', 'rouge1_fmeasure', 'llm_evaluation_authenticity']
missing_critical = False
for col in critical_columns:
    if final_df[col].isna().sum() > len(final_df) * 0.1:  # More than 10% missing
        print(f"WARNING: Critical column '{col}' is missing in more than 10% of records!")
        missing_critical = True

# Save the final DataFrame to CSV
final_df.to_csv(output_path, index=False)

print(f"\nSuccessfully created combined CSV file at: {output_path}")
print(f"Total records processed: {len(final_df)}")

if missing_critical:
    print("\nWARNING: Some critical columns have significant missing data. Please review the output!")

Successfully loaded 8000 records from CSV file
Matched ROUGE scores for 8000 of 8000 records
Matched LLM evaluations for 8000 of 8000 records

Data Summary:
Total records: 8000
Column completion rates:
  user_id: 8000/8000 records (100.0%)
  generation_id: 8000/8000 records (100.0%)
  original_post_id: 8000/8000 records (100.0%)
  original_text: 8000/8000 records (100.0%)
  stimulus: 8000/8000 records (100.0%)
  generation_index: 8000/8000 records (100.0%)
  generated_text: 8000/8000 records (100.0%)
  similarity_scores: 8000/8000 records (100.0%)
  rouge1_fmeasure: 8000/8000 records (100.0%)
  rouge1_recall: 8000/8000 records (100.0%)
  rouge1_precision: 8000/8000 records (100.0%)
  rouge2_fmeasure: 8000/8000 records (100.0%)
  rouge2_recall: 8000/8000 records (100.0%)
  rouge2_precision: 8000/8000 records (100.0%)
  rougel_fmeasure: 8000/8000 records (100.0%)
  rougel_recall: 8000/8000 records (100.0%)
  rougel_precision: 8000/8000 records (100.0%)
  llm_evaluation_authenticity: 8000

## Averges per Model

### Gpt4o

In [11]:
import json
import pandas as pd
import numpy as np
from collections import defaultdict

def analyze_json_files(eval_file_path, cleaned_file_path):
    """
    Analyze JSON files and create a table showing metrics by dimensions
    
    Args:
        eval_file_path (str): Path to evaluation results JSON file
        cleaned_file_path (str): Path to cleaned output JSON file
        
    Returns:
        pandas.DataFrame: Table with metrics by dimensions
    """
    # Load the JSON files
    with open(eval_file_path, 'r') as f:
        eval_data = json.load(f)
    
    with open(cleaned_file_path, 'r') as f:
        cleaned_data = json.load(f)
    
    # Extract the individual evaluations and generated posts
    evaluations = eval_data['individual_evaluations']
    posts = cleaned_data['generated_posts']
    
    # Create a mapping from generation_id to evaluation data
    eval_map = {}
    for eval_item in evaluations:
        gen_id = eval_item['metadata']['generated_id']
        eval_map[gen_id] = eval_item
    
    # Identify dimensions (persona attributes) - collecting from all posts to ensure we get all dimensions
    dimensions = set()
    for post in posts:
        for key in post.keys():
            if key.startswith('persona_'):
                dimensions.add(key)
    dimensions = list(dimensions)
    
    # Create a DataFrame with combined data
    combined_data = []
    
    for post in posts:
        gen_id = post['generation_id']
        if gen_id in eval_map:
            eval_item = eval_map[gen_id]
            
            # Create a row with all necessary data
            row = {}
            
            # Add dimension flags (1 if the dimension exists and is non-empty, 0 otherwise)
            for dim in dimensions:
                if dim in post and isinstance(post[dim], str):
                    row[dim] = 1 if post[dim].strip() != "" else 0
                else:
                    row[dim] = 0
            
            # Add metrics from evaluation data - with error handling
            
            # Rouge scores
            if 'rouge_scores' in eval_item:
                rouge_scores = eval_item['rouge_scores']
                for rouge_type, metrics in rouge_scores.items():
                    if 'fmeasure' in metrics:
                        row[f"{rouge_type}_fmeasure"] = metrics['fmeasure']
            
            # Similarity score
            if 'similarity_scores' in eval_item:
                row['similarity_scores'] = eval_item['similarity_scores']
            
            # LLM evaluation scores
            if 'llm_evaluation' in eval_item:
                llm_eval = eval_item['llm_evaluation']
                for key, value in llm_eval.items():
                    if isinstance(value, dict) and 'score' in value:
                        row[f"llm_{key}_score"] = value['score']
                    elif key == 'matching_intent':
                        row['matching_intent'] = 1 if value else 0
            
            combined_data.append(row)
    
    # Create DataFrame from combined data
    df = pd.DataFrame(combined_data)
    
    # Calculate metrics by dimension
    metrics = [col for col in df.columns if col not in dimensions]
    
    # Check if we have data to analyze
    if df.empty:
        print("No matching data found between evaluation and generated posts.")
        return pd.DataFrame()
    
    results = []
    
    for dim in dimensions:
        dim_result = {'Dimension': dim.replace('persona_', '')}
        
        # Filter rows where dimension is present (1) or absent (0)
        present = df[df[dim] == 1]
        absent = df[df[dim] == 0]
        
        # Skip if all rows are in one category
        if len(present) == 0 or len(absent) == 0:
            print(f"Warning: Dimension {dim} has all values present or all values absent. Skipping.")
            continue
        
        for metric in metrics:
            present_avg = present[metric].mean()
            absent_avg = absent[metric].mean()
            diff = present_avg - absent_avg
            
            dim_result[f"{metric}_present"] = present_avg
            dim_result[f"{metric}_absent"] = absent_avg
            dim_result[f"{metric}_diff"] = diff
        
        results.append(dim_result)
    
    # Create final results DataFrame
    results_df = pd.DataFrame(results)
    
    # Format the table for better readability
    formatted_table = create_formatted_table(results_df, dimensions, metrics)
    
    return formatted_table

def create_formatted_table(results_df, dimensions, metrics):
    """
    Create a formatted table with metrics as columns and dimensions as rows
    
    Args:
        results_df (pandas.DataFrame): Results DataFrame
        dimensions (list): List of dimension names
        metrics (list): List of metric names
        
    Returns:
        pandas.DataFrame: Formatted table
    """
    # Group metrics by type
    metric_groups = defaultdict(list)
    
    for metric in metrics:
        if 'rouge' in metric:
            base_name = metric.split('_')[0]
            metric_groups[base_name].append(metric)
        elif 'llm_' in metric:
            base_name = metric.replace('llm_', '').replace('_score', '')
            metric_groups[base_name].append(metric)
        else:
            metric_groups[metric].append(metric)
    
    # Create the table with clean column names
    table_data = []
    
    for index, row in results_df.iterrows():
        table_row = {'Dimension': row['Dimension']}
        
        for group_name, group_metrics in metric_groups.items():
            for metric in group_metrics:
                if f"{metric}_present" in row:
                    present_val = row[f"{metric}_present"]
                    absent_val = row[f"{metric}_absent"]
                    diff_val = row[f"{metric}_diff"]
                    
                    metric_display = group_name.replace('_', ' ').title()
                    
                    # Format values based on metric type
                    if metric == 'matching_intent':
                        present_str = f"{present_val:.1%}"
                        absent_str = f"{absent_val:.1%}"
                        diff_str = f"{diff_val:.1%}"
                    elif 'rouge' in metric or 'similarity' in metric:
                        present_str = f"{present_val:.3f}"
                        absent_str = f"{absent_val:.3f}"
                        diff_str = f"{diff_val:.3f}"
                    else:  # llm scores
                        present_str = f"{present_val:.2f}"
                        absent_str = f"{absent_val:.2f}"
                        diff_str = f"{diff_val:.2f}"
                    
                    table_row[f"{metric_display} (Present)"] = present_str
                    table_row[f"{metric_display} (Absent)"] = absent_str
                    table_row[f"{metric_display} (Diff)"] = diff_str
        
        table_data.append(table_row)
    
    return pd.DataFrame(table_data)

def main():
    """Main function to run the analysis"""
    eval_file = "/Users/christophhau/Desktop/Research_case/results/fine_tune03/merged_evaluations.json"
    cleaned_file = "/Users/christophhau/Desktop/Research_case/results/fine_tune03/generated_posts.json"
    
    result_table = analyze_json_files(eval_file, cleaned_file)
    
    # Save the result to CSV
    output_path = "/Users/christophhau/Desktop/Research_case/results/dimension_metrics_table_GPT4o.csv"
    result_table.to_csv(output_path, index=False)
    
    print(f"Analysis complete. Results saved to {output_path}")
    print("\nTable Preview:")
    print(result_table.head())
    
    # Also save a more compact version with just the differences
    diff_cols = ['Dimension'] + [col for col in result_table.columns if '(Diff)' in col]
    diff_table = result_table[diff_cols]
    diff_table.to_csv(output_path.replace('.csv', '_diff_only.csv'), index=False)
    
    return result_table

if __name__ == "__main__":
    main()

Analysis complete. Results saved to /Users/christophhau/Desktop/Research_case/results/dimension_metrics_table_GPT4o.csv

Table Preview:
            Dimension Rouge1 (Present) Rouge1 (Absent) Rouge1 (Diff)  \
0   stress_indicators            0.209           0.211        -0.002   
1   conflict_approach            0.209           0.211        -0.002   
2       brevity_style            0.218           0.209         0.009   
3  language_formality            0.214           0.210         0.004   
4    vocabulary_range            0.208           0.212        -0.003   

  Rouge2 (Present) Rouge2 (Absent) Rouge2 (Diff) Rougel (Present)  \
0            0.055           0.056        -0.002            0.149   
1            0.056           0.056        -0.001            0.150   
2            0.059           0.055         0.004            0.162   
3            0.057           0.056         0.001            0.153   
4            0.056           0.056        -0.000            0.148   

  Rougel (Absent

### LLama 70b 

In [4]:
import json
import pandas as pd
import numpy as np
from collections import defaultdict

def analyze_json_files(eval_file_path, cleaned_file_path):
    """
    Analyze JSON files and create a table showing metrics by dimensions
    
    Args:
        eval_file_path (str): Path to evaluation results JSON file
        cleaned_file_path (str): Path to cleaned output JSON file
        
    Returns:
        pandas.DataFrame: Table with metrics by dimensions
    """
    # Load the JSON files
    with open(eval_file_path, 'r') as f:
        eval_data = json.load(f)
    
    with open(cleaned_file_path, 'r') as f:
        cleaned_data = json.load(f)
    
    # Extract the individual evaluations and generated posts
    evaluations = eval_data['individual_evaluations']
    posts = cleaned_data['generated_posts']
    
    # Create a mapping from generation_id to evaluation data
    eval_map = {}
    for eval_item in evaluations:
        gen_id = eval_item['metadata']['generated_id']
        eval_map[gen_id] = eval_item
    
    # Identify dimensions (persona attributes) - collecting from all posts to ensure we get all dimensions
    dimensions = set()
    for post in posts:
        for key in post.keys():
            if key.startswith('persona_'):
                dimensions.add(key)
    dimensions = list(dimensions)
    
    # Create a DataFrame with combined data
    combined_data = []
    
    for post in posts:
        gen_id = post['generation_id']
        if gen_id in eval_map:
            eval_item = eval_map[gen_id]
            
            # Create a row with all necessary data
            row = {}
            
            # Add dimension flags (1 if the dimension exists and is non-empty, 0 otherwise)
            for dim in dimensions:
                if dim in post and isinstance(post[dim], str):
                    row[dim] = 1 if post[dim].strip() != "" else 0
                else:
                    row[dim] = 0
            
            # Add metrics from evaluation data - with error handling
            
            # Rouge scores
            if 'rouge_scores' in eval_item:
                rouge_scores = eval_item['rouge_scores']
                for rouge_type, metrics in rouge_scores.items():
                    if 'fmeasure' in metrics:
                        row[f"{rouge_type}_fmeasure"] = metrics['fmeasure']
            
            # Similarity score
            if 'similarity_scores' in eval_item:
                row['similarity_scores'] = eval_item['similarity_scores']
            
            # LLM evaluation scores
            if 'llm_evaluation' in eval_item:
                llm_eval = eval_item['llm_evaluation']
                for key, value in llm_eval.items():
                    if isinstance(value, dict) and 'score' in value:
                        row[f"llm_{key}_score"] = value['score']
                    elif key == 'matching_intent':
                        row['matching_intent'] = 1 if value else 0
            
            combined_data.append(row)
    
    # Create DataFrame from combined data
    df = pd.DataFrame(combined_data)
    
    # Calculate metrics by dimension
    metrics = [col for col in df.columns if col not in dimensions]
    
    # Check if we have data to analyze
    if df.empty:
        print("No matching data found between evaluation and generated posts.")
        return pd.DataFrame()
    
    results = []
    
    for dim in dimensions:
        dim_result = {'Dimension': dim.replace('persona_', '')}
        
        # Filter rows where dimension is present (1) or absent (0)
        present = df[df[dim] == 1]
        absent = df[df[dim] == 0]
        
        # Skip if all rows are in one category
        if len(present) == 0 or len(absent) == 0:
            print(f"Warning: Dimension {dim} has all values present or all values absent. Skipping.")
            continue
        
        for metric in metrics:
            present_avg = present[metric].mean()
            absent_avg = absent[metric].mean()
            diff = present_avg - absent_avg
            
            dim_result[f"{metric}_present"] = present_avg
            dim_result[f"{metric}_absent"] = absent_avg
            dim_result[f"{metric}_diff"] = diff
        
        results.append(dim_result)
    
    # Create final results DataFrame
    results_df = pd.DataFrame(results)
    
    # Format the table for better readability
    formatted_table = create_formatted_table(results_df, dimensions, metrics)
    
    return formatted_table

def create_formatted_table(results_df, dimensions, metrics):
    """
    Create a formatted table with metrics as columns and dimensions as rows
    
    Args:
        results_df (pandas.DataFrame): Results DataFrame
        dimensions (list): List of dimension names
        metrics (list): List of metric names
        
    Returns:
        pandas.DataFrame: Formatted table
    """
    # Group metrics by type
    metric_groups = defaultdict(list)
    
    for metric in metrics:
        if 'rouge' in metric:
            base_name = metric.split('_')[0]
            metric_groups[base_name].append(metric)
        elif 'llm_' in metric:
            base_name = metric.replace('llm_', '').replace('_score', '')
            metric_groups[base_name].append(metric)
        else:
            metric_groups[metric].append(metric)
    
    # Create the table with clean column names
    table_data = []
    
    for index, row in results_df.iterrows():
        table_row = {'Dimension': row['Dimension']}
        
        for group_name, group_metrics in metric_groups.items():
            for metric in group_metrics:
                if f"{metric}_present" in row:
                    present_val = row[f"{metric}_present"]
                    absent_val = row[f"{metric}_absent"]
                    diff_val = row[f"{metric}_diff"]
                    
                    metric_display = group_name.replace('_', ' ').title()
                    
                    # Format values based on metric type
                    if metric == 'matching_intent':
                        present_str = f"{present_val:.1%}"
                        absent_str = f"{absent_val:.1%}"
                        diff_str = f"{diff_val:.1%}"
                    elif 'rouge' in metric or 'similarity' in metric:
                        present_str = f"{present_val:.3f}"
                        absent_str = f"{absent_val:.3f}"
                        diff_str = f"{diff_val:.3f}"
                    else:  # llm scores
                        present_str = f"{present_val:.2f}"
                        absent_str = f"{absent_val:.2f}"
                        diff_str = f"{diff_val:.2f}"
                    
                    table_row[f"{metric_display} (Present)"] = present_str
                    table_row[f"{metric_display} (Absent)"] = absent_str
                    table_row[f"{metric_display} (Diff)"] = diff_str
        
        table_data.append(table_row)
    
    return pd.DataFrame(table_data)

def main():
    """Main function to run the analysis"""
    eval_file = "/Users/christophhau/Desktop/Research_case/results/fine_tuned_llama/eval_results.json"
    cleaned_file = "/Users/christophhau/Desktop/Research_case/results/fine_tuned_llama/cleaned_output.json"
    
    result_table = analyze_json_files(eval_file, cleaned_file)
    
    # Save the result to CSV
    output_path = "/Users/christophhau/Desktop/Research_case/results/dimension_metrics_table_Llama70bft.csv"
    result_table.to_csv(output_path, index=False)
    
    print(f"Analysis complete. Results saved to {output_path}")
    print("\nTable Preview:")
    print(result_table.head())
    
    # Also save a more compact version with just the differences
    diff_cols = ['Dimension'] + [col for col in result_table.columns if '(Diff)' in col]
    diff_table = result_table[diff_cols]
    diff_table.to_csv(output_path.replace('.csv', '_diff_only.csv'), index=False)
    
    return result_table

if __name__ == "__main__":
    main()

Analysis complete. Results saved to /Users/christophhau/Desktop/Research_case/results/dimension_metrics_table_Llama70bft.csv

Table Preview:
            Dimension Rouge1 (Present) Rouge1 (Absent) Rouge1 (Diff)  \
0   stress_indicators            0.315           0.314         0.001   
1   conflict_approach            0.298           0.319        -0.021   
2       brevity_style            0.316           0.314         0.002   
3  language_formality            0.315           0.314         0.001   
4    vocabulary_range            0.344           0.307         0.038   

  Rouge2 (Present) Rouge2 (Absent) Rouge2 (Diff) Rougel (Present)  \
0            0.119           0.121        -0.002            0.244   
1            0.110           0.124        -0.014            0.231   
2            0.111           0.124        -0.012            0.239   
3            0.120           0.121        -0.001            0.245   
4            0.147           0.114         0.033            0.267   

  Rougel (A

## To Latex

In [14]:
import pandas as pd
import numpy as np
import re

def convert_csv_to_apa_latex(csv_path, output_path, table_number=1, table_title="Comparison of Metrics Across Different Dimensions", table_note="Note. All metrics are presented with mean values. 'Present' indicates the dimension was included in the text, 'Absent' indicates it was not."):
    """
    Convert a CSV file with dimension metrics to an APA-formatted LaTeX table
    
    Args:
        csv_path (str): Path to the CSV file
        output_path (str): Path where to save the LaTeX file
        table_number (int): Table number for the APA table
        table_title (str): Title for the APA table
        table_note (str): Note to appear at the bottom of the table
    """
    # Read the CSV file
    df = pd.read_csv(csv_path)
    
    # Clean up column names
    clean_cols = []
    for col in df.columns:
        # Extract metric name and type (Present/Absent/Diff)
        if "(" in col and ")" in col:
            metric, measure_type = col.split("(")
            measure_type = measure_type.replace(")", "").strip()
            clean_cols.append((metric.strip(), measure_type))
        else:
            clean_cols.append((col, ""))
    
    # Group metrics
    metrics = {}
    for col_name, measure_type in clean_cols:
        if col_name == "Dimension":
            continue
        if col_name not in metrics:
            metrics[col_name] = []
        metrics[col_name].append(measure_type)
    
    # Create LaTeX header
    latex = []
    latex.append("\\begin{table}")
    latex.append("\\caption{" + table_title + "}")
    latex.append("\\label{table" + str(table_number) + "}")
    latex.append("\\begin{tabular}{l" + "ccc" * len(metrics) + "}")
    latex.append("\\hline")
    
    # Create column headers
    header1 = ["\\multirow{2}{*}{Dimension}"]
    header2 = [""]
    
    for metric in metrics:
        header1.append("\\multicolumn{3}{c}{" + metric + "}")
        header2.extend(["Present", "Absent", "Diff"])
    
    latex.append(" & ".join(header1) + " \\\\")
    latex.append(" & ".join(header2) + " \\\\")
    latex.append("\\hline")
    
    # Create table rows
    for _, row in df.iterrows():
        dimension = row["Dimension"].replace("_", " ").title()
        row_data = [dimension]
        
        for metric in metrics:
            for measure in ["Present", "Absent", "Diff"]:
                col_name = f"{metric} ({measure})"
                if col_name in df.columns:
                    # Get the value and format it
                    value = row[col_name]
                    
                    # Format based on the value type (percentage, float, etc.)
                    if isinstance(value, str):
                        if "%" in value:
                            # It's a percentage
                            value_float = float(value.replace("%", ""))
                            formatted_value = f"{value_float:.1f}\\%"
                        else:
                            # Try to convert to float
                            try:
                                value_float = float(value)
                                if value_float >= 0.01:
                                    formatted_value = f"{value_float:.2f}"
                                else:
                                    formatted_value = f"{value_float:.3f}"
                            except ValueError:
                                formatted_value = value
                    else:
                        # Numeric value
                        if value >= 0.01:
                            formatted_value = f"{value:.2f}"
                        else:
                            formatted_value = f"{value:.3f}"
                            
                    row_data.append(formatted_value)
                else:
                    row_data.append("")
        
        latex.append(" & ".join(row_data) + " \\\\")
    
    # Close the table
    latex.append("\\hline")
    latex.append("\\end{tabular}")
    
    # Add table note
    if table_note:
        latex.append("\\begin{tablenotes}")
        latex.append("\\small")
        latex.append("\\item " + table_note)
        latex.append("\\end{tablenotes}")
    
    latex.append("\\end{table}")
    
    # Save to file
    with open(output_path, "w") as f:
        f.write("\n".join(latex))
    
    print(f"APA-formatted LaTeX table saved to {output_path}")
    
    # Also return the LaTeX code as a string
    return "\n".join(latex)

def convert_csv_to_apa_compact(csv_path, output_path, table_number=1, table_title="Impact of Textual Dimensions on Generation Quality", show_only_diff=True):
    """
    Convert a CSV file with dimension metrics to a more compact APA-formatted LaTeX table
    that only shows differences or a more focused set of metrics
    
    Args:
        csv_path (str): Path to the CSV file
        output_path (str): Path where to save the LaTeX file
        table_number (int): Table number for the APA table
        table_title (str): Title for the APA table
        show_only_diff (bool): If True, only show the difference columns
    """
    # Read the CSV file
    df = pd.read_csv(csv_path)
    
    # Filter to only include Dimension and Diff columns if requested
    if show_only_diff:
        diff_cols = ['Dimension'] + [col for col in df.columns if '(Diff)' in col]
        df = df[diff_cols]
    
    # Clean up column names for the diff-only columns
    if show_only_diff:
        clean_cols = {}
        for col in df.columns:
            if col == 'Dimension':
                clean_cols[col] = col
            else:
                # Extract just the metric name without "(Diff)"
                metric = col.replace(" (Diff)", "")
                clean_cols[col] = metric
        
        df = df.rename(columns=clean_cols)
    
    # Format the values for better readability
    for col in df.columns:
        if col == 'Dimension':
            continue
            
        # Format numeric values
        if df[col].dtype in [np.float64, np.float32, np.int64, np.int32]:
            df[col] = df[col].map(lambda x: f"{x:.2f}" if abs(x) >= 0.01 else f"{x:.3f}")
        elif df[col].dtype == object:  # String values
            # Try to convert strings to floats for formatting
            df[col] = df[col].map(lambda x: 
                f"{float(x.replace('%', '')):.1f}\\%" if isinstance(x, str) and '%' in x
                else (f"{float(x):.2f}" if isinstance(x, str) and abs(float(x)) >= 0.01 
                      else f"{float(x):.3f}" if isinstance(x, str)
                      else x))
    
    # Create LaTeX header
    latex = []
    latex.append("\\begin{table}")
    latex.append("\\caption{" + table_title + "}")
    latex.append("\\label{table" + str(table_number) + "}")
    
    # Calculate column formatting based on number of metrics
    num_metrics = len(df.columns) - 1  # Subtract 1 for the Dimension column
    column_format = "l" + "c" * num_metrics
    
    latex.append("\\begin{tabular}{" + column_format + "}")
    latex.append("\\hline")
    
    # Create column headers - simple version for diff-only table
    header = [col.replace("_", " ").title() for col in df.columns]
    latex.append(" & ".join(header) + " \\\\")
    latex.append("\\hline")
    
    # Create table rows
    for _, row in df.iterrows():
        # Format the dimension name nicely
        dimension = row["Dimension"].replace("_", " ").title()
        row_data = [dimension]
        
        # Add other columns
        for col in df.columns[1:]:  # Skip the Dimension column
            row_data.append(str(row[col]))
        
        latex.append(" & ".join(row_data) + " \\\\")
    
    # Close the table
    latex.append("\\hline")
    latex.append("\\end{tabular}")
    
    # Add a note about what the values represent
    if show_only_diff:
        note = "Note. Values represent the difference in metrics when the dimension is present versus absent. Positive values indicate higher performance when the dimension is present."
        latex.append("\\begin{tablenotes}")
        latex.append("\\small")
        latex.append("\\item " + note)
        latex.append("\\end{tablenotes}")
    
    latex.append("\\end{table}")
    
    # Save to file
    with open(output_path, "w") as f:
        f.write("\n".join(latex))
    
    print(f"Compact APA-formatted LaTeX table saved to {output_path}")
    
    # Also return the LaTeX code as a string
    return "\n".join(latex)

def create_multiple_tables_by_metric_group(csv_path, output_dir, metric_groups=None):
    """
    Create multiple APA tables based on grouped metrics
    
    Args:
        csv_path (str): Path to the CSV file
        output_dir (str): Directory to save the LaTeX files
        metric_groups (dict): Dictionary mapping group names to lists of metric name patterns
    """
    import os
    
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Read the CSV file
    df = pd.read_csv(csv_path)
    
    # Default metric groups if none provided
    if metric_groups is None:
        metric_groups = {
            "rouge": ["Rouge"],
            "similarity": ["Similarity"],
            "llm_scores": ["Authenticity", "Style", "Consistency"],
            "intent": ["Matching", "Intent"]
        }
    
    # Process each metric group
    for group_idx, (group_name, patterns) in enumerate(metric_groups.items(), 1):
        # Filter columns that match the current group patterns
        group_cols = ['Dimension']
        for col in df.columns:
            if col == 'Dimension':
                continue
            
            # Check if any pattern matches this column
            if any(pattern.lower() in col.lower() for pattern in patterns):
                group_cols.append(col)
        
        # Skip if no columns match (other than Dimension)
        if len(group_cols) <= 1:
            print(f"No matching columns for group {group_name}, skipping.")
            continue
        
        # Create a filtered DataFrame with only the relevant columns
        group_df = df[group_cols]
        
        # Save to CSV temporarily
        temp_csv = os.path.join(output_dir, f"temp_{group_name}.csv")
        group_df.to_csv(temp_csv, index=False)
        
        # Create and save the LaTeX table
        output_file = os.path.join(output_dir, f"table_{group_name}.tex")
        title = f"Impact of Dimensions on {group_name.replace('_', ' ').title()} Metrics"
        
        # Call the appropriate function based on the group
        if "intent" in group_name.lower():
            # For binary metrics like matching_intent
            convert_csv_to_apa_compact(temp_csv, output_file, table_number=group_idx, table_title=title)
        else:
            # For continuous metrics
            convert_csv_to_apa_latex(temp_csv, output_file, table_number=group_idx, table_title=title)
        
        # Remove temporary CSV
        os.remove(temp_csv)
    
    print(f"Created {len(metric_groups)} APA-formatted tables in {output_dir}")

def main():
    """Main function to convert CSV to APA tables"""
    input_csv = "/Users/christophhau/Desktop/Research_case/results/dimension_metrics_table_GPT4o.csv"
    diff_only_csv = "/Users/christophhau/Desktop/Research_case/results/dimension_metrics_table_GPT4o_diff_only.csv"
    output_tex = "/Users/christophhau/Desktop/Research_case/results/GPT4o_apa_table.tex"
    output_compact_tex = "/Users/christophhau/Desktop/Research_case/results/GPT4o_apa_table_compact.tex"
    output_dir = "/Users/christophhau/Desktop/Research_case/results/apa_tables"
    
    # Create a full APA table
    convert_csv_to_apa_latex(input_csv, output_tex)
    
    # Create a compact version with only differences
    convert_csv_to_apa_compact(diff_only_csv, output_compact_tex)
    
    # Create multiple tables based on metric groups
    metric_groups = {
        "rouge_scores": ["Rouge1", "Rouge2", "RougeL"],
        "similarity": ["Similarity"],
        "authenticity": ["Authenticity"],
        "style": ["Style"],
        "intent": ["Matching Intent"]
    }
    create_multiple_tables_by_metric_group(input_csv, output_dir, metric_groups)
    
    print("Conversion complete. APA tables created successfully.")

if __name__ == "__main__":
    main()

APA-formatted LaTeX table saved to /Users/christophhau/Desktop/Research_case/results/GPT4o_apa_table.tex
Compact APA-formatted LaTeX table saved to /Users/christophhau/Desktop/Research_case/results/GPT4o_apa_table_compact.tex
APA-formatted LaTeX table saved to /Users/christophhau/Desktop/Research_case/results/apa_tables/table_rouge_scores.tex
APA-formatted LaTeX table saved to /Users/christophhau/Desktop/Research_case/results/apa_tables/table_similarity.tex
APA-formatted LaTeX table saved to /Users/christophhau/Desktop/Research_case/results/apa_tables/table_authenticity.tex
APA-formatted LaTeX table saved to /Users/christophhau/Desktop/Research_case/results/apa_tables/table_style.tex
Compact APA-formatted LaTeX table saved to /Users/christophhau/Desktop/Research_case/results/apa_tables/table_intent.tex
Created 5 APA-formatted tables in /Users/christophhau/Desktop/Research_case/results/apa_tables
Conversion complete. APA tables created successfully.


### Test

In [10]:
# %% [markdown]
# # JSON Merger für Evaluationsdaten
# 
# Dieses Notebook führt zwei JSON-Dateien zusammen:
# 1. Eine Datei mit technischen Metriken (ROUGE-Scores, Similarity Scores)
# 2. Eine Datei mit qualitativen LLM-Evaluationen
#
# Das Ergebnis ist eine zusammengeführte JSON-Datei, die beide Arten von Evaluationen enthält.

# %%
import json
import os
from datetime import datetime
import pandas as pd
from IPython.display import display

# %% [markdown]
# ## Funktionen zum Laden und Zusammenführen der Dateien

# %%
def load_json_file(file_path):
    """
    Lädt eine JSON-Datei und gibt deren Inhalt zurück.
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return json.load(file)
    except Exception as e:
        print(f"Fehler beim Laden der Datei {file_path}: {e}")
        return None

# %%
def merge_json_files(technical_file, qualitative_file, output_file=None):
    """
    Führt die technische und qualitative JSON-Dateien zusammen.
    
    Parameters:
    -----------
    technical_file : str
        Pfad zur technischen JSON-Datei (individual_evaluations)
    qualitative_file : str
        Pfad zur qualitativen JSON-Datei (results)
    output_file : str, optional
        Ausgabedatei für die zusammengeführten Daten.
        Wenn None, wird keine Datei gespeichert, sondern nur das Ergebnis zurückgegeben.
        
    Returns:
    --------
    dict
        Die zusammengeführten Daten
    """
    # Dateien laden
    print(f"Lade technische Daten aus: {technical_file}")
    technical_data = load_json_file(technical_file)
    
    print(f"Lade qualitative Daten aus: {qualitative_file}")
    qualitative_data = load_json_file(qualitative_file)
    
    if not technical_data or not qualitative_data:
        print("Fehler beim Laden der Dateien!")
        return None
    
    # Erstellen einer Zuordnung von post_id zu qualitativen Bewertungen
    print("Erstelle Mapping für qualitative Bewertungen...")
    qualitative_by_id = {}
    for result in qualitative_data.get("results", []):
        post_id = result.get("post_id")
        if post_id:
            qualitative_by_id[post_id] = result.get("evaluation", {})
    
    # Ergebnis-Struktur erstellen
    merged_data = {
        "individual_evaluations": []
    }
    
    # Zusammenführen der Daten
    print("Führe Daten zusammen...")
    matched_count = 0
    unmatched_count = 0
    
    for evaluation in technical_data.get("individual_evaluations", []):
        generated_id = evaluation.get("metadata", {}).get("generated_id")
        qualitative_eval = qualitative_by_id.get(generated_id, {})
        
        # Prüfen, ob eine qualitative Bewertung gefunden wurde
        if qualitative_eval:
            matched_count += 1
        else:
            unmatched_count += 1
        
        # Originale Struktur kopieren
        merged_item = evaluation.copy()
        
        # LLM-Evaluation hinzufügen
        merged_item["llm_evaluation"] = qualitative_eval
        
        merged_data["individual_evaluations"].append(merged_item)
    
    print(f"Zusammenführung abgeschlossen:")
    print(f"- Gefundene technische Bewertungen: {len(technical_data.get('individual_evaluations', []))}")
    print(f"- Gefundene qualitative Bewertungen: {len(qualitative_data.get('results', []))}")
    print(f"- Erfolgreiche Matches: {matched_count}")
    print(f"- Einträge ohne Match: {unmatched_count}")
    
    # Speichern der zusammengeführten Daten, falls gewünscht
    if output_file:
        try:
            with open(output_file, 'w', encoding='utf-8') as file:
                json.dump(merged_data, file, indent=2, ensure_ascii=False)
            print(f"Zusammengeführte Daten wurden in {output_file} gespeichert.")
        except Exception as e:
            print(f"Fehler beim Speichern der zusammengeführten Daten: {e}")
    
    return merged_data

# %% [markdown]
# ## Zusammenführen der Dateien
# 
# Gib hier die Pfade zu deinen JSON-Dateien an:

# %%
# Pfade zu den JSON-Dateien
qualitative_file = "/Users/christophhau/Desktop/Research_case/results/fine_tune03/judge/evaluation_results_20250325_121256.json"  # Hier den Pfad anpassen
technical_file   = "/Users/christophhau/Desktop/Research_case/results/fine_tune03/eval_results.json"  # Hier den Pfad anpassen
output_file = "/Users/christophhau/Desktop/Research_case/results/fine_tune03/merged_evaluations.json"  # Name der Ausgabedatei

# %%
# Führe die Dateien zusammen
merged_data = merge_json_files(technical_file, qualitative_file, output_file)

# %% [markdown]
# ## Analyse der zusammengeführten Daten
# 
# Wir können nun einige Statistiken über die zusammengeführten Daten anzeigen:

# %%
if merged_data:
    # Erstelle ein DataFrame mit einigen wichtigen Metriken
    metrics_data = []
    
    for item in merged_data["individual_evaluations"]:
        metrics_row = {
            "generated_id": item.get("metadata", {}).get("generated_id", ""),
            "rouge1_f1": item.get("rouge_scores", {}).get("rouge1", {}).get("fmeasure", 0),
            "rouge2_f1": item.get("rouge_scores", {}).get("rouge2", {}).get("fmeasure", 0),
            "rougeL_f1": item.get("rouge_scores", {}).get("rougeL", {}).get("fmeasure", 0),
            "similarity_score": item.get("similarity_scores", 0),
            "authenticity_score": item.get("llm_evaluation", {}).get("authenticity", {}).get("score", 0),
            "style_consistency_score": item.get("llm_evaluation", {}).get("style_consistency", {}).get("score", 0),
            "matching_intent": item.get("llm_evaluation", {}).get("matching_intent", False)
        }
        metrics_data.append(metrics_row)
    
    metrics_df = pd.DataFrame(metrics_data)
    
    # Zeige die ersten Einträge
    print("Beispieleinträge aus den zusammengeführten Daten:")
    display(metrics_df.head())
    
    # Zeige einige Statistiken
    print("\nStatistiken zu den Metriken:")
    display(metrics_df.describe())
    
    # Zähle, wie oft intent übereinstimmt
    intent_counts = metrics_df['matching_intent'].value_counts()
    print("\nIntent-Übereinstimmung:")
    display(intent_counts)
else:
    print("Keine Daten zum Analysieren verfügbar.")

# %% [markdown]
# ## Beispiel für einen einzelnen Eintrag
# 
# Schauen wir uns einen vollständigen Eintrag aus den zusammengeführten Daten an:

# %%
if merged_data and merged_data["individual_evaluations"]:
    sample_entry = merged_data["individual_evaluations"][0]
    print(json.dumps(sample_entry, indent=2))

Lade technische Daten aus: /Users/christophhau/Desktop/Research_case/results/fine_tune03/eval_results.json
Lade qualitative Daten aus: /Users/christophhau/Desktop/Research_case/results/fine_tune03/judge/evaluation_results_20250325_121256.json
Erstelle Mapping für qualitative Bewertungen...
Führe Daten zusammen...
Zusammenführung abgeschlossen:
- Gefundene technische Bewertungen: 8000
- Gefundene qualitative Bewertungen: 8000
- Erfolgreiche Matches: 8000
- Einträge ohne Match: 0
Zusammengeführte Daten wurden in /Users/christophhau/Desktop/Research_case/results/fine_tune03/merged_evaluations.json gespeichert.
Beispieleinträge aus den zusammengeführten Daten:


Unnamed: 0,generated_id,rouge1_f1,rouge2_f1,rougeL_f1,similarity_score,authenticity_score,style_consistency_score,matching_intent
0,1.016827313022034e+18_gen_0,0.336842,0.107527,0.252632,0.809191,7,6,False
1,1.016827313022034e+18_gen_1,0.293333,0.082192,0.266667,0.826995,6,5,False
2,809529043.0_gen_0,0.12987,0.026667,0.12987,0.75175,8,9,True
3,809529043.0_gen_1,0.1,0.0,0.075,0.73396,7,8,False
4,1530958506.0_gen_0,0.530612,0.291667,0.510204,0.893119,7,6,False



Statistiken zu den Metriken:


Unnamed: 0,rouge1_f1,rouge2_f1,rougeL_f1,similarity_score,authenticity_score,style_consistency_score
count,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0
mean,0.210983,0.056085,0.150832,0.766283,6.54,6.469
std,0.106966,0.07236,0.086273,0.068518,1.387224,1.756182
min,0.0,0.0,0.0,0.458403,0.0,0.0
25%,0.135593,0.0,0.095238,0.724762,6.0,5.0
50%,0.2,0.035088,0.135758,0.771792,7.0,7.0
75%,0.275,0.079208,0.188754,0.814586,8.0,8.0
max,0.76,0.75,0.76,0.954009,9.0,10.0



Intent-Übereinstimmung:


matching_intent
False    4812
True     3188
Name: count, dtype: int64

{
  "rouge_scores": {
    "rouge1": {
      "precision": 0.3076923076923077,
      "recall": 0.37209302325581395,
      "fmeasure": 0.3368421052631579
    },
    "rouge2": {
      "precision": 0.09803921568627451,
      "recall": 0.11904761904761904,
      "fmeasure": 0.1075268817204301
    },
    "rougeL": {
      "precision": 0.23076923076923078,
      "recall": 0.27906976744186046,
      "fmeasure": 0.25263157894736843
    }
  },
  "similarity_scores": 0.8091909885406494,
  "metadata": {
    "original_id": 1.69015609e+18,
    "generated_id": "1.016827313022034e+18_gen_0",
    "timestamp": "2025-01-29T14:49:41.221139+00:00"
  },
  "llm_evaluation": {
    "authenticity": {
      "score": 7,
      "explanation": "The generated post captures the user's skepticism towards government actions and highlights the issue of data privacy, aligning with their libertarian viewpoint. However, it introduces a more conversational and rhetorical style with phrases like 'Uncle Sam' and 'Why am I not s