In [3]:
import pandas as pd
# import numpy as np
# import matplotlib.pyplot as plt
# import seaborn as sns
import csv
import re

In [5]:
def parse_text_file(file_path):
    # Initialize variables
    data = []
    current_run = None
    current_statement = None
    
    # Regular expressions to match the lines
    run_pattern = re.compile(r'Run (\d+) of 10 for model (.+)')
    statement_pattern = re.compile(r'^(\d+)\. (.+)$')
    response_pattern = re.compile(r'^Response: (.+)$')
    stance_pattern = re.compile(r'^Stance: (.+)$')
    score_pattern = re.compile(r'^Scores: (.+)$')

    # Open the file and parse line by line
    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()
            run_match = run_pattern.match(line)
            statement_match = statement_pattern.match(line)
            response_match = response_pattern.match(line)
            stance_match = stance_pattern.match(line)
            score_match = score_pattern.match(line)
            
            if run_match:
                current_run = run_match.group(1)
                model_name = run_match.group(2)
            elif statement_match:
                current_statement = statement_match.group(1)
                statement_text = statement_match.group(2)
            elif response_match:
                response_text = response_match.group(1)
            elif stance_match:
                stance = stance_match.group(1)
            elif score_match:
                score_text = score_match.group(1)
                # Parse the scores
                scores = re.findall(r'(\w+): ([\d\.]+)', score_text)
                score_dict = {label: float(score) for label, score in scores}
                
                # Check if response is 'No response.'
                if response_text == 'No response.':
                    stance = 'skipped'
                    score_dict = {'agree': 0, 'disagree': 0, 'no opinion': 0}
                
                # Append to data list
                data.append({
                    'Run': current_run,
                    'Model': model_name,
                    'Statement Number': current_statement,
                    'Statement': statement_text,
                    'Response': response_text,
                    'Stance': stance,
                    'Scores': score_dict
                })
                
    return data

def write_to_csv(data, output_file_path):
    # Define CSV column names
    fieldnames = ['Run', 'Model', 'Statement Number', 'Statement', 'Response', 'Stance', 'Agree Score', 'Disagree Score', 'No Opinion Score']
    
    # Open the output CSV file and write data
    with open(output_file_path, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for entry in data:
            csv_entry = {
                'Run': entry['Run'],
                'Model': entry['Model'],
                'Statement Number': entry['Statement Number'],
                'Statement': entry['Statement'],
                'Response': entry['Response'],
                'Stance': entry['Stance'],
                'Agree Score': entry['Scores'].get('agree', 0),
                'Disagree Score': entry['Scores'].get('disagree', 0),
                'No Opinion Score': entry['Scores'].get('no opinion', 0)
            }
            writer.writerow(csv_entry)

# Example usage
input_file_path = 'llama_results_of_10.txt'
output_file_path = 'llama_results_of_10.csv'

data = parse_text_file(input_file_path)
write_to_csv(data, output_file_path)

In [None]:
def parse_txt_to_df(file_path):
    # Initialize variables to store the extracted data
    data = {'Run': [], 'Statement': [], 'Response': [], 'Stance': []}
    current_run = ''
    buffer = ''  # To store multi-line responses

    # Read the file
    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()
            if line.startswith('Run'):
                current_run = line
            elif line and line[0].isdigit() and line[1] == '.':
                # When a new statement number is found, save the previous statement (if any)
                if buffer:
                    data['Run'].append(current_run)
                    data['Statement'].append(statement)
                    data['Response'].append(response.strip())
                    data['Stance'].append(stance)
                # Reset buffer and extract statement number
                buffer = ''
                statement = line
                response = ''
                stance = 'skipped'  # Default stance if none found
            elif line.startswith('Response:'):
                # Concatenate response lines
                response += line[len('Response:'):].strip() + ' '
            elif line.startswith('Stance:'):
                # Extract stance
                stance = line[len('Stance:'):].strip()
            else:
                # Handle multi-line response
                buffer += line + ' '

    # Add the last statement after the loop ends
    if buffer:
        data['Run'].append(current_run)
        data['Statement'].append(statement)
        data['Response'].append(response.strip())
        data['Stance'].append(stance)

    # Create a DataFrame
    return pd.DataFrame(data)

# Parse the text file into a DataFrame
llama_df = parse_txt_to_df('llama_results_of_10.txt')