In [None]:
import pandas as pd
import json

import os

print(os.getcwd())



In [None]:
# from google.colab import drive
# drive.mount('/content/drive')


In [None]:
path = '/content/drive/MyDrive/topics-final-project/batch_requests/batch_results.jsonl'

# Form Combined Prompt + Response Dataset

In [None]:
# Pull in all .jsonl files within batches directory and form into pandas df
batch_files = [f for f in os.listdir('batches') if f.endswith('.jsonl')]
batch_dataframes = [pd.read_json(os.path.join('batches', f), lines=True) for f in batch_files]
prompts = pd.concat(batch_dataframes, ignore_index=True)

prompts = prompts.set_index('custom_id')
prompts = prompts.drop(columns=['method', 'url'])
prompts

# Define a function to extract the question from the user's message content
def extract_question(body):
    try:
        # The question is in the user message (index 1) content, inside the 'messages' list
        user_content = body['messages'][1]['content']
        
        # The question appears after "Question:" and before "Answer:"
        # Let's extract just the question part
        if "Question:" in user_content and "Answer:" in user_content:
            # Split by "Question:" and take the second part
            question_part = user_content.split("Question:")[1]
            # Split that by "Answer:" and take the first part
            question_only = question_part.split("Answer:")[0].strip()
            return question_only
        else:
            # If it doesn't follow the expected format, return the whole content
            return user_content.strip()
    except (KeyError, IndexError, TypeError) as e:
        print(f"Error extracting question: {e}")
        return None

# Apply the function to extract questions
prompts['question'] = prompts['body'].apply(extract_question)

prompts = prompts.drop(columns=['body'])
prompts.head()



In [None]:
# Pull in all .jsonl files within batch_results directory and form into pandas df

batch_results_path = 'batch_results'
batch_results_files = [f for f in os.listdir(batch_results_path) if f.endswith('.jsonl')]

# Read all files into a list of dataframes
batch_results_dataframes = [pd.read_json(os.path.join(batch_results_path, f), lines=True) for f in batch_results_files]

# Concatenate all dataframes into a single dataframe
responses = pd.concat(batch_results_dataframes, ignore_index=True)


# Set custom_id column as index of combined_df
responses = responses.set_index('custom_id')
responses = responses.drop(columns=['id', 'error'])
responses

# Transform the response column to get the answer
def extract_answer(response):
    try:
        return response['body']['choices'][0]['message']['content']
    except (KeyError, TypeError, IndexError):
        return None

# Apply the transformation to each row
responses['answer'] = responses['response'].apply(extract_answer)

responses = responses.drop(columns=['response'])

responses


In [None]:
# Merge prompts and responses on custom_id
combined_df = pd.merge(prompts, responses, left_index=True, right_index=True, how='inner')

print(combined_df.iloc[10]['question'])

print(combined_df.iloc[10]['answer'])

combined_df



# Process Data into Fragments

In [None]:
import re

def extract_answer_fragments(answer_text):
    if not isinstance(answer_text, str):
        return []
    
    # Split the answer into steps using regex pattern
    steps_pattern = r'(Step \d+:|Conclusion:)'
    steps = re.split(steps_pattern, answer_text)
    
    # Remove any empty strings from the split
    steps = [s.strip() for s in steps if s.strip()]
    
    fragments = []
    
    # Process each step and extract components
    for i in range(0, len(steps), 2):
        if i+1 < len(steps):
            step_title = steps[i]  # "Step 1:", "Step 2:", "Conclusion:"
            step_content = steps[i+1]
            
            # Special handling for Conclusion
            if step_title == "Conclusion:":
                fragments.append(("Conclusion:", step_content))
                continue
                
            # For normal steps, split by semicolon
            parts = step_content.split(';', 1)
            
            if len(parts) == 2:
                high_level = parts[0].strip()
                low_level = parts[1].strip()
                fragments.append((f"{step_title} {high_level}", low_level))
            else:
                fragments.append((f"{step_title} {step_content}", ""))
    
    return fragments

# Apply the function to split up the answer into chunks containing the full reasoning, including lower-level reasoning (LLR)
combined_df['LLR'] = combined_df['answer'].apply(extract_answer_fragments)


In [None]:
combined_df

In [None]:
# Validate data
combined_df.iloc[0,2]

In [None]:
# Create High-Level Reasoning (HLR) chunks

# Create a function to extract the first element from each tuple in the LLR column
def extract_step_titles(llr_tuples):
    if not isinstance(llr_tuples, list):
        return []
    return [item[0] for item in llr_tuples]

# Apply the function to create the HLR column
combined_df['HLR'] = combined_df['LLR'].apply(extract_step_titles)

combined_df.iloc[0,3]




In [None]:
combined_df.to_csv('combined_df.csv')