## Description
This notebook cleanes the dataset of models' responses to the proofs question pool:
- Merges datasets responses generated from different runs of the script
- Pivots horizontally the dataset
- Divides the <think> part from the actual response in deepseek-r1's answers
- Divides the different sections of the response in ##Reasoning and ##Solution, extracting them with tailored regexes
- Assigns a score 0-1 based on whether the structure response instructions were followed or not
- Adds colums from original datasets for additional information, for proper evaluation
- Assign a score 0-1 based on whether the models aswered correctly or not

In [None]:
import os
import sys
import pandas as pd
import sqlite3
from tqdm import tqdm
import numpy as np
import re
import random
import torch 
import torch.nn.functional as F
import math
desktop_path = os.path.join(os.path.expanduser("~"), "llm-justification-evaluation", "Data_cleaning_cosine_calculation_semantic_and_analysis")
os.chdir(desktop_path)

In [2]:
df_proofs= pd.read_csv('Models_answers/proofs_answers.csv')
df_proofs = df_proofs.drop_duplicates(subset=['uuid', 'model'], keep='last')


In [3]:
df_proofs_response = df_proofs.pivot(index='uuid', columns='model', values='response').reset_index()

def split_think(text):
    if isinstance(text, str):
        match = re.search(r"<think>(.*?)</think>", text, re.DOTALL)
        if match:
            think_part = match.group(1).strip()
            response_part = text.replace(match.group(0), "").strip()
            return pd.Series([think_part, response_part])
    return pd.Series(["", text])

df_proofs_response[['deepseek-r1:1.5b_think', 'deepseek-r1:1.5b']] = df_proofs_response['deepseek-r1:1.5b'].apply(split_think)
df_proofs_response[['deepseek-r1:14b_think', 'deepseek-r1:14b']] = df_proofs_response['deepseek-r1:14b'].apply(split_think)


In [4]:
df_proofs_time= df_proofs.pivot(index='uuid', columns='model', values='time_taken_seconds').reset_index()
df_proofs_time.columns = [f"{col}_time" for col in df_proofs_time.columns]
df_proofs_analysis = pd.concat([df_proofs_response, df_proofs_time], axis=1).reset_index()
df_proofs_analysis = df_proofs_analysis.drop(columns=['uuid_time', 'index'])

In [5]:
data=pd.read_csv('Additional_information_datasets/OpenR1-Math-220k_for_answers.csv')
data_to_merge=data[['uuid','answer', 'solution', 'problem_type', 'problem']]
df_proofs_analysis = df_proofs_analysis.merge(data_to_merge, on='uuid', how='left')
df_proofs_analysis = df_proofs_analysis.drop(columns=[col for col in df_proofs_analysis.columns if '_think' in col])

In [6]:
def parse_and_score_model_column(df, model_name):
    reasoning_col = f"{model_name}_reasoning"
    solution_col = f"{model_name}_solution"
    score_col = f"{model_name}_structure_score"

    reasoning_list = []
    solution_list = []
    structure_score_list = []

    for text in df_proofs_analysis[model_name]:
        text = str(text)
        reasoning = ""
        solution = ""
        score = 1 

        has_reasoning_heading = "## Reasoning" in text
        has_solution_heading = "## Solution" in text
        has_answer_heading = "**Answer:**" in text
        boxed_answer_match = re.search(r"\\boxed\{([A-D])\}", text)

        if has_reasoning_heading:
            reasoning_match = re.search(
                r"##\s*Reasoning\s*(.*?)(?=##\s*Solution|\*\*Answer:|\\boxed\{[A-D]\}|$)",
                text,
                re.DOTALL | re.IGNORECASE
            )
            if reasoning_match:
                reasoning = reasoning_match.group(1).strip()
        else:
            fallback_reasoning_match = re.search(
                r"^(.*?)(?=##\s*Solution|\*\*Answer:|\\boxed\{[A-D]\})",
                text,
                re.DOTALL | re.IGNORECASE
            )
            if fallback_reasoning_match:
                reasoning = fallback_reasoning_match.group(1).strip()

        solution_match = re.search(r"##\s*Solution\s*([A-D])\s*$", text, re.MULTILINE)
        if solution_match:
            solution = solution_match.group(1)
            if has_reasoning_heading:
                score = 0
        else:
            answer_match = re.search(r"\*\*Answer:\*\*\s*([A-D])", text, re.IGNORECASE)
            if answer_match:
                solution = answer_match.group(1)
            elif boxed_answer_match:
                solution = boxed_answer_match.group(1)
            else:
                raw_match = re.search(r"(##\s*Solution|\*\*Answer:\*\*|\[\s*\\boxed\{[A-D]\}\s*\])\s*(.*)", text, re.DOTALL | re.IGNORECASE)
                if raw_match:
                    solution = raw_match.group(2).strip()

        if reasoning.strip() == '' or solution.strip() == '':
            reasoning = text
            solution = text
            score = 1 

        reasoning_list.append(reasoning)
        solution_list.append(solution)
        structure_score_list.append(score)

    df[reasoning_col] = reasoning_list
    df[solution_col] = solution_list
    df[score_col] = structure_score_list

    return df


models = [
    "deepseek-r1:1.5b",
    "deepseek-r1:14b",
    "qwen2.5:1.5b",
    "qwen2.5:14b"
]

for model in models:
    df_proofs_analysis = parse_and_score_model_column(df_proofs_analysis, model)


In [7]:
df_proofs_analysis.to_csv('NLP_analysis/proofs_analysis.csv', index=False)