## Description
This notebook cleanes the dataset of models' responses to critical reasoning question pool:
- Merges datasets responses generated from different runs of the script
- Pivots horizontally the dataset
- Divides the <think> part from the actual response in deepseek-r1's answers
- Divides the different sections of the response in ##Reasoning and ##Solution, extracting them with tailored regexes
- Assigns a score 0-1 based on whether the structure response instructions were followed or not
- Adds colums from original datasets for additional information, for proper evaluation
- Assign a score 0-1 based on whether the models aswered correctly or not

In [None]:
import os
import sys
import pandas as pd
import sqlite3
from tqdm import tqdm
import numpy as np
import re
import random
import torch 
import torch.nn.functional as F
import math
desktop_path = os.path.join(os.path.expanduser("~"), "llm-justification-evaluation", "Data_cleaning_cosine_calculation_semantic_and_analysis")
os.chdir(desktop_path)

In [2]:
df_luca=pd.read_csv('Models_answers/math_answers_luca.csv')
df_final_to_merge=pd.read_csv('Models_answers/math_answers.csv')

In [3]:
df_final_combined = pd.concat([df_luca, df_final_to_merge]).reset_index(drop=True)
df_final_combined = df_final_combined[df_final_combined['response'] != 'Error: Failed to connect to Ollama. Please check that Ollama is downloaded, running and accessible. https://ollama.com/download']

In [4]:
df_final_merged_1= df_final_combined[df_final_combined['model'] == 'deepseek-r1:14b']
df_final_merged_1 = df_final_merged_1.drop_duplicates(subset=['uuid'], keep='first')
df_final_merged_2= df_final_combined[df_final_combined['model'] == 'deepseek-r1:1.5b']
df_final_merged_2 = df_final_merged_2.drop_duplicates(subset=['uuid'], keep='first')
df_final_combined = df_final_combined[~df_final_combined['model'].isin(['deepseek-r1:14b', 'deepseek-r1:1.5b'])]
df_final_combined = pd.concat([df_final_combined, df_final_merged_2, df_final_merged_1]).reset_index(drop=True)

In [5]:
df_math_analysis=df_final_combined.copy()
valid_uuids = df_math_analysis['uuid'].value_counts()
valid_uuids = valid_uuids[valid_uuids == 4].index
df_math_analysis = df_math_analysis[df_math_analysis['uuid'].isin(valid_uuids)]



In [6]:
response_df = df_math_analysis.pivot(index='uuid', columns='model', values='response')

def split_think(text):
    if isinstance(text, str):
        match = re.search(r"<think>(.*?)</think>", text, re.DOTALL)
        if match:
            think_part = match.group(1).strip()
            response_part = text.replace(match.group(0), "").strip()
            return pd.Series([think_part, response_part])
    return pd.Series(["", text])

response_df[['deepseek-r1:1.5b_think', 'deepseek-r1:1.5b']] = response_df['deepseek-r1:1.5b'].apply(split_think)
response_df[['deepseek-r1:14b_think', 'deepseek-r1:14b']] = response_df['deepseek-r1:14b'].apply(split_think)

In [7]:
time_df = df_math_analysis.pivot(index='uuid', columns='model', values='time_taken_seconds')
time_df.columns = [f"{col}_time" for col in time_df.columns]
df_math_analysis = pd.concat([response_df, time_df], axis=1).reset_index()

In [8]:
df_questions = pd.read_csv('Additional_information_datasets/math_questions_pool.csv')
df_questions = df_questions[['uuid', 'problem', 'problem_type', 'source']]
df_math_analysis = df_math_analysis.merge(df_questions, on='uuid', how='left')

In [9]:
df1=pd.read_csv('Additional_information_datasets/OpenR1-Math-220k_for_answers.csv')
df_solutions = df1[['uuid', 'solution', 'answer']]
df_math_analysistrial = df_math_analysis.merge(df_solutions, on='uuid', how='left')
df_math_analysistrial
df_math_analysistrial['solution'].value_counts()
df_math_analysistrial = df_math_analysistrial[df_math_analysistrial['solution'].map(df_math_analysistrial['solution'].value_counts()) == 1]
df_math_analysistrial['solution'].value_counts()

df_math_analysistrial_sample = df_math_analysistrial.sample(n=500, random_state=40)
df_math_analysistrial_sample.reset_index(drop=True, inplace=True)
df_math_analysistrial_sample['answer'] = df_math_analysistrial_sample['answer'].str.replace(r'\\text\s*{\s*([A-D])\s*}', r'\1', regex=True)


In [10]:
def parse_and_score_model_column(df, model_name):
    reasoning_col = f"{model_name}_reasoning"
    solution_col = f"{model_name}_solution"
    score_col = f"{model_name}_structure_score"

    reasoning_list = []
    solution_list = []
    structure_score_list = []

    for text in df[model_name]:
        text = str(text)
        reasoning = ""
        solution = ""
        score = 1 

        has_reasoning_heading = "## Reasoning" in text
        has_solution_heading = "## Solution" in text
        has_answer_heading = "**Answer:**" in text
        boxed_answer_match = re.search(r"\\boxed\{([A-D])\}", text)

        if has_reasoning_heading:
            reasoning_match = re.search(
                r"##\s*Reasoning\s*(.*?)(?=##\s*Solution|\*\*Answer:|\\boxed\{[A-D]\}|$)",
                text,
                re.DOTALL | re.IGNORECASE
            )
            if reasoning_match:
                reasoning = reasoning_match.group(1).strip()
        else:
            fallback_reasoning_match = re.search(
                r"^(.*?)(?=##\s*Solution|\*\*Answer:|\\boxed\{[A-D]\})",
                text,
                re.DOTALL | re.IGNORECASE
            )
            if fallback_reasoning_match:
                reasoning = fallback_reasoning_match.group(1).strip()

        solution_match = re.search(r"##\s*Solution\s*([A-D])\s*$", text, re.MULTILINE)
        if solution_match:
            solution = solution_match.group(1)
            if has_reasoning_heading:
                score = 0
        else:
            answer_match = re.search(r"\*\*Answer:\*\*\s*([A-D])", text, re.IGNORECASE)
            if answer_match:
                solution = answer_match.group(1)
            elif boxed_answer_match:
                solution = boxed_answer_match.group(1)
            else:
                raw_match = re.search(r"(##\s*Solution|\*\*Answer:\*\*|\[\s*\\boxed\{[A-D]\}\s*\])\s*(.*)", text, re.DOTALL | re.IGNORECASE)
                if raw_match:
                    solution = raw_match.group(2).strip()

        if reasoning.strip() == '' or solution.strip() == '':
            reasoning = text
            solution = text
            score = 1 

        reasoning_list.append(reasoning)
        solution_list.append(solution)
        structure_score_list.append(score)

    df[reasoning_col] = reasoning_list
    df[solution_col] = solution_list
    df[score_col] = structure_score_list

    return df


models = [
    "deepseek-r1:1.5b",
    "deepseek-r1:14b",
    "qwen2.5:1.5b",
    "qwen2.5:14b"
]

for model in models:
    df_math_analysistrial_sample = parse_and_score_model_column(df_math_analysistrial_sample, model)


In [11]:
def clean_solution(text):
    if not isinstance(text, str):
        return float('nan')
    
    text = text.strip()
    pattern = re.compile(r'(?:\\boxed\{\\text\{([A-Da-d])\}\}|\\text\{([A-Da-d])\}|^([A-Da-d]):|([A-Da-d]))')
    
    match = pattern.search(text)
    if match:
        letter = next(g for g in match.groups() if g is not None)
        return letter.upper()
    
    return float('nan')

for model_name in models:
    col = f"{model_name}_solution"
    df_math_analysistrial_sample[col] = df_math_analysistrial_sample[col].apply(clean_solution)
df_math_analysistrial_sample['answer'] = df_math_analysistrial_sample['answer'].apply(clean_solution)

In [12]:
df_math_analysistrial_sample = df_math_analysistrial_sample.drop(columns=['deepseek-r1:1.5b_think', 'deepseek-r1:14b_think'])


In [13]:
model_names = ["deepseek-r1:1.5b", "deepseek-r1:14b", "qwen2.5:1.5b", "qwen2.5:14b"]
for model_name in model_names:
    sol_col = f"{model_name}_solution"
    ans_col = 'answer'
    correct_col = f"{model_name}_correct"
    
    df_math_analysistrial_sample[correct_col] = np.where(df_math_analysistrial_sample[sol_col] == df_math_analysistrial_sample[ans_col], 1, 0)

In [14]:
df_math_analysistrial_sample.to_csv('NLP_analysis/math_analysis.csv', index=False)