## Description
This notebook cleanes the dataset of models' responses to the critical reasoning question pool:
- Merges datasets responses generated from different runs of the script
- Pivots horizontally the dataset
- Divides the <think> part from the actual response in deepseek-r1's answers
- Divides the different sections of the response in ##Reasoning and ##Solution, extracting them with tailored regexes
- Assigns a score 0-1 based on whether the structure response instructions were followed or not
- Adds colums from original datasets for additional information, for proper evaluation
- Assign a score 0-1 based on whether the models aswered correctly or not

In [2]:
import os
import sys
import pandas as pd
import sqlite3
from tqdm import tqdm
import numpy as np
import re
import random
import torch 
import torch.nn.functional as F
import math
desktop_path = os.path.join(os.path.expanduser("~"), "Desktop", "Data_cleaning_cosine_calculation_semantic_and_analysis")
os.chdir(desktop_path)

In [3]:
crit_reas_pippo= pd.read_csv('Models_answers/critical_reasoning_answers.csv')
crit_reas_luca= pd.read_csv('Models_answers/critical_reasoning_answers_luca.csv')

In [4]:
crit_reas_pippo = crit_reas_pippo[~crit_reas_pippo['model'].isin(['deepseek-r1:1.5b', 'qwen2.5:1.5b'])]


In [5]:
crit_reas_pippo = crit_reas_pippo.drop_duplicates(subset=['QuestionID', 'model'], keep='last')
crit_reas=pd.concat([crit_reas_pippo, crit_reas_luca], ignore_index=True)

In [6]:
crit_reas_response = crit_reas.pivot(index='QuestionID', columns='model', values='response').reset_index()

import re
def split_think(text):
    if isinstance(text, str):
        match = re.search(r"<think>(.*?)</think>", text, re.DOTALL)
        if match:
            think_part = match.group(1).strip()
            response_part = text.replace(match.group(0), "").strip()
            return pd.Series([think_part, response_part])
    return pd.Series(["", text])

crit_reas_response[['deepseek-r1:1.5b_think', 'deepseek-r1:1.5b']] = crit_reas_response['deepseek-r1:1.5b'].apply(split_think)
crit_reas_response[['deepseek-r1:14b_think', 'deepseek-r1:14b']] = crit_reas_response['deepseek-r1:14b'].apply(split_think)


In [7]:
crit_reas_time= crit_reas.pivot(index='QuestionID', columns='model', values='time_taken_seconds').reset_index()
crit_reas_time.columns = [f"{col}_time" for col in crit_reas_time.columns]

In [8]:
df_crit_analysis = pd.concat([crit_reas_response, crit_reas_time], axis=1).reset_index()
df_crit_analysis = df_crit_analysis.drop(columns=['QuestionID_time', 'index'])

In [9]:
data=pd.read_csv('Additional_information_datasets/critical_reasoning_questions.csv')
data_to_merge=data[['QuestionID','Answer', 'Solution', 'QuestionText']]

In [10]:
df_crit_analysis = df_crit_analysis.merge(data_to_merge, on='QuestionID', how='left')

In [11]:
def split_argument_answer(text):
    if not isinstance(text, str):
        return "", "", 1

    text = text.strip()
    arg_marker = "# Argument Construction"
    ans_marker = "# Answer"

    arg_index = text.find(arg_marker)
    ans_index = text.find(ans_marker)

    if arg_index != -1 and ans_index != -1 and arg_index < ans_index:
        evaluation = text[arg_index + len(arg_marker):ans_index].strip()
        solution = text[ans_index + len(ans_marker):].strip()
        return evaluation, solution, 0 

    return text.strip(), text.strip(), 1


model_names = ["deepseek-r1:1.5b", "deepseek-r1:14b", "qwen2.5:1.5b", "qwen2.5:14b"]

for model_name in model_names:
    source_col = model_name
    eval_col = f"{model_name}_evaluation"
    sol_col = f"{model_name}_solution"
    score_col = f"{model_name}_structure_score"

    results = df_crit_analysis[source_col].apply(split_argument_answer)
    df_crit_analysis[[eval_col, sol_col, score_col]] = pd.DataFrame(results.tolist(), index=df_crit_analysis.index)


In [12]:

df_crit_analysis = df_crit_analysis.drop(columns=[col for col in df_crit_analysis.columns if '_think' in col])

In [13]:
df_crit_analysis = df_crit_analysis[df_crit_analysis['Solution'] != 'Solution not found.']

In [14]:
def clean_solution(text):
    if not isinstance(text, str):
        return np.nan

    text = text.strip()
    pattern = re.compile(
        r'(?i)(?:\\boxed\{\\text\{([A-E])\}\}'      
        r'|\\text\{([A-E])\}'                       
        r'|\*\*([A-E])\*\*'                         
        r'|\(([A-E])\)'                             
        r'|#\s*([A-E])'                             
        r'|answer\s*[:\s]*([A-E])'                  
        r'|final\s+answer\s*[:\s]*([A-E])'          
        r'|correct\s+answer\s+is\s*[:\s]*([A-E])'     
        r'|^([A-E])$'                                 
        r')'
    )

    match = pattern.search(text)
    if match:
        for group in match.groups():
            if group:
                return group.upper()

    return np.nan


model_names = ["deepseek-r1:1.5b", "deepseek-r1:14b", "qwen2.5:1.5b", "qwen2.5:14b"]

for model_name in model_names:
    sol_col = f"{model_name}_solution"
    df_crit_analysis[sol_col] = df_crit_analysis[sol_col].apply(clean_solution)

In [15]:
model_solution_sums = {}
for model_name in model_names:
    sol_col = f"{model_name}_solution"
    model_solution_sums[model_name] = df_crit_analysis[sol_col].value_counts().sum()



In [16]:
df_crit_analysis = df_crit_analysis.rename(columns={f"{model_name}_evaluation": f"{model_name}_reasoning" for model_name in model_names})

In [17]:
for model_name in model_names:
    sol_col = f"{model_name}_solution"
    ans_col = 'Answer'
    correct_col = f"{model_name}_correct"
    
    df_crit_analysis[correct_col] = np.where(df_crit_analysis[sol_col] == df_crit_analysis[ans_col], 1, 0)

In [18]:
df_crit_analysis.to_csv('NLP_analysis/critical_reasoning_analysis.csv', index=False)