## Description
This notebook cleanes the dataset of models' responses to the reading_comprehension question pool:
- Merges datasets responses generated from different runs of the script
- Pivots horizontally the dataset
- Divides the <think> part from the actual response in deepseek-r1's answers
- Divides the different sections of the response in ##Reasoning and ##Solution, extracting them with tailored regexes
- Assigns a score 0-1 based on whether the structure response instructions were followed or not
- Adds colums from original datasets for additional information, for proper evaluation
- Assign a score 0-1 based on whether the models aswered correctly or not

In [None]:
import os
import sys
import pandas as pd
import sqlite3
from tqdm import tqdm
import numpy as np
import re
import random
import torch 
import torch.nn.functional as F
import math
desktop_path = os.path.join(os.path.expanduser("~"), "llm-justification-evaluation", "Data_cleaning_cosine_calculation_semantic_and_analysis")
os.chdir(desktop_path)

In [2]:
read_pippo= pd.read_csv('Models_answers/reading_comprehension_answers.csv')
read_luca= pd.read_csv('Models_answers/reading_comprehension_answers_luca.csv')
read_pippo = read_pippo[~read_pippo['model'].isin(['deepseek-r1:1.5b'])]
read_pippo = read_pippo.drop_duplicates(subset=['QuestionID', 'model'], keep='last')
read_=pd.concat([read_pippo, read_luca]).reset_index(drop=True)
read_ = read_[~read_['response'].str.contains('Error: 1 ')]

In [3]:
read_response = read_.pivot(index='QuestionID', columns='model', values='response').reset_index()

import re
def split_think(text):
    if isinstance(text, str):
        match = re.search(r"<think>(.*?)</think>", text, re.DOTALL)
        if match:
            think_part = match.group(1).strip()
            response_part = text.replace(match.group(0), "").strip()
            return pd.Series([think_part, response_part])
    return pd.Series(["", text])

read_response[['deepseek-r1:1.5b_think', 'deepseek-r1:1.5b']] = read_response['deepseek-r1:1.5b'].apply(split_think)
read_response[['deepseek-r1:14b_think', 'deepseek-r1:14b']] = read_response['deepseek-r1:14b'].apply(split_think)


In [4]:
read_time= read_.pivot(index='QuestionID', columns='model', values='time_taken_seconds').reset_index()
read_time.columns = [f"{col}_time" for col in read_time.columns]
read_analysis = pd.concat([read_response, read_time], axis=1).reset_index()
read_analysis = read_analysis.drop(columns=['QuestionID_time', 'index'])

In [5]:
data=pd.read_csv('Additional_information_datasets/reading_comprehension_pool.csv')
data_to_merge=data[['QuestionID','Answer', 'Solution', 'PassageText', 'QuestionText']]
read_analysis = read_analysis.merge(data_to_merge, on='QuestionID', how='left')

In [6]:
def split_argument_answer(text):
    if not isinstance(text, str):
        return "", "", 1  

    text = text.strip()
    arg_marker = "## Reasoning"
    ans_marker = "## Solution"

    arg_index = text.find(arg_marker)
    ans_index = text.find(ans_marker)

    if arg_index != -1 and ans_index != -1 and arg_index < ans_index:
        evaluation = text[arg_index + len(arg_marker):ans_index].strip()
        solution = text[ans_index + len(ans_marker):].strip()
        return evaluation, solution, 0  

    return text.strip(), text.strip(), 1


model_names = ["deepseek-r1:1.5b", "deepseek-r1:14b", "qwen2.5:14b", "qwen2.5:1.5b"]

for model_name in model_names:
    source_col = model_name
    eval_col = f"{model_name}_evaluation"
    sol_col = f"{model_name}_solution"
    score_col = f"{model_name}_structure_score"

    results = read_analysis[source_col].apply(split_argument_answer)
    read_analysis[[eval_col, sol_col, score_col]] = pd.DataFrame(results.tolist(), index=read_analysis.index)


In [7]:
read_analysis = read_analysis.drop(columns=['deepseek-r1:1.5b_think', 'deepseek-r1:14b_think'])
read_analysis = read_analysis[read_analysis['Solution'] != 'Solution not found.']

In [8]:
import re
import numpy as np

def clean_solution(text):
    if not isinstance(text, str):
        return np.nan

    text = text.strip()

    pattern = re.compile(
        r"""(?ix) 
        (?:\\boxed\{\\text\{([A-E])\}\})
        |(?:\\text\{([A-E])\})
        |(?:\*\*\s*([A-E])\s*\*\*) 
        |(?:\(\s*([A-E])\s*\))
        |(?:[#>*\-]+\s*([A-E]))
        |(?:(?:answer|solution|final\s+answer|correct\s+answer\s+is)[\s:\n\*]*([A-E]))
        |(?:^([A-E])$)
        """,
        flags=re.IGNORECASE
    )

    match = pattern.search(text)
    if match:
        for group in match.groups():
            if group:
                return group.upper()

    return np.nan


model_names = ["deepseek-r1:1.5b", "deepseek-r1:14b", "qwen2.5:14b", "qwen2.5:1.5b"]

for model_name in model_names:
    sol_col = f"{model_name}_solution"
    read_analysis[sol_col] = read_analysis[sol_col].apply(clean_solution)

read_analysis.columns = [col.replace('_evaluation', '_reasoning') if '_evaluation' in col else col for col in read_analysis.columns]

In [9]:
model_names = ["deepseek-r1:1.5b", "deepseek-r1:14b", "qwen2.5:1.5b", "qwen2.5:14b"]
for model_name in model_names:
    sol_col = f"{model_name}_solution"
    ans_col = 'Answer'
    correct_col = f"{model_name}_correct"
    
    read_analysis[correct_col] = np.where(read_analysis[sol_col] == read_analysis[ans_col], 1, 0)

In [10]:

read_analysis.to_csv('NLP_analysis/reading_comprehension_analysis.csv', index=False)