## Description
This notebook cleanes the dataset of models' responses to the math question pool:
- Merges datasets responses generated from different runs of the script
- Pivots horizontally the dataset
- Divides the <think> part from the actual response in deepseek-r1's answers
- Divides the different sections of the response in ##Reasoning and ##Solution, extracting them with tailored regexes
- Assigns a score 0-1 based on whether the structure response instructions were followed or not
- Adds colums from original datasets for additional information, for proper evaluation
- Assign a score 0-1 based on whether the models aswered correctly or not

In [None]:
import os
import sys
import pandas as pd
import sqlite3
from tqdm import tqdm
import numpy as np
import re
import random
import torch 
import torch.nn.functional as F
import math
desktop_path = os.path.join(os.path.expanduser("~"), "llm-justification-evaluation", "Data_cleaning_cosine_calculation_semantic_and_analysis")
os.chdir(desktop_path)

In [17]:
eval_answers = pd.read_csv('Models_answers/essay_evaluation_answers.csv')
essay_luca = pd.read_csv('Models_answers/essay_evaluation_answers_luca.csv')

In [18]:
essay_luca = essay_luca[essay_luca['response'] != "Error: 1 validation error for GenerateRequest\nmodel\n  String should have at least 1 character [type=string_too_short, input_value='', input_type=str]\n    For further information visit https://errors.pydantic.dev/2.11/v/string_too_short"]
essay_luca['response'].value_counts()
essay_luca['response'].str.contains('Error:').sum()
error_ids = essay_luca[essay_luca['response'].str.contains('Error:')]['QuestionID'].unique()
error_ids
essay_luca = essay_luca[~essay_luca['QuestionID'].isin(error_ids)]

In [19]:
eval_answers = eval_answers[eval_answers['model'] != 'deekseek-r1:1.5b']
eval_answers = eval_answers.drop_duplicates(subset=['QuestionID', 'model'], keep='last')

In [20]:

eval_answ_temp = eval_answers[~eval_answers['model'].isin(['qwen2.5:14b', 'deepseek-r1:14b'])].copy()
eval_answ_temp = eval_answ_temp[~eval_answ_temp['QuestionID'].isin([1, 2, 3, 4, 5])]

eval_answers = eval_answers[
    (eval_answers['model'] == 'qwen2.5:14b') | 
    (eval_answers['model'] == 'deepseek-r1:14b')]
eval_answers = pd.concat([eval_answ_temp, eval_answers], ignore_index=True)
eval_answers = pd.concat([eval_answers, essay_luca], ignore_index=True)

In [21]:
eval_answers_x = eval_answers.pivot(index='QuestionID', columns='model', values='response').reset_index()
eval_answers_x.columns.name = None

In [22]:
eval_answers_time = eval_answers.pivot(index='QuestionID', columns='model', values='time_taken_seconds').reset_index()
eval_answers_time.columns = [f"{col}_time" for col in eval_answers_time.columns]

In [23]:
eval_answers = pd.concat([eval_answers_x, eval_answers_time], axis=1).reset_index()
eval_answers = eval_answers.drop(columns=['QuestionID_time', 'index'])

In [24]:
def split_think(text):
    if isinstance(text, str):
        match = re.search(r"<think>(.*?)</think>", text, re.DOTALL)
        if match:
            think_part = match.group(1).strip()
            response_part = text.replace(match.group(0), "").strip()
            return pd.Series([think_part, response_part])
    return pd.Series(["", text])

eval_answers[['deepseek-r1:1.5b_think', 'deepseek-r1:1.5b']] = eval_answers['deepseek-r1:1.5b'].apply(split_think)
eval_answers[['deepseek-r1:14b_think', 'deepseek-r1:14b']] = eval_answers['deepseek-r1:14b'].apply(split_think)

eval_answers = eval_answers.drop(columns=['deepseek-r1:1.5b_think', 'deepseek-r1:14b_think'])

In [25]:
get_eval=pd.read_csv('Additional_information_datasets/ielts_essays_questions.csv')


In [26]:
eval_answers = eval_answers.merge(get_eval[['QuestionID', 'evaluation','prompt', 'essay']], on='QuestionID', how='left')



In [27]:
eval_answers.columns = [col + '_reasoning' if col not in ['QuestionID', 'evaluation', 'prompt', 'essay'] and not col.endswith('_time') and not col.endswith('_score') else col for col in eval_answers.columns]
eval_answers = eval_answers.rename(columns={'evaluation': 'Solution'})

In [28]:
def check_band_score_structure(text):
    if not isinstance(text, str):
        return 1, float('nan')

    required_sections = [
        "## Task Achievement",
        "## Coherence and Cohesion",
        "## Lexical Resource",
        "## Grammatical Range and Accuracy",
        "## Overall Band Score",
        "## Feedback and Additional Comments"
    ]

    structure_ok = all(section in text for section in required_sections)
    score = 0 if structure_ok else 1

    patterns = [
        r"Suggested\s+(?:Overall\s+)?Band\s+Score\s*:\s*(?!\s*\()"
        r"\**\s*([0-9](?:\.\d)?)\s*\**",

        r"\*\*Overall\s+Band\s+Score:\*\*\s*<\s*(\d+(?:\.\d+)?)\s*>",

        r"\**\s*Overall\s+Band\s+Score\s*:\s*<\s*(\d+(?:\.\d+)?)\s*>", 

        r"\boverall\s+(?:band\s+)?score\s*(?:is|was|=|of)?\s*[:\-]?\s*(\d+(?:\.\d+)?)",

        r"\*\*Overall\s+Band\s+Score\*\*:\s*(?:.+?)\((\d+(?:\.\d+)?)\)",

        r"(?:has|have|was|were|is|are)?\s*"
        r"(?:awarded|received|achieved|got|obtained)?\s*"
        r"(?:an?\s+)?([0-9](?:\.\d)?)\s+band\s+score"
    ]

    band_score = float('nan')
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            try:
                band_score = float(match.group(1))
            except (ValueError, IndexError):
                band_score = float('nan')
            break

    return score, band_score

for model in eval_answers.columns:
    if model.endswith('_reasoning'):
        eval_answers[[f"{model.replace('reasoning', '')}_structure_score", f"{model.replace('reasoning', '')}_band_score"]] = eval_answers[model].apply(check_band_score_structure).apply(pd.Series)


In [29]:
def extract_band_score(text):
    if not isinstance(text, str):
        return float('nan')

   
    text = text.replace('\xa0', ' ')          
    text = re.sub(r'\s+', ' ', text)          
    text = text.strip()                       

    pattern = r"Suggested\s*Overall\s*Band\s*Score\s*:\s*\**\s*(\d+(?:\.\d+)?)\s*\**"
    match = re.search(pattern, text, re.IGNORECASE)

    if match:
        return float(match.group(1))
    return float('nan')


eval_answers['band_score_solution'] = eval_answers['Solution'].apply(extract_band_score)


In [30]:
eval_answers.to_csv('NLP_analysis/essay_evaluation_analysis.csv', index=False)