In [None]:
### Install and import libraries
!pip install pandas openpyxl
!pip install dataset
!pip install unsloth
!pip install gradio
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install sentence-transformers bert-score nltk rouge-score
import pandas as pd
import json
from google.colab import files
import numpy as np
from datasets import Dataset
from unsloth import FastLanguageModel
import torch
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

In [None]:
### Downloading of model

max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 2x faster
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # 4bit for 405b!
    "unsloth/Mistral-Small-Instruct-2409",     # Mistral 22b 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!

    "unsloth/Llama-3.2-1B-bnb-4bit",           # NEW! Llama 3.2 models
    "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
    "unsloth/Llama-3.2-3B-bnb-4bit",
    "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit", # or choose "unsloth/Llama-3.2-1B"
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Preprocessing of Multiple-Choice Questions based on Modality 1

In [None]:
"""
Script for processing multiple-choice questions from an Excel file, preparing data for model training.
The script performs the following:
1. Reads multiple-choice questions and metadata from an Excel file.
2. Formats the data with options, ranks, and justifications for each question.
3. Converts the data into a JSON format and saves it to a file.
4. Prepares a Hugging Face dataset for training, including tokenization and formatting.

Modules:
    pandas: For reading and processing Excel data.
    numpy: For handling numerical operations.
    json: For saving and loading JSON files.
    datasets: For creating and managing Hugging Face datasets.
    transformers: For tokenization and formatting.

Functions:
    preprocess_function_mc: Prepares the dataset by tokenizing questions and answers and aligning labels.

Usage:
    - Ensure the Excel file "Teliko.xls" exists in the working directory with the correct sheet and columns.
    - Replace `tokenizer` with the appropriate tokenizer for your language model.
    - Customize parameters like max_length, padding, and truncation based on the specific model being used.
"""

import pandas as pd
import numpy as np
import json
from datasets import Dataset

# Step 1: Load the Excel file
df = pd.read_excel("Teliko.xls", sheet_name='multiple choice questions')
"""
Reads the 'multiple choice questions' sheet from the Excel file into a pandas DataFrame.
"""

# Define column names for the final DataFrame
columns = [
    'ID', 'question', 'Choice A', 'metadata A.rank', 'metadata A.justification',
    'Choice B', 'metadata B.rank', 'metadata B.justification',
    'Choice C', 'metadata C.rank', 'metadata C.justification',
    'Choice D', 'metadata D.rank', 'metadata D.justification'
]

# Step 2: Initialize processing structures
multiple_choices = []
options = [["None", "None", "None", "None"]]
justifications = [["None", "None", "None", "None"]]
ranks = [["None", "None", "None", "None"]]
questions = [df.columns[1].replace('\n', '')]
id = 0
current_option_id = 0

# Process the DataFrame row by row
for i in range(len(df)):
    if df.iloc[i, 1] in ["text", "text "]:  # Extract options
        options[id][current_option_id] = str(df.iloc[i, 2]).replace('\n', '')
    elif df.iloc[i, 1] == 'justification':  # Extract justifications
        justifications[id][current_option_id] = str(df.iloc[i, 2]).replace('\n', '')
    elif df.iloc[i, 1] in ['rank', 'rank ']:  # Extract ranks
        ranks[id][current_option_id] = df.iloc[i, 2]
        current_option_id += 1
    elif df.iloc[i, 0] == 'question':  # Process new question
        current_option_id = 0
        id += 1
        options.append(["None", "None", "None", "None"])
        justifications.append(["None", "None", "None", "None"])
        ranks.append(["None", "None", "None", "None"])
        questions.append(df.iloc[i, 1].replace('\n', ''))

# Combine processed data into records
for i in range(len(questions)):
    multiple_choices.append([
        i, questions[i], options[i][0], ranks[i][0], justifications[i][0],
        options[i][1], ranks[i][1], justifications[i][1],
        options[i][2], ranks[i][2], justifications[i][2],
        options[i][3], ranks[i][3], justifications[i][3]
    ])

# Step 3: Create a DataFrame and convert it to JSON
df = pd.DataFrame(multiple_choices, columns=columns)

df.to_excel('phishing_multiple_choice_questions_new_vfinal.xlsx')
df = pd.read_excel('./phishing_multiple_choice_questions_new_vfinal.xlsx')

dict_data = df.to_dict(orient='records')
choices = ['A', 'B', 'C', 'D']
for row in dict_data:
    options_num = 0
    row['question'] = row['question'].replace('\n', "")
    for choice in choices:
        if pd.notna(row[f'metadata {choice}.rank']) and row[f'metadata {choice}.rank'] != "None":
            answer_choice = row[f'Choice {choice}'].replace('\n', "")
            answer_rank = int(row[f'metadata {choice}.rank'])
            answer_justification = row[f'metadata {choice}.justification'].replace('\n', "")
            answer = f"{choice}) {answer_choice} - Justification: {answer_justification} - Rank: {answer_rank}"
            row[f'choice_{choice}'] = answer
            row[f'Choice {choice}'] = answer_choice
            options_num += 1
            if answer_rank == 1:
                row['answer'] = answer
        else:
            row[f'choice_{choice}'] = 'None'
            row[f'Choice {choice}'] = 'None'
    row['options_num'] = options_num

# Step 4: Save the processed data to JSON
json_data = json.dumps(dict_data, indent=4)
with open('output_with_nested_metadata.json', 'w') as json_file:
    json_file.write(json_data)

# Load JSON into a Hugging Face Dataset
with open('output_with_nested_metadata.json', 'r') as json_file:
    json_data = json.load(json_file)

def preprocess_function_mc(examples):
    """
    Tokenizes multiple-choice questions and aligns labels for model training.

    Args:
        examples (dict): A batch of examples containing questions, choices, and answers.

    Returns:
        dict: Tokenized input with aligned labels for Hugging Face datasets.

    Notes:
        - Tokenizes input questions and options using the provided tokenizer.
        - Aligns correct answers with the corresponding choice labels.
        - Pads and truncates inputs to fit the model's maximum input length.
        - Replaces padding tokens in labels with -100 for loss function masking.
    """
    inputs = []
    labels = []

    questions = [question for question in examples['question']]
    answers = [answer for answer in examples['answer']]
    choices = {
        'A': [choice for choice in examples['choice_A']],
        'B': [choice for choice in examples['choice_B']],
        'C': [choice for choice in examples['choice_C']],
        'D': [choice for choice in examples['choice_D']]
    }

    surplus = 0
    while surplus < len(questions):
        num = 0
        for option in choices:
            if examples[f'choice_{option}'][surplus] != "None":
                num += 1
                input_text = f"Question: {questions[surplus]}\nOption: {choices[f'{option}'][surplus]}\nAnswer:"
                inputs.append(input_text)
                labels.append(answers[surplus] if choices[f'{option}'][surplus] == answers[surplus] else "")
        surplus += num

    tokenized_inputs = tokenizer(
        inputs,
        padding='max_length',
        truncation=True,
        max_length=512,  # Adjust this based on model input limits
        return_tensors="pt",
    )

    with tokenizer.as_target_tokenizer():
        tokenized_labels = tokenizer(
            labels,
            truncation=True,
            padding='max_length',
            max_length=512,
            return_tensors="pt"
        )

    labels_ids = tokenized_labels['input_ids']
    # Replace padding token IDs in labels with -100 so they are ignored by the loss
    labels_ids[labels_ids == tokenizer.pad_token_id] = -100
    tokenized_inputs['labels'] = labels_ids
    return tokenized_inputs

dataset = Dataset.from_dict({
    "question": [item["question"] for item in json_data],
    "choice_A": [item["choice_A"] for item in json_data],
    "choice_B": [item["choice_B"] for item in json_data],
    "choice_C": [item["choice_C"] for item in json_data],
    "choice_D": [item["choice_D"] for item in json_data],
    "answer":   [item["answer"] for item in json_data],
    "options_num": [item["options_num"] for item in json_data],
    "Choice A": [item["Choice A"] for item in json_data],
    "Choice B": [item["Choice B"] for item in json_data],
    "Choice C": [item["Choice A"] for item in json_data],
    "Choice D": [item["Choice D"] for item in json_data],
})

# Split dataset into training and test sets
split_dataset = dataset.train_test_split(test_size=0.3,seed=0)

train_dataset = split_dataset['train']
test_dataset = split_dataset['test']

train_dataset = Dataset.from_dict({
    "question": [item["question"] for item in train_dataset for i in range(item['options_num'])],
    'choice_A': [item['choice_A'] for item in train_dataset for i in range(item['options_num'])],
    'choice_B': [item['choice_B'] for item in train_dataset for i in range(item['options_num'])],
    'choice_C': [item['choice_C'] for item in train_dataset for i in range(item['options_num'])],
    'choice_D': [item['choice_D'] for item in train_dataset for i in range(item['options_num'])],
    'answer': [item['answer'] for item in train_dataset for i in range(item['options_num'])],
})

tokenized_train_dataset = train_dataset.map(preprocess_function_mc, batched=True)
tokenized_train_dataset = tokenized_train_dataset.remove_columns(['question', 'choice_A', 'choice_B', 'choice_C', 'choice_D'])
tokenized_train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

#### All of dataset
dataset = Dataset.from_dict({
    "question": [item["question"] for item in dataset for i in range(item['options_num'])],
    'choice_A': [item['choice_A'] for item in dataset for i in range(item['options_num'])],
    'choice_B': [item['choice_B'] for item in dataset for i in range(item['options_num'])],
    'choice_C': [item['choice_C'] for item in dataset for i in range(item['options_num'])],
    'choice_D': [item['choice_D'] for item in dataset for i in range(item['options_num'])],
    'answer': [item['answer'] for item in dataset for i in range(item['options_num'])],
})

tokenized_dataset = dataset.map(preprocess_function_mc, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(['question', 'choice_A', 'choice_B', 'choice_C', 'choice_D'])
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])



Training of model for Benchmark

In [None]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = tokenized_train_dataset,
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 4,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 50, # Set this for 1 full training run.
        # max_steps = 1000,
        learning_rate = 4e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 5,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

trainer_stats = trainer.train()


Defining Metrics for Benchmark

In [None]:
"""
Script for evaluating model-generated answers against ground truth using multiple metrics.
Metrics include:
1. Cosine Similarity (using SentenceTransformer).
2. BERTScore (using the `bert_score` library).
3. BLEU Score (using NLTK).
4. ROUGE Scores (using `rouge_scorer`).

The script evaluates:
- The similarity between generated justifications and ground-truth justifications.
- The correctness of the chosen answer.
- Weighted scores based on the rank of options.

Modules:
    json: For loading and processing JSON data.
    re: For regular expression matching.
    sentence_transformers: For embedding-based similarity scoring.
    bert_score: For semantic similarity evaluation.
    nltk: For BLEU score computation.
    rouge_score: For ROUGE score computation.

Functions:
    compute_bleu_score: Computes BLEU score for a single justification.
    compute_rouge_scores: Computes ROUGE scores (ROUGE-1, ROUGE-2, ROUGE-L) for a single justification.
    compute_bert_score: Computes BERTScore for a single justification.
    evaluate_model: Evaluates the model's performance on ground-truth and generated answers.

Usage:
    - Replace `ground_truths` and `generated_answers` with appropriate data for evaluation.
"""

import json
import re
from sentence_transformers import SentenceTransformer, util
from bert_score import score
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer

def compute_bleu_score(reference, generated_answer):
    """
    Compute BLEU score for a single justification.

    Args:
        reference (str): Ground-truth justification.
        generated_answer (str): Model-generated justification.

    Returns:
        float: BLEU score.
    """
    bleu = sentence_bleu([reference], [generated_answer])
    return bleu

def compute_rouge_scores(reference, generated_answer):
    """
    Compute ROUGE scores for a single justification.

    Args:
        reference (str): Ground-truth justification.
        generated_answer (str): Model-generated justification.

    Returns:
        dict: ROUGE scores (ROUGE-1, ROUGE-2, ROUGE-L).
    """
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, generated_answer)
    return {
        "ROUGE-1": scores['rouge1'].fmeasure,
        "ROUGE-2": scores['rouge2'].fmeasure,
        "ROUGE-L": scores['rougeL'].fmeasure
    }

def compute_bert_score(reference, generated_answer):
    """
    Compute BERTScore for a single justification.

    Args:
        reference (str): Ground-truth justification.
        generated_answer (str): Model-generated justification.

    Returns:
        float: BERTScore (F1).
    """
    P, R, F1 = score(
        [generated_answer],
        [reference],
        lang="en",
        verbose=False
    )
    return F1.mean().item()

def evaluate_model(ground_truths, generated_answers):
    """
    Evaluate the model's performance using Cosine Similarity, BERTScore, and weighted scores.

    Args:
        ground_truths (list): List of ground-truth answers with justifications.
        generated_answers (list): List of model-generated answers with justifications.

    Returns:
        tuple: Containing the following metrics:
            - sim_scores: Cosine similarity scores for justifications.
            - bert_scores: BERT scores for justifications.
            - num_of_correct_choices: Number of correct answer choices.
            - new_sim_scores: Weighted similarity scores for incorrect answers.
            - new_bert_scores: Weighted BERT scores for incorrect answers.
    """
    dict_choice_index = {'A': 0, 'B': 1, 'C': 2, 'D': 3}
    ranks_val = {'1': 1, '2': 0.50, '3': 0.25, '4': 0}

    similiratity_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

    sim_scores = []
    bert_scores = []
    new_sim_scores = []
    new_bert_scores = []

    pattern = re.compile(
        r'([A-Z])\)\s*(.*?)\s*-\s*Justification:\s*(.*?)\s*-\s*Rank:\s*(\d+)',
        re.DOTALL
    )

    num_of_correct_choices = 0

    for i in range(len(ground_truths)):
        matches = pattern.findall(ground_truths[i][0])
        for match in matches:
            correct_choice = match[0]
            answer = match[1].strip()
            justification = match[2].strip()
            rank = match[3]

        choices_ranks = []
        for j in range(4):
            matches = pattern.findall(ground_truths[i][1 + j])
            if matches:
                for match in matches:
                    choices_ranks.append(match[3])
            else:
                choices_ranks.append('4')

        generated_choice = generated_answers[i][0]
        generated_justification = generated_answers[i][1]

        if generated_choice != correct_choice:
            sim_scores.append(0)
            bert_scores.append(0)

            answer_similarity = util.pytorch_cos_sim(
                similiratity_model.encode(justification),
                similiratity_model.encode(generated_justification)
            ).item()
            bert_score = compute_bert_score(justification, generated_justification)

            choice_to_index = dict_choice_index[generated_choice]
            rank_choice = choices_ranks[choice_to_index]

            new_sim_scores.append(answer_similarity * ranks_val[rank_choice])
            new_bert_scores.append(bert_score * ranks_val[rank_choice])
        else:
            answer_similarity = util.pytorch_cos_sim(
                similiratity_model.encode(justification),
                similiratity_model.encode(generated_justification)
            ).item()
            bert_score = compute_bert_score(justification, generated_justification)
            sim_scores.append(answer_similarity)
            bert_scores.append(bert_score)
            new_sim_scores.append(answer_similarity)
            new_bert_scores.append(bert_score)
            num_of_correct_choices += 1

    return sim_scores, bert_scores, num_of_correct_choices, new_sim_scores, new_bert_scores



In [None]:
"""
Script for extracting structured data from text using XML parsing and regular expressions.
The script includes:
1. Extraction of data from XML-like text blocks using `xml.etree.ElementTree`.
2. Extraction of data from Markdown-like text blocks using regular expressions.
3. Aggregation of extracted data into a structured list of dictionaries.

Functions:
    extract_from_xml_et: Extracts data from XML-like text blocks.
    extract_from_markdown_regex: Extracts data from Markdown-like text blocks.
    extract_fields: Combines data extraction methods and aggregates results.

Modules:
    xml.etree.ElementTree: For parsing XML-like structures.
    re: For regex-based extraction from Markdown-like text.

Usage:
    - Provide input text containing XML-like or Markdown-like data blocks.
    - Use `extract_fields` to process the text and extract structured data.
"""

import xml.etree.ElementTree as ET
import re

def extract_from_xml_et(text: str) -> dict:
    """
    Extracts data from an XML-like text block.

    Args:
        text (str): The XML-like text block to process.

    Returns:
        dict: A dictionary containing extracted key-value pairs, with tags as keys
              and their corresponding text content as values. Returns `None` if parsing fails.
    """
    try:
        # Wrap the text with a root element to ensure it forms valid XML
        wrapped_text = f"<root>{text}</root>"
        root = ET.fromstring(wrapped_text)
        data = {}
        for child in root:
            if child.text:
                # Extract and clean tag and value
                value = child.text.strip().strip('"')
                data[child.tag.lower()] = value
        return data
    except ET.ParseError:
        return None

def extract_from_markdown_regex(text: str) -> dict:
    """
    Extracts data from a Markdown-like text block using regular expressions.

    Args:
        text (str): The Markdown-like text block to process.

    Returns:
        dict: A dictionary containing the extracted `choice` and `justification`.
              Returns `None` if no matches are found.
    """
    try:
        # Pattern to match Markdown-like structure for choice and justification
        pattern = r'\*\*choice\*\*:\s*(.+?)\s*\*\*justification\*\*:\s*([\s\S]+?)(?=\*\*choice\*\*|$)'
        matches = re.findall(pattern, text)
        data = {'choice': matches[0][0], 'justification': matches[0][1]}
        return data
    except IndexError:
        return None

def extract_fields(text: str) -> list:
    """
    Extracts structured data from text blocks using XML parsing and regex-based extraction.

    Args:
        text (str): The input text containing multiple data blocks (XML-like or Markdown-like).

    Returns:
        list: A list of dictionaries containing extracted data from each block.
              Each dictionary represents a single data entry.
    """
    entries = []  # List to store extracted data
    blocks = re.split(r'\n\s*\n', text.strip())  # Split text into blocks by empty lines
    for block in blocks:
        data = {}
        # Attempt to extract data using XML parsing
        xml_data_et = extract_from_xml_et(block)
        if xml_data_et:
            data.update(xml_data_et)
        else:
            # Fallback to extracting data using regex for Markdown
            xml_data_regex = extract_from_markdown_regex(block)
            if xml_data_regex:
                data.update(xml_data_regex)
        if data:
            entries.append(data)  # Append extracted data to the entries list
    return entries


Benchmark

In [None]:
"""
Script for evaluating a fine-tuned language model's responses to multiple-choice questions.
The script performs the following:
1. Generates responses for multiple-choice questions using the fine-tuned model.
2. Extracts and processes the model's outputs for choice and justification.
3. Evaluates the generated responses against ground-truth answers using metrics:
   - Cosine Similarity
   - BERT Score
4. Computes metrics for overall performance and correct predictions.
5. Saves the model's responses and metrics into CSV files for further analysis.

Modules:
    numpy: For computing averages of similarity scores.
    pandas: For saving and loading data to/from CSV.
    json: For parsing JSON-like responses.
    re: For additional text processing.
    torch: For model inference.
    sentence_transformers: For embedding-based similarity scoring.

Functions:
    generate_response: Generates a choice and justification using the fine-tuned model.
    evaluate_model: Computes performance metrics for model-generated responses.

Usage:
    - Ensure `test_dataset` contains questions, choices, and ground-truth answers.
    - Replace `model` and `tokenizer` with your fine-tuned language model and tokenizer.
"""

import json
import re
import numpy as np
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer, util

# Initialize lists for metrics and responses
sim_scores = []
bert_scores = []
responses = []

def generate_response(instruction):
    """
    Generates a response using the fine-tuned model.

    Args:
        instruction (str): The question and choices formatted as an instruction for the model.

    Returns:
        dict: A dictionary containing:
            - "choice": The selected choice (A, B, C, or D).
            - "justification": The justification for the selected choice.
    """
    # Enable optimized inference mode for the model
    FastLanguageModel.for_inference(model)

    # Define the prompt with instructions and example format
    prompt = f"""### Instruction:
    In the following question, you are provided with 4 choices. Select the best choice based on the knowledge provided and provide a justification for that choice.

    **You must return only your response with the following keys:**
      - "choice": The best choice letter
      - "justification": The justification for your choice

    **Example Response:**
      **choice**: A
      **justification**: Explanation for why Option A is correct

    ### Question:
    {instruction}

    ### Answer:
    """

    # Tokenize the input and send it to the GPU
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    # Generate the model's response
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            early_stopping=True,
            min_length=50,
            length_penalty=2,
            do_sample=True,
            max_new_tokens=300,
            top_p=0.95,
            top_k=50,
            temperature=0.7,
            num_return_sequences=1
        )

    # Decode the generated output and extract the response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    try:
        response = response.split('### Answer:')[1]
        data = extract_fields(response)
        response = {"choice": data[0]['choice'], "justification": data[0]['justification']}
    except (json.JSONDecodeError, IndexError):
        # Default response if parsing fails
        response = {"choice": "None", "justification": "Could not parse JSON"}

    return response

# Step 1: Generate responses for the test dataset
for item in test_dataset:
    instruction = f'''{item['question']}
    Choices:
    A) {item['Choice A'].replace('  ',' ')},
    B) {item['Choice B'].replace('  ',' ')},
    C) {item['Choice C'].replace('  ',' ')},
    D) {item['Choice D'].replace('  ',' ')}
    '''

    response = generate_response(instruction)
    print(f"Question: {item['question']}")
    print("RESPONSE:", response)
    print("GROUND_TRUTH:", item['answer'])
    responses.append(response)

# Save responses to CSV
data = []
cols = ['choice', 'justification']
for response in responses:
    data.append([response['choice'], response['justification']])
df = pd.DataFrame(data, columns=cols)
df.to_csv('LLM_answers_multiple_choice.csv', index=False)

# Step 2: Prepare ground-truth data
ground_truth = []
for item in test_dataset:
    ground_truth.append([item['answer'], item['choice_A'], item['choice_B'], item['choice_C'], item['choice_D']])

# Load generated answers from the CSV
df = pd.read_csv('LLM_answers_multiple_choice.csv')
generated_answers = []
for i in range(len(df)):
    generated_answers.append([df['choice'][i], df['justification'][i]])

# Step 3: Evaluate the model's responses
sim_scores, bert_scores, num_of_correct_choices, new_sim_scores, new_bert_scores = evaluate_model(ground_truth, generated_answers)

# Print evaluation metrics
print(f"Average similarity score is: {np.mean(sim_scores):.4f}")
print(f"Average BERT Score is: {np.mean(bert_scores):.4f}")
print(f"Number of correct choices are: {num_of_correct_choices}/{len(ground_truth)}")

correct_sim_scores = np.mean([val for val in sim_scores if val != 0])
correct_bert_scores = np.mean([val for val in bert_scores if val != 0])

print(f"Average similarity score of correct choices is: {correct_sim_scores:.4f}")
print(f"Average BERT Score of correct choices is: {correct_bert_scores:.4f}")

print(f"New Average similarity score is: {np.mean(new_sim_scores):.4f}")
print(f"New Average BERT Score is: {np.mean(new_bert_scores):.4f}")


Training of the model on the the entire dataset

In [None]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = tokenized_dataset,
    #dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 10,
        num_train_epochs = 100, # Set this for 1 full training run.
        # max_steps = 1000,
        learning_rate = 5e-5,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

trainer_stats = trainer.train()
model.save_pretrained('./multiple_choice_questions_model')

Creation of the API

In [None]:
"""
Script for generating responses to multiple-choice questions using a fine-tuned language model.
The script includes:
1. Parsing structured data from XML-like or Markdown-like text using `extract_from_xml_et` and `extract_from_markdown_regex`.
2. Generating responses to multiple-choice questions with justifications.
3. Integrating a Gradio interface for interactive use.

Modules:
    gradio: For creating a web-based interface for interacting with the model.
    unsloth.FastLanguageModel: For fine-tuned model inference.
    xml.etree.ElementTree: For XML parsing.
    re: For regex-based Markdown parsing.
    torch: For tokenization and GPU-accelerated inference.

Functions:
    extract_from_xml_et: Parses an XML-like string and extracts key-value pairs.
    extract_from_markdown_regex: Extracts `choice` and `justification` from Markdown-like text.
    extract_fields: Aggregates extracted data from multiple blocks of text.
    generate_response: Generates a response to a multiple-choice question using the fine-tuned model.
    greet: Handles user input and provides a formatted response for the Gradio interface.

Usage:
    - Place the fine-tuned model and tokenizer in the `multiple_choice_questions_model` directory.
    - Use the Gradio interface to input questions and choices.
"""

import gradio as gr
from unsloth import FastLanguageModel
import torch
import re
import xml.etree.ElementTree as ET

def extract_from_xml_et(text: str) -> dict:
    """
    Parses an XML-like string and extracts key-value pairs from its elements.

    Args:
        text (str): A string containing XML-like content (e.g., <tag>value</tag>).

    Returns:
        dict: A dictionary where the keys are lowercase XML tags and the values are their corresponding text content.
        None: Returns None if the XML parsing fails.

    Example:
        >>> text = '<key>"value"</key>'
        >>> extract_from_xml_et(text)
        {'key': 'value'}
    """
    try:
        wrapped_text = f"<root>{text}</root>"
        root = ET.fromstring(wrapped_text)
        data = {}
        for child in root:
            if child.text:
                value = child.text.strip().strip('"')
                data[child.tag.lower()] = value
        return data
    except ET.ParseError:
        return None

def extract_from_markdown_regex(text: str) -> dict:
    """
    Extracts structured data from Markdown-like text blocks using regex.

    Args:
        text (str): A string containing Markdown-like structured data.

    Returns:
        dict: A dictionary containing:
            - 'choice': The value extracted after **choice**.
            - 'justification': The value extracted after **justification**.
        None: Returns None if no match is found.

    Example:
        >>> text = "**choice**: Option A **justification**: This is the reason."
        >>> extract_from_markdown_regex(text)
        {'choice': 'Option A', 'justification': 'This is the reason.'}
    """
    try:
        pattern = r'\*\*choice\*\*:\s*(.+?)\s*\*\*justification\*\*:\s*([\s\S]+?)(?=\*\*choice\*\*|$)'
        matches = re.findall(pattern, text)
        data = {'choice': matches[0][0], 'justification': matches[0][1]}
        return data
    except IndexError:
        return None

def extract_fields(text: str) -> list:
    """
    Extracts structured data from text using XML parsing and regex.

    Args:
        text (str): A string containing one or more text blocks in either XML-like or Markdown-like formats.

    Returns:
        list: A list of dictionaries containing extracted data.
    """
    entries = []
    data = {}
    blocks = re.split(r'\n\s*\n', text.strip())
    for block in blocks:
        xml_data_et = extract_from_xml_et(block)
        if xml_data_et:
            data.update(xml_data_et)
        else:
            xml_data_regex = extract_from_markdown_regex(block)
            if xml_data_regex:
                data.update(xml_data_regex)
        if data:
            entries.append(data)
    return entries

# Load the fine-tuned model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained('./multiple_choice_questions_model')

def generate_response(instruction, choice_A, choice_B, choice_C, choice_D):
    """
    Generates a response to a multiple-choice question.

    Args:
        instruction (str): The question text.
        choice_A (str): Text of choice A.
        choice_B (str): Text of choice B.
        choice_C (str): Text of choice C.
        choice_D (str): Text of choice D.

    Returns:
        dict: A dictionary containing:
            - "choice": The selected choice letter.
            - "justification": The justification for the selected choice.
    """
    FastLanguageModel.for_inference(model)

    # Define the prompt
    prompt = f"""### Instruction:
    In the following question, you are provided with 4 choices. Select the best choice based on the knowledge provided and provide a justification for that choice.

    **You must return only your response with the following keys:**
      - "choice": The best choice letter
      - "justification": The justification for your choice

    **Example Response:**
      **choice**: A
      **justification**: Explanation for why Option A is correct

    ### Question:
    {instruction}

    ### Choices:
    A) {choice_A}
    B) {choice_B}
    C) {choice_C}
    D) {choice_D}

    ### Answer:
    """

    # Tokenize and infer
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            early_stopping=True,
            min_length=50,
            length_penalty=2,
            do_sample=True,
            max_new_tokens=300,
            top_p=0.95,
            top_k=50,
            temperature=0.7,
            num_return_sequences=1
        )

    # Decode and extract the response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    response = response.split("### Answer:")[1]
    data = extract_fields(response)
    response = {"choice": data[0]['choice'], "justification": data[0]['justification']}

    return response

def greet(question, choice_A, choice_B, choice_C, choice_D):
    """
    Handles user input and generates a response.

    Args:
        question (str): The question text.
        choice_A (str): Text of choice A.
        choice_B (str): Text of choice B.
        choice_C (str): Text of choice C.
        choice_D (str): Text of choice D.

    Returns:
        str: A formatted response containing the model's choice and justification.
    """
    if question == "":
        return "No question was given to answer"
    if choice_A == "" and choice_B == "" and choice_C == "" and choice_D == "":
        return "No choice was given"
    else:
        response = generate_response(question, choice_A, choice_B, choice_C, choice_D)
        return f"Choice: {response['choice']}\nJustification: {response['justification']}"

# Create a Gradio interface
gr.Interface(
    fn=greet,
    inputs=[
        gr.Textbox(label='Question'),
        gr.Textbox(label='Choice A'),
        gr.Textbox(label='Choice B'),
        gr.Textbox(label='Choice C'),
        gr.Textbox(label='Choice D')
    ],
    outputs="textbox"
).launch()
