In [None]:
### Install and import libraries
!pip install pandas openpyxl
!pip install dataset
!pip install unsloth
!pip install gradio
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install sentence-transformers bert-score nltk rouge-score
import pandas as pd
import json
from google.colab import files
import numpy as np
from datasets import Dataset
from unsloth import FastLanguageModel
import torch
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

API for Multiple Choice questions (given an already trained model)

In [None]:
"""
Script for generating responses to multiple-choice questions using a fine-tuned language model.
The script includes:
1. Parsing structured data from XML-like or Markdown-like text using `extract_from_xml_et` and `extract_from_markdown_regex`.
2. Generating responses to multiple-choice questions with justifications.
3. Integrating a Gradio interface for interactive use.

Modules:
    gradio: For creating a web-based interface for interacting with the model.
    unsloth.FastLanguageModel: For fine-tuned model inference.
    xml.etree.ElementTree: For XML parsing.
    re: For regex-based Markdown parsing.
    torch: For tokenization and GPU-accelerated inference.

Functions:
    extract_from_xml_et: Parses an XML-like string and extracts key-value pairs.
    extract_from_markdown_regex: Extracts `choice` and `justification` from Markdown-like text.
    extract_fields: Aggregates extracted data from multiple blocks of text.
    generate_response: Generates a response to a multiple-choice question using the fine-tuned model.
    greet: Handles user input and provides a formatted response for the Gradio interface.

Usage:
    - Place the fine-tuned model and tokenizer in the `multiple_choice_questions_model` directory.
    - Use the Gradio interface to input questions and choices.
"""

import gradio as gr
from unsloth import FastLanguageModel
import torch
import re
import xml.etree.ElementTree as ET

def extract_from_xml_et(text: str) -> dict:
    """
    Parses an XML-like string and extracts key-value pairs from its elements.

    Args:
        text (str): A string containing XML-like content (e.g., <tag>value</tag>).

    Returns:
        dict: A dictionary where the keys are lowercase XML tags and the values are their corresponding text content.
        None: Returns None if the XML parsing fails.

    Example:
        >>> text = '<key>"value"</key>'
        >>> extract_from_xml_et(text)
        {'key': 'value'}
    """
    try:
        wrapped_text = f"<root>{text}</root>"
        root = ET.fromstring(wrapped_text)
        data = {}
        for child in root:
            if child.text:
                value = child.text.strip().strip('"')
                data[child.tag.lower()] = value
        return data
    except ET.ParseError:
        return None

def extract_from_markdown_regex(text: str) -> dict:
    """
    Extracts structured data from Markdown-like text blocks using regex.

    Args:
        text (str): A string containing Markdown-like structured data.

    Returns:
        dict: A dictionary containing:
            - 'choice': The value extracted after **choice**.
            - 'justification': The value extracted after **justification**.
        None: Returns None if no match is found.

    Example:
        >>> text = "**choice**: Option A **justification**: This is the reason."
        >>> extract_from_markdown_regex(text)
        {'choice': 'Option A', 'justification': 'This is the reason.'}
    """
    try:
        pattern = r'\*\*choice\*\*:\s*(.+?)\s*\*\*justification\*\*:\s*([\s\S]+?)(?=\*\*choice\*\*|$)'
        matches = re.findall(pattern, text)
        data = {'choice': matches[0][0], 'justification': matches[0][1]}
        return data
    except IndexError:
        return None

def extract_fields(text: str) -> list:
    """
    Extracts structured data from text using XML parsing and regex.

    Args:
        text (str): A string containing one or more text blocks in either XML-like or Markdown-like formats.

    Returns:
        list: A list of dictionaries containing extracted data.
    """
    entries = []
    data = {}
    blocks = re.split(r'\n\s*\n', text.strip())
    for block in blocks:
        xml_data_et = extract_from_xml_et(block)
        if xml_data_et:
            data.update(xml_data_et)
        else:
            xml_data_regex = extract_from_markdown_regex(block)
            if xml_data_regex:
                data.update(xml_data_regex)
        if data:
            entries.append(data)
    return entries

# Load the fine-tuned model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained('./multiple_choice_questions_model')

def generate_response(instruction, choice_A, choice_B, choice_C, choice_D):
    """
    Generates a response to a multiple-choice question.

    Args:
        instruction (str): The question text.
        choice_A (str): Text of choice A.
        choice_B (str): Text of choice B.
        choice_C (str): Text of choice C.
        choice_D (str): Text of choice D.

    Returns:
        dict: A dictionary containing:
            - "choice": The selected choice letter.
            - "justification": The justification for the selected choice.
    """
    FastLanguageModel.for_inference(model)

    # Define the prompt
    prompt = f"""### Instruction:
    In the following question, you are provided with 4 choices. Select the best choice based on the knowledge provided and provide a justification for that choice.

    **You must return only your response with the following keys:**
      - "choice": The best choice letter
      - "justification": The justification for your choice

    **Example Response:**
      **choice**: A
      **justification**: Explanation for why Option A is correct

    ### Question:
    {instruction}

    ### Choices:
    A) {choice_A}
    B) {choice_B}
    C) {choice_C}
    D) {choice_D}

    ### Answer:
    """

    # Tokenize and infer
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            early_stopping=True,
            min_length=50,
            length_penalty=2,
            do_sample=True,
            max_new_tokens=300,
            top_p=0.95,
            top_k=50,
            temperature=0.7,
            num_return_sequences=1
        )

    # Decode and extract the response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    response = response.split("### Answer:")[1]
    data = extract_fields(response)
    response = {"choice": data[0]['choice'], "justification": data[0]['justification']}

    return response

def greet(question, choice_A, choice_B, choice_C, choice_D):
    """
    Handles user input and generates a response.

    Args:
        question (str): The question text.
        choice_A (str): Text of choice A.
        choice_B (str): Text of choice B.
        choice_C (str): Text of choice C.
        choice_D (str): Text of choice D.

    Returns:
        str: A formatted response containing the model's choice and justification.
    """
    if question == "":
        return "No question was given to answer"
    if choice_A == "" and choice_B == "" and choice_C == "" and choice_D == "":
        return "No choice was given"
    else:
        response = generate_response(question, choice_A, choice_B, choice_C, choice_D)
        return f"Choice: {response['choice']}\nJustification: {response['justification']}"

# Create a Gradio interface
gr.Interface(
    fn=greet,
    inputs=[
        gr.Textbox(label='Question'),
        gr.Textbox(label='Choice A'),
        gr.Textbox(label='Choice B'),
        gr.Textbox(label='Choice C'),
        gr.Textbox(label='Choice D')
    ],
    outputs="textbox"
).launch()
