In [None]:
### Install and import libraries
!pip install pandas openpyxl
!pip install dataset
!pip install unsloth
!pip install gradio
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

import pandas as pd
import json
from google.colab import files
import numpy as np
from datasets import Dataset
from unsloth import FastLanguageModel
import torch
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
from datasets import Dataset,concatenate_datasets

Found existing installation: unsloth 2025.1.5
Uninstalling unsloth-2025.1.5:
  Successfully uninstalled unsloth-2025.1.5
Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-mr8ibrqy/unsloth_63f0d8e140db408c8c5cf14a97115c94
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-mr8ibrqy/unsloth_63f0d8e140db408c8c5cf14a97115c94
  Resolved https://github.com/unslothai/unsloth.git to commit b4c48d9c5e78203909495bf9beaa29a5c9aeaeeb
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: unsloth
  Building wheel for unsloth (pyproject.toml) ... [?25l[?25hdone
  Created wheel for unsloth: filename=unsloth-2025.1.5-py3-none-any.w

In [None]:
### Downloading of model

max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 2x faster
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # 4bit for 405b!
    "unsloth/Mistral-Small-Instruct-2409",     # Mistral 22b 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!

    "unsloth/Llama-3.2-1B-bnb-4bit",           # NEW! Llama 3.2 models
    "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
    "unsloth/Llama-3.2-3B-bnb-4bit",
    "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit", # or choose "unsloth/Llama-3.2-1B"
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

==((====))==  Unsloth 2025.1.5: Fast Llama patching. Transformers: 4.47.1.
   \\   /|    GPU: NVIDIA L4. Max memory: 22.168 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 8.9. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.1.5 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


Preprocessing of True-False Questions

In [None]:
"""
Script for processing True/False questions from an Excel file, saving as JSON, and converting to a Hugging Face dataset.
The script performs the following:
1. Reads data from an Excel file containing True/False questions and metadata.
2. Processes and formats the data into structured columns.
3. Saves the data as a JSON file and reloads it for further processing.
4. Converts the data into a Hugging Face Dataset format.
5. Formats and tokenizes the dataset for model training.

Modules:
    pandas: For reading and processing Excel data.
    json: For saving and loading JSON files.
    datasets: For creating and managing Hugging Face datasets.

Functions:
    format_examples: Formats dataset examples for input text creation.
    tokenize_function: Tokenizes input text using the tokenizer.

Usage:
    - Ensure the Excel file "Teliko.xls" exists in the working directory with the correct sheet and columns.
    - Replace `tokenizer` with the appropriate tokenizer for your model.
"""

# Step 1: Load the Excel file
df = pd.read_excel("Teliko.xls", sheet_name='True false questions')
"""
Reads the 'True false questions' sheet from the Excel file into a pandas DataFrame.
"""

# Define column names for the final DataFrame
columns = ['id', 'question', 'answer', 'metadata.justification']

# Initialize lists for processing
questions = [df.columns[1].replace('\n', ' ').replace('\n\n', '')]  # Clean column header for questions
true_false = []  # Stores True/False answers
justification = []  # Stores justifications
indexs = []  # Final list of processed records

# Step 2: Process the DataFrame row by row
for i in range(len(df)):
    if df.iloc[i, 1] == 'text':  # Extract True/False answers
        true_false.append(str(df.iloc[i, 2]).lower())
    elif df.iloc[i, 1] == 'justification':  # Extract justifications
        justification.append(df.iloc[i, 2])
    elif df.iloc[i, 1] == 'rank':  # Skip rows with 'rank'
        pass
    else:
        # Extract and clean question text
        questions.append(df.iloc[i, 1].replace('\n', ' ').replace('\n\n', ''))

# Construct records by combining questions, answers, and justifications
for i in range(len(questions)):
    indexs.append([i, questions[i], true_false[i], justification[i]])

# Convert the records into a pandas DataFrame
df = pd.DataFrame(indexs, columns=columns)

# Step 3: Save the processed data as JSON
json_data = df.to_json(orient='records', indent=4)
with open('output_with_nested_metadata.json', 'w') as json_file:
    json_file.write(json_data)
"""
Saves the processed data into a JSON file named 'output_with_nested_metadata.json'.
"""

# Step 4: Load the JSON data from the file
with open('output_with_nested_metadata.json', 'r') as json_file:
    json_data = json.load(json_file)
"""
Loads the JSON file into a Python object for further processing.
"""

# Step 5: Convert the JSON data into a Hugging Face Dataset
dataset = Dataset.from_dict({
    "statement": [item["question"] for item in json_data],
    "label": [item["answer"] for item in json_data]
})
"""
Creates a Hugging Face Dataset from the processed JSON data.
Each record consists of:
    - `statement`: The question text.
    - `label`: The corresponding True/False answer.
"""

# Define a function to format dataset examples
def format_examples(examples):
    """
    Formats dataset examples for input text creation.

    Args:
        examples (dict): A batch of examples from the dataset.

    Returns:
        dict: A dictionary with formatted text inputs.
    """
    inputs = [f'Question: {q} Answer: {a}' for q, a in zip(examples['statement'], examples['label'])]
    return {'text': inputs}

# Define a function to tokenize the formatted examples
def tokenize_function(examples):
    """
    Tokenizes the input text.

    Args:
        examples (dict): A batch of examples with formatted text.

    Returns:
        dict: A dictionary with tokenized inputs.
    """

    tokenized_inputs = tokenizer(examples['text'],
        padding='max_length',
        truncation=True,
        max_length=512,  # Set max length based on model input limits
        return_tensors="pt"
    )

    with tokenizer.as_target_tokenizer():
        tokenized_labels = tokenizer(
            examples['label'],
            truncation=True,
            padding='max_length',
            max_length=512,
            return_tensors="pt"
        )
    labels_ids = tokenized_labels['input_ids']
    labels_ids[labels_ids == tokenizer.pad_token_id] = -100
    tokenized_inputs['labels'] = labels_ids
    return tokenized_inputs

# Step 6: Format and tokenize the full dataset
dataset = dataset.map(format_examples, batched=True)
true_false_tokenized_dataset = dataset.map(tokenize_function, batched=True)
"""
Formats and tokenizes the entire dataset for future use.
"""
true_false_tokenized_dataset

Map:   0%|          | 0/61 [00:00<?, ? examples/s]

Map:   0%|          | 0/61 [00:00<?, ? examples/s]

Dataset({
    features: ['statement', 'label', 'text', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 61
})

Preprocessing of Open-Ended Questions

In [None]:
"""
Script for processing open-ended questions from an Excel file, converting to JSON, and preparing for model training.
The script performs the following:
1. Reads data from an Excel file containing open-ended questions and answers.
2. Processes and formats the data into structured columns.
3. Saves the data as a JSON file and reloads it for further processing.
4. Converts the data into a Hugging Face Dataset format.
5. Tokenizes and prepares the dataset for language model training.

Modules:
    pandas: For reading and processing Excel data.
    json: For saving and loading JSON files.
    datasets: For creating and managing Hugging Face datasets.
    DataCollatorForLanguageModeling: For collating and batching data for language model training.

Functions:
    format_examples: Formats dataset examples into text input-output pairs.
    tokenize_function: Tokenizes the text inputs for the model.

Usage:
    - Ensure the Excel file "Teliko.xls" exists in the working directory with the correct sheet and columns.
    - Replace `tokenizer` with the appropriate tokenizer for your language model.
"""

import pandas as pd
import json
from datasets import Dataset
from transformers import DataCollatorForLanguageModeling

# Step 1: Load the Excel file
df = pd.read_excel("Teliko.xls", sheet_name='open ended questions')
"""
Reads the 'open ended questions' sheet from the Excel file into a pandas DataFrame.
"""

# Define column names for the final DataFrame
columns = ['id', 'question', 'answer']

# Initialize lists for processing
questions = [df.columns[1].replace('\n', ' ').replace('\n\n', '')]  # Clean column header for questions
answers = []  # Stores answers
indexs = []  # Final list of processed records

# Step 2: Process the DataFrame row by row
for i in range(len(df)):
    if df.iloc[i, 0] == 'answer':  # Extract answers
        answers.append(df.iloc[i, 1].replace('\n', ''))
    elif df.iloc[i, 0] == 'question ':  # Extract questions
        questions.append(df.iloc[i, 1].replace('\n', ''))

# Construct records by combining questions and answers
for i in range(len(questions)):
    indexs.append([i, questions[i], answers[i]])

# Convert the records into a pandas DataFrame
df = pd.DataFrame(indexs, columns=columns)

# Convert the DataFrame to a dictionary
dict_data = df.to_dict(orient='records')

# Step 3: Clean the data by removing newline characters
for row in dict_data:
    row['question'] = row['question'].replace('\n', "")
    row['answer'] = row['answer'].replace('\n', "")

# Step 4: Convert the DataFrame to JSON
json_data = json.dumps(dict_data, indent=4)

# Step 5: Save the JSON to a file
with open('output_with_nested_metadata.json', 'w') as json_file:
    json_file.write(json_data)
"""
Saves the processed data into a JSON file named 'output_with_nested_metadata.json'.
"""

# Step 6: Load the JSON data from the file
with open('output_with_nested_metadata.json', 'r') as json_file:
    json_data = json.load(json_file)
"""
Loads the JSON file into a Python object for further processing.
"""

# Define a function to format dataset examples
def format_examples(examples):
    """
    Formats dataset examples into text input-output pairs.

    Args:
        examples (dict): A batch of examples from the dataset.

    Returns:
        dict: A dictionary with formatted text inputs.
    """
    inputs = [f"Question: {q} Answer: {a}" for q, a in zip(examples["question"], examples["answer"])]
    return {'text': inputs}

# Define a function to tokenize the formatted examples
def tokenize_function(examples):
    """
    Tokenizes the input text.

    Args:
        examples (dict): A batch of examples with formatted text.

    Returns:
        dict: A dictionary with tokenized inputs.
    """
    tokenized_inputs = tokenizer(examples['text'],
        padding='max_length',
        truncation=True,
        max_length=512,  # Set max length based on model input limits
        return_tensors="pt"
    )
    with tokenizer.as_target_tokenizer():
        tokenized_labels = tokenizer(
            examples['answer'],
            truncation=True,
            padding='max_length',
            max_length=512,
          return_tensors="pt"
      )
    labels_ids = tokenized_labels['input_ids']
    labels_ids[labels_ids == tokenizer.pad_token_id] = -100
    tokenized_inputs['labels'] = labels_ids

    return tokenized_inputs

# Step 7: Convert the JSON data into a Hugging Face Dataset
dataset = Dataset.from_dict({
    "question": [item["question"] for item in json_data],
    "answer": [item["answer"] for item in json_data]
})
"""
Creates a Hugging Face Dataset from the processed JSON data.
"""

# Step 8: Format and tokenize the full dataset
dataset = dataset.map(format_examples, batched=True)
open_ended_questions_tokenized = dataset.map(tokenize_function, batched=True)
"""
Formats and tokenizes the entire dataset for future use.
"""

open_ended_questions_tokenized

Map:   0%|          | 0/13 [00:00<?, ? examples/s]

Map:   0%|          | 0/13 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'answer', 'text', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 13
})

Preprocessing of Multiple-Choice Questions

In [None]:
"""
Script for processing multiple-choice questions from an Excel file, preparing data for model training.
The script performs the following:
1. Reads multiple-choice questions and metadata from an Excel file.
2. Formats the data with options, ranks, and justifications for each question.
3. Converts the data into a JSON format and saves it to a file.
4. Prepares a Hugging Face dataset for training, including tokenization and formatting.

Modules:
    pandas: For reading and processing Excel data.
    json: For saving and loading JSON files.
    numpy: For handling numerical operations.
    datasets: For creating and managing Hugging Face datasets.
    transformers: For tokenization and formatting.

Functions:
    preprocess_function: Prepares the dataset by tokenizing questions and answers and aligning labels.

Usage:
    - Ensure the Excel file "Teliko.xls" exists in the working directory with the correct sheet and columns.
    - Replace `tokenizer` with the appropriate tokenizer for your language model.
"""

import pandas as pd
import numpy as np
import json
from datasets import Dataset

# Step 1: Load the Excel file
df = pd.read_excel("Teliko.xls", sheet_name='multiple choice questions')
"""
Reads the 'multiple choice questions' sheet from the Excel file into a pandas DataFrame.
"""

# Define column names for the final DataFrame
columns = [
    'ID', 'question', 'Choice A', 'metadata A.rank', 'metadata A.justification',
    'Choice B', 'metadata B.rank', 'metadata B.justification',
    'Choice C', 'metadata C.rank', 'metadata C.justification',
    'Choice D', 'metadata D.rank', 'metadata D.justification'
]

# Initialize lists for processing
multiple_choices = []
options = [["None", "None", "None", "None"]]
justifications = [["None", "None", "None", "None"]]
ranks = [["None", "None", "None", "None"]]
questions = [df.columns[1].replace('\n', '')]
id = 0
current_option_id = 0

# Step 2: Process the DataFrame row by row
for i in range(len(df)):
    if df.iloc[i, 1] in ["text", "text "]:  # Extract options
        options[id][current_option_id] = str(df.iloc[i, 2]).replace('\n', '')
    elif df.iloc[i, 1] == 'justification':  # Extract justifications
        justifications[id][current_option_id] = str(df.iloc[i, 2]).replace('\n', '')
    elif df.iloc[i, 1] in ['rank', 'rank ']:  # Extract ranks
        ranks[id][current_option_id] = df.iloc[i, 2]
        current_option_id += 1
    elif df.iloc[i, 0] == 'question':  # Process new question
        current_option_id = 0
        id += 1
        options.append(["None", "None", "None", "None"])
        justifications.append(["None", "None", "None", "None"])
        ranks.append(["None", "None", "None", "None"])
        questions.append(df.iloc[i, 1].replace('\n', ''))

# Combine processed data into records
for i in range(len(questions)):
    multiple_choices.append([
        i, questions[i], options[i][0], ranks[i][0], justifications[i][0],
        options[i][1], ranks[i][1], justifications[i][1],
        options[i][2], ranks[i][2], justifications[i][2],
        options[i][3], ranks[i][3], justifications[i][3]
    ])

# Convert the records into a pandas DataFrame
df = pd.DataFrame(multiple_choices, columns=columns)

dict_data = df.to_dict(orient='records')

choices = ['A','B','C','D']
for row in dict_data:
  options_num = 0
  row['question'] = row['question'].replace('\n', "")
  for choice in choices:
    if pd.notna(row[f'metadata {choice}.rank']) and row[f'metadata {choice}.rank']!="None":
      answer_choice = row[f'Choice {choice}'].replace('\n', "").replace('\"',"'").replace('                 ',"").replace('\u62b0',"'t").replace('\u62af',"'s").replace(' \u62b3',"'v").replace(' \ufffdor'," or").replace('\ufffd',': ').replace('\u63e7',"'Y").replace('\u63cc',"'H")
      answer_rank = int(row[f'metadata {choice}.rank'])
      answer_justification = row[f'metadata {choice}.justification'].replace('\n', "").replace('\"',"'").replace('                 ',"").replace('\u62b0',"'t").replace('\u62af',"'s").replace(' \u62b3',"'v").replace(' \ufffdor'," or").replace('\ufffd',': ').replace('\u63e7',"'Y").replace('\u63cc',"'H")
      answer = f"{choice}) {answer_choice} - Justification: {answer_justification} - Rank: {answer_rank}".replace('..','.').replace('. . ','.')
      row[f'choice_{choice}'] = answer
      row[f'Choice {choice}'] = answer_choice
      options_num+=1
      if answer_rank == 1:
        row['answer'] = answer
    else:
      row[f'choice_{choice}'] = 'None'
      row[f'Choice {choice}'] = 'None'
  row['options_num'] = options_num


# Step 4: Convert the DataFrame to JSON
json_data = json.dumps(dict_data, indent=4)

# Step 5: Save the JSON to a file
with open('output_with_nested_metadata.json','w') as json_file:
  json_file.write(json_data)

with open('output_with_nested_metadata.json', 'r') as json_file:
    json_data = json.load(json_file) # load the file as a json object

def preprocess_function_mc(examples):
  inputs = []
  labels = []

  questions = [question for question in examples['question']]
  answers = [answer for answer in examples['answer']]
  choices = {
      'A': [choice for choice in examples['choice_A']],
      'B': [choice for choice in examples['choice_B']],
      'C': [choice for choice in examples['choice_C']],
      'D': [choice for choice in examples['choice_D']]
  }

  surplus = 0
  while surplus<len(questions):
    num = 0
    for option in choices:
      if examples[f'choice_{option}'][surplus] != "None":
        num +=1
        input_text = f"Question: {questions[surplus]}\nOption: {choices[f'{option}'][surplus]}\nAnswer:"
        inputs.append(input_text)
        labels.append(answers[surplus] if choices[f'{option}'][surplus] == answers[surplus] else "")
    surplus += num

  tokenized_inputs = tokenizer(
        inputs,
        padding='max_length',
        truncation=True,
        max_length=512,  # Set max length based on model input limits
        return_tensors="pt",
    )

  with tokenizer.as_target_tokenizer():
        tokenized_labels = tokenizer(
            labels,
            truncation=True,
            padding='max_length',
            max_length=512,
            return_tensors="pt"
        )


  labels_ids = tokenized_labels['input_ids']
  # Replace padding token id's in labels by -100 so they are ignored by the loss
  labels_ids[labels_ids == tokenizer.pad_token_id] = -100
  tokenized_inputs['labels'] = labels_ids
  return tokenized_inputs

dataset = Dataset.from_dict({
    "question": [item["question"] for item in json_data],
    "choice_A": [item["choice_A"] for item in json_data],
    "choice_B": [item["choice_B"] for item in json_data],
    "choice_C": [item["choice_C"] for item in json_data],
    "choice_D": [item["choice_D"] for item in json_data],
    "answer":   [item["answer"] for item in json_data],
    "options_num": [item["options_num"] for item in json_data],
    "Choice A": [item["Choice A"] for item in json_data],
    "Choice B": [item["Choice B"] for item in json_data],
    "Choice C": [item["Choice A"] for item in json_data],
    "Choice D": [item["Choice D"] for item in json_data],
})

#####

dataset = Dataset.from_dict({
    "question": [item["question"] for item in dataset for i in range(item['options_num'])],
    'choice_A': [item['choice_A'] for item in dataset for i in range(item['options_num'])],
    'choice_B': [item['choice_B'] for item in dataset for i in range(item['options_num'])],
    'choice_C': [item['choice_C'] for item in dataset for i in range(item['options_num'])],
    'choice_D': [item['choice_D'] for item in dataset for i in range(item['options_num'])],
    'answer': [item['answer'] for item in dataset for i in range(item['options_num'])],
})

tokenized_dataset = dataset.map(preprocess_function_mc, batched=True)
multiple_choice_tokenized_dataset = tokenized_dataset.remove_columns(['question', 'choice_A', 'choice_B', 'choice_C', 'choice_D'])
multiple_choice_tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
multiple_choice_tokenized_dataset

Map:   0%|          | 0/445 [00:00<?, ? examples/s]

Dataset({
    features: ['answer', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 445
})

Creation of Merged Dataset

In [None]:
def map_to_unified(example):
    data_dict = {
        'text':example.get('text') or example.get('question'),
        'labels':example.get('labels'),
        'input_ids':example['input_ids'],
        'attention_mask':example['attention_mask']
    }

    return data_dict

dataset1 = true_false_tokenized_dataset.map(map_to_unified).remove_columns(['statement', 'label'])
dataset2 = open_ended_questions_tokenized.map(map_to_unified).remove_columns(['question', 'answer'])
dataset3 = multiple_choice_tokenized_dataset.map(map_to_unified).remove_columns(['answer'])

merged_dataset = concatenate_datasets([dataset1, dataset2,dataset3])
merged_dataset


Map:   0%|          | 0/61 [00:00<?, ? examples/s]

Map:   0%|          | 0/13 [00:00<?, ? examples/s]

Map:   0%|          | 0/445 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 519
})

Traning of Model

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = merged_dataset,
    max_seq_length = 2048,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 4,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 50, # Set this for 1 full training run.
        # max_steps = 1000,
        learning_rate = 4e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 5,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)
trainer_stats = trainer.train()
model.save_pretrained('./unified_model')


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 519 | Num Epochs = 50
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 4
\        /    Total batch size = 16 | Total steps = 1,600
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
5,3.0009
10,1.9485
15,1.4904
20,1.42
25,1.3554
30,1.2104
35,1.0315
40,0.9247
45,0.9839
50,0.8562


KeyboardInterrupt: 

Creation of API (Unified Approach)

In [None]:
"""
This module processes multiple types of questions (True/False, Open-Ended, and Multiple-Choice) and provides utilities for structured data extraction and interaction via a Gradio interface.

Features:
- Structured data parsing from XML-like, Markdown-like, and raw text formats.
- True/False evaluation using a fine-tuned model.
- Open-Ended question answering with detailed responses.
- Multiple-Choice question answering with selected choice and justification.

Modules:
    - unsloth: Provides FastLanguageModel for efficient model inference.
    - torch: For PyTorch-based tensor operations.
    - gradio: For creating the user-friendly interface.
    - xml.etree.ElementTree: For XML parsing.
    - re: For regular expressions to process structured text.

Functions:
    extract_from_xml_et(text): Parses XML-like strings.
    extract_choice(text): Extracts the choice label (e.g., A), B)).
    extract_justification(text): Extracts the justification text.
    extract_from_markdown_regex(text): Extracts choice and justification from Markdown-like text.
    extract_fields(text): Combines various parsing methods for structured data extraction.
    generate_response_true_false(instruction): Evaluates True/False questions.
    generate_response_open_ended(instruction): Answers Open-Ended questions.
    generate_response_multiple_choice(question, choice_A, choice_B, choice_C, choice_D): Handles Multiple-Choice questions.
    true_false_greet(question): Interface function for True/False questions.
    open_ended_greet(question): Interface function for Open-Ended questions.
    multiple_choice_greet(question, choice_A, choice_B, choice_C, choice_D): Interface function for Multiple-Choice questions.
"""

from unsloth import FastLanguageModel
import torch
import gradio as gr
import xml.etree.ElementTree as ET
import re

def extract_from_xml_et(text: str) -> dict:
    """
    Parses an XML-like string and extracts key-value pairs from its elements.

    Args:
        text (str): A string containing XML-like content (e.g., <tag>value</tag>).

    Returns:
        dict: A dictionary where keys are lowercase XML tags and values are their text content.
        None: Returns None if XML parsing fails.

    Example:
        >>> text = '<key>"value"</key>'
        >>> extract_from_xml_et(text)
        {'key': 'value'}
    """
    try:
        wrapped_text = f"<root>{text}</root>"
        root = ET.fromstring(wrapped_text)
        data = {}
        for child in root:
            if child.text:
                value = child.text.strip().strip('"')
                data[child.tag.lower()] = value
        return data
    except ET.ParseError:
        return None

def extract_choice(text: str) -> str:
    """
    Extracts the choice (e.g., A), B)) from a text block.

    Args:
        text (str): Input text to search for the choice.

    Returns:
        str: The extracted choice, or None if not found.

    Example:
        >>> text = "A) This is a sample choice."
        >>> extract_choice(text)
        'A)'
    """
    choice_pattern = r'([A-D]\))'
    match = re.search(choice_pattern, text)
    if match:
        return match.group(1).strip()
    return None

def extract_justification(text: str) -> str:
    """
    Extracts the justification text from a text block.

    Args:
        text (str): Input text to search for the justification.

    Returns:
        str: The extracted justification, or None if not found.

    Example:
        >>> text = "- Justification: This is the reason."
        >>> extract_justification(text)
        'This is the reason.'
    """
    justification_pattern = r'(?:- )?Justification:\s*(.+)'
    match = re.search(justification_pattern, text)
    if match:
        return match.group(1).strip()
    return None

def extract_from_markdown_regex(text: str) -> dict:
    """
    Extracts structured data from Markdown-like text blocks.

    Args:
        text (str): Input text containing Markdown-like content with **choice** and **justification** fields.

    Returns:
        dict: A dictionary containing "choice" and "justification", or None if no match is found.

    Example:
        >>> text = "**choice**: A **justification**: This is the reason."
        >>> extract_from_markdown_regex(text)
        {'choice': 'A', 'justification': 'This is the reason.'}
    """
    choice_pattern = r'\*\*choice\*\*:\s*(.+?)'
    justification_pattern = r'\*\*justification\*\*:\s*([\s\S]+?)(?=\*\*choice\*\*|$)'
    choice_match = re.search(choice_pattern, text)
    justification_match = re.search(justification_pattern, text)

    if choice_match and justification_match:
        return {
            "choice": choice_match.group(1).strip(),
            "justification": justification_match.group(1).strip()
        }
    return None

def extract_fields(text: str) -> list:
    """
    Processes text blocks to extract structured data using various methods.

    Args:
        text (str): Input text containing one or more blocks of data.

    Returns:
        list: A list of dictionaries, each containing extracted data from a block.

    Workflow:
        - Splits the input text into blocks using double line breaks (\n\n).
        - Tries XML parsing, regex for choice/justification, and Markdown-like parsing.
    """
    entries = []
    blocks = re.split(r'\n\s*\n', text.strip())

    for block in blocks:
        extracted_data = {}

        # Try extracting using XML
        xml_data = extract_from_xml_et(block)
        if xml_data:
            entries.append(xml_data)
            continue

        # Try extracting using choice and justification regex
        choice = extract_choice(block)
        justification = extract_justification(block)
        if choice or justification:
            extracted_data["choice"] = choice
            extracted_data["justification"] = justification
            entries.append(extracted_data)
            continue

        # Try extracting using Markdown regex
        markdown_data = extract_from_markdown_regex(block)
        if markdown_data:
            entries.append(markdown_data)

    return entries

### Model Initialization ###
model, tokenizer = FastLanguageModel.from_pretrained('./unified_model')

def generate_response_true_false(instruction: str) -> str:
    """
    Generates a "True" or "False" response based on the provided statement.

    Args:
        instruction (str): A string containing the statement to evaluate.

    Returns:
        str: "True" or "False", or "Unable to determine" if parsing fails.
    """
    FastLanguageModel.for_inference(model)
    prompt = f"""### Instruction:
    Determine if the following statement is true or false. Respond only with "True" or "False".

    ### Statement:
    {instruction}

    ### Answer:"""

    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=50)

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    response = response.split("### Answer:")[-1].strip()

    if response.lower() == "true":
        return "True"
    elif response.lower() == "false":
        return "False"
    return "Unable to determine."

# Function to generate responses for open-ended questions
def generate_response_open_ended(instruction):
    """
    Generates a response using your fine-tuned model based on the provided instruction.

    This function enables faster inference through the `FastLanguageModel` and prepares a
    prompt for the model to answer the provided question.

    Args:
        instruction (str): A string containing the statement and instructions to be evaluated.

    Returns:
        str: A response from the model to the provided question or "Unable to determine" if the
             response cannot be parsed reliably.
    """
    FastLanguageModel.for_inference(model)  # Enable faster inference within the function

    # Create the prompt for the model
    prompt = f"""### Instruction:
    Answer the provided question with the knowledge provided to you
    ### Question:
    {instruction}

    ### Answer:
    """

    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            early_stopping=False,
            min_length=50,
            length_penalty=2,
            max_length=200
        )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract the answer from the generated response
    response = response.split('### Answer:')[1]
    return response

# Function to generate responses for multiple-choice questions
def generate_response_multiple_choice(question, choice_A, choice_B, choice_C, choice_D):
    """
    Generates a response using a fine-tuned language model for multiple-choice questions.

    Args:
        question (str): The question to be answered.
        choice_A (str): Option A.
        choice_B (str): Option B.
        choice_C (str): Option C.
        choice_D (str): Option D.

    Returns:
        dict: A dictionary with the selected choice and its justification.
              Example:
              {
                  "choice": "A",
                  "justification": "Explanation for why Option A is correct."
              }
              Defaults to:
              {
                  "choice": "None",
                  "justification": "Could not parse JSON."
              } if parsing fails.
    """
    instruction = f'''{question}
  Choices:
  A) {choice_A},
  B) {choice_B},
  C) {choice_C},
  D) {choice_D}
    '''

    FastLanguageModel.for_inference(model)  # Enable faster inference

    # Define the prompt
    prompt = f"""### Instruction:
    In the following question, you are provided with 4 choices. Select the best choice based on the knowledge provided and provide a justification for that choice.

    **You must return only your response with the following keys:**
      - "choice": The best choice letter
      - "justification": The justification for your choice

    **Example Response:**
      **choice**: A
      **justification**: Explanation for why Option A is correct

    ### Question:
    {instruction}

    ### Answer:
    """

    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    # Generate a response from the model
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            early_stopping=True,
            min_length=50,
            length_penalty=2,
            do_sample=True,
            max_new_tokens=300,
            top_p=0.95,
            top_k=50,
            temperature=0.65,
            num_return_sequences=1
        )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract the answer from the generated response
    response = response.split('### Answer:')[1]
    print("RESPONSE", response)
    data = extract_fields(response)
    if len(data) == 0:
        response = {"choice": "None", "justification": "Could not parse JSON"}
    else:
        response = {"choice": data[-1]['choice'], "justification": data[-1]['justification']}
    return response

# Helper functions for different question types
def true_false_greet(question):
    """
    Responds to a True/False question or returns a default response if no question is provided.

    Args:
        question (str): The input True/False question.

    Returns:
        str: The response from the model or a default message.
    """
    if question == "":
        return "No question was given to answer"
    else:
        response = generate_response_true_false(question)  # Placeholder
        return f"{response}!"

def open_ended_greet(question):
    """
    Processes an open-ended question and returns a response.

    Args:
        question (str): The input question provided by the user.

    Returns:
        str: The response from the model or a default message.
    """
    if question == "":
        return "No question was given to answer"
    else:
        response = generate_response_open_ended(question)
        return f"{response}!"

def multiple_choice_greet(question, choice_A, choice_B, choice_C, choice_D):
    """
    Processes a multiple-choice question and returns a response.

    Args:
        question (str): The input question.
        choice_A (str): Option A.
        choice_B (str): Option B.
        choice_C (str): Option C.
        choice_D (str): Option D.

    Returns:
        str: The response, including the selected choice and its justification.
    """
    if question == "":
        return "No question was given to answer"
    if choice_A == "" and choice_B == "" and choice_C == "" and choice_D == "":
        return "No choice was given"
    else:
        response = generate_response_multiple_choice(question, choice_A, choice_B, choice_C, choice_D)
        actual_response = f"Selected Choice: {response['choice']}\nJustification: {response['justification']}"
        return actual_response

# Functions to toggle visibility of interfaces
def show_true_false_interface():
    return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)

def show_open_ended_interface():
    return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)

def show_multiple_choice_interface():
    return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True)

# Gradio UI setup
with gr.Blocks() as demo:

    # Navigation buttons
    with gr.Row():
        btn_t_f = gr.Button('True/False questions')
        btn_open_ended = gr.Button('Open-Ended questions')
        btn_m_c = gr.Button('Multiple-Choice questions')

    # True/False questions interface
    with gr.Column(visible=True) as true_false_interface:
        gr.Markdown("## True-False Template")
        question_simple = gr.Textbox(label="Enter your question")
        simple_output = gr.Textbox(label="Output", interactive=False)
        submit_simple = gr.Button("Submit")
        submit_simple.click(true_false_greet, inputs=question_simple, outputs=simple_output)

    # Open-ended questions interface
    with gr.Column(visible=False) as open_ended_interface:
        gr.Markdown("## Open Ended Template")
        question_simple = gr.Textbox(label="Enter your question")
        simple_output = gr.Textbox(label="Output", interactive=False)
        submit_simple = gr.Button("Submit")
        submit_simple.click(open_ended_greet, inputs=question_simple, outputs=simple_output)

    # Multiple-choice questions interface
    with gr.Column(visible=False) as mc_interface:
        gr.Markdown("## Multiple-Choice Template")
        question_mc = gr.Textbox(label="Enter your question")
        choice_A = gr.Textbox(label="Choice A")
        choice_B = gr.Textbox(label="Choice B")
        choice_C = gr.Textbox(label="Choice C")
        choice_D = gr.Textbox(label="Choice D")
        mc_output = gr.Textbox(label="Output", interactive=False)
        submit_mc = gr.Button("Submit")
        submit_mc.click(multiple_choice_greet, inputs=[question_mc, choice_A, choice_B, choice_C, choice_D], outputs=mc_output)

    # Navigation button functionality
    btn_t_f.click(show_true_false_interface, outputs=[true_false_interface, open_ended_interface, mc_interface])
    btn_open_ended.click(show_open_ended_interface, outputs=[true_false_interface, open_ended_interface, mc_interface])
    btn_m_c.click(show_multiple_choice_interface, outputs=[true_false_interface, open_ended_interface, mc_interface])

# Launch the Gradio app
demo.launch()



Creation of API (Chat-Bot Approach)

In [None]:
# Import necessary libraries
from unsloth import FastLanguageModel
import torch
import gradio as gr

# Provide a reference to documentation for further information
"""
For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
"""

# Load the pre-trained language model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained('./unified_model')

# Function to generate a response
def generate_response(instruction, chat_history):
    """
    Generates a response using your fine-tuned model.

    Args:
        instruction (str): The user's input question or instruction.
        chat_history (dict): A dictionary maintaining the conversation history.

    Returns:
        str: The generated response from the model.
    """
    # Create a prompt for the model
    prompt = f"""### Instruction:
    Answer the following question.

    ### Question:
    {instruction}

    Provide a unique, concise, and non-repetitive answer.
    ### Answer:"""

    # Tokenize the prompt and move it to GPU for inference
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            early_stopping=True,
            min_length=50,
            length_penalty=2,
            do_sample=True,
            max_new_tokens=300,
            top_p=0.95,
            top_k=50,
            temperature=0.7,
            repetition_penalty=1.2,
            num_return_sequences=1
        )

    # Decode the generated response and extract the answer
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    response = response.split("### Answer:")[-1]
    return response

# Function to update chat history
def update_chat_history(chat_history, user_message, bot_message):
    """
    Updates the chat history to maintain relevance and avoid excessive growth.

    Args:
        chat_history (dict): A dictionary containing user and bot messages.
        user_message (str): The latest message from the user.
        bot_message (str): The latest response from the bot.

    Returns:
        dict: The updated chat history, trimmed to the last N interactions.
    """
    chat_history['user'].append(user_message)
    chat_history['bot'].append(bot_message)

    # Keep only the last 5 interactions for brevity
    if len(chat_history['user']) > 5:
        chat_history['user'] = chat_history['user'][-5:]
        chat_history['bot'] = chat_history['bot'][-5:]

    return chat_history

# Main chatbot function
def chatbot(input_text, chat_history):
    """
    Handles user input, generates a response, and updates the chat history.

    Args:
        input_text (str): The user's input message.
        chat_history (list): A list of tuples containing previous user and bot messages.

    Returns:
        str: An empty string (clears the input box).
        list: The updated chat history as a list of tuples.
    """
    # Initialize a structured dictionary for conversation messages
    messages = {
        "user": [],
        "bot": [],
    }

    # Populate messages with existing chat history
    for user_msg, bot_msg in chat_history:
        messages["user"].append(user_msg)
        messages["bot"].append(bot_msg)

    # Generate a response based on user input
    bot_response = generate_response(input_text, messages)

    # Update the chat history with the new messages
    chat_history.append(("User: " + input_text, bot_response))
    messages = update_chat_history(messages, input_text, bot_response)

    return "", chat_history

# Build the Gradio interface
with gr.Blocks() as demo:
    gr.Markdown('## AILA INTERFACE DEMO')

    # Input section for user messages
    with gr.Row():
        user_input = gr.Textbox(
            placeholder="Type your message here...",
            label="Your Message",
            lines=1
        )

    # Button to submit user input
    submit_button = gr.Button('Submit')

    # Display chat history
    chat_history = gr.Chatbot()

    # Link the button to the chatbot function
    submit_button.click(
        chatbot,
        inputs=[user_input, chat_history],
        outputs=[user_input, chat_history]
    )

# Launch the interface
demo.launch()
