In [None]:
### Install and import libraries
!pip install pandas openpyxl
!pip install dataset
!pip install unsloth
!pip install gradio
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

import pandas as pd
import json
from google.colab import files
import numpy as np
from datasets import Dataset
from unsloth import FastLanguageModel
import torch
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
from datasets import Dataset,concatenate_datasets

In [None]:
### Downloading of model

max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 2x faster
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # 4bit for 405b!
    "unsloth/Mistral-Small-Instruct-2409",     # Mistral 22b 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!

    "unsloth/Llama-3.2-1B-bnb-4bit",           # NEW! Llama 3.2 models
    "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
    "unsloth/Llama-3.2-3B-bnb-4bit",
    "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit", # or choose "unsloth/Llama-3.2-1B"
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Preprocessing of True-False Questions

In [None]:
"""
Script for processing True/False questions from an Excel file, saving as JSON, and converting to a Hugging Face dataset.
The script performs the following:
1. Reads data from an Excel file containing True/False questions and metadata.
2. Processes and formats the data into structured columns.
3. Saves the data as a JSON file and reloads it for further processing.
4. Converts the data into a Hugging Face Dataset format.
5. Formats and tokenizes the dataset for model training.

Modules:
    pandas: For reading and processing Excel data.
    json: For saving and loading JSON files.
    datasets: For creating and managing Hugging Face datasets.

Functions:
    format_examples: Formats dataset examples for input text creation.
    tokenize_function: Tokenizes input text using the tokenizer.

Usage:
    - Ensure the Excel file "Teliko.xls" exists in the working directory with the correct sheet and columns.
    - Replace `tokenizer` with the appropriate tokenizer for your model.
"""

# Step 1: Load the Excel file
df = pd.read_excel("Teliko.xls", sheet_name='True false questions')
"""
Reads the 'True false questions' sheet from the Excel file into a pandas DataFrame.
"""

# Define column names for the final DataFrame
columns = ['id', 'question', 'answer', 'metadata.justification']

# Initialize lists for processing
questions = [df.columns[1].replace('\n', ' ').replace('\n\n', '')]  # Clean column header for questions
true_false = []  # Stores True/False answers
justification = []  # Stores justifications
indexs = []  # Final list of processed records

# Step 2: Process the DataFrame row by row
for i in range(len(df)):
    if df.iloc[i, 1] == 'text':  # Extract True/False answers
        true_false.append(str(df.iloc[i, 2]).lower())
    elif df.iloc[i, 1] == 'justification':  # Extract justifications
        justification.append(df.iloc[i, 2])
    elif df.iloc[i, 1] == 'rank':  # Skip rows with 'rank'
        pass
    else:
        # Extract and clean question text
        questions.append(df.iloc[i, 1].replace('\n', ' ').replace('\n\n', ''))

# Construct records by combining questions, answers, and justifications
for i in range(len(questions)):
    indexs.append([i, questions[i], true_false[i], justification[i]])

# Convert the records into a pandas DataFrame
df = pd.DataFrame(indexs, columns=columns)

# Step 3: Save the processed data as JSON
json_data = df.to_json(orient='records', indent=4)
with open('output_with_nested_metadata.json', 'w') as json_file:
    json_file.write(json_data)
"""
Saves the processed data into a JSON file named 'output_with_nested_metadata.json'.
"""

# Step 4: Load the JSON data from the file
with open('output_with_nested_metadata.json', 'r') as json_file:
    json_data = json.load(json_file)
"""
Loads the JSON file into a Python object for further processing.
"""

# Step 5: Convert the JSON data into a Hugging Face Dataset
dataset = Dataset.from_dict({
    "statement": [item["question"] for item in json_data],
    "label": [item["answer"] for item in json_data]
})
"""
Creates a Hugging Face Dataset from the processed JSON data.
Each record consists of:
    - `statement`: The question text.
    - `label`: The corresponding True/False answer.
"""

# Define a function to format dataset examples
def format_examples(examples):
    """
    Formats dataset examples for input text creation.

    Args:
        examples (dict): A batch of examples from the dataset.

    Returns:
        dict: A dictionary with formatted text inputs.
    """
    inputs = [f'Question: {q} Answer: {a}' for q, a in zip(examples['statement'], examples['label'])]
    return {'text': inputs}

# Define a function to tokenize the formatted examples
def tokenize_function(examples):
    """
    Tokenizes the input text.

    Args:
        examples (dict): A batch of examples with formatted text.

    Returns:
        dict: A dictionary with tokenized inputs.
    """

    tokenized_inputs = tokenizer(examples['text'],
        padding='max_length',
        truncation=True,
        max_length=512,  # Set max length based on model input limits
        return_tensors="pt"
    )

    with tokenizer.as_target_tokenizer():
        tokenized_labels = tokenizer(
            examples['label'],
            truncation=True,
            padding='max_length',
            max_length=512,
            return_tensors="pt"
        )
    labels_ids = tokenized_labels['input_ids']
    labels_ids[labels_ids == tokenizer.pad_token_id] = -100
    tokenized_inputs['labels'] = labels_ids
    return tokenized_inputs

# Step 6: Format and tokenize the full dataset
dataset = dataset.map(format_examples, batched=True)
true_false_tokenized_dataset = dataset.map(tokenize_function, batched=True)
"""
Formats and tokenizes the entire dataset for future use.
"""
true_false_tokenized_dataset

Preprocessing of Open-Ended Questions

In [None]:
"""
Script for processing open-ended questions from an Excel file, converting to JSON, and preparing for model training.
The script performs the following:
1. Reads data from an Excel file containing open-ended questions and answers.
2. Processes and formats the data into structured columns.
3. Saves the data as a JSON file and reloads it for further processing.
4. Converts the data into a Hugging Face Dataset format.
5. Tokenizes and prepares the dataset for language model training.

Modules:
    pandas: For reading and processing Excel data.
    json: For saving and loading JSON files.
    datasets: For creating and managing Hugging Face datasets.
    DataCollatorForLanguageModeling: For collating and batching data for language model training.

Functions:
    format_examples: Formats dataset examples into text input-output pairs.
    tokenize_function: Tokenizes the text inputs for the model.

Usage:
    - Ensure the Excel file "Teliko.xls" exists in the working directory with the correct sheet and columns.
    - Replace `tokenizer` with the appropriate tokenizer for your language model.
"""

import pandas as pd
import json
from datasets import Dataset
from transformers import DataCollatorForLanguageModeling

# Step 1: Load the Excel file
df = pd.read_excel("Teliko.xls", sheet_name='open ended questions')
"""
Reads the 'open ended questions' sheet from the Excel file into a pandas DataFrame.
"""

# Define column names for the final DataFrame
columns = ['id', 'question', 'answer']

# Initialize lists for processing
questions = [df.columns[1].replace('\n', ' ').replace('\n\n', '')]  # Clean column header for questions
answers = []  # Stores answers
indexs = []  # Final list of processed records

# Step 2: Process the DataFrame row by row
for i in range(len(df)):
    if df.iloc[i, 0] == 'answer':  # Extract answers
        answers.append(df.iloc[i, 1].replace('\n', ''))
    elif df.iloc[i, 0] == 'question ':  # Extract questions
        questions.append(df.iloc[i, 1].replace('\n', ''))

# Construct records by combining questions and answers
for i in range(len(questions)):
    indexs.append([i, questions[i], answers[i]])

# Convert the records into a pandas DataFrame
df = pd.DataFrame(indexs, columns=columns)

# Convert the DataFrame to a dictionary
dict_data = df.to_dict(orient='records')

# Step 3: Clean the data by removing newline characters
for row in dict_data:
    row['question'] = row['question'].replace('\n', "")
    row['answer'] = row['answer'].replace('\n', "")

# Step 4: Convert the DataFrame to JSON
json_data = json.dumps(dict_data, indent=4)

# Step 5: Save the JSON to a file
with open('output_with_nested_metadata.json', 'w') as json_file:
    json_file.write(json_data)
"""
Saves the processed data into a JSON file named 'output_with_nested_metadata.json'.
"""

# Step 6: Load the JSON data from the file
with open('output_with_nested_metadata.json', 'r') as json_file:
    json_data = json.load(json_file)
"""
Loads the JSON file into a Python object for further processing.
"""

# Define a function to format dataset examples
def format_examples(examples):
    """
    Formats dataset examples into text input-output pairs.

    Args:
        examples (dict): A batch of examples from the dataset.

    Returns:
        dict: A dictionary with formatted text inputs.
    """
    inputs = [f"Question: {q} Answer: {a}" for q, a in zip(examples["question"], examples["answer"])]
    return {'text': inputs}

# Define a function to tokenize the formatted examples
def tokenize_function(examples):
    """
    Tokenizes the input text.

    Args:
        examples (dict): A batch of examples with formatted text.

    Returns:
        dict: A dictionary with tokenized inputs.
    """
    tokenized_inputs = tokenizer(examples['text'],
        padding='max_length',
        truncation=True,
        max_length=512,  # Set max length based on model input limits
        return_tensors="pt"
    )
    with tokenizer.as_target_tokenizer():
        tokenized_labels = tokenizer(
            examples['answer'],
            truncation=True,
            padding='max_length',
            max_length=512,
          return_tensors="pt"
      )
    labels_ids = tokenized_labels['input_ids']
    labels_ids[labels_ids == tokenizer.pad_token_id] = -100
    tokenized_inputs['labels'] = labels_ids

    return tokenized_inputs

# Step 7: Convert the JSON data into a Hugging Face Dataset
dataset = Dataset.from_dict({
    "question": [item["question"] for item in json_data],
    "answer": [item["answer"] for item in json_data]
})
"""
Creates a Hugging Face Dataset from the processed JSON data.
"""

# Step 8: Format and tokenize the full dataset
dataset = dataset.map(format_examples, batched=True)
open_ended_questions_tokenized = dataset.map(tokenize_function, batched=True)
"""
Formats and tokenizes the entire dataset for future use.
"""

open_ended_questions_tokenized

Preprocessing of Multiple-Choice Questions

In [None]:
"""
Script for processing multiple-choice questions from an Excel file, preparing data for model training.
The script performs the following:
1. Reads multiple-choice questions and metadata from an Excel file.
2. Formats the data with options, ranks, and justifications for each question.
3. Converts the data into a JSON format and saves it to a file.
4. Prepares a Hugging Face dataset for training, including tokenization and formatting.

Modules:
    pandas: For reading and processing Excel data.
    json: For saving and loading JSON files.
    numpy: For handling numerical operations.
    datasets: For creating and managing Hugging Face datasets.
    transformers: For tokenization and formatting.

Functions:
    preprocess_function: Prepares the dataset by tokenizing questions and answers and aligning labels.

Usage:
    - Ensure the Excel file "Teliko.xls" exists in the working directory with the correct sheet and columns.
    - Replace `tokenizer` with the appropriate tokenizer for your language model.
"""

import pandas as pd
import numpy as np
import json
from datasets import Dataset

# Step 1: Load the Excel file
df = pd.read_excel("Teliko.xls", sheet_name='multiple choice questions')
"""
Reads the 'multiple choice questions' sheet from the Excel file into a pandas DataFrame.
"""

# Define column names for the final DataFrame
columns = [
    'ID', 'question', 'Choice A', 'metadata A.rank', 'metadata A.justification',
    'Choice B', 'metadata B.rank', 'metadata B.justification',
    'Choice C', 'metadata C.rank', 'metadata C.justification',
    'Choice D', 'metadata D.rank', 'metadata D.justification'
]

# Initialize lists for processing
multiple_choices = []
options = [["None", "None", "None", "None"]]
justifications = [["None", "None", "None", "None"]]
ranks = [["None", "None", "None", "None"]]
questions = [df.columns[1].replace('\n', '')]
id = 0
current_option_id = 0

# Step 2: Process the DataFrame row by row
for i in range(len(df)):
    if df.iloc[i, 1] in ["text", "text "]:  # Extract options
        options[id][current_option_id] = str(df.iloc[i, 2]).replace('\n', '')
    elif df.iloc[i, 1] == 'justification':  # Extract justifications
        justifications[id][current_option_id] = str(df.iloc[i, 2]).replace('\n', '')
    elif df.iloc[i, 1] in ['rank', 'rank ']:  # Extract ranks
        ranks[id][current_option_id] = df.iloc[i, 2]
        current_option_id += 1
    elif df.iloc[i, 0] == 'question':  # Process new question
        current_option_id = 0
        id += 1
        options.append(["None", "None", "None", "None"])
        justifications.append(["None", "None", "None", "None"])
        ranks.append(["None", "None", "None", "None"])
        questions.append(df.iloc[i, 1].replace('\n', ''))

# Combine processed data into records
for i in range(len(questions)):
    multiple_choices.append([
        i, questions[i], options[i][0], ranks[i][0], justifications[i][0],
        options[i][1], ranks[i][1], justifications[i][1],
        options[i][2], ranks[i][2], justifications[i][2],
        options[i][3], ranks[i][3], justifications[i][3]
    ])

# Convert the records into a pandas DataFrame
df = pd.DataFrame(multiple_choices, columns=columns)

dict_data = df.to_dict(orient='records')

choices = ['A','B','C','D']
for row in dict_data:
  options_num = 0
  row['question'] = row['question'].replace('\n', "")
  for choice in choices:
    if pd.notna(row[f'metadata {choice}.rank']) and row[f'metadata {choice}.rank']!="None":
      answer_choice = row[f'Choice {choice}'].replace('\n', "").replace('\"',"'").replace('                 ',"").replace('\u62b0',"'t").replace('\u62af',"'s").replace(' \u62b3',"'v").replace(' \ufffdor'," or").replace('\ufffd',': ').replace('\u63e7',"'Y").replace('\u63cc',"'H")
      answer_rank = int(row[f'metadata {choice}.rank'])
      answer_justification = row[f'metadata {choice}.justification'].replace('\n', "").replace('\"',"'").replace('                 ',"").replace('\u62b0',"'t").replace('\u62af',"'s").replace(' \u62b3',"'v").replace(' \ufffdor'," or").replace('\ufffd',': ').replace('\u63e7',"'Y").replace('\u63cc',"'H")
      answer = f"{choice}) {answer_choice} - Justification: {answer_justification} - Rank: {answer_rank}".replace('..','.').replace('. . ','.')
      row[f'choice_{choice}'] = answer
      row[f'Choice {choice}'] = answer_choice
      options_num+=1
      if answer_rank == 1:
        row['answer'] = answer
    else:
      row[f'choice_{choice}'] = 'None'
      row[f'Choice {choice}'] = 'None'
  row['options_num'] = options_num


# Step 4: Convert the DataFrame to JSON
json_data = json.dumps(dict_data, indent=4)

# Step 5: Save the JSON to a file
with open('output_with_nested_metadata.json','w') as json_file:
  json_file.write(json_data)

with open('output_with_nested_metadata.json', 'r') as json_file:
    json_data = json.load(json_file) # load the file as a json object

def preprocess_function_mc(examples):
  inputs = []
  labels = []

  questions = [question for question in examples['question']]
  answers = [answer for answer in examples['answer']]
  choices = {
      'A': [choice for choice in examples['choice_A']],
      'B': [choice for choice in examples['choice_B']],
      'C': [choice for choice in examples['choice_C']],
      'D': [choice for choice in examples['choice_D']]
  }

  surplus = 0
  while surplus<len(questions):
    num = 0
    for option in choices:
      if examples[f'choice_{option}'][surplus] != "None":
        num +=1
        input_text = f"Question: {questions[surplus]}\nOption: {choices[f'{option}'][surplus]}\nAnswer:"
        inputs.append(input_text)
        labels.append(answers[surplus] if choices[f'{option}'][surplus] == answers[surplus] else "")
    surplus += num

  tokenized_inputs = tokenizer(
        inputs,
        padding='max_length',
        truncation=True,
        max_length=512,  # Set max length based on model input limits
        return_tensors="pt",
    )

  with tokenizer.as_target_tokenizer():
        tokenized_labels = tokenizer(
            labels,
            truncation=True,
            padding='max_length',
            max_length=512,
            return_tensors="pt"
        )


  labels_ids = tokenized_labels['input_ids']
  # Replace padding token id's in labels by -100 so they are ignored by the loss
  labels_ids[labels_ids == tokenizer.pad_token_id] = -100
  tokenized_inputs['labels'] = labels_ids
  return tokenized_inputs

dataset = Dataset.from_dict({
    "question": [item["question"] for item in json_data],
    "choice_A": [item["choice_A"] for item in json_data],
    "choice_B": [item["choice_B"] for item in json_data],
    "choice_C": [item["choice_C"] for item in json_data],
    "choice_D": [item["choice_D"] for item in json_data],
    "answer":   [item["answer"] for item in json_data],
    "options_num": [item["options_num"] for item in json_data],
    "Choice A": [item["Choice A"] for item in json_data],
    "Choice B": [item["Choice B"] for item in json_data],
    "Choice C": [item["Choice A"] for item in json_data],
    "Choice D": [item["Choice D"] for item in json_data],
})

#####

dataset = Dataset.from_dict({
    "question": [item["question"] for item in dataset for i in range(item['options_num'])],
    'choice_A': [item['choice_A'] for item in dataset for i in range(item['options_num'])],
    'choice_B': [item['choice_B'] for item in dataset for i in range(item['options_num'])],
    'choice_C': [item['choice_C'] for item in dataset for i in range(item['options_num'])],
    'choice_D': [item['choice_D'] for item in dataset for i in range(item['options_num'])],
    'answer': [item['answer'] for item in dataset for i in range(item['options_num'])],
})

tokenized_dataset = dataset.map(preprocess_function_mc, batched=True)
multiple_choice_tokenized_dataset = tokenized_dataset.remove_columns(['question', 'choice_A', 'choice_B', 'choice_C', 'choice_D'])
multiple_choice_tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
multiple_choice_tokenized_dataset

Creation of Merged Dataset

In [None]:
def map_to_unified(example):
    data_dict = {
        'text':example.get('text') or example.get('question'),
        'labels':example.get('labels'),
        'input_ids':example['input_ids'],
        'attention_mask':example['attention_mask']
    }

    return data_dict

dataset1 = true_false_tokenized_dataset.map(map_to_unified).remove_columns(['statement', 'label'])
dataset2 = open_ended_questions_tokenized.map(map_to_unified).remove_columns(['question', 'answer'])
dataset3 = multiple_choice_tokenized_dataset.map(map_to_unified).remove_columns(['answer'])

merged_dataset = concatenate_datasets([dataset1, dataset2,dataset3])
merged_dataset


Traning of Model

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = merged_dataset,
    max_seq_length = 2048,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 4,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 50, # Set this for 1 full training run.
        # max_steps = 1000,
        learning_rate = 4e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 5,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)
trainer_stats = trainer.train()
model.save_pretrained('./unified_model')
