In [None]:
### Install and import libraries
!pip install pandas openpyxl
!pip install dataset
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install sentence-transformers bert-score nltk rouge-score
import pandas as pd
import json
from google.colab import files
import numpy as np
from datasets import Dataset
from unsloth import FastLanguageModel
import torch
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

In [None]:
### Downloading of model

max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 2x faster
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # 4bit for 405b!
    "unsloth/Mistral-Small-Instruct-2409",     # Mistral 22b 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!

    "unsloth/Llama-3.2-1B-bnb-4bit",           # NEW! Llama 3.2 models
    "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
    "unsloth/Llama-3.2-3B-bnb-4bit",
    "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit", # or choose "unsloth/Llama-3.2-1B"
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Preprocessing of Multiple-Choice Questions as Open-Ended questions

In [None]:
"""
Script for processing multiple-choice questions from an Excel file, converting them to JSON,
and preparing them for training a language model.

The script performs the following steps:
1. Reads data from an Excel file containing multiple-choice questions, options, ranks, and justifications.
2. Processes the data to extract and format structured content.
3. Converts the data to a JSON format and saves it to a file.
4. Creates a Hugging Face dataset and preprocesses it for language model training.
5. Tokenizes the dataset and sets up a data collator for training.

Modules:
    pandas: For reading and processing Excel data.
    numpy: For handling NaN values.
    json: For saving and loading JSON data.
    re: For extracting structured content using regular expressions.
    datasets: For creating and managing Hugging Face datasets.
    transformers: For tokenization and data collation.

Functions:
    preprocess_function: Prepares questions and answers as text for language model training.
    tokenize_function: Tokenizes the input text for the model.

Usage:
    - Ensure the Excel file "Teliko.xls" exists in the working directory with the correct sheet and columns.
    - Replace `tokenizer` with the appropriate tokenizer for your language model.
"""

import pandas as pd
import numpy as np
import json
import re
from datasets import Dataset
from transformers import DataCollatorForLanguageModeling

# Step 1: Load the Excel file
df = pd.read_excel("Teliko.xls", sheet_name='multiple choice questions')

# Define column names for the DataFrame
columns = [
    'ID', 'question', 'Choice A', 'metadata A.rank', 'metadata A.justification',
    'Choice B', 'metadata B.rank', 'metadata B.justification',
    'Choice C', 'metadata C.rank', 'metadata C.justification',
    'Choice D', 'metadata D.rank', 'metadata D.justification'
]

# Initialize lists for processing data
multiple_choices = []
options = [["None", "None", "None", "None"]]
justifications = [["None", "None", "None", "None"]]
ranks = [["None", "None", "None", "None"]]
questions = [df.columns[1].replace('\n', '')]
id = 0
current_option_id = 0

# Step 2: Process the Excel rows
for i in range(len(df)):
    if df.iloc[i, 1] in ["text", "text "]:
        options[id][current_option_id] = str(df.iloc[i, 2]).replace('\n', '')
    elif df.iloc[i, 1] == 'justification':
        justifications[id][current_option_id] = str(df.iloc[i, 2]).replace('\n', '')
    elif df.iloc[i, 1] in ['rank', 'rank ']:
        ranks[id][current_option_id] = df.iloc[i, 2]
        current_option_id += 1
    elif df.iloc[i, 0] == 'question':
        current_option_id = 0
        id += 1
        options.append(["None", "None", "None", "None"])
        justifications.append(["None", "None", "None", "None"])
        ranks.append(["None", "None", "None", "None"])
        questions.append(df.iloc[i, 1].replace('\n', ''))

# Combine data into rows
for i in range(len(questions)):
    multiple_choices.append([
        i, questions[i], options[i][0], ranks[i][0], justifications[i][0],
        options[i][1], ranks[i][1], justifications[i][1],
        options[i][2], ranks[i][2], justifications[i][2],
        options[i][3], ranks[i][3], justifications[i][3]
    ])

# Convert processed data into a DataFrame
df = pd.DataFrame(multiple_choices, columns=columns)

df.to_excel('phishing_multiple_choice_questions_new_vfinal.xlsx')
df = pd.read_excel('./phishing_multiple_choice_questions_new_vfinal.xlsx')

# Step 3: Convert the DataFrame to a dictionary
dict_data = df.to_dict(orient='records')

# Process the data to remove newline characters and construct answers
choices = ['A', 'B', 'C', 'D']
for row in dict_data:
    options_num = 0
    row['question'] = row['question'].replace('\n', "")
    for choice in choices:
        if pd.notna(row[f'metadata {choice}.rank']) and row[f'metadata {choice}.rank']!="None":
            answer_choice = row[f'Choice {choice}'].replace('\n', "").replace('\"', "'")
            answer_rank = int(row[f'metadata {choice}.rank'])
            answer_justification = row[f'metadata {choice}.justification'].replace('\n', "").replace('\"', "'")
            answer = f"{choice}) {answer_choice} - Justification: {answer_justification} - Rank: {answer_rank}"
            row[f'choice_{choice}'] = answer
            row[f'Choice {choice}'] = answer_choice
            options_num += 1
            if answer_rank == 1:
                row['answer'] = f"{answer_choice}. {answer_justification}".replace('..', '.').replace('. .', '.')
        else:
            row[f'choice_{choice}'] = 'None'
            row[f'Choice {choice}'] = 'None'
    row['options_num'] = options_num

# Step 4: Save the processed data to a JSON file
json_data = json.dumps(dict_data, indent=4)
with open('output_with_nested_metadata.json', 'w') as json_file:
    json_file.write(json_data)

# Step 5: Preprocess the data for training
def preprocess_function(examples):
    """
    Prepares questions and answers as text for language model training.

    Args:
        examples (dict): A batch of examples from the dataset.

    Returns:
        dict: A dictionary with formatted text inputs for training.
    """
    inputs = []
    labels = []
    questions = [question for question in examples['question']]
    choices = {
        'A': [choice for choice in examples['choice_A']],
        'B': [choice for choice in examples['choice_B']],
        'C': [choice for choice in examples['choice_C']],
        'D': [choice for choice in examples['choice_D']],
    }

    for i in range(len(questions)):
        inputs.append(questions[i])
        text = ""
        for option in choices:
          if 'Rank: 1' in choices[f'{option}'][i]:
              text = choices[f'{option}'][i]
              pattern = r"[A-Z]\)\s(.*?)-\sRank:"
              match = re.search(pattern, text, re.DOTALL)
              text = match.group(1).strip()
              new_text = text.split(' - Justification:')
              text = new_text[0] + '.' + new_text[1]
        labels.append(text.replace('..', '.').replace('. .', '.').replace(' .', '.'))

    texts = [f"Question: {q} Answer: {a}" for q, a in zip(inputs, labels)]
    return {'text': texts}

def tokenize_function(examples):
    """
    Tokenizes the input text for the model.

    Args:
        examples (dict): A batch of examples with text inputs.

    Returns:
        dict: Tokenized inputs for the model.
    """
    return tokenizer(examples["text"], truncation=True)

# Load the JSON data into a Hugging Face dataset
with open('output_with_nested_metadata.json', 'r') as json_file:
    json_data = json.load(json_file)

dataset = Dataset.from_dict({
    "question": [item["question"] for item in json_data],
    'choice_A': [item['choice_A'] for item in json_data],
    'choice_B': [item['choice_B'] for item in json_data],
    'choice_C': [item['choice_C'] for item in json_data],
    'choice_D': [item['choice_D'] for item in json_data],
    'answer': [item['answer'] for item in json_data],
})

# Step 6: Split the dataset and prepare for training
split_dataset = dataset.train_test_split(test_size=0.3, seed=0)
train_dataset = split_dataset['train']
test_dataset = split_dataset['test']

# Preprocess and tokenize the training dataset
train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)


Training of model on the entire dataset


In [None]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = tokenized_train_dataset,
    #dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = data_collator,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 10,
        num_train_epochs = 100, # Set this for 1 full training run.
        # max_steps = 1000,
        learning_rate = 5e-5,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)
trainer_stats = trainer.train()
model.save_pretrained('./multiple_choice_questions_model')