In [None]:
### Install and import libraries
!pip install pandas openpyxl
!pip install dataset
!pip install gradio
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

import pandas as pd
import json
from google.colab import files
import numpy as np
from datasets import Dataset
from unsloth import FastLanguageModel
import torch
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

In [None]:
### Downloading of model

max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 2x faster
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # 4bit for 405b!
    "unsloth/Mistral-Small-Instruct-2409",     # Mistral 22b 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!

    "unsloth/Llama-3.2-1B-bnb-4bit",           # NEW! Llama 3.2 models
    "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
    "unsloth/Llama-3.2-3B-bnb-4bit",
    "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit", # or choose "unsloth/Llama-3.2-1B"
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Preprocess True-False Questions

In [None]:
"""
Script for processing True/False questions from an Excel file, saving as JSON, and converting to a Hugging Face dataset.
The script performs the following:
1. Reads data from an Excel file containing True/False questions and metadata.
2. Processes and formats the data into structured columns.
3. Saves the data as a JSON file and reloads it for further processing.
4. Converts the data into a Hugging Face Dataset format.
5. Formats and tokenizes the dataset for model training.

Modules:
    pandas: For reading and processing Excel data.
    json: For saving and loading JSON files.
    datasets: For creating and managing Hugging Face datasets.

Functions:
    format_examples: Formats dataset examples for input text creation.
    tokenize_function: Tokenizes input text using the tokenizer.

Usage:
    - Ensure the Excel file "Teliko.xls" exists in the working directory with the correct sheet and columns.
    - Replace `tokenizer` with the appropriate tokenizer for your model.
"""

import pandas as pd
import json
from datasets import Dataset

# Step 1: Load the Excel file
df = pd.read_excel("Teliko.xls", sheet_name='True false questions')
"""
Reads the 'True false questions' sheet from the Excel file into a pandas DataFrame.
"""

# Define column names for the final DataFrame
columns = ['id', 'question', 'answer', 'metadata.justification']

# Initialize lists for processing
questions = [df.columns[1].replace('\n', ' ').replace('\n\n', '')]  # Clean column header for questions
true_false = []  # Stores True/False answers
justification = []  # Stores justifications
indexs = []  # Final list of processed records

# Step 2: Process the DataFrame row by row
for i in range(len(df)):
    if df.iloc[i, 1] == 'text':  # Extract True/False answers
        true_false.append(str(df.iloc[i, 2]).lower())
    elif df.iloc[i, 1] == 'justification':  # Extract justifications
        justification.append(df.iloc[i, 2])
    elif df.iloc[i, 1] == 'rank':  # Skip rows with 'rank'
        pass
    else:
        # Extract and clean question text
        questions.append(df.iloc[i, 1].replace('\n', ' ').replace('\n\n', ''))

# Construct records by combining questions, answers, and justifications
for i in range(len(questions)):
    indexs.append([i, questions[i], true_false[i], justification[i]])

# Convert the records into a pandas DataFrame
df = pd.DataFrame(indexs, columns=columns)
df.to_excel('phishing_true_false_questions_new_vfinal.xlsx')

df = pd.read_excel('./phishing_true_false_questions_new_vfinal.xlsx')

# Step 3: Save the processed data as JSON
json_data = df.to_json(orient='records', indent=4)
with open('output_with_nested_metadata.json', 'w') as json_file:
    json_file.write(json_data)
"""
Saves the processed data into a JSON file named 'output_with_nested_metadata.json'.
"""

# Step 4: Load the JSON data from the file
with open('output_with_nested_metadata.json', 'r') as json_file:
    json_data = json.load(json_file)
"""
Loads the JSON file into a Python object for further processing.
"""

# Step 5: Convert the JSON data into a Hugging Face Dataset
dataset = Dataset.from_dict({
    "statement": [item["question"] for item in json_data],
    "label": [item["answer"] for item in json_data]
})
"""
Creates a Hugging Face Dataset from the processed JSON data.
Each record consists of:
    - `statement`: The question text.
    - `label`: The corresponding True/False answer.
"""

# Define a function to format dataset examples
def format_examples(examples):
    """
    Formats dataset examples for input text creation.

    Args:
        examples (dict): A batch of examples from the dataset.

    Returns:
        dict: A dictionary with formatted text inputs.
    """
    inputs = [f'Question: {q} Answer: {a}' for q,a in zip(examples['statement'],examples['label'])]
    return {'text':inputs}

# Define a function to tokenize the formatted examples
def tokenize_function(examples):
    """
    Tokenizes the input text.

    Args:
        examples (dict): A batch of examples with formatted text.

    Returns:
        dict: A dictionary with tokenized inputs.
    """
    return tokenizer(examples['text'],truncation=True)

# Step 6: Split the dataset into train and test subsets
split_dataset = dataset.train_test_split(test_size=0.3, seed=0)
train_dataset = split_dataset['train']
test_dataset = split_dataset['test']
"""
Splits the dataset into training (70%) and testing (30%) subsets using a fixed random seed for reproducibility.
"""

# Step 7: Format and tokenize the training dataset
train_dataset = train_dataset.map(format_examples, batched=True)
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
"""
Formats and tokenizes the training dataset.
"""

# Step 8: Format and tokenize the full dataset
dataset = dataset.map(format_examples, batched=True)
tokenized_dataset = dataset.map(tokenize_function, batched=True)
"""
Formats and tokenizes the entire dataset for future use.
"""


Training of model on entire dataset

In [None]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = tokenized_dataset,
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 4,
        gradient_accumulation_steps = 4,
        warmup_steps = 10,
        num_train_epochs = 100, # Set this for 1 full training run.
        # max_steps = 1000,
        learning_rate = 5e-5,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)
trainer_stats = trainer.train()
model.save_pretrained('./true_false_questions_model')