In [None]:
### Install and import libraries
!pip install pandas openpyxl
!pip install dataset
!pip install unsloth
!pip install gradio
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install sentence-transformers bert-score nltk rouge-score
import pandas as pd
import json
from google.colab import files
import numpy as np
from datasets import Dataset
from unsloth import FastLanguageModel
import torch
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

In [None]:
### Downloading of model

max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 2x faster
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # 4bit for 405b!
    "unsloth/Mistral-Small-Instruct-2409",     # Mistral 22b 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!

    "unsloth/Llama-3.2-1B-bnb-4bit",           # NEW! Llama 3.2 models
    "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
    "unsloth/Llama-3.2-3B-bnb-4bit",
    "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit", # or choose "unsloth/Llama-3.2-1B"
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Preprocess the Open-Ended Questions

In [None]:
"""
Script for processing open-ended questions from an Excel file, converting to JSON, and preparing for model training.
The script performs the following:
1. Reads data from an Excel file containing open-ended questions and answers.
2. Processes and formats the data into structured columns.
3. Saves the data as a JSON file and reloads it for further processing.
4. Converts the data into a Hugging Face Dataset format.
5. Tokenizes and prepares the dataset for language model training.

Modules:
    pandas: For reading and processing Excel data.
    json: For saving and loading JSON files.
    datasets: For creating and managing Hugging Face datasets.
    DataCollatorForLanguageModeling: For collating and batching data for language model training.

Functions:
    format_examples: Formats dataset examples into text input-output pairs.
    tokenize_function: Tokenizes the text inputs for the model.

Usage:
    - Ensure the Excel file "Teliko.xls" exists in the working directory with the correct sheet and columns.
    - Replace `tokenizer` with the appropriate tokenizer for your language model.
"""

import pandas as pd
import json
from datasets import Dataset
from transformers import DataCollatorForLanguageModeling

# Step 1: Load the Excel file
df = pd.read_excel("Teliko.xls", sheet_name='open ended questions')
"""
Reads the 'open ended questions' sheet from the Excel file into a pandas DataFrame.
"""

# Define column names for the final DataFrame
columns = ['id', 'question', 'answer']

# Initialize lists for processing
questions = [df.columns[1].replace('\n', ' ').replace('\n\n', '')]  # Clean column header for questions
answers = []  # Stores answers
indexs = []  # Final list of processed records

# Step 2: Process the DataFrame row by row
for i in range(len(df)):
    if df.iloc[i, 0] == 'answer':  # Extract answers
        answers.append(df.iloc[i, 1].replace('\n', ''))
    elif df.iloc[i, 0] == 'question ':  # Extract questions
        questions.append(df.iloc[i, 1].replace('\n', ''))

# Construct records by combining questions and answers
for i in range(len(questions)):
    indexs.append([i, questions[i], answers[i]])

# Convert the records into a pandas DataFrame
df = pd.DataFrame(indexs, columns=columns)

df.to_excel('phishing_open_ended_questions_new_vfinal.xlsx')

df = pd.read_excel('./phishing_open_ended_questions_new_vfinal.xlsx')

# Convert the DataFrame to a dictionary
dict_data = df.to_dict(orient='records')

# Step 3: Clean the data by removing newline characters
for row in dict_data:
    row['question'] = row['question'].replace('\n', "")
    row['answer'] = row['answer'].replace('\n', "")

# Step 4: Convert the DataFrame to JSON
json_data = json.dumps(dict_data, indent=4)

# Step 5: Save the JSON to a file
with open('output_with_nested_metadata.json', 'w') as json_file:
    json_file.write(json_data)
"""
Saves the processed data into a JSON file named 'output_with_nested_metadata.json'.
"""

# Step 6: Load the JSON data from the file
with open('output_with_nested_metadata.json', 'r') as json_file:
    json_data = json.load(json_file)
"""
Loads the JSON file into a Python object for further processing.
"""

# Define a function to format dataset examples
def format_examples(examples):
    """
    Formats dataset examples into text input-output pairs.

    Args:
        examples (dict): A batch of examples from the dataset.

    Returns:
        dict: A dictionary with formatted text inputs.
    """
    inputs = [f"Question: {q} Answer: {a}" for q, a in zip(examples["question"], examples["answer"])]
    return {'text': inputs}

# Define a function to tokenize the formatted examples
def tokenize_function(examples):
    """
    Tokenizes the input text.

    Args:
        examples (dict): A batch of examples with formatted text.

    Returns:
        dict: A dictionary with tokenized inputs.
    """
    return tokenizer(examples["text"], truncation=True)

# Step 7: Convert the JSON data into a Hugging Face Dataset
dataset = Dataset.from_dict({
    "question": [item["question"] for item in json_data],
    "answer": [item["answer"] for item in json_data]
})
"""
Creates a Hugging Face Dataset from the processed JSON data.
"""

# Step 8: Split the dataset into train and test subsets
split_dataset = dataset.train_test_split(test_size=0.3, seed=0)
train_dataset = split_dataset['train']
test_dataset = split_dataset['test']
"""
Splits the dataset into training (70%) and testing (30%) subsets using a fixed random seed for reproducibility.
"""

# Step 9: Format and tokenize the training dataset
train_dataset = train_dataset.map(format_examples, batched=True)
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)

# Define a data collator for language model training
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
"""
Formats and tokenizes the training dataset, and sets up the data collator for language modeling.
"""

# Step 10: Format and tokenize the full dataset
dataset = dataset.map(format_examples, batched=True)
tokenized_dataset = dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
"""
Formats and tokenizes the entire dataset for future use.
"""


Training of model for Benchmark

In [None]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = tokenized_train_dataset,
    #dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = data_collator,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 10,
        num_train_epochs = 100, # Set this for 1 full training run.
        # max_steps = 1000,
        learning_rate = 5e-5,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

trainer_stats = trainer.train()


Defining Accuracy metrics for Benchmark

In [11]:
#### Defining Accuracy Metrics

from sentence_transformers import SentenceTransformer, util
from bert_score import score
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer

def compute_bleu_score(reference,generated_answer):
    """
    Compute BLEU score for a single justification.
    Args:
        reference (str): Ground-truth justification.
        generated_answer (str): Model-generated justification.
    Returns:
        float: BLEU score.
    """
    bleu = sentence_bleu(
        [reference],
        [generated_answer]
    )
    return bleu

def compute_rouge_scores(reference, generated_answer):
    """
    Compute ROUGE scores for a single justification.
    Args:
        reference (str): Ground-truth justification.
        generated_answer (str): Model-generated justification.
    Returns:
        dict: ROUGE scores (ROUGE-1, ROUGE-2, ROUGE-L).
    """
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, generated_answer)
    return {
        "ROUGE-1": scores['rouge1'].fmeasure,
        "ROUGE-2": scores['rouge2'].fmeasure,
        "ROUGE-L": scores['rougeL'].fmeasure
    }

def compute_bert_score(reference, generated_answer):
    """
    Compute BERTScore for a single justification.
    Args:
        reference (str): Ground-truth justification.
        generated_answer (str): Model-generated justification.
    Returns:
        float: BERTScore (F1).
    """
    P, R, F1 = score(
        [generated_answer],
        [reference],
        lang="en",
        verbose=False
    )
    return F1.mean().item()

def evaluate_model(ground_truth,generated_answer):
    similiratity_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

    answer_similarity = util.pytorch_cos_sim(
        similiratity_model.encode(ground_truth),
        similiratity_model.encode(generated_answer)
    ).item()

    bert_score = compute_bert_score(ground_truth,generated_answer)

    return answer_similarity,bert_score

Benchmark

In [None]:
"""
Script for evaluating a fine-tuned model's responses on a test dataset using Cosine Similarity and BERT Scores.
The script performs the following:
1. Generates responses for a given set of questions using a fine-tuned model.
2. Evaluates the generated responses against ground truth answers using:
   - Cosine Similarity
   - BERT Scores
3. Computes average metrics across the test dataset.

Modules:
    numpy: For calculating averages of evaluation metrics.
    torch: For tensor operations and model inference.

Dependencies:
    - `FastLanguageModel.for_inference(model)`: Optimizes model inference (replace with your implementation).
    - `tokenizer`: A tokenizer compatible with the fine-tuned model.
    - `evaluate_model`: A function to compute Cosine Similarity and BERT Scores.

Usage:
    - Ensure `test_dataset` contains test samples with "question" and "answer".
    - Replace placeholders with your model, tokenizer, and `evaluate_model` implementations.
"""

import numpy as np
import torch

# Initialize storage for evaluation scores
cos_similarity_scores = []
bert_scores = []

def generate_response(instruction):
    """
    Generates a response to a given instruction using the fine-tuned model.

    Args:
        instruction (str): The question or instruction for the model.

    Returns:
        str: The model's generated response.
    """
    # Enable faster inference mode for the fine-tuned model
    FastLanguageModel.for_inference(model)

    # Construct a prompt for the model
    prompt = f"""### Instruction:
Answer the provided question with the knowledge provided to you
### Question:
{instruction}

### Answer:
"""

    # Tokenize the prompt and move it to the GPU
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    # Generate a response using the model
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            early_stopping=False,
            min_length=50,
            length_penalty=2,
            max_length=300
        )

    # Decode the generated output and extract the answer
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    response = response.split("### Answer:")[1]  # Extract the answer portion
    return response

# Iterate over the test dataset
for pair in test_dataset:
    # Prepare the question
    instruction = f"{pair['question']}"
    print("Question:", pair['question'])

    # Generate a response
    response = generate_response(instruction)
    print("Generated Answer:", response)

    # Print ground truth for comparison
    print("GROUND TRUTH:", pair['answer'])

    # Evaluate the response using Cosine Similarity and BERT Scores
    cos_sim, bert_score = evaluate_model(pair['answer'], response)
    cos_similarity_scores.append(cos_sim)
    bert_scores.append(bert_score)
    print()

# Print individual evaluation scores
print("Cosine Similarity Scores:", cos_similarity_scores)
print("BERT Scores:", bert_scores)

# Calculate and print average scores
avg_cos_sim_score = np.mean(cos_similarity_scores)  # Average Cosine Similarity
avg_bert_score = np.mean(bert_scores)  # Average BERT Score

print(f"Average Cosine Similarity Score: {avg_cos_sim_score:.4f}")
print(f"Average BERT Score: {avg_bert_score:.4f}")


Training of model on entire dataset

In [None]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = tokenized_dataset,
    #dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = data_collator,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 10,
        num_train_epochs = 100, # Set this for 1 full training run.
        # max_steps = 1000,
        learning_rate = 5e-5,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

trainer_stats = trainer.train()
model.save_pretrained('./open_ended_questions_model')

creation of the API

In [None]:
"""
Script for generating responses to open-ended questions using a fine-tuned language model.
The script includes:
1. Initialization of a fine-tuned model and tokenizer from a checkpoint directory.
2. A function to generate responses using the fine-tuned model.
3. A Gradio interface for user interaction, allowing questions to be input and responses to be displayed.

Modules:
    gradio: For creating a web-based interface.
    unsloth.FastLanguageModel: For loading and interacting with the fine-tuned language model.
    torch: For tensor operations and model inference.

Usage:
    - Place your fine-tuned model and tokenizer checkpoint in the directory `open_ended_questions_model`.
    - Run the script to launch a Gradio interface for asking questions.
"""

import gradio as gr
from unsloth import FastLanguageModel
import torch

# Initialize the fine-tuned language model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained('./open_ended_questions_model')
"""
Loads the fine-tuned language model and tokenizer from the `open_ended_questions_model` directory.
Ensure this directory contains the appropriate model and tokenizer files.
"""

def generate_response(instruction):
    """
    Generates a response to a given instruction using the fine-tuned model.

    Args:
        instruction (str): The question or instruction for the model.

    Returns:
        str: The model's generated response.
    """
    # Enable faster inference mode for the fine-tuned model
    FastLanguageModel.for_inference(model)

    # Construct a prompt for the model
    prompt = f"""### Instruction:
Answer the provided question with the knowledge provided to you
### Question:
{instruction}
### Answer:"""

    # Tokenize the prompt and move it to the GPU
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    # Generate a response using the model
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            early_stopping=False,
            min_length=50,
            length_penalty=2,
            max_length=300
        )

    # Decode the generated output and extract the answer
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    response = response.split("### Answer:")[-1].strip()
    return response

def greet(question):
    """
    Processes the user's input and returns a model-generated response.

    Args:
        question (str): The user's question.

    Returns:
        str: A response generated by the model or an error message for empty input.
    """
    if question == "":
        return "No question was given to answer"
    else:
        response = generate_response(question)
        return f"{response}!"

# Create a Gradio interface for user interaction
gr.Interface(
    fn=greet,
    inputs=[gr.Textbox(label='question')],  # Input: Textbox for user input
    outputs="textbox"  # Output: Textbox for displaying the response
).launch()
"""
Launches a Gradio interface with:
- A text input for the user to ask questions.
- A text output for displaying the model's response.
"""
