In [52]:
import os
os.environ["ACCELERATE_MIN_VERSION"] = "0.26.0"
import time
import torch
import pandas as pd
from dotenv import load_dotenv
from transformers import (
    AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments,
    DataCollatorForLanguageModeling
)
from datasets import Dataset
from sklearn.model_selection import train_test_split

# Disable tokenizers parallelism to avoid warnings
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Load environment variables from .env file
load_dotenv()

# Print a reminder for Hugging Face authentication (if needed)
print("HF_TOKEN:", os.getenv("HF_TOKEN"))

HF_TOKEN: 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


In [53]:
# Create a synthetic dataset (you can later expand or replace this with your real data)
data = {
    "question": [
        "What is attention in neural networks?",
        "How do transformer models work?",
        "What are the advantages of self-attention?"
    ],
    "answer": [
        "Attention is a mechanism that allows models to focus on relevant parts of the input.",
        "Transformer models use self-attention and feed-forward layers to process sequences in parallel.",
        "Self-attention helps capture long-range dependencies and improves parallelization."
    ]
}

df = pd.DataFrame(data)
# Split into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Save CSV files in the data folder (ensure you have created a "data" folder)
os.makedirs("data", exist_ok=True)
train_df.to_csv("data/train_qa.csv", index=False)
val_df.to_csv("data/val_qa.csv", index=False)

print("Synthetic dataset created and saved.")

Synthetic dataset created and saved.


In [54]:
# Load training and validation data
train_df = pd.read_csv("data/train_qa.csv")
val_df = pd.read_csv("data/val_qa.csv")

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Load model and tokenizer
model_name = os.getenv("QWEN_MODEL_NAME", "Qwen/Qwen2.5-3B-Instruct")
print("Using model:", model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)

def tokenize_function(examples):
    # Concatenate question and answer with a separator
    texts = [f"Question: {q}\nAnswer: {a}\n" for q, a in zip(examples["question"], examples["answer"])]
    return tokenizer(texts, truncation=True, max_length=1024)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="model/qwen_finetuned",
    evaluation_strategy="steps",  # Correct parameter name
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=3,
    save_steps=500,
    eval_steps=500,
    logging_steps=100,
    learning_rate=5e-5,
    fp16=True,  # Enable mixed precision training
    report_to="none",
    gradient_accumulation_steps=4,  # Accumulate gradients to simulate a larger batch size
    optim="adamw_torch",  # Explicitly set optimizer
    bf16=True if torch.cuda.is_available() else False,  # Use bf16 if available
    disable_tqdm=False,  # Disable tqdm for cleaner output
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
)

# Start fine-tuning (ensure you are using a GPU runtime in Colab)
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained("model/qwen_finetuned")
tokenizer.save_pretrained("model/qwen_finetuned")
print("Fine-tuning complete and model saved.")

Using model: Qwen/Qwen2.5-3B-Instruct


Loading checkpoint shards: 100%|██████████| 2/2 [00:29<00:00, 14.62s/it]
Map: 100%|██████████| 2/2 [00:00<00:00, 38.61 examples/s]
Map: 100%|██████████| 1/1 [00:00<00:00, 625.83 examples/s]


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>=0.26.0'`

In [None]:
print("To quantize the fine-tuned model to 4-bit and convert it to .gguf format, please follow these steps:")
print("1. Export your fine-tuned model (already saved in 'model/qwen_finetuned').")
print("2. Use a conversion script from llama.cpp or a similar tool. For example, run:")
print("   python convert_to_gguf.py --input_dir model/qwen_finetuned --output_file model/qwen_finetuned.gguf --quantization 4bit")
print("Refer to the tool’s documentation for exact command-line arguments.")

In [None]:
import subprocess
import sys

def run_inference(prompt: str) -> str:
    # Replace 'gguf_infer' with your actual inference command or Python API call for the quantized model.
    try:
        result = subprocess.run(
            ["gguf_infer", "--model", "model/qwen_finetuned.gguf", "--prompt", prompt],
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            check=True,
            text=True,
        )
        return result.stdout
    except subprocess.CalledProcessError as e:
        print("Inference error:", e.stderr)
        sys.exit(1)

# Example inference: ask a question
if __name__ == "__main__":
    prompt = input("Enter your question: ")
    answer = run_inference(prompt)
    print("Answer:", answer)

In [None]:
import nltk
from nltk.translate.bleu_score import sentence_bleu

# Example evaluation function comparing generated answer to a reference answer.
def evaluate_answer(generated: str, reference: str):
    reference_tokens = reference.split()
    generated_tokens = generated.split()
    score = sentence_bleu([reference_tokens], generated_tokens)
    return score

# Example usage:
generated_answer = "Attention is a mechanism that allows models to focus on the relevant parts of the input."
reference_answer = "Attention enables a model to focus on the most important parts of the input."
bleu = evaluate_answer(generated_answer, reference_answer)
print("BLEU Score:", bleu)