In [2]:
!pip install torch pillow transformers peft datasets lightning wandb nltk bert-score -q

In [None]:
import torch

from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import TrainingArguments, Trainer
from transformers import EarlyStoppingCallback

from peft import LoraConfig, get_peft_model
from datasets import load_dataset

# Load Mistral 7B
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B")
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B", device_map="auto")

In [None]:
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

print(dataset["train"][0])  # Check the first sample

In [None]:
# Preprocessing function
def preprocess_data(example):
    return tokenizer(
        example["text"],
        truncation=True,         # Ensure inputs fit within max length
        padding="max_length",    # Add padding tokens to make all inputs equal length
        max_length=512           # Mistral's max token limit
    )


tokenized_dataset = dataset.map(preprocess_data, batched=True)
print(tokenized_dataset["train"][0])

In [None]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
    lora_dropout=0.05, 
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora_config)

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./mistral_finetuned",    # Directory to save the model
    evaluation_strategy="steps",         # Evaluate the model periodically
    save_strategy="steps",               # Save model checkpoints periodically
    logging_dir="./logs",                # Directory for logs
    learning_rate=2e-5,                  # Learning rate for fine-tuning
    per_device_train_batch_size=4,       # Batch size per device
    gradient_accumulation_steps=8,       # Accumulate gradients to simulate larger batches
    num_train_epochs=3,                  # Number of training epochs
    logging_steps=100,                   # Log progress every 100 steps
    save_steps=500,                      # Save a checkpoint every 500 steps
    fp16=True,                           # Enable mixed precision for faster training
    push_to_hub=False                    # Disable auto-push to Hugging Face Hub
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],  # Training dataset
    eval_dataset=tokenized_dataset["test"]    # Evaluation dataset
)

trainer.add_callback(EarlyStoppingCallback(early_stopping_patience=2))

In [None]:
input_text = "What is the capital of France?"

inputs = tokenizer(input_text, return_tensors="pt").to("cuda")

outputs = model.generate(**inputs, max_length=50)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))