In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install -q transformers datasets peft accelerate bitsandbytes sentencepiece scikit-learn


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from peft import LoraConfig, get_peft_model, TaskType

model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"  # Replace with actual TinyLlama HF repo or path

tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load model with 4-bit quantization for efficiency if desired
from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
)


In [None]:
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split

data = pd.read_csv("/content/drive/MyDrive/NvidiaDocumentationQandApairs.csv")

# Optional: Clean text as you did earlier (lowercase, remove non-alphanumeric)

def clean_text(text):
    import re
    text = text.lower()
    text = re.sub('[^A-Za-z0-9\s]+', '', text)
    return text

data['question'] = data['question'].apply(clean_text)
data['answer'] = data['answer'].apply(clean_text)

# Create a new column with prompt+answer concatenated
def build_prompt(q, a):
    return f"Question: {q}\nAnswer: {a}"

data['input_text'] = data.apply(lambda row: build_prompt(row['question'], row['answer']), axis=1)

# Split train/val/test
train_df, test_df = train_test_split(data, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(test_df, test_size=0.5, random_state=42)

train_ds = Dataset.from_pandas(train_df)
val_ds = Dataset.from_pandas(val_df)
test_ds = Dataset.from_pandas(test_df)


In [None]:
def tokenize_function(examples):
    # Tokenize full prompt+answer text
    tokenized = tokenizer(
        examples["input_text"],
        truncation=True,
        max_length=256,
        padding="max_length",
    )
    # Labels are same as input_ids (causal LM)
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

train_tokenized = train_ds.map(tokenize_function, batched=True, remove_columns=train_ds.column_names)
val_tokenized = val_ds.map(tokenize_function, batched=True, remove_columns=val_ds.column_names)
test_tokenized = test_ds.map(tokenize_function, batched=True, remove_columns=test_ds.column_names)


In [None]:
lora_config = LoraConfig(
    r=16,                 # smaller rank for tiny model
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],  # typical for LLaMA; confirm correct module names for TinyLlama
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, lora_config)

def print_trainable_params(model):
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    all_params = sum(p.numel() for p in model.parameters())
    print(f"Trainable params: {trainable_params} / {all_params} ({100*trainable_params/all_params:.2f}%)")

print_trainable_params(model)


In [None]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/tinyllama-lora-sft",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=4,
    learning_rate=2e-4,
    logging_steps=50,
    save_total_limit=2,
    fp16=torch.cuda.is_available(),
    load_best_model_at_end=True,
    report_to="none",
    label_names=["labels"],

)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # causal LM
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    tokenizer=tokenizer,
    data_collator=data_collator,
)


In [None]:
trainer.train()


In [None]:
output_dir = "/content/drive/MyDrive/tinyllama-lora-sft-tuned-model"

# Save the PEFT model (includes LoRA weights)
trainer.model.save_pretrained(output_dir)

# Save the tokenizer as well
tokenizer.save_pretrained(output_dir)
