# Isuku Chatbot - Llama Model Training

This notebook trains a Llama-based chatbot model for the Isuku waste management system using the provided Q&A dataset.

## Dataset Overview
- **Total Rows**: 900
- **Languages**: English, Kinyarwanda, French
- **Intents**: waste_sorting, pickup_schedule, payment, education
- **Split**: 70% Training, 20% Validation, 10% Testing

In [None]:
# Install required packages
!pip install -q transformers torch accelerate datasets pandas openpyxl peft bitsandbytes

## 1. Import Libraries and Setup

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import Dataset
import os
import warnings
warnings.filterwarnings('ignore')

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
print(f"PyTorch version: {torch.__version__}")

## 2. Load and Explore Dataset

In [None]:
# Load the dataset
df = pd.read_excel('Dataset/isuku_chatbot_dataset_300_QA.xlsx')

print("Dataset Shape:", df.shape)
print("\nColumn Names:", df.columns.tolist())
print("\nFirst few rows:")
print(df.head())

print("\n" + "="*60)
print("Dataset Statistics")
print("="*60)
print(f"Total rows: {len(df)}")
print(f"Languages: {df['Language'].unique()}")
print(f"Intents: {df['Intent'].unique()}")
print(f"\nLanguage distribution:\n{df['Language'].value_counts()}")
print(f"\nIntent distribution:\n{df['Intent'].value_counts()}")

## 3. Data Preprocessing and Formatting

In [None]:
# Remove duplicates to get unique Q&A pairs
df_unique = df.drop_duplicates(subset=['Question', 'Answer', 'Language', 'Intent'])
print(f"Unique Q&A pairs: {len(df_unique)} (from {len(df)} total rows)")

# Format data for chatbot training
# Create a prompt template for instruction-following format
def format_prompt(row):
    """Format Q&A pair into instruction-following format"""
    prompt = f"### Instruction:\n{row['Question']}\n\n### Response:\n{row['Answer']}"
    return prompt

df_unique['formatted_text'] = df_unique.apply(format_prompt, axis=1)

# Display sample formatted text
print("\nSample formatted prompts:")
for i in range(min(3, len(df_unique))):
    print(f"\n--- Sample {i+1} ({df_unique.iloc[i]['Language']}, {df_unique.iloc[i]['Intent']}) ---")
    print(df_unique.iloc[i]['formatted_text'])

## 4. Train/Validation/Test Split (70/20/10)

In [None]:
# First split: 70% train, 30% temp (for val + test)
train_df, temp_df = train_test_split(
    df_unique,
    test_size=0.3,
    random_state=42,
    stratify=df_unique[['Language', 'Intent']]  # Stratify to maintain distribution
)

# Second split: 20% val, 10% test (from the 30% temp)
# temp_df is 30%, so 20/30 = 0.667 for validation, 10/30 = 0.333 for test
val_df, test_df = train_test_split(
    temp_df,
    test_size=0.333,  # 10% of total / 30% of total = 0.333
    random_state=42,
    stratify=temp_df[['Language', 'Intent']]
)

print(f"Training set: {len(train_df)} samples ({len(train_df)/len(df_unique)*100:.1f}%)")
print(f"Validation set: {len(val_df)} samples ({len(val_df)/len(df_unique)*100:.1f}%)")
print(f"Test set: {len(test_df)} samples ({len(test_df)/len(df_unique)*100:.1f}%)")

print("\nTraining set distribution:")
print(f"  Languages: {train_df['Language'].value_counts().to_dict()}")
print(f"  Intents: {train_df['Intent'].value_counts().to_dict()}")

print("\nValidation set distribution:")
print(f"  Languages: {val_df['Language'].value_counts().to_dict()}")
print(f"  Intents: {val_df['Intent'].value_counts().to_dict()}")

print("\nTest set distribution:")
print(f"  Languages: {test_df['Language'].value_counts().to_dict()}")
print(f"  Intents: {test_df['Intent'].value_counts().to_dict()}")

## 5. Load Llama Model and Tokenizer

**Note**: For this example, we'll use a smaller Llama model. You may need to:
- Use Hugging Face authentication token for Llama models
- Adjust model name based on available resources
- Consider using quantized models (4-bit/8-bit) for memory efficiency

In [None]:
# Model configuration
# Using a smaller Llama model variant for training
# You can change this to "meta-llama/Llama-2-7b-chat-hf" or other variants
# Note: You may need Hugging Face authentication for Llama models

MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"  # Small model for demonstration
# Alternative options:
# MODEL_NAME = "meta-llama/Llama-2-7b-chat-hf"  # Requires HF token
# MODEL_NAME = "microsoft/phi-2"  # Alternative small model

print(f"Loading model: {MODEL_NAME}")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Configure quantization for memory efficiency (optional)
use_quantization = True  # Set to False if you have enough GPU memory

if use_quantization and torch.cuda.is_available():
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
    )
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True
    )
else:
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
        device_map="auto" if torch.cuda.is_available() else None,
        trust_remote_code=True
    )

# Move model to device if not using device_map
if not torch.cuda.is_available():
    model = model.to(device)

print(f"Model loaded successfully!")
print(f"Model parameters: {sum(p.numel() for p in model.parameters())/1e6:.2f}M")

In [None]:
# Tokenize function
def tokenize_function(examples):
    """Tokenize the formatted text"""
    # Tokenize with truncation and padding
    tokenized = tokenizer(
        examples['formatted_text'],
        truncation=True,
        padding='max_length',
        max_length=512,  # Adjust based on your needs
        return_tensors="pt"
    )
    # For causal LM, labels are the same as input_ids
    tokenized['labels'] = tokenized['input_ids'].clone()
    return tokenized

# Convert to HuggingFace datasets
train_dataset = Dataset.from_pandas(train_df[['formatted_text']])
val_dataset = Dataset.from_pandas(val_df[['formatted_text']])
test_dataset = Dataset.from_pandas(test_df[['formatted_text']])

# Tokenize datasets
print("Tokenizing training set...")
train_dataset = train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=['formatted_text']
)

print("Tokenizing validation set...")
val_dataset = val_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=['formatted_text']
)

print("Tokenizing test set...")
test_dataset = test_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=['formatted_text']
)

print(f"\nTraining samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")
print(f"Test samples: {len(test_dataset)}")

## 7. Configure LoRA for Efficient Fine-tuning

Using LoRA (Low-Rank Adaptation) to reduce memory requirements and training time.

In [None]:
# Prepare model for LoRA training
if use_quantization:
    model = prepare_model_for_kbit_training(model)

# LoRA configuration
lora_config = LoraConfig(
    r=16,  # Rank
    lora_alpha=32,  # LoRA alpha
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],  # Adjust based on model architecture
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

# Apply LoRA
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

print("\nLoRA configuration applied successfully!")

## 8. Training Configuration

In [None]:
# Training arguments
output_dir = "./llama_chatbot_model"
os.makedirs(output_dir, exist_ok=True)

training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=3,  # Adjust based on your needs
    per_device_train_batch_size=2,  # Adjust based on GPU memory
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,  # Effective batch size = 2 * 4 = 8
    warmup_steps=50,
    learning_rate=2e-4,
    fp16=torch.cuda.is_available(),  # Use mixed precision if GPU available
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=3,
    report_to="none",  # Set to "tensorboard" if you want to use TensorBoard
    push_to_hub=False,
)

# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # We're doing causal LM, not masked LM
)

print("Training arguments configured!")
print(f"Output directory: {output_dir}")
print(f"Training epochs: {training_args.num_train_epochs}")
print(f"Batch size: {training_args.per_device_train_batch_size}")
print(f"Learning rate: {training_args.learning_rate}")

## 9. Initialize Trainer and Start Training

In [None]:
# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
)

print("Trainer initialized. Starting training...")
print("="*60)

# Start training
trainer.train()

print("\n" + "="*60)
print("Training completed!")