# AgriFinConnect-Rwanda: Multilingual Loan Chatbot Training

This notebook trains a **Llama-based** AI chatbot for AgriFinConnect-Rwanda that:
- **Guides users through loan applications**
- **Explains loan terms and repayment schedules**
- **Provides basic financial literacy support**
- **Responds in Kinyarwanda, English, and French**

## Pipeline Overview
1. **Load** the Bitext mortgage/loans dataset (English)
2. **Translate** to Kinyarwanda and French using NLLB (No Language Left Behind)
3. **Combine** multilingual dataset (EN + RW + FR)
4. **Fine-tune** Llama with LoRA/PEFT on the combined data

## Requirements
- GPU recommended (Colab T4/V100 or local CUDA GPU)
- Hugging Face account (for Llama model access - accept Meta's license at https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct)

## 1. Install Dependencies

In [1]:
# Install required packages (run once, restart kernel if needed)
!pip install -q transformers datasets accelerate peft bitsandbytes pandas torch sentencepiece


[notice] A new release of pip is available: 25.2 -> 26.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# Imports and configuration
import os
import pandas as pd
import torch
from pathlib import Path

# Configuration - adjust these as needed
DATA_PATH = "Bitext-mortgage-loans-llm-chatbot-training-dataset/bitext-mortgage-loans-llm-chatbot-training-dataset.csv"
OUTPUT_DIR = "agrifinconnect_multilingual_dataset"
TRANSLATED_DATA_PATH = "agrifinconnect_multilingual_dataset/multilingual_loan_dataset.csv"

# Translation: use SUBSET_SIZE for quick testing, None for full dataset
SUBSET_SIZE = 2000  # Set to None to translate full ~37K rows (takes several hours)
TRANSLATION_BATCH_SIZE = 8
NLLB_MODEL = "facebook/nllb-200-distilled-600M"  # Lighter; use nllb-200-distilled-1.3B for better quality

# Training
LLAMA_MODEL = "meta-llama/Llama-3.2-3B-Instruct"  # Requires HF login + Meta license
OUTPUT_MODEL_DIR = "agrifinconnect-loan-chatbot"
TRAINING_EPOCHS = 2
LEARNING_RATE = 2e-5
MAX_SEQ_LENGTH = 1024

# Create output directory
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
print(f"GPU available: {torch.cuda.is_available()}")

GPU available: False


## 2. Load Original Dataset

In [3]:
# Load the Bitext mortgage/loans dataset
df = pd.read_csv(DATA_PATH, on_bad_lines='skip')
print(f"Loaded {len(df)} rows")
print(df.columns.tolist())

# Use subset for faster iteration (optional)
if SUBSET_SIZE is not None:
    df = df.sample(n=min(SUBSET_SIZE, len(df)), random_state=42).reset_index(drop=True)
    print(f"Using subset of {len(df)} rows for translation")

# Keep only needed columns
df = df[['system_prompt', 'instruction', 'intent', 'category', 'tags', 'response']]
df.head(2)

Loaded 36778 rows
['system_prompt', 'instruction', 'intent', 'category', 'tags', 'response']
Using subset of 2000 rows for translation


Unnamed: 0,system_prompt,instruction,intent,category,tags,response
0,You are an expert in customer support for mort...,i got to speak with a fucking operator,human_agent,CONTACT,BLQW,I apologize for any frustration or confusion y...
1,You are an expert in customer support for mort...,I want to take out a home loan with my partner,apply_for_joint_loan,LOAN_APPLICATION,BL,Unquestionably! Applying for a home loan with ...


## 3. Translate Dataset to Kinyarwanda and French

Using **NLLB (No Language Left Behind)** from Meta - supports 200+ languages including Kinyarwanda (`kin_Latn`) and French (`fra_Latn`).

In [4]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Load NLLB translation model
print("Loading NLLB translation model...")
translation_tokenizer = AutoTokenizer.from_pretrained(NLLB_MODEL)
translation_model = AutoModelForSeq2SeqLM.from_pretrained(NLLB_MODEL)
if torch.cuda.is_available():
    translation_model = translation_model.cuda()
translation_model.eval()
print("NLLB model loaded.")

  from .autonotebook import tqdm as notebook_tqdm


Loading NLLB translation model...


Loading weights: 100%|██████████| 512/512 [00:01<00:00, 287.98it/s, Materializing param=model.shared.weight]                                   


NLLB model loaded.


In [5]:
def translate_batch(texts, src_lang="eng_Latn", tgt_lang="fra_Latn", max_length=512):
    """Translate a batch of texts using NLLB."""
    if not texts or all(not t or (isinstance(t, float) and pd.isna(t)) for t in texts):
        return [""] * len(texts)
    
    # Clean texts
    texts = [str(t).strip() if pd.notna(t) and str(t).strip() else "" for t in texts]
    
    translation_tokenizer.src_lang = src_lang
    forced_bos_id = translation_tokenizer.convert_tokens_to_ids(tgt_lang)
    
    inputs = translation_tokenizer(
        texts,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=max_length,
    )
    
    if torch.cuda.is_available():
        inputs = {k: v.cuda() for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = translation_model.generate(
            **inputs,
            forced_bos_token_id=forced_bos_id,
            max_length=max_length,
            num_beams=2,
        )
    
    return translation_tokenizer.batch_decode(outputs, skip_special_tokens=True)

In [6]:
def translate_column(df, column, tgt_lang, batch_size=TRANSLATION_BATCH_SIZE):
    """Translate a column in batches."""
    results = []
    n = len(df)
    for i in range(0, n, batch_size):
        batch = df[column].iloc[i:i + batch_size].tolist()
        translated = translate_batch(batch, src_lang="eng_Latn", tgt_lang=tgt_lang)
        results.extend(translated)
        if (i + batch_size) % 100 == 0 or i + batch_size >= n:
            print(f"  Translated {min(i + batch_size, n)}/{n} rows...")
    return results

In [7]:
# Translate to Kinyarwanda (kin_Latn)
print("Translating to Kinyarwanda...")
df["instruction_kin"] = translate_column(df, "instruction", "kin_Latn")
df["response_kin"] = translate_column(df, "response", "kin_Latn")
print("Kinyarwanda translation done.")

Translating to Kinyarwanda...
  Translated 200/2000 rows...
  Translated 400/2000 rows...
  Translated 600/2000 rows...
  Translated 800/2000 rows...
  Translated 1000/2000 rows...
  Translated 1200/2000 rows...
  Translated 1400/2000 rows...
  Translated 1600/2000 rows...
  Translated 1800/2000 rows...
  Translated 2000/2000 rows...
  Translated 200/2000 rows...
  Translated 400/2000 rows...
  Translated 600/2000 rows...
  Translated 800/2000 rows...
  Translated 1000/2000 rows...
  Translated 1200/2000 rows...


KeyboardInterrupt: 

In [None]:
# Translate to French (fra_Latn)
print("Translating to French...")
df["instruction_fra"] = translate_column(df, "instruction", "fra_Latn")
df["response_fra"] = translate_column(df, "response", "fra_Latn")
print("French translation done.")

**Optional:** To skip translation (if you already ran it once and saved), run the cell below instead of cells 10-11:

In [None]:
# SKIP TRANSLATION: Load pre-translated data (run this instead of translation + combine cells)
# df_multilingual = pd.read_csv(TRANSLATED_DATA_PATH)
# print(f"Loaded {len(df_multilingual)} rows. Skip to '5. Format Dataset for Llama Fine-Tuning'.")

## 4. Combine Multilingual Dataset

Create one dataset with English, Kinyarwanda, and French instruction-response pairs.

In [None]:
# Build multilingual dataset: EN + Kinyarwanda + French
SYSTEM_PROMPT = "You are an expert in customer support for mortgage and loans. Guide users through loan applications, explain loan terms and repayment schedules, and provide financial literacy support. Respond in the same language as the user (English, Kinyarwanda, or French)."

records = []

# English
for _, row in df.iterrows():
    records.append({
        "language": "en",
        "system_prompt": SYSTEM_PROMPT,
        "instruction": row["instruction"],
        "response": row["response"],
        "intent": row["intent"],
        "category": row["category"],
    })

# Kinyarwanda
for _, row in df.iterrows():
    records.append({
        "language": "kin",
        "system_prompt": SYSTEM_PROMPT,
        "instruction": row["instruction_kin"],
        "response": row["response_kin"],
        "intent": row["intent"],
        "category": row["category"],
    })

# French
for _, row in df.iterrows():
    records.append({
        "language": "fr",
        "system_prompt": SYSTEM_PROMPT,
        "instruction": row["instruction_fra"],
        "response": row["response_fra"],
        "intent": row["intent"],
        "category": row["category"],
    })

df_multilingual = pd.DataFrame(records)
df_multilingual.to_csv(TRANSLATED_DATA_PATH, index=False)
print(f"Multilingual dataset: {len(df_multilingual)} rows (EN: {len(df)}, KIN: {len(df)}, FR: {len(df)})")
df_multilingual.sample(3)

## 5. Format Dataset for Llama Fine-Tuning

Convert to chat format compatible with Llama 3.2.

In [None]:
from datasets import Dataset

def format_chat(example):
    """Format as Llama 3.2 chat: <|begin_of_text|><|start_header_id|>system<|end_header_id|>..."""
    return {
        "text": f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{example['system_prompt']}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{example['instruction']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{example['response']}<|eot_id|>"
    }

# Convert to HuggingFace Dataset
hf_dataset = Dataset.from_pandas(df_multilingual[["system_prompt", "instruction", "response"]])
hf_dataset = hf_dataset.map(format_chat, remove_columns=["system_prompt", "instruction", "response"])
print(f"Dataset size: {len(hf_dataset)}")
print("Sample formatted text (first 500 chars):")
print(hf_dataset[0]["text"][:500])

## 6. Load Llama Model and Tokenizer

**Note:** You must log in to Hugging Face and accept Meta's Llama license:
1. `huggingface-cli login`
2. Visit https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct and accept the license

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

# 4-bit quantization for memory efficiency (fits on ~8GB GPU)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

print("Loading Llama model and tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(LLAMA_MODEL, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    LLAMA_MODEL,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)
print("Model loaded.")

## 7. Configure LoRA and Prepare for Training

In [None]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# LoRA configuration
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

# Prepare model for k-bit training
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

In [None]:
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=MAX_SEQ_LENGTH,
        padding="max_length",
        return_tensors=None,
    )

# Tokenize dataset
tokenized_dataset = hf_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"],
    desc="Tokenizing",
)

# Set labels for causal LM (labels = input_ids, with -100 for padding)
def set_labels(examples):
    examples["labels"] = examples["input_ids"].copy()
    return examples

tokenized_dataset = tokenized_dataset.map(set_labels, batched=True, desc="Setting labels")

## 8. Train the Model

In [None]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

# Set pad token
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# Training arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_MODEL_DIR,
    num_train_epochs=TRAINING_EPOCHS,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=LEARNING_RATE,
    fp16=True,
    logging_steps=10,
    save_strategy="epoch",
    save_total_limit=2,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

print("Starting training...")
trainer.train()
print("Training complete.")

## 9. Save Model and Test Inference

In [None]:
# Save the fine-tuned model
trainer.save_model(OUTPUT_MODEL_DIR)
tokenizer.save_pretrained(OUTPUT_MODEL_DIR)
print(f"Model saved to {OUTPUT_MODEL_DIR}")

In [None]:
# Quick inference test
model.eval()
def chat(prompt, max_new_tokens=256):
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": prompt},
    ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
    )
    inputs = tokenizer(text, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=True, temperature=0.7)
    return tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)

# Test in English
print("--- English ---")
print(chat("How do I apply for a loan?"))

# Test in Kinyarwanda (if you have Kinyarwanda samples)
print("\n--- Kinyarwanda ---")
print(chat("Nte nshobora gusaba icyangombwa?"))

# Test in French
print("\n--- French ---")
print(chat("Comment puis-je demander un prêt?"))