In [5]:
# Q-LoRA Fine-tuning BGE-M3 trên Kaggle với 2x T4 GPU
import os
import torch
import pandas as pd
import numpy as np
from datasets import Dataset


from transformers import (
    AutoTokenizer, 
    AutoModel,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig
)
from peft import (
    LoraConfig, 
    get_peft_model, 
    TaskType,
    prepare_model_for_kbit_training
)
from torch.utils.data import DataLoader
import torch.nn.functional as F
from sklearn.metrics.pairwise import cosine_similarity

# Cấu hình 4-bit quantization cho Q-LoRA
def get_bnb_config():
    return BitsAndBytesConfig(
        load_in_4bit=True,                    # Kích hoạt 4-bit loading
        bnb_4bit_use_double_quant=True,       # Double quantization
        bnb_4bit_quant_type="nf4",           # NormalFloat4 quantization
        bnb_4bit_compute_dtype=torch.float16, # Compute dtype
    )

# Cấu hình LoRA
def get_lora_config():
    return LoraConfig(
        r=16,                           # Rank của decomposition
        lora_alpha=32,                  # Scaling factor (thường = 2*r)
        target_modules=[                # Modules cần áp dụng LoRA
            "query", "key", "value", "dense"
        ],
        lora_dropout=0.1,              # Dropout rate
        bias="none",                   # Không train bias
        task_type=TaskType.FEATURE_EXTRACTION
    )

# Load model với Q-LoRA
def load_model_with_qlora(model_name="BAAI/bge-m3"):
    print("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    print("Loading model with 4-bit quantization...")
    model = AutoModel.from_pretrained(
        model_name,
        quantization_config=get_bnb_config(),
        torch_dtype=torch.float16,
        device_map="auto",              # Tự động phân bổ trên 2 GPU
        trust_remote_code=True
    )
    
    print("Preparing model for k-bit training...")
    model = prepare_model_for_kbit_training(model)
    
    print("Adding LoRA adapters...")
    model = get_peft_model(model, get_lora_config())
    
    # In thông tin về trainable parameters
    model.print_trainable_parameters()
    
    return model, tokenizer

# Custom Dataset class cho sentence similarity
class SentencePairDataset:
    def __init__(self, sentences1, sentences2, labels, tokenizer, max_length=512):
        self.sentences1 = sentences1
        self.sentences2 = sentences2
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.sentences1)
    
    def __getitem__(self, idx):
        # Tokenize câu thứ nhất
        encoding1 = self.tokenizer(
            self.sentences1[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        # Tokenize câu thứ hai
        encoding2 = self.tokenizer(
            self.sentences2[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids_1': encoding1['input_ids'].squeeze(),
            'attention_mask_1': encoding1['attention_mask'].squeeze(),
            'input_ids_2': encoding2['input_ids'].squeeze(),
            'attention_mask_2': encoding2['attention_mask'].squeeze(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.float)
        }

# Custom Trainer cho sentence similarity
class SentenceSimilarityTrainer(Trainer):
    def __init__(self, model, **kwargs):
        super().__init__(model=model, **kwargs)
        self.model = model
    
    def compute_loss(self, model, inputs, return_outputs=False):
        # Extract inputs
        input_ids_1 = inputs['input_ids_1']
        attention_mask_1 = inputs['attention_mask_1']
        input_ids_2 = inputs['input_ids_2']
        attention_mask_2 = inputs['attention_mask_2']
        labels = inputs['labels']
        
        # Get embeddings cho câu thứ nhất
        outputs1 = model(input_ids=input_ids_1, attention_mask=attention_mask_1)
        embeddings1 = self.mean_pooling(outputs1.last_hidden_state, attention_mask_1)
        
        # Get embeddings cho câu thứ hai  
        outputs2 = model(input_ids=input_ids_2, attention_mask=attention_mask_2)
        embeddings2 = self.mean_pooling(outputs2.last_hidden_state, attention_mask_2)
        
        # Normalize embeddings
        embeddings1 = F.normalize(embeddings1, p=2, dim=1)
        embeddings2 = F.normalize(embeddings2, p=2, dim=1)
        
        # Tính cosine similarity
        cosine_sim = F.cosine_similarity(embeddings1, embeddings2, dim=1)
        
        # MSE loss
        loss = F.mse_loss(cosine_sim, labels)
        
        return (loss, {'cosine_sim': cosine_sim}) if return_outputs else loss
    
    def mean_pooling(self, token_embeddings, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        return sum_embeddings / sum_mask

# Tạo dữ liệu mẫu cho training
def create_sample_data():
    # Dữ liệu mẫu - trong thực tế bạn sẽ load từ file
    sentences1 = [
        "The cat is sleeping on the sofa.",
        "I love programming in Python.",
        "The weather is nice today.",
        "Machine learning is fascinating.",
        "The book is on the table."
    ] * 100  # Nhân lên để có đủ data
    
    sentences2 = [
        "A cat is resting on the couch.",
        "Python programming is my favorite.",
        "Today has beautiful weather.",
        "AI and ML are very interesting.",
        "There's a book on the desk."
    ] * 100
    
    # Labels từ 0-1 (similarity scores)
    labels = [0.8, 0.9, 0.85, 0.75, 0.7] * 100
    
    return sentences1, sentences2, labels

# Hàm đánh giá model
def evaluate_model(model, tokenizer, test_sentences1, test_sentences2, test_labels):
    model.eval()
    predictions = []
    
    with torch.no_grad():
        for i in range(len(test_sentences1)):
            # Tokenize
            encoding1 = tokenizer(test_sentences1[i], return_tensors='pt', 
                                truncation=True, padding=True, max_length=512)
            encoding2 = tokenizer(test_sentences2[i], return_tensors='pt',
                                truncation=True, padding=True, max_length=512)
            
            # Move to device
            encoding1 = {k: v.to(model.device) for k, v in encoding1.items()}
            encoding2 = {k: v.to(model.device) for k, v in encoding2.items()}
            
            # Get embeddings
            outputs1 = model(**encoding1)
            outputs2 = model(**encoding2)
            
            # Mean pooling
            def mean_pooling(token_embeddings, attention_mask):
                input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
                sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
                sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
                return sum_embeddings / sum_mask
            
            emb1 = mean_pooling(outputs1.last_hidden_state, encoding1['attention_mask'])
            emb2 = mean_pooling(outputs2.last_hidden_state, encoding2['attention_mask'])
            
            # Normalize và tính cosine similarity
            emb1 = F.normalize(emb1, p=2, dim=1)
            emb2 = F.normalize(emb2, p=2, dim=1)
            cosine_sim = F.cosine_similarity(emb1, emb2, dim=1)
            
            predictions.append(cosine_sim.cpu().item())
    
    # Tính correlation với ground truth
    correlation = np.corrcoef(predictions, test_labels)[0, 1]
    mse = np.mean((np.array(predictions) - np.array(test_labels)) ** 2)
    
    return correlation, mse, predictions

# Main training function
def main():
    print("🚀 Bắt đầu Q-LoRA fine-tuning BGE-M3...")
    
    # Load model và tokenizer
    model, tokenizer = load_model_with_qlora()
    
    # Tạo dữ liệu
    print("📊 Tạo training data...")
    sentences1, sentences2, labels = create_sample_data()
    
    # Split train/test
    split_idx = int(0.8 * len(sentences1))
    train_s1, test_s1 = sentences1[:split_idx], sentences1[split_idx:]
    train_s2, test_s2 = sentences2[:split_idx], sentences2[split_idx:]
    train_labels, test_labels = labels[:split_idx], labels[split_idx:]
    
    # Tạo dataset
    train_dataset = SentencePairDataset(train_s1, train_s2, train_labels, tokenizer)
    
    # Training arguments - tối ưu cho 2x T4 GPU
    training_args = TrainingArguments(
        output_dir="./results",
        num_train_epochs=3,
        per_device_train_batch_size=4,      # Nhỏ do memory hạn chế
        gradient_accumulation_steps=4,       # Tăng effective batch size
        warmup_steps=100,
        learning_rate=2e-4,                 # Learning rate cao hơn cho LoRA  
        fp16=True,                          # Mixed precision training
        logging_steps=10,
        save_strategy="epoch",
        evaluation_strategy="no",           # Tắt eval để tiết kiệm time
        dataloader_num_workers=2,           # Parallel data loading
        remove_unused_columns=False,        # Giữ tất cả columns
        report_to=None,                     # Tắt wandb/tensorboard
    )
    
    # Tạo trainer
    trainer = SentenceSimilarityTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        tokenizer=tokenizer,
    )
    
    # Bắt đầu training
    print("🔥 Bắt đầu training...")
    trainer.train()
    
    # Save model
    print("💾 Saving model...")
    trainer.save_model("./bge-m3-qlora-finetuned")
    
    # Đánh giá model
    print("📈 Evaluating model...")
    correlation, mse, predictions = evaluate_model(model, tokenizer, test_s1, test_s2, test_labels)
    
    print(f"✅ Training hoàn thành!")
    print(f"📊 Correlation với ground truth: {correlation:.4f}")
    print(f"📊 MSE: {mse:.4f}")
    
    # In một vài ví dụ predictions
    print("\n🔍 Một vài ví dụ predictions:")
    for i in range(min(5, len(test_s1))):
        print(f"Câu 1: {test_s1[i]}")
        print(f"Câu 2: {test_s2[i]}")
        print(f"Ground truth: {test_labels[i]:.3f}")
        print(f"Prediction: {predictions[i]:.3f}")
        print("-" * 50)

# Hàm để load model đã fine-tune và sử dụng
def load_finetuned_model(model_path="./bge-m3-qlora-finetuned"):
    print("Loading fine-tuned model...")
    
    # Load base model với quantization
    base_model = AutoModel.from_pretrained(
        "BAAI/bge-m3",
        quantization_config=get_bnb_config(),
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True
    )
    
    # Load LoRA weights
    from peft import PeftModel
    model = PeftModel.from_pretrained(base_model, model_path)
    
    tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-m3")
    
    return model, tokenizer

# Hàm để encode text thành embeddings
def encode_text(model, tokenizer, texts, batch_size=8):
    model.eval()
    all_embeddings = []
    
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        
        # Tokenize
        encoded = tokenizer(
            batch_texts,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors='pt'
        )
        
        # Move to device
        encoded = {k: v.to(model.device) for k, v in encoded.items()}
        
        with torch.no_grad():
            outputs = model(**encoded)
            
            # Mean pooling
            attention_mask = encoded['attention_mask']
            input_mask_expanded = attention_mask.unsqueeze(-1).expand(outputs.last_hidden_state.size()).float()
            sum_embeddings = torch.sum(outputs.last_hidden_state * input_mask_expanded, 1)
            sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
            embeddings = sum_embeddings / sum_mask
            
            # Normalize
            embeddings = F.normalize(embeddings, p=2, dim=1)
            all_embeddings.extend(embeddings.cpu().numpy())
    
    return np.array(all_embeddings)

if __name__ == "__main__":
    # Set random seeds cho reproducibility
    torch.manual_seed(42)
    np.random.seed(42)
    
    # Chạy training
    main()
    
    print("\n" + "="*60)
    print("🎉 Q-LoRA Fine-tuning hoàn thành!")
    print("📁 Model đã được save tại: ./bge-m3-qlora-finetuned")
    print("🔧 Để sử dụng model: model, tokenizer = load_finetuned_model()")
    print("="*60)

🚀 Bắt đầu Q-LoRA fine-tuning BGE-M3...
Loading tokenizer...
Loading model with 4-bit quantization...


ImportError: Using `bitsandbytes` 4-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`