# Mamba 370M Training on Kaggle
Enable GPU: Settings > Accelerator > GPU T4 x2 or P100

In [None]:
!pip install -q mamba-ssm causal-conv1d transformers datasets tqdm einops

In [None]:
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")
print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB" if torch.cuda.is_available() else "")

In [None]:
# Upload your training data to Kaggle as a dataset, then add it to this notebook
# The path will be something like: /kaggle/input/your-dataset-name/D_ad_train.csv
DATASET_PATH = "/kaggle/input/your-dataset-name/D_ad_train.csv"  # UPDATE THIS

In [None]:
# Config
config = {
    "model": {
        "name": "state-spaces/mamba-370m",
        "new_model": "mamba-370m-finetuned"
    },
    "training": {
        "output_dir": "/kaggle/working/results",
        "num_train_epochs": 5,
        "per_device_train_batch_size": 4,
        "per_device_eval_batch_size": 4,
        "learning_rate": 1e-5,
        "weight_decay": 0.1,
        "max_grad_norm": 1.0,
        "max_length": 1024
    }
}

In [None]:
from datasets import load_dataset

def load_data_with_split(dataset_path, test_size=0.1, val_size=0.1, seed=42):
    dataset = load_dataset('csv', data_files=dataset_path, split='train')
    train_testvalid = dataset.train_test_split(test_size=test_size + val_size, seed=seed)
    test_valid = train_testvalid['test'].train_test_split(test_size=test_size / (test_size + val_size), seed=seed)
    return {
        'train': train_testvalid['train'],
        'validation': test_valid['train'],
        'test': test_valid['test']
    }

def format_data(row):
    return {'formatted_text': f"<s> [INST] {row['input']} [/INST] {row['output']} </s>"}

dataset_dict = load_data_with_split(DATASET_PATH)
for split in dataset_dict:
    dataset_dict[split] = dataset_dict[split].map(format_data)

print(f"Train: {len(dataset_dict['train'])}, Val: {len(dataset_dict['validation'])}, Test: {len(dataset_dict['test'])}")

In [None]:
from mamba_ssm.models.mixer_seq_simple import MambaLMHeadModel
from transformers import AutoTokenizer

model_name = config["model"]["name"]
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = MambaLMHeadModel.from_pretrained(model_name)
print(f"Model loaded: {sum(p.numel() for p in model.parameters()) / 1e6:.1f}M parameters")

In [None]:
from torch.utils.data import DataLoader
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR
from tqdm import tqdm
import os

def collate_fn(batch, tokenizer, max_length):
    texts = [item["formatted_text"] for item in batch]
    encodings = tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
    encodings["labels"] = encodings["input_ids"].clone()
    return encodings

def train_model(model, tokenizer, train_dataset, config):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    
    tc = config["training"]
    batch_size = tc["per_device_train_batch_size"]
    num_epochs = tc["num_train_epochs"]
    max_length = tc["max_length"]
    
    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=lambda b: collate_fn(b, tokenizer, max_length)
    )
    
    optimizer = AdamW(model.parameters(), lr=tc["learning_rate"], weight_decay=tc["weight_decay"])
    scheduler = CosineAnnealingLR(optimizer, T_max=len(train_loader) * num_epochs)
    
    model.train()
    for epoch in range(num_epochs):
        pbar = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs}")
        for batch in pbar:
            input_ids = batch["input_ids"].to(device)
            labels = batch["labels"].to(device)
            
            outputs = model(input_ids)
            logits = outputs.logits
            
            loss = torch.nn.functional.cross_entropy(
                logits[:, :-1, :].reshape(-1, logits.size(-1)),
                labels[:, 1:].reshape(-1)
            )
            
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), tc.get("max_grad_norm", 1.0))
            optimizer.step()
            scheduler.step()
            
            pbar.set_postfix({"loss": f"{loss.item():.4f}"})
    
    # Save
    os.makedirs(tc["output_dir"], exist_ok=True)
    save_path = os.path.join(tc["output_dir"], config["model"]["new_model"])
    model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)
    print(f"Model saved to {save_path}")
    return model

In [None]:
# Train
model = train_model(model, tokenizer, dataset_dict['train'], config)

In [None]:
# Zip and download
import shutil
shutil.make_archive('/kaggle/working/mamba-370m-finetuned', 'zip', config['training']['output_dir'])
print("Download: /kaggle/working/mamba-370m-finetuned.zip")