### Step 1: Environment Setup

**Description:** Install the necessary libraries for fine-tuning the Ministral 3 model, including Hugging Face `transformers`, `trl` for supervised fine-tuning, and `bitsandbytes` for memory optimization.

In [None]:
%%capture
!pip install -U transformers trl datasets accelerate bitsandbytes flash-attn mistral_common

### Step 2: Hardware and CUDA Verification

**Description:** Check the environment to ensure the GPU is recognized correctly and verify the available VRAM to determine if the setup can handle the model.

In [None]:
import torch

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    device_props = torch.cuda.get_device_properties(0)
    print(f"GPU: {device_props.name}")
    print(f"Total VRAM: {device_props.total_memory / 1e9:.2f} GB")


### Step 3: Library Imports

**Description:** Load core modules for model architecture, dataset management, and the SFT (Supervised Fine-Tuning) trainer.

In [None]:
import torch
import gc
from datasets import load_dataset
from transformers import (
    AutoTokenizer, 
    Mistral3ForConditionalGeneration, 
    MistralCommonBackend
)
from trl import SFTTrainer, SFTConfig

### Step 4: Model and Tokenizer Initialization

**Description:** Load the model with `bfloat16` precision and configure the chat template. This step also includes freezing the vision-related parameters (vision tower and projector) to focus training strictly on the language capabilities.

In [None]:
model_id = "0xA50C1A1/Ministral-3-8B-Instruct-2512-BF16-Heretic"

# Load the model in BF16
model = Mistral3ForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
    device_map="auto",
    trust_remote_code=True
)

# Load and configure tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)

# Define custom chat template for Ministral 3
tokenizer.chat_template = """
{%- if messages[0]['role'] == 'system' -%}
    {{- bos_token + '[SYSTEM_PROMPT]' + messages[0]['content'] + '[/SYSTEM_PROMPT]' -}}
    {%- set loop_messages = messages[1:] -%}
{%- else -%}
    {{- bos_token -}}
    {%- set loop_messages = messages -%}
{%- endif -%}

{%- for message in loop_messages -%}
    {%- if message['role'] == 'user' -%}
        {{- '[INST]' + message['content'] + '[/INST]' -}}
    {%- elif message['role'] == 'assistant' -%}
        {%- generation -%}
            {{- message['content'] + eos_token -}}
        {%- endgeneration -%}
    {%- endif -%}
{%- endfor -%}

{%- if add_generation_prompt -%}
    {%- generation -%}{%- endgeneration -%}
{%- endif -%}
""".strip()

# Freeze Vision-related components
for param in model.model.vision_tower.parameters():
    param.requires_grad = False
for param in model.model.multi_modal_projector.parameters():
    param.requires_grad = False

trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Model loaded. Trainable parameters: {trainable_params/1e6:.2f}M")

### Step 5: Dataset Loading

**Description:** Import the training dataset from Hugging Face. We apply a shuffle to ensure better stochastic gradient descent during the training process.

In [None]:
dataset = load_dataset("0xA50C1A1/rp_dataset", split="train").shuffle(seed=3407)
print(f"Dataset loaded: {len(dataset)} samples ready.")

### Step 6: SFT Trainer Configuration

**Description:** Set up the fine-tuning hyper-parameters. We use `SFTConfig` for efficient memory management, including gradient checkpointing and NEFTune noise for better generalization.

In [None]:
sft_config = SFTConfig(
    output_dir="outputs",
    dataset_text_field="messages",
    packing=True, # See https://github.com/huggingface/trl/pull/3749
    padding_free=True,
    assistant_only_loss=True,
    
    # Training Hyperparameters
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    learning_rate=1e-5,
    
    # Precision and Efficiency
    bf16=True,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    
    # Optimizer & Scheduler
    optim="adamw_torch_fused",
    weight_decay=0.05,
    lr_scheduler_type="cosine",
    warmup_steps=0.1,
    max_grad_norm=1.0,
    max_length=8192,
    
    # Logging
    logging_steps=5,
    report_to="none",
    
    # Regularization
    seed=3407,
    neftune_noise_alpha=5,
)

trainer = SFTTrainer(
    model=model,
    args=sft_config,
    train_dataset=dataset,
    processing_class=tokenizer,
)

### Step 7: Training Execution

**Description:** Initializing the training loop. We perform a manual garbage collection and clear the CUDA cache to prevent Out-Of-Memory (OOM) errors at the start of the run.

In [None]:
gc.collect()
torch.cuda.empty_cache()

trainer.train()

### Step 8: Model Inference and Testing

**Description:** Run a qualitative test on the fine-tuned model. We switch the model to evaluation mode and generate a response using the updated chat template to verify the learning results.

In [None]:
# Clear memory and set to eval mode
torch.cuda.empty_cache()
model.eval()

# Sample prompt
messages = [
    {"role": "system", "content": "Roleplay as an tavern-keeper. Describe your actions using *asterisks* and wrap your speech in \"double quotes\"."},
    {"role": "user", "content": "*I walk into the tavern and ask for an ale.*"}
]

# Prepare inputs
inputs = tokenizer.apply_chat_template(
    messages, 
    add_generation_prompt=True, 
    return_tensors="pt",
    return_dict=True
).to(model.device)

# Generate response
with torch.no_grad():
    outputs = model.generate(
        **inputs, 
        max_new_tokens=300, 
        do_sample=True, 
        temperature=1.0,
        min_p=0.05,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id
    )

# Decode output (removing the input tokens)
input_length = inputs["input_ids"].shape[1]
response = tokenizer.decode(outputs[0][input_length:], skip_special_tokens=True)

print(f"Prompt: {messages[1]['content']}")
print(f"Response:\n{response}")

### Step 9: Model Export and Upload to Hugging Face Hub

**Description:** Authenticate with the Hugging Face Hub and upload the fine-tuned model, tokenizer, and training configuration. This allows for easy sharing, versioning, and deployment of your model.

##### 9.1: Hub Authentication

In [None]:
from huggingface_hub import login

# Put your HF Write Token here
# It is recommended to use an environment variable or secret for security
HF_TOKEN = "your_hf_token" 

login(token=HF_TOKEN)

##### 9.2: Push to Hub

In [None]:
from transformers import AutoTokenizer

# Fetch the official tokenizer to grab its template
official_repo = "mistralai/Ministral-3-8B-Instruct-2512-BF16"
official_tokenizer = AutoTokenizer.from_pretrained(official_repo, trust_remote_code=True)

# Synchronize the template
tokenizer.chat_template = official_tokenizer.chat_template

print("Official chat template restored successfully.")

# Set your target repository name
hub_model_id = "YourUsername/Ministral-3-8B-Instruct-RP"

# Push everything to the Hub
trainer.push_to_hub(repo_id=hub_model_id)

# Explicitly push the tokenizer to ensure the restored template is saved
tokenizer.push_to_hub(hub_model_id)

print(f"Model and Tokenizer are now live at: https://huggingface.co/{hub_model_id}")