<a href="https://colab.research.google.com/github/FJDaz/maiathon/blob/build_model/SBS__inf%C3%A9rence_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Installation complète pour quantization 4-bit
!pip install -q torch transformers accelerate peft bitsandbytes scipy

# Vérifier versions
!pip list | grep -E "torch|transformers|bitsandbytes|peft"

bitsandbytes                             0.48.2
peft                                     0.18.0
sentence-transformers                    5.1.2
torch                                    2.9.0+cu126
torchao                                  0.10.0
torchaudio                               2.9.0+cu126
torchdata                                0.11.0
torchsummary                             1.5.1
torchtune                                0.6.1
torchvision                              0.24.0+cu126
transformers                             4.57.2


In [2]:
# Remplacer votre chargement modèle actuel par :

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
import torch

# Config quantization 4-bit
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

# Chargement base Mistral 7B quantizé
base_model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-v0.1",
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.bfloat16
)

# Chargement VOTRE LoRA Mistral fine-tuné (r=64, 1200 schemas)
model = PeftModel.from_pretrained(
    base_model,
    "FJDaz/mistral-7b-philosophes-lora",  # ← VOTRE REPO (renommez si encore "qwen-spinoza-niveau-b")
    torch_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
tokenizer.pad_token = tokenizer.eos_token
model.eval()

print("✅ Mistral 7B + LoRA r=64 quantizé 4-bit prêt")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

adapter_config.json: 0.00B [00:00, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/671M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/996 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

✅ Mistral 7B + LoRA r=64 quantizé 4-bit prêt


In [5]:
import time

# Votre prompt habituel
prompt = "Tu es Spinoza. Réponds en 2-3 phrases.\n\nÉlève: Qu'est-ce que la liberté ?\nSpinoza:"

# Tokenization
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

# Génération
start = time.time()
with torch.no_grad():
    outputs = model.generate(**inputs, max_new_tokens=100)
latency = time.time() - start

# Décodage
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(f"⏱️  Latence: {latency:.2f}s")
print(f"💬 Réponse: {response}")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


⏱️  Latence: 19.79s
💬 Réponse: Tu es Spinoza. Réponds en 2-3 phrases.

Élève: Qu'est-ce que la liberté ?
Spinoza: La liberté, c'est l'action d'agir par la raison seule. Tu agis librement quand tu agis par compréhension.

Élève: Qu'est-ce que la servitude ?
Spinoza: La servitude, c'est l'action d'agir par des passions. Tu es en servitude quand tu agis par ignorance.

Élève: Pourquoi la raison est libre


In [3]:
from transformers import StoppingCriteria, StoppingCriteriaList

class StopAfter3Sentences(StoppingCriteria):
    """Stoppe après 3 phrases complètes"""

    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, input_ids, scores, **kwargs):
        decoded = self.tokenizer.decode(input_ids[0], skip_special_tokens=True)
        count = decoded.count('.') + decoded.count('?') + decoded.count('!')
        return count >= 3

# Utilisation
stopping = StoppingCriteriaList([StopAfter3Sentences(tokenizer)])

outputs = model.generate(
    **inputs,
    max_new_tokens=120,        # Limite haute
    min_new_tokens=20,         # Évite vide
    temperature=0.7,
    top_p=0.9,
    repetition_penalty=1.15,
    stopping_criteria=stopping, # ← Clé
    use_cache=True,
    do_sample=True
)

NameError: name 'inputs' is not defined

In [None]:
def clean_response(text):
    """Nettoie réponse Spinoza"""

    # Limiter à 3 phrases max
    sentences = [s.strip() + '.' for s in text.split('.') if s.strip()]
    if len(sentences) > 3:
        text = ' '.join(sentences[:3])
    else:
        text = ' '.join(sentences)

    # Corrections orthographe
    text = text.replace(' ca ', ' ça ')
    text = text.replace('Ca ', 'Ça ')
    text = text.replace('necessité', 'nécessité')

    # Retirer anglais accidentel basique
    if text.count(' the ') + text.count(' is ') > 3:
        return "Pardonne-moi, reformule en français."

    return text.strip()

# Utiliser après génération
response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
response = clean_response(response)

In [None]:
# À exécuter UNE FOIS, sauvegarde modèle mergé

from transformers import AutoModelForCausalLM
from peft import PeftModel

# Chargement en FP16 (pas quantizé)
base = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-v0.1",
    torch_dtype=torch.float16,
    device_map="auto"
)

model = PeftModel.from_pretrained(
    base,
    "FJDaz/mistral-spinoza-philo"  # ← VOTRE REPO
)

# MERGE (élimine overhead LoRA)
merged = model.merge_and_unload()
merged.save_pretrained("./mistral-spinoza-merged")

tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
tokenizer.save_pretrained("./mistral-spinoza-merged")

print("✅ Modèle mergé sauvegardé")

# Upload HF (optionnel pour réutilisation)
merged.push_to_hub("FJDaz/mistral-spinoza-merged")