## 1. Installation des d√©pendances

In [None]:
# Installation des biblioth√®ques n√©cessaires
!pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install -q transformers accelerate datasets sentencepiece
!pip install -q llmcompressor  # LLM Compressor (successeur d'AutoAWQ par vLLM)
!pip install -q gdown pandas

print("D√©pendances install√©es")

‚úì D√©pendances install√©es


In [4]:
import os
import gc
import json
import time
import torch
from datetime import datetime
from pathlib import Path

from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset

# Pour la reproductibilit√©
torch.manual_seed(42)

# Configuration du device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

Using device: cuda
GPU: Tesla T4
GPU Memory: 15.83 GB


## 2. T√©l√©chargement du Mod√®le Distill√© depuis Google Drive

Remplacez `YOUR_DRIVE_FILE_ID` par l'ID de votre fichier distilled_tinyllama.zip sur Google Drive.

In [None]:
# === CONFIGURATION ===
# ID du fichier Google Drive contenant le mod√®le distill√©
# Pour obtenir l'ID : clic droit sur le fichier Drive > "Obtenir le lien" > extraire l'ID de l'URL
# Exemple: https://drive.google.com/file/d/ABC123XYZ/view -> ID = ABC123XYZ

DISTILLED_ZIP_ID = "xxxx"  # <- REMPLACEZ ICI

# Chemins de travail (compatibles Kaggle/Colab)
if os.path.exists("/kaggle/working"):
    ROOT = "/kaggle/working/"
elif os.path.exists("/content"):
    ROOT = "/content/"
else:
    ROOT = "./"

DISTILLED_MODEL_PATH = ROOT + "distilled_tinyllama"
AWQ_OUTPUT_PATH = ROOT + "distilled_tinyllama_awq"
AWQ_CACHE_PATH = ROOT + "awq_cache"
QUANT_CACHE_PATH = ROOT + "quant_cache"

print(f"ROOT: {ROOT}")
print(f"Model path: {DISTILLED_MODEL_PATH}")
print(f"AWQ output: {AWQ_OUTPUT_PATH}")

ROOT: /kaggle/working/
Model path: /kaggle/working/distilled_tinyllama
AWQ output: /kaggle/working/distilled_tinyllama_awq


In [None]:
# T√©l√©chargement depuis Google Drive avec gdown
import subprocess

ZIP_FILENAME = ROOT + "distilled_tinyllama.zip"

if DISTILLED_ZIP_ID != "YOUR_DRIVE_FILE_ID":
    print("T√©l√©chargement du mod√®le distill√© depuis Google Drive...")
    !gdown --id {DISTILLED_ZIP_ID} -O {ZIP_FILENAME}
    
    if os.path.exists(ZIP_FILENAME):
        print(f"T√©l√©chargement r√©ussi : {ZIP_FILENAME} ({os.path.getsize(ZIP_FILENAME) / 1e6:.2f} MB)")
        
        # Extraction
        print("\nExtraction du mod√®le...")
        !unzip -q {ZIP_FILENAME} -d {DISTILLED_MODEL_PATH}
        print(f"Mod√®le extrait dans {DISTILLED_MODEL_PATH}")
        
        # V√©rification du contenu
        print("\nContenu du dossier:")
        for item in os.listdir(DISTILLED_MODEL_PATH):
            print(f"  - {item}")
    else:
        print("√âchec du t√©l√©chargement. V√©rifiez l'ID et les permissions de partage.")
else:
    print("Veuillez renseigner DISTILLED_ZIP_ID avec l'ID de votre fichier Google Drive")
    print("  Ou placez manuellement le mod√®le dans:", DISTILLED_MODEL_PATH)

In [None]:
# === ALTERNATIVE: T√©l√©chargement depuis Google Colab avec Drive mount ===
# D√©commentez ce bloc si vous utilisez Google Colab

# from google.colab import drive
# import shutil
# import zipfile

# # Monter Google Drive
# drive.mount('/content/drive')

# # Chemin vers votre fichier ZIP sur Google Drive
# DRIVE_ZIP_PATH = "/content/drive/MyDrive/distilled_tinyllama.zip"  # <- Modifiez ici

# if os.path.exists(DRIVE_ZIP_PATH):
#     print(f"Copie depuis Google Drive...")
#     shutil.copy(DRIVE_ZIP_PATH, ZIP_FILENAME)
#     print(f"‚úì Fichier copi√© : {os.path.getsize(ZIP_FILENAME) / 1e6:.2f} MB")
    
#     # Extraction
#     with zipfile.ZipFile(ZIP_FILENAME, 'r') as zip_ref:
#         zip_ref.extractall(DISTILLED_MODEL_PATH)
#     print(f"‚úì Mod√®le extrait dans {DISTILLED_MODEL_PATH}")
# else:
#     print(f"Fichier non trouv√© : {DRIVE_ZIP_PATH}")

## 3. Chargement du Mod√®le Distill√©

In [None]:
print("Chargement du mod√®le distill√©...")

# V√©rification de l'existence du mod√®le
if not os.path.exists(DISTILLED_MODEL_PATH):
    raise FileNotFoundError(f"Mod√®le non trouv√© dans {DISTILLED_MODEL_PATH}. T√©l√©chargez-le d'abord.")

# Chargement du tokenizer
tokenizer = AutoTokenizer.from_pretrained(DISTILLED_MODEL_PATH, trust_remote_code=True)
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

# Chargement du mod√®le en FP16 pour √©valuation avant quantization
model_fp16 = AutoModelForCausalLM.from_pretrained(
    DISTILLED_MODEL_PATH,
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True,
)
model_fp16.eval()

print(f"‚úì Mod√®le charg√©: {DISTILLED_MODEL_PATH}")
print(f"  Param√®tres: {sum(p.numel() for p in model_fp16.parameters()):,}")
print(f"  M√©moire GPU: {torch.cuda.memory_allocated() / 1e9:.2f} GB")

üì¶ Chargement du mod√®le distill√©...


`torch_dtype` is deprecated! Use `dtype` instead!
2025-12-23 22:38:06.439822: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1766529486.850009      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1766529486.980197      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1766529487.977916      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766529487.977942      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766529487.977946      55

‚úì Mod√®le charg√©: /kaggle/working/distilled_tinyllama
  Param√®tres: 1,100,048,384
  M√©moire GPU: 1.01 GB


## 4. √âvaluation Baseline (Avant Quantization)

√âvaluation du mod√®le FP16 pour √©tablir une r√©f√©rence de performance.

In [None]:
# Fonction de g√©n√©ration pour les tests
def generate_response(model, tokenizer, prompt, max_new_tokens=150):
    """G√©n√®re une r√©ponse √† partir d'un prompt."""
    model.eval()
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    start_time = time.time()
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.7,
            do_sample=True,
            top_p=0.9,
            pad_token_id=tokenizer.pad_token_id
        )
    generation_time = time.time() - start_time
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    tokens_generated = outputs.shape[1] - inputs['input_ids'].shape[1]
    tokens_per_sec = tokens_generated / generation_time
    
    return response, tokens_per_sec

# Alpaca-style prompts
alpaca_prompts = [
    {"id": "alp1", "prompt": "### Instruction:\nExplain the difference between supervised and unsupervised learning in simple terms.\n\n### Response:"},
    {"id": "alp2", "prompt": "### Instruction:\nWrite a short email to your boss explaining that you will be late to work because of a doctor's appointment.\n\n### Response:"},
    {"id": "alp3", "prompt": "### Instruction:\nGive me 5 creative ideas for a science fair project for a 10-year-old child.\n\n### Response:"},
    {"id": "alp4", "prompt": "### Instruction:\nClassify the following animals as mammal, bird, reptile, or fish: dolphin, penguin, crocodile, salmon, bat.\n\n### Response:"},
    {"id": "alp5", "prompt": "### Instruction:\nTranslate the following sentence into French: \"The quick brown fox jumps over the lazy dog.\"\n\n### Response:"},
    {"id": "alp6", "prompt": "### Instruction:\nWhy is it important to recycle plastic? Give at least 3 reasons.\n\n### Response:"},
]

# GSM8K samples
gsm8k_samples = [
    {"id": "gsm1", "question": "Janet has 8 apples. She gives 3 to her friend and then buys 5 more. How many apples does she have now?", "answer": "10"},
    {"id": "gsm2", "question": "A store has 20 boxes of pencils. Each box contains 12 pencils. If they sell 15 boxes, how many pencils are left in the store?", "answer": "60"},
    {"id": "gsm3", "question": "John has 5 bags of marbles. Each bag has 8 marbles. He gives away 18 marbles to his friends. How many marbles does he have left?", "answer": "22"},
    {"id": "gsm4", "question": "A class has 30 students. 40% of them are girls. How many boys are in the class?", "answer": "18"},
]

# Prompts de test (mix Alpaca + GSM8K style)
test_prompts = [item["prompt"] for item in alpaca_prompts] + ["### Instruction:\n"+item["question"]+"\n\n### Response:" for item in gsm8k_samples]

print("="*80)
print("√âVALUATION BASELINE - MOD√àLE FP16 (avant quantization)")
print("="*80)

√âVALUATION BASELINE - MOD√àLE FP16 (avant quantization)


In [None]:
# G√©n√©ration des r√©ponses baseline
baseline_results = []

for i, prompt in enumerate(test_prompts, 1):
    print(f"\n--- Test {i}/{len(test_prompts)} ---")
    print(f"Prompt: {prompt[:80]}...")
    
    response, tps = generate_response(model_fp16, tokenizer, prompt)
    response_only = response[len(prompt):].strip()
    
    print(f"R√©ponse: {response_only[:200]}..." if len(response_only) > 200 else f"R√©ponse: {response_only}")
    print(f"Vitesse: {tps:.2f} tokens/sec")
    
    baseline_results.append({
        "prompt": prompt,
        "response": response_only,
        "tokens_per_sec": tps
    })

avg_tps_fp16 = sum(r["tokens_per_sec"] for r in baseline_results) / len(baseline_results)
print(f"\nVitesse moyenne FP16: {avg_tps_fp16:.2f} tokens/sec")


--- Test 1/5 ---
üìù Prompt: ### Instruction:
Explain the difference between supervised and unsupervised lear...
üéØ R√©ponse: Supervised learning refers to learning tasks where the data is labeled, while unsupervised learning is the process of discovering patterns and relationships in unlabeled data without any labels. In su...
‚ö° Vitesse: 20.49 tokens/sec

--- Test 2/5 ---
üìù Prompt: ### Instruction:
Janet has 8 apples. She gives 3 to her friend and then buys 5 m...
üéØ R√©ponse: The given input is: Janet has 8 apples. She gives 3 to her friend and then buys 5 more. How many apples does she have now?

The output is: Janet has 13 apples.
‚ö° Vitesse: 27.05 tokens/sec

--- Test 3/5 ---
üìù Prompt: ### Instruction:
Write a short email to your boss explaining that you will be la...
üéØ R√©ponse: Dear [Boss‚Äôs Name],

I hope this email finds you well. As you know, I have to attend a doctor's appointment on [Date]. Unfortunately, the doctor's office is located at [Location], whic

In [None]:
# Calcul de la perplexit√© sur WikiText-2 (m√©trique quantitative)
def calculate_perplexity(model, tokenizer, dataset_name="wikitext", split="test", max_samples=100):
    """Calcule la perplexit√© sur un dataset de test."""
    print(f"\nCalcul de la perplexit√© sur {dataset_name}...")
    
    # Charger le dataset
    if dataset_name == "wikitext":
        dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split=split)
        texts = [t for t in dataset["text"] if len(t.strip()) > 50][:max_samples]
    else:
        texts = [dataset_name]  # texte custom
    
    model.eval()
    total_loss = 0
    total_tokens = 0
    
    for text in texts:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
        inputs = {k: v.to(model.device) for k, v in inputs.items()}
        
        with torch.no_grad():
            outputs = model(**inputs, labels=inputs["input_ids"])
            loss = outputs.loss
            
        total_loss += loss.item() * inputs["input_ids"].numel()
        total_tokens += inputs["input_ids"].numel()
    
    avg_loss = total_loss / total_tokens
    perplexity = torch.exp(torch.tensor(avg_loss)).item()
    
    return perplexity

# Perplexit√© baseline
ppl_fp16 = calculate_perplexity(model_fp16, tokenizer)
print(f"Perplexit√© FP16: {ppl_fp16:.2f}")


üìä Calcul de la perplexit√© sur wikitext...


README.md: 0.00B [00:00, ?B/s]

wikitext-2-raw-v1/test-00000-of-00001.pa(‚Ä¶):   0%|          | 0.00/733k [00:00<?, ?B/s]

wikitext-2-raw-v1/train-00000-of-00001.p(‚Ä¶):   0%|          | 0.00/6.36M [00:00<?, ?B/s]

wikitext-2-raw-v1/validation-00000-of-00(‚Ä¶):   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

‚úì Perplexit√© FP16: 15.61


In [None]:
# Lib√©rer la m√©moire du mod√®le FP16
del model_fp16
gc.collect()
torch.cuda.empty_cache()
print("M√©moire lib√©r√©e")

NameError: name 'model_fp16' is not defined

## 5. Quantization AWQ avec LLM Compressor

AWQ (Activation-aware Weight Quantization) est une technique de quantization post-training qui :
- Identifie les poids saillants en analysant les activations
- Applique une mise √† l'√©chelle pour prot√©ger ces poids critiques
- Quantifie en INT4 avec un impact minimal sur la qualit√©

### LLM Compressor
LLM Compressor est le successeur officiel d'AutoAWQ, adopt√© par le projet vLLM.

### Configuration W4A16 (Sym√©trique)
Nous utilisons le sch√©ma W4A16 sym√©trique pour :
- Meilleure compatibilit√© avec vLLM et transformers standard
- Pas de probl√®me de zero-points lors du chargement
- Performance similaire pour la plupart des mod√®les

Avantages :
- R√©duction m√©moire de 4x (FP16 vers INT4)
- Acc√©l√©ration de l'inf√©rence de 2-3x
- Faible d√©gradation de la qualit√©
- Compatible vLLM pour le d√©ploiement

In [None]:
# =============================================================================
# QUANTIZATION AWQ AVEC LLM COMPRESSOR
# Successeur officiel d'AutoAWQ par le projet vLLM
# https://github.com/vllm-project/llm-compressor
# =============================================================================

from llmcompressor import oneshot
from llmcompressor.modifiers.awq import AWQModifier
from llmcompressor.utils import dispatch_for_generation

print("="*80)
print("QUANTIZATION AWQ (INT4) - LLM Compressor")
print("="*80)

# Configuration de la quantization
quant_config = {
    "scheme": "W4A16",
    "targets": ["Linear"],
    "ignore": ["lm_head"],
    "symmetric": True,
}

print(f"\nConfiguration AWQ:")
for k, v in quant_config.items():
    print(f"  - {k}: {v}")

print("\nLLM Compressor import√© avec succ√®s")



QUANTIZATION AWQ (INT4) - LLM Compressor

üìã Configuration AWQ:
  - scheme: W4A16
  - targets: ['Linear']
  - ignore: ['lm_head']
  - symmetric: True

‚úì LLM Compressor import√© avec succ√®s


In [None]:
# Chargement du mod√®le pour quantization avec LLM Compressor
print("\nChargement du mod√®le pour quantization...")

# LLM Compressor utilise les mod√®les transformers standard
model_awq = AutoModelForCausalLM.from_pretrained(
    DISTILLED_MODEL_PATH,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True,
)

tokenizer_awq = AutoTokenizer.from_pretrained(
    DISTILLED_MODEL_PATH,
    trust_remote_code=True,
)

if tokenizer_awq.pad_token_id is None:
    tokenizer_awq.pad_token_id = tokenizer_awq.eos_token_id

print(f"Mod√®le charg√©")
print(f"Param√®tres: {sum(p.numel() for p in model_awq.parameters()):,}")


üì¶ Chargement du mod√®le pour quantization...
‚úì Mod√®le charg√©
  Param√®tres: 1,100,048,384


In [None]:
# =============================================================================
# PR√âPARATION DU DATASET DE CALIBRATION
# =============================================================================

from datasets import load_dataset

print("\nPr√©paration du dataset de calibration...")

# Configuration de calibration
NUM_CALIBRATION_SAMPLES = 128      # Nombre d'√©chantillons pour la calibration
MAX_SEQUENCE_LENGTH = 512          # Longueur max des s√©quences de calibration

print("Chargement de WikiText-2 (dataset universel pour calibration)...")

ds = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")

# Filtrer les lignes vides et trop courtes
ds = ds.filter(lambda x: len(x["text"].strip()) > 100)

# Limiter au nombre d'√©chantillons souhait√©
ds = ds.select(range(min(NUM_CALIBRATION_SAMPLES * 2, len(ds))))
ds = ds.shuffle(seed=42)

def tokenize_wikitext(sample):
    return tokenizer_awq(
        sample["text"],
        padding=False,
        max_length=MAX_SEQUENCE_LENGTH,
        truncation=True,
        add_special_tokens=True,
    )

ds = ds.map(tokenize_wikitext, remove_columns=ds.column_names)

# Filtrer les s√©quences trop courtes apr√®s tokenization
ds = ds.filter(lambda x: len(x["input_ids"]) >= 32)

# Limiter au nombre final
if len(ds) > NUM_CALIBRATION_SAMPLES:
    ds = ds.select(range(NUM_CALIBRATION_SAMPLES))

print(f"Dataset de calibration WikiText-2 charg√©: {len(ds)} √©chantillons")
print(f"Longueur moyenne: {sum(len(x['input_ids']) for x in ds) / len(ds):.0f} tokens")


üìä Pr√©paration du dataset de calibration...
   Chargement de WikiText-2 (dataset universel pour calibration)...


Filter:   0%|          | 0/36718 [00:00<?, ? examples/s]

Map:   0%|          | 0/256 [00:00<?, ? examples/s]

Filter:   0%|          | 0/256 [00:00<?, ? examples/s]

‚úì Dataset de calibration WikiText-2 charg√©: 128 √©chantillons
  Longueur moyenne: 198 tokens


In [None]:
# =============================================================================
# APPLICATION DE LA QUANTIZATION AWQ
# =============================================================================

print("\nQuantification AWQ en cours...")
print("Cette √©tape analyse les activations et quantifie les poids en INT4\n")

start_time = time.time()

# Configuration de AWQ
recipe = [
    AWQModifier(
        ignore=["lm_head"],
        scheme="W4A16",
        targets=["Linear"],
        duo_scaling=False,
    ),
]

# Appliquer la quantization avec oneshot
oneshot(
    model=model_awq,
    dataset=ds,
    recipe=recipe,
    max_seq_length=MAX_SEQUENCE_LENGTH,
    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
)

quant_time = time.time() - start_time
print(f"\nQuantification termin√©e en {quant_time:.1f} secondes ({quant_time/60:.1f} minutes)")


üîß Quantification AWQ en cours...
   Cette √©tape analyse les activations et quantifie les poids en INT4

2025-12-23T22:41:26.002507+0000 | reset | INFO - Compression lifecycle reset
2025-12-23T22:41:26.005100+0000 | from_modifiers | INFO - Creating recipe from modifiers
2025-12-23T22:41:26.037650+0000 | on_initialize | INFO - No AWQModifier.mappings provided, inferring from model...
2025-12-23T22:41:26.069099+0000 | initialize | INFO - Compression lifecycle initialized for 1 modifiers
2025-12-23T22:41:26.069815+0000 | IndependentPipeline | INFO - Inferred `SequentialPipeline` for `AWQModifier`


Preparing cache: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 128/128 [00:00<00:00, 2286.08it/s]
(1/23): Calibrating: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 128/128 [00:00<00:00, 149.39it/s]
Smoothing: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3/3 [00:06<00:00,  2.24s/it]
(1/23): Propagating: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 128/128 [00:00<00:00, 281.42it/s]
(2/23): Calibrating: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 128/128 [00:00<00:00, 179.40it/s]
Smoothing: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3/3 [00:06<00:00,  2.20s/it]
(2/23): Propagating: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 128/128 [00:00<00:00, 346.23it/s]
(3/23): Calibrating: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 128/128 [00:00<00:00, 178.31it/s]
Smoothing: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3/3 [00:06<00:00,  2.18s/it]
(3/23): Propagating: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 128/128 [00:00<00:00, 344.13it/s]
(4/23): Calibrating: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 128/128 [00:00<00:00, 178.14it/s]
Smoothing: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñ

2025-12-23T22:44:26.402248+0000 | finalize | INFO - Compression lifecycle finalized for 1 modifiers

‚úì Quantification termin√©e en 180.6 secondes (3.0 minutes)





In [None]:
# Sauvegarde du mod√®le quantifi√©
print(f"\nSauvegarde du mod√®le quantifi√©...")

os.makedirs(AWQ_OUTPUT_PATH, exist_ok=True)

# Sauvegarder le mod√®le au format compressed-tensors (compatible vLLM)
model_awq.save_pretrained(AWQ_OUTPUT_PATH, save_compressed=True)
tokenizer_awq.save_pretrained(AWQ_OUTPUT_PATH)

# Sauvegarder la configuration de quantization
with open(os.path.join(AWQ_OUTPUT_PATH, "quant_config.json"), "w") as f:
    json.dump(quant_config, f, indent=2)

print(f"\nMod√®le quantifi√© sauvegard√©!")

# Afficher la taille des fichiers
print(f"\nContenu du dossier {AWQ_OUTPUT_PATH}:")
total_size = 0
for item in sorted(os.listdir(AWQ_OUTPUT_PATH)):
    fp = os.path.join(AWQ_OUTPUT_PATH, item)
    if os.path.isfile(fp):
        size = os.path.getsize(fp) / 1e6
        total_size += size
        print(f"  - {item} ({size:.2f} MB)")

print(f"\nTaille totale: {total_size:.2f} MB ({total_size/1000:.2f} GB)")


üíæ Sauvegarde du mod√®le quantifi√©...
2025-12-23T22:46:54.315659+0000 | get_model_compressor | INFO - skip_sparsity_compression_stats set to True. Skipping sparsity compression statistic calculations. No sparsity compressor will be applied.


Compressing model: 154it [00:04, 31.81it/s]



‚úì Mod√®le quantifi√© sauvegard√©!

üìÅ Contenu du dossier /kaggle/working/distilled_tinyllama_awq:
  - chat_template.jinja (0.00 MB)
  - config.json (0.00 MB)
  - generation_config.json (0.00 MB)
  - model.safetensors (761.97 MB)
  - quant_config.json (0.00 MB)
  - recipe.yaml (0.00 MB)
  - special_tokens_map.json (0.00 MB)
  - tokenizer.json (3.62 MB)
  - tokenizer.model (0.50 MB)
  - tokenizer_config.json (0.00 MB)

üìä Taille totale: 766.09 MB (0.77 GB)


## 6. √âvaluation du Mod√®le Quantifi√©

Comparaison des performances entre le mod√®le FP16 original et le mod√®le AWQ INT4.

In [None]:
# Chargement du mod√®le quantifi√© pour √©valuation
print("\nChargement du mod√®le quantifi√© pour √©valuation...")

# Lib√©rer la m√©moire
del model_awq
gc.collect()
torch.cuda.empty_cache()

# Charger le mod√®le quantifi√©
model_quant = AutoModelForCausalLM.from_pretrained(
    AWQ_OUTPUT_PATH,
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True,
)
model_quant.eval()

# Dispatcher pour la g√©n√©ration optimis√©e
dispatch_for_generation(model_quant)

tokenizer_quant = AutoTokenizer.from_pretrained(AWQ_OUTPUT_PATH)
if tokenizer_quant.pad_token_id is None:
    tokenizer_quant.pad_token_id = tokenizer_quant.eos_token_id

print(f"Mod√®le quantifi√© charg√©")
print(f"M√©moire GPU: {torch.cuda.memory_allocated() / 1e9:.2f} GB")


üì¶ Chargement du mod√®le quantifi√© pour √©valuation...


Compressing model: 154it [00:00, 1273.60it/s]


‚úì Mod√®le quantifi√© charg√©
  M√©moire GPU: 0.54 GB


In [None]:
# √âvaluation qualitative - G√©n√©ration de r√©ponses
print("\n" + "="*80)
print("√âVALUATION DU MOD√àLE QUANTIFI√â AWQ (INT4)")
print("="*80)

quant_results = []

for i, prompt in enumerate(test_prompts, 1):
    print(f"\n--- Test {i}/{len(test_prompts)} ---")
    print(f"Prompt: {prompt[:80]}...")
    
    response, tps = generate_response(model_quant, tokenizer_quant, prompt)
    response_only = response[len(prompt):].strip()
    
    print(f"R√©ponse: {response_only[:200]}..." if len(response_only) > 200 else f"R√©ponse: {response_only}")
    print(f"Vitesse: {tps:.2f} tokens/sec")
    
    quant_results.append({
        "prompt": prompt,
        "response": response_only,
        "tokens_per_sec": tps
    })

avg_tps_quant = sum(r["tokens_per_sec"] for r in quant_results) / len(quant_results)
print(f"\nVitesse moyenne AWQ INT4: {avg_tps_quant:.2f} tokens/sec")


√âVALUATION DU MOD√àLE QUANTIFI√â AWQ (INT4)

--- Test 1/10 ---
üìù Prompt: ### Instruction:
Explain the difference between supervised and unsupervised lear...
üéØ R√©ponse: Supervised learning is a form of machine learning where we have labeled data to train the machine learning model. Unsupervised learning is a form of machine learning where we have unlabeled data to tr...
‚ö° Vitesse: 5.32 tokens/sec

--- Test 2/10 ---
üìù Prompt: ### Instruction:
Write a short email to your boss explaining that you will be la...
üéØ R√©ponse: I am sorry to inform you that I will be late to work because of a doctor's appointment. I have been experiencing some health issues and have had to attend to my ailing mother, resulting in me being un...
‚ö° Vitesse: 5.81 tokens/sec

--- Test 3/10 ---
üìù Prompt: ### Instruction:
Give me 5 creative ideas for a science fair project for a 10-ye...
üéØ R√©ponse: 1. How about a solar oven that cooks food using the sun's energy?
2. How about a robot that can

In [None]:
# Perplexit√© du mod√®le quantifi√©
ppl_quant = calculate_perplexity(model_quant, tokenizer_quant)
print(f"Perplexit√© AWQ INT4: {ppl_quant:.2f}")


üìä Calcul de la perplexit√© sur wikitext...
‚úì Perplexit√© AWQ INT4: 15.22


## 7. Comparaison des R√©sultats

In [None]:
# R√©sum√© des comparaisons
print("\n" + "="*80)
print("R√âSUM√â DE LA QUANTIZATION AWQ")
print("="*80)

# Calcul des tailles de mod√®le
def get_folder_size(path):
    total = 0
    for f in os.listdir(path):
        fp = os.path.join(path, f)
        if os.path.isfile(fp):
            total += os.path.getsize(fp)
    return total / 1e9  # En GB

size_fp16 = get_folder_size(DISTILLED_MODEL_PATH) if os.path.exists(DISTILLED_MODEL_PATH) else 2.2  # ~2.2 GB pour TinyLlama FP16
size_quant = get_folder_size(AWQ_OUTPUT_PATH) if os.path.exists(AWQ_OUTPUT_PATH) else 0.55  # ~0.55 GB pour INT4

print(f"\nM√âTRIQUES DE PERFORMANCE:")
print(f"{'='*50}")
print(f"{'M√©trique':<25} {'FP16':>12} {'AWQ INT4':>12} {'Œî':>10}")
print(f"{'-'*50}")
print(f"{'Perplexit√©':<25} {ppl_fp16:>12.2f} {ppl_quant:>12.2f} {(ppl_quant-ppl_fp16)/ppl_fp16*100:>+9.1f}%")
print(f"{'Vitesse (tokens/sec)':<25} {avg_tps_fp16:>12.1f} {avg_tps_quant:>12.1f} {(avg_tps_quant-avg_tps_fp16)/avg_tps_fp16*100:>+9.1f}%")
print(f"{'Taille mod√®le (GB)':<25} {size_fp16:>12.2f} {size_quant:>12.2f} {(size_quant-size_fp16)/size_fp16*100:>+9.1f}%")
print(f"{'='*50}")

print(f"\nGAINS:")
print(f"   ‚Ä¢ R√©duction de taille: {size_fp16/size_quant:.1f}x plus petit")
print(f"   ‚Ä¢ Acc√©l√©ration: {avg_tps_quant/avg_tps_fp16:.1f}x plus rapide")
print(f"   ‚Ä¢ D√©gradation perplexit√©: {(ppl_quant-ppl_fp16)/ppl_fp16*100:.2f}%")


R√âSUM√â DE LA QUANTIZATION AWQ

üìä M√âTRIQUES DE PERFORMANCE:
M√©trique                          FP16     AWQ INT4          Œî
--------------------------------------------------
Perplexit√©                       15.61        15.22      -2.5%
Vitesse (tokens/sec)              25.9          5.7     -78.1%
Taille mod√®le (GB)                2.20         0.77     -65.2%

‚úÖ GAINS:
   ‚Ä¢ R√©duction de taille: 2.9x plus petit
   ‚Ä¢ Acc√©l√©ration: 0.2x plus rapide
   ‚Ä¢ D√©gradation perplexit√©: -2.49%


In [None]:
# Comparaison c√¥te √† c√¥te des r√©ponses
print("\n" + "="*80)
print("COMPARAISON QUALITATIVE DES R√âPONSES")
print("="*80)

for i, (baseline, quant) in enumerate(zip(baseline_results, quant_results), 1):
    print(f"\n{'='*80}")
    print(f"Test {i}: {baseline['prompt'][:60]}...")
    print(f"{'='*80}")
    
    print(f"\nFP16 ({baseline['tokens_per_sec']:.1f} t/s):")
    print(f"   {baseline['response'][:300]}..." if len(baseline['response']) > 300 else f"   {baseline['response']}")
    
    print(f"\nAWQ INT4 ({quant['tokens_per_sec']:.1f} t/s):")
    print(f"   {quant['response'][:300]}..." if len(quant['response']) > 300 else f"   {quant['response']}")

### Sauvegarde des R√©sultats en JSON

Les r√©sultats des tests sont sauvegard√©s dans un fichier JSON pour:
- Tra√ßabilit√© des exp√©riences
- Comparaison future avec d'autres configurations
- Int√©gration dans des pipelines CI/CD

In [None]:
# Sauvegarde des r√©sultats de test dans un fichier JSON
RESULTS_JSON_PATH = os.path.join(AWQ_OUTPUT_PATH, "evaluation_results.json")

# Compilation de tous les r√©sultats
evaluation_results = {
    "metadata": {
        "date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        "model_source": DISTILLED_MODEL_PATH,
        "model_quantized": AWQ_OUTPUT_PATH,
        "quant_config": quant_config,
    },
    "performance_metrics": {
        "fp16": {
            "perplexity": ppl_fp16,
            "avg_tokens_per_sec": avg_tps_fp16,
            "model_size_gb": size_fp16,
        },
        "awq_int4": {
            "perplexity": ppl_quant,
            "avg_tokens_per_sec": avg_tps_quant,
            "model_size_gb": size_quant,
        },
        "comparison": {
            "perplexity_change_percent": (ppl_quant - ppl_fp16) / ppl_fp16 * 100,
            "speed_improvement_percent": (avg_tps_quant - avg_tps_fp16) / avg_tps_fp16 * 100,
            "size_reduction_factor": size_fp16 / size_quant,
        }
    },
    "qualitative_tests": {
        "fp16_responses": baseline_results,
        "awq_int4_responses": quant_results,
    }
}

# Sauvegarde en JSON
with open(RESULTS_JSON_PATH, 'w', encoding='utf-8') as f:
    json.dump(evaluation_results, f, indent=2, ensure_ascii=False)

print(f"R√©sultats sauvegard√©s dans: {RESULTS_JSON_PATH}")

# Affichage d'un aper√ßu du JSON
print("\nAper√ßu des r√©sultats sauvegard√©s:")
print(json.dumps(evaluation_results["performance_metrics"], indent=2))

## 8. Export et Documentation

In [None]:
# Cr√©er une archive ZIP du mod√®le quantifi√© pour upload sur Google Drive
import shutil

zip_path = ROOT + "distilled_tinyllama_awq.zip"
print(f"\nCr√©ation de l'archive {zip_path}...")

shutil.make_archive(
    ROOT + "distilled_tinyllama_awq",
    'zip',
    root_dir=ROOT,
    base_dir="distilled_tinyllama_awq"
)

if os.path.exists(zip_path):
    print(f"Archive cr√©√©e: {zip_path} ({os.path.getsize(zip_path) / 1e6:.2f} MB)")
    print("\nVous pouvez maintenant t√©l√©charger cette archive ou l'uploader sur Google Drive")

In [None]:
print("\n" + "="*80)
print("PIPELINE DE COMPRESSION TERMIN√â")
print("="*80)
print(f"\nMod√®le quantifi√© disponible dans: {AWQ_OUTPUT_PATH}")
print(f"Archive ZIP disponible: {zip_path}")