In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TextStreamer
from datasets import load_dataset, load_from_disk
import torch
import random
import matplotlib.pyplot as plt
import pandas as pd
import json
import gc
from peft import PeftModel
print(torch.cuda.is_bf16_supported()) 

# Check Gallica content

In [None]:
gallica = load_from_disk("gallica")
ntexts = len(gallica)
print(ntexts)

text_lengths = [len(gallica[i]["complete_text"]) for i in range(ntexts)]
min_len = min(text_lengths)
max_len = max(text_lengths)
avg_len = sum(text_lengths) / len(text_lengths)
print(f"Gallica corpus: {ntexts} texts with an average of {avg_len} character per text (min: {min_len}, max: {max_len})")

sample_size = 1000
i = 0
for i in range(10):
    item = gallica[random.randint(0, ntexts - 1)]
    text = item["complete_text"]

    max_start = len(text) - sample_size
    char_start = random.randint(0, max_start)
    substring = text[char_start:char_start + sample_size]
    print(f"== content {i+1} ==")
    print(substring)
    print("\n")

# Check the progress

In [None]:
def plot_train(mod_step, plot_name):
    with open(mod_step + "/trainer_state.json", 'r') as f:
        state = json.load(f)
    state = state["log_history"]
    state = pd.DataFrame(state)
    fig, ax1 = plt.subplots(figsize=(6, 3))
    ax1.set_xlabel('Epoch', fontsize=12)
    ax1.set_ylabel('Loss', color='blue', fontsize=10)
    ax1.plot(state["epoch"], state["loss"], color='blue', linewidth=2, label='Loss')
    ax1.tick_params(axis='y', labelcolor='blue')
    ax1.grid(True, alpha=0.3)
    ax2 = ax1.twinx()
    ax2.set_ylabel('Learning Rate', color='red', fontsize=10)
    ax2.plot(state["epoch"], state["learning_rate"], color='red', linewidth=2, label='Learning Rate')
    ax2.tick_params(axis='y', labelcolor='red')
    plt.savefig(plot_name, dpi=150, bbox_inches="tight")
    plt.close()

In [None]:
plot_train("train_gallica_fullweight/checkpoint-31250", "plot_train_fullweight.jpg")
plot_train("train_gallica_qlora/checkpoint-3125", "plot_train_qlora.jpg")
plot_train("train_kafka_qlora/checkpoint-12", "plot_train_kafka_qlora.jpg")
plot_train("train_kafka_fullweight/checkpoint-520", "plot_train_kafka_fullweight.jpg")
plot_train("train_mistral_qlora/checkpoint-3125", "plot_train_mistral_gallica.jpg")
plot_train("train_mistral_kafka/checkpoint-520", "plot_train_mistral_kafka.jpg")

# Test a model

## Save the qlora intermediate model

In [None]:
lora = "./qlora_gallica_100K_2048t"
base_model = "./tinyllama_bf16_gallica_fullweight_1M_512t"
final_path = "tinyllama_qlora_gallica_100K_2048t"

bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
)
base_model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    local_files_only=True
)
model = PeftModel.from_pretrained(base_model, lora)
model = model.merge_and_unload()
model.save_pretrained(final_path)

## Save the qlora mistral model

In [None]:
lora = "./mistral_qlora_gallica_100K_512t"
base_model = "./mistral_bf16"
final_path = "mistral_gallica_100K_512t"

bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
)
base_model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    local_files_only=True
)
model = PeftModel.from_pretrained(base_model, lora)
model = model.merge_and_unload()
model.save_pretrained(final_path)

## Generation

In [None]:
# final_path = "./tinyllama_bf16"
# final_path = "./tinyllama_bf16_gallica_fullweight_1M_512t"
# final_path = "./tinyllama_qlora_gallica_100K_2048t"
# final_path = "./tinykafka"
# final_path = "./tinyllama_bf16_kafka_fullweight_512t"
# final_path = "./mistral_bf16"
# final_path = "./mistral_gallica_100K_512t"
final_path = "./kafstral"

# tokenizer_path = "./tinyllama_bf16"
tokenizer_path = "./mistral_bf16"

tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
prompt = """
K. ouvrit la porte.
"""

for i in range(1):
    print(f"=== Generation {i+1} ===")
    
    # Chargement du modèle (workaround leak Q8)
    model = AutoModelForCausalLM.from_pretrained(
        final_path,
        quantization_config=BitsAndBytesConfig(load_in_8bit=True),
        device_map="cuda",
        local_files_only=True
    )
    
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    # Streamer pour affichage en temps réel
    streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

    print(prompt)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=1000,
            temperature=0.8,
            top_p=0.9,
            do_sample=True,
            repetition_penalty=1.1,
            streamer=streamer  # Active le streaming
        )
    
    # Nettoyage
    del inputs, outputs, model, streamer
    torch.cuda.empty_cache()
    gc.collect()
    
    print("\n")  # Espace après chaque génération