In [1]:
%pip install torch transformers datasets

Collecting transformers
  Downloading transformers-4.57.3-py3-none-any.whl.metadata (43 kB)
Collecting datasets
  Downloading datasets-4.4.1-py3-none-any.whl.metadata (19 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Downloading huggingface_hub-0.36.0-py3-none-any.whl.metadata (14 kB)
Collecting pyyaml>=5.1 (from transformers)
  Downloading pyyaml-6.0.3-cp311-cp311-win_amd64.whl.metadata (2.4 kB)
Collecting requests (from transformers)
  Using cached requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Downloading tokenizers-0.22.1-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.7.0-cp38-abi3-win_amd64.whl.metadata (4.2 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-22.0.0-cp311-cp311-win_amd64.whl.metadata (3.3 kB)
Collecting dill<0.4.1,>=0.3.0 (from datasets)
  Downloading dill-0.4.0-py3-none-any.whl.metadata (10 

In [3]:
%pip install accelerate

Collecting accelerate
  Downloading accelerate-1.12.0-py3-none-any.whl.metadata (19 kB)
Downloading accelerate-1.12.0-py3-none-any.whl (380 kB)
Installing collected packages: accelerate
Successfully installed accelerate-1.12.0
Note: you may need to restart the kernel to use updated packages.


In [4]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

# ============ DATASET PERSONNALISÉ ============
class TextDataset(Dataset):
    def __init__(self, tokenizer, text_file, block_size=128):
        self.tokenizer = tokenizer
        self.block_size = block_size
        
        with open(text_file, 'r', encoding='utf-8') as f:
            self.examples = f.readlines()
    
    def __len__(self):
        return len(self.examples)
    
    def __getitem__(self, idx):
        text = self.examples[idx].strip()
        encodings = self.tokenizer(text, truncation=True, max_length=self.block_size, 
                                   padding='max_length', return_tensors='pt')
        
        return {
            'input_ids': encodings['input_ids'].squeeze(),
            'attention_mask': encodings['attention_mask'].squeeze()
        }

# ============ CRÉER LE DATASET ============
def create_custom_dataset():
    texts = [
        "L'intelligence artificielle est une technologie révolutionnaire qui change notre monde.",
        "Les transformers sont des modèles puissants pour le traitement du langage naturel.",
        "GPT-2 est un modèle de génération de texte très efficace et polyvalent.",
        "L'apprentissage profond nous permet de créer des systèmes intelligents et autonomes.",
        "Les réseaux de neurones artificiels sont inspirés par le cerveau humain.",
        "La science des données ouvre de nouvelles possibilités dans tous les domaines.",
        "Les algorithmes d'apprentissage automatique peuvent prédire des tendances futures.",
        "La programmation en Python est populaire pour l'intelligence artificielle.",
        "Les modèles pré-entraînés comme GPT-2 facilitent le développement d'applications.",
        "L'entraînement sur GPU accélère considérablement le processus d'apprentissage.",
    ]
    
    with open("custom_dataset.txt", "w", encoding="utf-8") as f:
        for text in texts:
            f.write(text + "\n")
    
    print("✓ Dataset créé : custom_dataset.txt")

# ============ ENTRAÎNEMENT MANUEL ============
def finetune_gpt2():
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Utilisation du device : {device}\n")
    
    # Créer dataset
    create_custom_dataset()
    
    # Charger tokenizer et modèle
    print("Chargement du modèle GPT-2...")
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    model = GPT2LMHeadModel.from_pretrained("gpt2")
    tokenizer.pad_token = tokenizer.eos_token
    model.to(device)
    print("✓ Modèle chargé\n")
    
    # Préparer le dataset
    dataset = TextDataset(tokenizer, "custom_dataset.txt", block_size=128)
    dataloader = DataLoader(dataset, batch_size=2, shuffle=True)
    
    # Configuration d'entraînement
    epochs = 3
    learning_rate = 5e-5
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    
    print(f"Démarrage du fine-tuning ({epochs} epochs)...\n")
    
    # Boucle d'entraînement
    for epoch in range(epochs):
        total_loss = 0
        model.train()
        
        progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{epochs}")
        
        for batch_idx, batch in enumerate(progress_bar):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            
            # Forward pass
            outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
            loss = outputs.loss
            
            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            progress_bar.set_postfix({"loss": loss.item()})
        
        avg_loss = total_loss / len(dataloader)
        print(f"Epoch {epoch+1} - Loss moyen : {avg_loss:.4f}\n")
    
    # Sauvegarder le modèle
    print("Sauvegarde du modèle fine-tuné...")
    model.save_pretrained("./gpt2_finetuned")
    tokenizer.save_pretrained("./gpt2_finetuned")
    print("✓ Modèle sauvegardé dans ./gpt2_finetuned\n")
    
    return model, tokenizer, device

# ============ GÉNÉRATION DE TEXTE ============
def generate_text(model, tokenizer, prompt, device, max_length=80):
    model.eval()
    
    print(f"Prompt : '{prompt}'")
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
    
    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_length=max_length,
            num_beams=5,
            early_stopping=True,
            top_k=50,
            top_p=0.95,
            temperature=0.7,
            no_repeat_ngram_size=2,
            do_sample=True
        )
    
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    print(f"Texte généré : {generated_text}\n")
    return generated_text

# ============ FONCTION PRINCIPALE ============
if __name__ == "__main__":
    # Fine-tuner le modèle
    model, tokenizer, device = finetune_gpt2()
    
    # Générer du texte
    print("="*60)
    print("GÉNÉRATION DE TEXTE AVEC LE MODÈLE FINE-TUNÉ")
    print("="*60 + "\n")
    
    prompts = [
        "L'intelligence artificielle",
        "Les transformers sont",
        "La programmation en Python",
        "L'apprentissage profond"
    ]
    
    for prompt in prompts:
        generate_text(model, tokenizer, prompt, device, max_length=80)
        print("-"*60 + "\n")

Utilisation du device : cpu

✓ Dataset créé : custom_dataset.txt
Chargement du modèle GPT-2...
✓ Modèle chargé

Démarrage du fine-tuning (3 epochs)...



Epoch 1/3:   0%|          | 0/5 [00:00<?, ?it/s]`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.
Epoch 1/3: 100%|██████████| 5/5 [00:07<00:00,  1.41s/it, loss=0.866]


Epoch 1 - Loss moyen : 3.1746



Epoch 2/3: 100%|██████████| 5/5 [00:05<00:00,  1.17s/it, loss=0.791]


Epoch 2 - Loss moyen : 0.9148



Epoch 3/3: 100%|██████████| 5/5 [00:05<00:00,  1.10s/it, loss=0.769]


Epoch 3 - Loss moyen : 0.8072

Sauvegarde du modèle fine-tuné...


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


✓ Modèle sauvegardé dans ./gpt2_finetuned

GÉNÉRATION DE TEXTE AVEC LE MODÈLE FINE-TUNÉ

Prompt : 'L'intelligence artificielle'


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Texte généré : L'intelligence artificielle d'apprentissage.

------------------------------------------------------------

Prompt : 'Les transformers sont'


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Texte généré : Les transformers sont révolutiones de l'apprentissage d'économie des développementes.

------------------------------------------------------------

Prompt : 'La programmation en Python'


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Texte généré : La programmation en Python.

------------------------------------------------------------

Prompt : 'L'apprentissage profond'
Texte généré : L'apprentissage profondées de l'éducation des développement des révolutiones.

------------------------------------------------------------

