In [1]:
import torch
import torch.nn as nn

class CamembertEmbeddings(nn.Module):
    def __init__(self, vocab_size, hidden_size, max_position_embeddings, pad_token_id, layer_norm_eps=1e-12, dropout_prob=0.1):
        super().__init__()
        self.token_embeddings = nn.Embedding(vocab_size, hidden_size, padding_idx=pad_token_id)
        self.position_embeddings = nn.Embedding(max_position_embeddings, hidden_size)
        self.token_type_embeddings = nn.Embedding(1, hidden_size)  # CamemBERT only uses one segment

        self.layer_norm = nn.LayerNorm(hidden_size, eps=layer_norm_eps)
        self.dropout = nn.Dropout(dropout_prob)

    def forward(self, input_ids):
        seq_length = input_ids.size(1)

        # Token Embeddings
        token_embeds = self.token_embeddings(input_ids)  # [batch_size, seq_length, hidden_size]

        # Position Embeddings
        position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device).unsqueeze(0)
        position_embeds = self.position_embeddings(position_ids)  # [1, seq_length, hidden_size]

        # Add token type embeddings (all zeros in this case)
        token_type_ids = torch.zeros_like(input_ids, dtype=torch.long, device=input_ids.device)
        token_type_embeds = self.token_type_embeddings(token_type_ids)  # [batch_size, seq_length, hidden_size]

        # Combine all embeddings
        embeddings = token_embeds + position_embeds + token_type_embeds
        embeddings = self.layer_norm(embeddings)
        embeddings = self.dropout(embeddings)

        return embeddings


In [14]:
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, hidden_size, num_heads, dropout_prob, debug=False):
        super().__init__()
        assert hidden_size % num_heads == 0, "hidden_size must be divisible by num_heads"
        self.num_heads = num_heads
        self.head_dim = hidden_size // num_heads
        self.debug = debug

        self.q_proj = nn.Linear(hidden_size, hidden_size)
        self.k_proj = nn.Linear(hidden_size, hidden_size)
        self.v_proj = nn.Linear(hidden_size, hidden_size)
        self.out_proj = nn.Linear(hidden_size, hidden_size)
        self.dropout = nn.Dropout(dropout_prob)

    def forward(self, hidden_states, attention_mask=None):
        if self.debug: print("Entrée Self-Attention:", hidden_states.shape)  # DEBUG
        batch_size, seq_length, hidden_size = hidden_states.size()

        # Projections Q, K, V
        q = self.q_proj(hidden_states)
        k = self.k_proj(hidden_states)
        v = self.v_proj(hidden_states)

        if self.debug: print("Forme Q/K/V avant reshape:", q.shape)  # DEBUG

        # Reshape pour multi-têtes
        q = q.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
        k = k.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
        v = v.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)

        if self.debug: print("Forme Q/K/V après reshape:", q.shape)  # DEBUG

        # Calcul des scores d'attention
        scores = torch.matmul(q, k.transpose(-2, -1)) / (self.head_dim ** 0.5)
        if self.debug: print("Scores d'attention:", scores.shape)  # DEBUG

        if attention_mask is not None:
            # Étendre le masque d'attention
            attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)  # [batch_size, 1, 1, seq_length]
            attention_mask = attention_mask.expand(batch_size, self.num_heads, seq_length, seq_length)
            if self.debug: print("Forme du masque après expansion:", attention_mask.shape)  # DEBUG
        
            # Appliquer le masque aux scores
            scores = scores.masked_fill(attention_mask == 0, -1e9)



        attn_probs = torch.softmax(scores, dim=-1)
        if self.debug: print("Probabilités d'attention:", attn_probs.shape)  # DEBUG

        # Calcul du contexte
        context = torch.matmul(attn_probs, v)
        if self.debug: print("Contexte après attention:", context.shape)  # DEBUG

        # Combine des têtes
        context = context.transpose(1, 2).contiguous().view(batch_size, seq_length, hidden_size)
        if self.debug: print("Contexte combiné:", context.shape)  # DEBUG

        # Projection finale
        output = self.out_proj(context)
        if self.debug: print("Sortie Self-Attention:", output.shape)  # DEBUG

        return output


In [3]:
class FeedForwardNetwork(nn.Module):
    def __init__(self, hidden_size, intermediate_size, dropout_prob):
        super().__init__()
        self.fc1 = nn.Linear(hidden_size, intermediate_size)
        self.gelu = nn.GELU()
        self.fc2 = nn.Linear(intermediate_size, hidden_size)
        self.dropout = nn.Dropout(dropout_prob)

    def forward(self, hidden_states):
        x = self.fc1(hidden_states)
        x = self.gelu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return self.dropout(x)


In [4]:
class TransformerLayer(nn.Module):
    def __init__(self, hidden_size, num_heads, intermediate_size, dropout_prob, layer_norm_eps, debug=False):
        super().__init__()
        self.debug = debug
        self.self_attn = MultiHeadSelfAttention(hidden_size, num_heads, dropout_prob)
        self.ffn = FeedForwardNetwork(hidden_size, intermediate_size, dropout_prob)
        self.ln1 = nn.LayerNorm(hidden_size, eps=layer_norm_eps)
        self.ln2 = nn.LayerNorm(hidden_size, eps=layer_norm_eps)

    def forward(self, hidden_states, attention_mask=None):
        if self.debug: print("Entrée à TransformerLayer:", hidden_states.shape)  # DEBUG
        attn_output = self.self_attn(hidden_states, attention_mask)
        if self.debug: print("Sortie Attention:", attn_output.shape)  # DEBUG
        hidden_states = self.ln1(hidden_states + attn_output)

        ffn_output = self.ffn(hidden_states)
        if self.debug: print("Sortie FFN:", ffn_output.shape)  # DEBUG
        hidden_states = self.ln2(hidden_states + ffn_output)

        if self.debug: print("Sortie TransformerLayer:", hidden_states.shape)  # DEBUG
        return hidden_states


In [29]:
class CamembertModel(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.can_generate = lambda : False # a verifier ; sans imposible de faire l'inference fin de notebook
        self.embeddings = CamembertEmbeddings(
            vocab_size=config.vocab_size,
            hidden_size=config.hidden_size,
            max_position_embeddings=config.max_position_embeddings,
            pad_token_id=config.pad_token_id,
        )
        self.encoder = nn.ModuleList(
            [TransformerLayer(
                hidden_size=config.hidden_size,
                num_heads=config.num_attention_heads,
                intermediate_size=config.intermediate_size,
                dropout_prob=config.hidden_dropout_prob,
                layer_norm_eps=config.layer_norm_eps,
            ) for _ in range(config.num_hidden_layers)]
        )

    def forward(self, input_ids, attention_mask=None):
        embeddings = self.embeddings(input_ids)

        # Pass through transformer layers
        hidden_states = embeddings
        for layer in self.encoder:
            hidden_states = layer(hidden_states, attention_mask)

        return hidden_states


In [30]:
import torch
from transformers import CamembertConfig

# Configuration du modèle
config = CamembertConfig(
    vocab_size=32000,  # Exemple de vocabulaire
    hidden_size=768,  # Taille des embeddings
    num_hidden_layers=6,  # Nombre de couches dans l'encodeur
    num_attention_heads=12,  # Nombre de têtes d'attention
    intermediate_size=3072,  # Taille de la couche intermédiaire (FFN)
    max_position_embeddings=512,  # Longueur maximale des séquences
    pad_token_id=0,  # ID du token de padding
)

# Chargement du modèle
model = CamembertModel(config)
model.eval()  # Désactiver le dropout pour les tests


CamembertModel(
  (embeddings): CamembertEmbeddings(
    (token_embeddings): Embedding(32000, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(1, 768)
    (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): ModuleList(
    (0-5): 6 x TransformerLayer(
      (self_attn): MultiHeadSelfAttention(
        (q_proj): Linear(in_features=768, out_features=768, bias=True)
        (k_proj): Linear(in_features=768, out_features=768, bias=True)
        (v_proj): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ffn): FeedForwardNetwork(
        (fc1): Linear(in_features=768, out_features=3072, bias=True)
        (gelu): GELU(approximate='none')
        (fc2): Linear(in_features=3072, out_features=768, bias=True)
        (dropou

In [31]:
# Données aléatoires
batch_size = 2
seq_length = 10

input_ids = torch.randint(0, config.vocab_size, (batch_size, seq_length))
attention_mask = torch.ones_like(input_ids)  # Pas de masquage

# Test
with torch.no_grad():
    output = model(input_ids, attention_mask)
    print("Sortie finale:", output.shape)


# Vérification des formes
print("Forme des embeddings d'entrée :", input_ids.shape)
print("Forme de la sortie du modèle :", output.shape)


Sortie finale: torch.Size([2, 10, 768])
Forme des embeddings d'entrée : torch.Size([2, 10])
Forme de la sortie du modèle : torch.Size([2, 10, 768])


In [32]:
from transformers import CamembertTokenizer, CamembertTokenizerFast

# Chargement du tokenizer
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")

# Phrase d'exemple
text = "Bonjour, comment allez-vous aujourd'hui ?"

# Prétraitement
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]

# Passage dans le modèle
with torch.no_grad():
    output = model(input_ids, attention_mask)

# Vérification des formes
print("Phrase d'entrée :", text)
print("Forme des input_ids :", input_ids.shape)
print("Forme de la sortie du modèle :", output.shape)


Phrase d'entrée : Bonjour, comment allez-vous aujourd'hui ?
Forme des input_ids : torch.Size([1, 12])
Forme de la sortie du modèle : torch.Size([1, 12, 768])


In [33]:
# Vérifiez les poids d'une couche de projection linéaire
for name, param in model.named_parameters():
    if "q_proj" in name:  # Exemple : vérifier les poids de la projection Q
        print(f"Poids de {name} : Moyenne = {param.data.mean().item()}, Écart-type = {param.data.std().item()}")


Poids de encoder.0.self_attn.q_proj.weight : Moyenne = 4.24699392169714e-05, Écart-type = 0.020828956738114357
Poids de encoder.0.self_attn.q_proj.bias : Moyenne = 0.0004086366679985076, Écart-type = 0.02064184844493866
Poids de encoder.1.self_attn.q_proj.weight : Moyenne = -4.2640117499104235e-06, Écart-type = 0.020850634202361107
Poids de encoder.1.self_attn.q_proj.bias : Moyenne = 0.002234747866168618, Écart-type = 0.020853571593761444
Poids de encoder.2.self_attn.q_proj.weight : Moyenne = -1.0603037480905186e-05, Écart-type = 0.020849868655204773
Poids de encoder.2.self_attn.q_proj.bias : Moyenne = -0.0008610218646936119, Écart-type = 0.020934447646141052
Poids de encoder.3.self_attn.q_proj.weight : Moyenne = -5.0458533223718405e-05, Écart-type = 0.02082166075706482
Poids de encoder.3.self_attn.q_proj.bias : Moyenne = 0.0005568447522819042, Écart-type = 0.021001063287258148
Poids de encoder.4.self_attn.q_proj.weight : Moyenne = -2.449043677188456e-06, Écart-type = 0.020838420838117

In [34]:
# Test de rétropropagation
model.train()
output = model(input_ids, attention_mask)
loss = output.sum()  # Exemple d'une perte arbitraire
loss.backward()  # Vérifiez qu'il n'y a pas d'erreurs
print("Rétropropagation réussie.")


Rétropropagation réussie.


In [35]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model=model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1
)

# Test sentence
sentence = "Paris est la capitale de la <mask>."
results = fill_mask(sentence)

for result in results:
    print(f"Prediction: {result['sequence']}, Score: {result['score']:.4f}") 

IndexError: index out of range in self