In [2]:
!pip install conllu

Collecting conllu
  Downloading conllu-6.0.0-py3-none-any.whl.metadata (21 kB)
Downloading conllu-6.0.0-py3-none-any.whl (16 kB)
Installing collected packages: conllu
Successfully installed conllu-6.0.0


In [3]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from conllu import parse_incr
from collections import Counter
from torch.utils.data import DataLoader, Dataset

#############################
# CONSTANTES ET VOCABULAIRES
#############################

# Pour les caractères (input)
PAD_CHAR = "<pad>"   # indice 0
UNK_CHAR = "<unk>"   # indice 1
ESP_CHAR = "<esp>"   # symbole servant à marquer la frontière entre les mots (par exemple, espace)
# On fixera manuellement les indices pour ces tokens spéciaux :
PAD_ID = 0
UNK_ID = 1
ESP_ID = 2

def build_char_vocab(data_file, min_freq=1):
    """
    Construit un vocabulaire des caractères à partir des formes (form) des tokens dans le fichier.
    On compte chaque caractère apparaissant dans les mots.
    Les tokens spéciaux PAD, UNK et ESP sont ajoutés par défaut.
    """
    counter = Counter()
    with open(data_file, "r", encoding="utf-8") as f:
        for sentence in parse_incr(f):
            for token in sentence:
                word = token["form"]
                for ch in word:
                    counter[ch] += 1
    # Vocabulaire initial avec tokens spéciaux (indices fixés)
    vocab = {PAD_CHAR: PAD_ID, UNK_CHAR: UNK_ID, ESP_CHAR: ESP_ID}
    idx = 3
    for ch, freq in counter.items():
        if freq >= min_freq and ch not in vocab:
            vocab[ch] = idx
            idx += 1
    return vocab

def build_number_vocab(data_file):
    """
    Construit le vocabulaire pour le trait morphologique Number.
    Les tokens de sortie seront :
      - "<PAD>" pour le padding (indice 0)
      - "<N/A>" pour les mots qui n'ont pas de trait Number (indice 1)
      - puis les différentes valeurs observées (p.ex. "Sing", "Plur", etc.)
    """
    vocab = {"<PAD>": 0, "<N/A>": 1}
    idx = 2
    with open(data_file, "r", encoding="utf-8") as f:
        for sentence in parse_incr(f):
            for token in sentence:
                feats = token.get("feats")
                if feats and "Number" in feats:
                    number = feats["Number"]
                    if number not in vocab:
                        vocab[number] = idx
                        idx += 1
    return vocab

########################################
# ENCODAGE D'UNE PHRASE (CARACTÈRES)
########################################

def encode_sentence(sentence, char_vocab, num_vocab, max_c, max_w):
    """
    Pour une phrase (liste de tokens du fichier CoNLL-U) :
      - Crée in_enc : séquence d'indices de caractères commençant par <pad>
      - Crée ends : liste des positions (indices dans in_enc) correspondant à la fin de chaque mot
      - Crée out_enc : pour chaque mot, l'indice associé à la valeur du trait Number,
        ou "<N/A>" si le trait est absent.
    On ajoute après chaque mot le token <esp> pour marquer la frontière (mais on enregistre la fin du mot avant <esp>).
    Enfin, on tronque si la séquence dépasse max_c (pour in_enc) ou si le nombre de mots dépasse max_w.
    """
    in_enc = [char_vocab[PAD_CHAR]]  # on démarre avec <pad> (optionnel, selon la doc)
    ends = []
    out_enc = []
    for token in sentence:
        word = token["form"]
        # Encoder chaque caractère du mot
        for ch in word:
            in_enc.append(char_vocab.get(ch, UNK_ID))
        # La position de fin du mot est l'index du dernier caractère ajouté
        ends.append(len(in_enc) - 1)
        # Ajouter le token de séparation <esp>
        in_enc.append(char_vocab[ESP_CHAR])
        # Pour la sortie, récupérer le trait Number (sinon, "<N/A>")
        feats = token.get("feats")
        if feats and "Number" in feats:
            out_val = num_vocab.get(feats["Number"], num_vocab["<N/A>"])
        else:
            out_val = num_vocab["<N/A>"]
        out_enc.append(out_val)
    # Si la séquence de caractères est trop longue, on la tronque et on retire les mots dont la fin dépasse max_c
    if len(in_enc) > max_c:
        in_enc = in_enc[:max_c]
        ends = [e for e in ends if e < max_c]
        out_enc = out_enc[:len(ends)]
    # Tronquer le nombre de mots à max_w
    if len(ends) > max_w:
        ends = ends[:max_w]
        out_enc = out_enc[:max_w]
    return in_enc, ends, out_enc

In [9]:
########################################
# DATASET ET DATA LOADER
########################################

class MorphDataset(Dataset):
    def __init__(self, data_file, char_vocab, num_vocab, max_c, max_w):
        self.samples = []
        with open(data_file, "r", encoding="utf-8") as f:
            for sentence in parse_incr(f):
                in_enc, ends, out_enc = encode_sentence(sentence, char_vocab, num_vocab, max_c, max_w)
                self.samples.append((in_enc, ends, out_enc))
    def __len__(self):
        return len(self.samples)
    def __getitem__(self, idx):
        return self.samples[idx]

def collate_fn(samples):
    """
    Rassemble une liste d'exemples et applique le padding pour obtenir des tenseurs de même taille.
    Chaque exemple est un tuple (in_enc, ends, out_enc).
    """
    import torch
    max_c = max(len(s[0]) for s in samples)
    max_w = max(len(s[1]) for s in samples)
    batch_in, batch_ends, batch_out = [], [], []
    for in_enc, ends, out_enc in samples:
        in_enc_tensor = torch.tensor(in_enc + [PAD_ID]*(max_c - len(in_enc)), dtype=torch.long)
        ends_tensor = torch.tensor(ends + [0]*(max_w - len(ends)), dtype=torch.long)
        out_enc_tensor = torch.tensor(out_enc + [0]*(max_w - len(out_enc)), dtype=torch.long)
        batch_in.append(in_enc_tensor)
        batch_ends.append(ends_tensor)
        batch_out.append(out_enc_tensor)
    return torch.stack(batch_in), torch.stack(batch_ends), torch.stack(batch_out)

########################################
# DÉFINITION DU MODÈLE
########################################

class CharMorphTagger(nn.Module):
    def __init__(self, char_vocab_size, num_classes, embedding_dim, hidden_dim, padding_idx=PAD_ID, dropout=0.5):
        super(CharMorphTagger, self).__init__()
        self.embedding = nn.Embedding(char_vocab_size, embedding_dim, padding_idx=padding_idx)
        self.gru = nn.GRU(embedding_dim, hidden_dim, batch_first=True, bias=False)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim, num_classes)
        self.hidden_dim = hidden_dim
    def forward(self, in_enc, ends):
        """
        in_enc : tenseur Long de taille (B, max_c)
        ends   : tenseur Long de taille (B, max_w) indiquant pour chaque mot l'index dans in_enc de son dernier caractère.
        """
        embeds = self.embedding(in_enc)         # (B, max_c, embedding_dim)
        gru_out, _ = self.gru(embeds)             # (B, max_c, hidden_dim)
        # Préparer ends pour gather : on ajoute une dimension pour hidden_dim
        ends_exp = ends.unsqueeze(2).expand(-1, -1, self.hidden_dim)  # (B, max_w, hidden_dim)
        # Récupérer les états cachés correspondant aux fins de mots
        word_reps = torch.gather(gru_out, 1, ends_exp)  # (B, max_w, hidden_dim)
        word_reps = self.dropout(word_reps)
        logits = self.fc(word_reps)             # (B, max_w, num_classes)
        return logits

########################################
# FONCTIONS D'ENTRAÎNEMENT & D'ÉVALUATION
########################################

def train_model(model, dataloader, criterion, optimizer, device):
    model.train()
    total_loss = 0.0
    for batch_in, batch_ends, batch_out in dataloader:
        batch_in = batch_in.to(device)
        batch_ends = batch_ends.to(device)
        batch_out = batch_out.to(device)
        optimizer.zero_grad()
        outputs = model(batch_in, batch_ends)  # (B, max_w, num_classes)
        loss = criterion(outputs.view(-1, outputs.shape[-1]), batch_out.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

def evaluate_model(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0.0
    correct = 0
    total_tokens = 0
    with torch.no_grad():
        for batch_in, batch_ends, batch_out in dataloader:
            batch_in = batch_in.to(device)
            batch_ends = batch_ends.to(device)
            batch_out = batch_out.to(device)
            outputs = model(batch_in, batch_ends)  # (B, max_w, num_classes)
            loss = criterion(outputs.view(-1, outputs.shape[-1]), batch_out.view(-1))
            total_loss += loss.item()
            preds = outputs.argmax(dim=-1)
            mask = batch_out != 0  # on ignore le padding (<PAD> a l'indice 0)
            correct += (preds[mask] == batch_out[mask]).sum().item()
            total_tokens += mask.sum().item()
    accuracy = correct / total_tokens if total_tokens > 0 else 0
    return total_loss / len(dataloader), accuracy


In [10]:
########################################
# PARAMÈTRES & CHARGEMENT DES DONNÉES
########################################

# Chemins vers vos fichiers de données
train_file = "/content/fr_partut-ud-train.conllu"
dev_file   = "/content/fr_partut-ud-dev.conllu"  # Par exemple, pour l'évaluation

# Paramètres d'encodage
max_c = 200   # nombre maximum de caractères par phrase
max_w = 20    # nombre maximum de mots par phrase

# Construction des vocabulaires
char_vocab = build_char_vocab(train_file, min_freq=1)
num_vocab = build_number_vocab(train_file)
print("Taille du vocabulaire des caractères :", len(char_vocab))
print("Vocabulaire Number :", num_vocab)

# Création des datasets et DataLoaders
train_dataset = MorphDataset(train_file, char_vocab, num_vocab, max_c, max_w)
dev_dataset = MorphDataset(dev_file, char_vocab, num_vocab, max_c, max_w)

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
dev_loader   = DataLoader(dev_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

########################################
# INITIALISATION DU MODÈLE, OPTIMISEUR, ETC.
########################################

embedding_dim = 50
hidden_dim = 100
dropout = 0.5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = CharMorphTagger(char_vocab_size=len(char_vocab), num_classes=len(num_vocab),
                        embedding_dim=embedding_dim, hidden_dim=hidden_dim,
                        padding_idx=PAD_ID, dropout=dropout)
model.to(device)

criterion = nn.CrossEntropyLoss(ignore_index=0)  # on ignore les pads (indice 0)
optimizer = optim.Adam(model.parameters(), lr=0.001)

########################################
# BOUCLE D'ENTRAÎNEMENT
########################################

epochs = 5
for epoch in range(epochs):
    train_loss = train_model(model, train_loader, criterion, optimizer, device)
    dev_loss, dev_acc = evaluate_model(model, dev_loader, criterion, device)
    print(f"Epoch {epoch+1}/{epochs}: Train Loss = {train_loss:.4f} | Dev Loss = {dev_loss:.4f} | Dev Acc = {dev_acc:.4f}")

# Optionnel : sauvegarder le modèle et les vocabulaires
checkpoint = {
    'model_state_dict': model.state_dict(),
    'char_vocab': char_vocab,
    'num_vocab': num_vocab,
    'embedding_dim': embedding_dim,
    'hidden_dim': hidden_dim,
    'max_c': max_c,
    'max_w': max_w
}
torch.save(checkpoint, "/content/morph_tagger.pt")
print("Modèle sauvegardé dans morph_tagger.pt")

Taille du vocabulaire des caractères : 99
Vocabulaire Number : {'<PAD>': 0, '<N/A>': 1, 'Sing': 2, 'Plur': 3}
Epoch 1/5: Train Loss = 0.9880 | Dev Loss = 0.6651 | Dev Acc = 0.7590
Epoch 2/5: Train Loss = 0.5430 | Dev Loss = 0.5044 | Dev Acc = 0.7970
Epoch 3/5: Train Loss = 0.4135 | Dev Loss = 0.4063 | Dev Acc = 0.8441
Epoch 4/5: Train Loss = 0.3439 | Dev Loss = 0.3571 | Dev Acc = 0.8821
Epoch 5/5: Train Loss = 0.2957 | Dev Loss = 0.3233 | Dev Acc = 0.8939
Modèle sauvegardé dans morph_tagger.pt


In [14]:
def evaluate_metrics(model, dataloader, device, UNK_ID, PAD_ID=0):
    model.eval()
    all_preds, all_tags = [], []
    oov_preds, oov_tags = [], []
    known_preds, known_tags = [], []

    with torch.no_grad():
        for batch_in, batch_ends, batch_out in dataloader:
            batch_in = batch_in.to(device)
            batch_ends = batch_ends.to(device) # Move batch_ends to device
            batch_out = batch_out.to(device)
            outputs = model(batch_in, batch_ends)
            predictions = outputs.argmax(dim=-1)

            # Parcourir chaque batch et chaque token
            for i in range(batch_out.shape[0]):  # Use batch_out.shape[0] for batch size
                for j in range(batch_out.shape[1]):  # Use batch_out.shape[1] for word sequence length
                    # Ignorer le padding
                    if batch_out[i, j].item() == PAD_ID:
                        continue
                    pred = predictions[i, j].item()
                    true = batch_out[i, j].item()
                    all_preds.append(pred)
                    all_tags.append(true)

                    # Vérifier si le token est OOV en utilisant batch_in et ends
                    # Get the character index for the current word
                    if j < len(batch_ends[i]) and batch_ends[i][j] != 0:
                      word_end_index = batch_ends[i][j].item()
                      # Check if any character in the word is UNK
                      is_oov = any(batch_in[i, k].item() == UNK_ID for k in range(word_end_index))
                    else:
                      is_oov = False

                    #is_oov = batch_in[i, batch_ends[i][j] if j < len(batch_ends[i]) else 0].item() == UNK_ID
                    if is_oov:  # Si le token est OOV
                        oov_preds.append(pred)
                        oov_tags.append(true)
                    else:
                        known_preds.append(pred)
                        known_tags.append(true)

    overall_acc = accuracy_score(all_tags, all_preds)
    overall_f1 = f1_score(all_tags, all_preds, average="macro")

    known_acc = accuracy_score(known_tags, known_preds) if known_tags else 0.0
    known_f1 = f1_score(known_tags, known_preds, average="macro") if known_tags else 0.0

    oov_acc = accuracy_score(oov_tags, oov_preds) if oov_tags else 0.0
    oov_f1 = f1_score(oov_tags, oov_preds, average="macro") if oov_tags else 0.0

    return {
        "overall_accuracy": overall_acc,
        "overall_f1": overall_f1,
        "known_accuracy": known_acc,
        "known_f1": known_f1,
        "oov_accuracy": oov_acc,
        "oov_f1": oov_f1
    }

In [15]:
for epoch in range(1, epochs+1):
    train_loss = train_model(model, train_loader, criterion, optimizer, device) # Change 'train' to 'train_model'
    dev_loss, dev_accuracy = evaluate_model(model, dev_loader, criterion, device) # Change 'evaluate' to 'evaluate_model'

    # Calcul des métriques détaillées (accuracy et F1 global, sur tokens connus et OOV)
    metrics = evaluate_metrics(model, dev_loader, device, UNK_ID, PAD_ID)

    print(f"Epoch {epoch}: train loss = {train_loss:.4f}, dev loss = {dev_loss:.4f}, dev accuracy = {dev_accuracy:.4f}")
    print("Overall Accuracy: {:.4f}, Overall F1: {:.4f}".format(metrics["overall_accuracy"], metrics["overall_f1"]))
    print("Known Accuracy: {:.4f}, Known F1: {:.4f}".format(metrics["known_accuracy"], metrics["known_f1"]))
    print("OOV Accuracy: {:.4f}, OOV F1: {:.4f}".format(metrics["oov_accuracy"], metrics["oov_f1"]))

Epoch 1: train loss = 0.2166, dev loss = 0.2604, dev accuracy = 0.9234
Overall Accuracy: 0.9234, Overall F1: 0.9237
Known Accuracy: 0.9233, Known F1: 0.9238
OOV Accuracy: 0.9286, OOV F1: 0.6296
Epoch 2: train loss = 0.1933, dev loss = 0.2466, dev accuracy = 0.9253
Overall Accuracy: 0.9253, Overall F1: 0.9253
Known Accuracy: 0.9253, Known F1: 0.9254
OOV Accuracy: 0.9286, OOV F1: 0.6296
Epoch 3: train loss = 0.1705, dev loss = 0.2366, dev accuracy = 0.9273
Overall Accuracy: 0.9273, Overall F1: 0.9275
Known Accuracy: 0.9273, Known F1: 0.9276
OOV Accuracy: 0.9286, OOV F1: 0.6296
Epoch 4: train loss = 0.1597, dev loss = 0.2213, dev accuracy = 0.9306
Overall Accuracy: 0.9306, Overall F1: 0.9316
Known Accuracy: 0.9306, Known F1: 0.9317
OOV Accuracy: 0.9286, OOV F1: 0.6296
Epoch 5: train loss = 0.1440, dev loss = 0.2148, dev accuracy = 0.9352
Overall Accuracy: 0.9352, Overall F1: 0.9362
Known Accuracy: 0.9352, Known F1: 0.9365
OOV Accuracy: 0.9286, OOV F1: 0.6296


In [17]:
import torch
from conllu import parse_incr


def load_model(model_path, device):
    checkpoint = torch.load(model_path, map_location=device)
    char_vocab = checkpoint['char_vocab']
    num_vocab = checkpoint['num_vocab']
    embedding_dim = checkpoint['embedding_dim']
    hidden_dim = checkpoint['hidden_dim']
    max_c = checkpoint['max_c']
    max_w = checkpoint['max_w']
    # Instanciation du modèle (la classe CharMorphTagger doit être définie dans le notebook)
    model = CharMorphTagger(char_vocab_size=len(char_vocab), num_classes=len(num_vocab),
                            embedding_dim=embedding_dim, hidden_dim=hidden_dim,
                            padding_idx=0, dropout=0.5)
    model.load_state_dict(checkpoint['model_state_dict'])
    model.to(device)
    model.eval()
    return model, char_vocab, num_vocab, max_c, max_w

def predict_sentence(model, sentence, char_vocab, num_vocab, device, max_c, max_w):
    # encode_sentence est la fonction qui transforme une phrase en (in_enc, ends, out_enc)
    in_enc, ends, _ = encode_sentence(sentence, char_vocab, num_vocab, max_c, max_w)
    in_tensor = torch.tensor(in_enc, dtype=torch.long).unsqueeze(0).to(device)
    ends_tensor = torch.tensor(ends, dtype=torch.long).unsqueeze(0).to(device)
    with torch.no_grad():
        outputs = model(in_tensor, ends_tensor)  # (1, max_w, num_classes)
        preds = outputs.argmax(dim=-1).squeeze(0).tolist()
    # Créer un dictionnaire inverse pour num_vocab
    rev_num_vocab = {v: k for k, v in num_vocab.items()}
    predicted_labels = [rev_num_vocab[p] for p in preds]
    return predicted_labels

# ---- Chargement du modèle et prédiction sur le fichier test ----

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model, char_vocab, num_vocab, max_c, max_w = load_model("/content/morph_tagger.pt", device)

with open("/content/pred.conllu", "w", encoding="utf-8") as outf:
    with open("/content/fr_partut-ud-test.conllu", "r", encoding="utf-8") as f:
        for sentence in parse_incr(f):
            # Prédire le trait Number pour chaque token de la phrase
            predicted_number = predict_sentence(model, sentence, char_vocab, num_vocab, device, max_c, max_w)
            # Mettre à jour le champ "feats" : si la prédiction est "<N/A>", on met None ; sinon, on remplace par {"Number": tag}
            for token, tag in zip(sentence, predicted_number):
                if tag == "<N/A>":
                    token["feats"] = None
                else:
                    token["feats"] = {"Number": tag}
            outf.write(sentence.serialize())
            outf.write("\n")

# ---- Lancement de l'évaluation avec accuracy.py ----
!python accuracy.py -p pred.conllu -g fr_partut-ud-test.conllu -t fr_partut-ud-train.conllu -c feats -f form


  checkpoint = torch.load(model_path, map_location=device)


Predictions file: pred.conllu
Accuracy on all feats: 49.78 ( 1340/ 2692)
Accuracy on OOV feats: 30.33 (   91/  300)

Precision, recall, and F-score for feats:
Definite   : P=100.00 (   89/   89) / R= 23.36 (   89/  381) / F= 37.87
Gender     : P=100.00 (  268/  268) / R= 24.28 (  268/ 1104) / F= 39.07
Mood       : P=100.00 (   34/   34) / R= 18.89 (   34/  180) / F= 31.78
NumType    : P=100.00 (   20/   20) / R= 35.71 (   20/   56) / F= 52.63
Number     : P= 93.49 ( 1493/ 1597) / R= 94.43 ( 1493/ 1581) / F= 93.96
Number[psor]: P=100.00 (    3/    3) / R= 21.43 (    3/   14) / F= 35.29
Person     : P=100.00 (   45/   45) / R= 18.22 (   45/  247) / F= 30.82
Person[psor]: P=100.00 (    3/    3) / R= 21.43 (    3/   14) / F= 35.29
Polarity   : P=100.00 (    8/    8) / R= 26.67 (    8/   30) / F= 42.11
Poss       : P=  0.00 (    0/    0) / R=  0.00 (    0/    1) / F=  0.00
PronType   : P=100.00 (  131/  131) / R= 23.02 (  131/  569) / F= 37.43
Tense      : P=100.00 (   62/   62) / R= 20.06 