In [1]:
!pip install conllu



In [4]:
!pip install torch

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

## **data preparation**

In [2]:
import os
from collections import defaultdict, Counter
import torch  # Corrected import
from conllu import parse_incr

# Constantes
PAD_ID = 0  # Indice pour le padding
UNK_ID = 1  # Indice pour les mots hors vocabulaire (OOV)

def build_vocab(data_file, min_freq=1):
    """
    Construit les vocabulaires des mots (Vw) et des étiquettes (Vi) à partir du fichier de données.
    :param data_file: Chemin vers le fichier de données au format CoNLL-U.
    :param min_freq: Fréquence minimale pour qu'un mot soit inclus dans le vocabulaire.
    :return: Deux dictionnaires : Vw (mots) et Vi (étiquettes POS).
    """
    word_counter = Counter()  # Compteur pour les mots
    pos_counter = Counter()   # Compteur pour les étiquettes POS

    with open(data_file, "r", encoding="utf-8") as f:
        for sentence in parse_incr(f):
            for token in sentence:
                word = token["form"].lower()  # On utilise des minuscules pour normaliser
                pos = token["upos"]
                word_counter[word] += 1
                pos_counter[pos] += 1

    # Construction du vocabulaire des mots (Vw)
    Vw = {"<PAD>": PAD_ID, "<UNK>": UNK_ID}  # On ajoute les tokens spéciaux
    for word, freq in word_counter.items():
        if freq >= min_freq:  # On ignore les mots trop rares
            Vw[word] = len(Vw)  # On assigne un nouvel indice

    # Construction du vocabulaire des étiquettes (Vi)
    Vi = {"<PAD>": PAD_ID}  # On ajoute le token spécial pour le padding
    for pos in pos_counter:
        Vi[pos] = len(Vi)  # On assigne un nouvel indice

    return Vw, Vi

def encode_data(data_file, Vw, Vi):
    """
    Encode les mots et les étiquettes POS en indices à l'aide des vocabulaires Vw et Vi.
    :param data_file: Chemin vers le fichier de données au format CoNLL-U.
    :param Vw: Vocabulaire des mots.
    :param Vi: Vocabulaire des étiquettes POS.
    :return: Une liste de tuples (mots encodés, étiquettes encodées).
    """
    encoded_data = []

    with open(data_file, "r", encoding="utf-8") as f:
        for sentence in parse_incr(f):
            words = []
            pos_tags = []
            for token in sentence:
                word = token["form"].lower()  # On utilise des minuscules pour normaliser
                pos = token["upos"]
                # Encodage des mots (remplacement des OOV par UNK_ID)
                word_idx = Vw.get(word, UNK_ID)
                pos_idx = Vi[pos]
                words.append(word_idx)
                pos_tags.append(pos_idx)
            encoded_data.append((words, pos_tags))

    return encoded_data

def save_vocab(Vw, Vi, output_dir):
    """
    Sauvegarde les vocabulaires Vw et Vi dans des fichiers.
    :param Vw: Vocabulaire des mots.
    :param Vi: Vocabulaire des étiquettes POS.
    :param output_dir: Répertoire de sortie pour sauvegarder les fichiers.
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Sauvegarde du vocabulaire des mots
    with open(os.path.join(output_dir, "vocab_words.txt"), "w", encoding="utf-8") as f:
        for word, idx in Vw.items():
            f.write(f"{word}\t{idx}\n")

    # Sauvegarde du vocabulaire des étiquettes
    with open(os.path.join(output_dir, "vocab_pos.txt"), "w", encoding="utf-8") as f:
        for pos, idx in Vi.items():
            f.write(f"{pos}\t{idx}\n")

def load_vocab(vocab_file):
    """
    Charge un vocabulaire à partir d'un fichier.
    :param vocab_file: Chemin vers le fichier de vocabulaire.
    :return: Un dictionnaire représentant le vocabulaire.
    """
    vocab = {}
    with open(vocab_file, "r", encoding="utf-8") as f:
        for line in f:
            word, idx = line.strip().split("\t")
            vocab[word] = int(idx)
    return vocab

if __name__ == "__main__":
    # Chemin vers le fichier de données d'entraînement
    train_file = "/content/qaf_arabizi-ud-train.conllu"

    # Étape 1 : Construire les vocabulaires
    Vw, Vi = build_vocab(train_file, min_freq=2)  # On ignore les mots qui apparaissent moins de 2 fois
    print(f"Taille du vocabulaire des mots (Vw) : {len(Vw)}")
    print(f"Taille du vocabulaire des étiquettes (Vi) : {len(Vi)}")

    # Étape 2 : Encoder les données d'entraînement
    encoded_train_data = encode_data(train_file, Vw, Vi)
    print(f"Nombre de phrases encodées : {len(encoded_train_data)}")

    # Étape 3 : Sauvegarder les vocabulaires
    save_vocab(Vw, Vi, "vocab")
    print("Vocabulaires sauvegardés dans le dossier 'vocab'.")

Taille du vocabulaire des mots (Vw) : 1443
Taille du vocabulaire des étiquettes (Vi) : 18
Nombre de phrases encodées : 1003
Vocabulaires sauvegardés dans le dossier 'vocab'.


In [5]:
import os
from collections import Counter
import torch
from conllu import parse_incr

# Constantes
PAD_ID = 0  # Padding
UNK_ID = 1  # Out-of-vocabulary

def build_vocab(data_file, min_freq=1):
    """
    Construit les vocabulaires des mots (Vw) et des étiquettes (Vi).
    """
    word_counter = Counter()
    pos_counter = Counter()
    with open(data_file, "r", encoding="utf-8") as f:
        for sentence in parse_incr(f):
            for token in sentence:
                word = token["form"].lower()
                pos = token["upos"]
                word_counter[word] += 1
                pos_counter[pos] += 1

    Vw = {"<PAD>": PAD_ID, "<UNK>": UNK_ID}
    for word, freq in word_counter.items():
        if freq >= min_freq:
            Vw[word] = len(Vw)

    Vi = {"<PAD>": PAD_ID}
    for pos in pos_counter:
        Vi[pos] = len(Vi)

    return Vw, Vi

def encode_data(data_file, Vw, Vi):
    """
    Encode les phrases et leurs étiquettes en indices.
    """
    encoded_data = []
    with open(data_file, "r", encoding="utf-8") as f:
        for sentence in parse_incr(f):
            word_indices = []
            pos_indices = []
            for token in sentence:
                word = token["form"].lower()
                pos = token["upos"]
                word_idx = Vw.get(word, UNK_ID)
                pos_idx = Vi[pos]
                word_indices.append(word_idx)
                pos_indices.append(pos_idx)
            encoded_data.append((word_indices, pos_indices))
    return encoded_data

def pad_batch(batch, pad_value=PAD_ID):
    """
    Applique le padding à une liste de séquences de longueurs variables.
    """
    from torch.nn.utils.rnn import pad_sequence
    batch_tensors = [torch.tensor(seq, dtype=torch.long) for seq in batch]
    padded_batch = pad_sequence(batch_tensors, batch_first=True, padding_value=pad_value)
    return padded_batch

def save_vocab(Vw, Vi, output_dir):
    """
    Sauvegarde les vocabulaires dans des fichiers.
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    with open(os.path.join(output_dir, "vocab_words.txt"), "w", encoding="utf-8") as f:
        for word, idx in Vw.items():
            f.write(f"{word}\t{idx}\n")
    with open(os.path.join(output_dir, "vocab_pos.txt"), "w", encoding="utf-8") as f:
        for pos, idx in Vi.items():
            f.write(f"{pos}\t{idx}\n")

def load_vocab(vocab_file):
    """
    Charge un vocabulaire depuis un fichier.
    """
    vocab = {}
    with open(vocab_file, "r", encoding="utf-8") as f:
        for line in f:
            token, idx = line.strip().split("\t")
            vocab[token] = int(idx)
    return vocab


if __name__ == "__main__":
    # Chemin vers le fichier de données d'entraînement
    train_file = "/content/qaf_arabizi-ud-train.conllu"

    # Étape 1 : Construire les vocabulaires
    Vw, Vi = build_vocab(train_file, min_freq=2)  # On ignore les mots qui apparaissent moins de 2 fois
    print(f"Taille du vocabulaire des mots (Vw) : {len(Vw)}")
    print(f"Taille du vocabulaire des étiquettes (Vi) : {len(Vi)}")

    # Étape 2 : Encoder les données d'entraînement
    encoded_train_data = encode_data(train_file, Vw, Vi)
    print(f"Nombre de phrases encodées : {len(encoded_train_data)}")

    # Étape 3 : Sauvegarder les vocabulaires
    save_vocab(Vw, Vi, "vocab")
    print("Vocabulaires sauvegardés dans le dossier 'vocab'.")

Taille du vocabulaire des mots (Vw) : 1443
Taille du vocabulaire des étiquettes (Vi) : 18
Nombre de phrases encodées : 1003
Vocabulaires sauvegardés dans le dossier 'vocab'.


In [6]:
import torch
import torch.nn as nn

class POSTagger(nn.Module):
    def __init__(self, vocab_size, tagset_size, embedding_dim, hidden_dim, padding_idx=0, dropout=0.5):
        super(POSTagger, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=padding_idx)
        self.gru = nn.GRU(embedding_dim, hidden_dim, batch_first=True, bias=False)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim, tagset_size)

    def forward(self, x):
        # x de taille (batch_size, seq_len)
        embeds = self.embedding(x)  # (batch_size, seq_len, embedding_dim)
        gru_out, _ = self.gru(embeds)  # (batch_size, seq_len, hidden_dim)
        gru_out = self.dropout(gru_out)
        logits = self.fc(gru_out)  # (batch_size, seq_len, tagset_size)
        return logits


In [7]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset


# Création d'un Dataset personnalisé
class POSDataset(Dataset):
    def __init__(self, encoded_data):
        self.encoded_data = encoded_data

    def __len__(self):
        return len(self.encoded_data)

    def __getitem__(self, idx):
        return self.encoded_data[idx]

# Fonction de collate pour le DataLoader
def collate_fn(batch):
    words = [item[0] for item in batch]
    tags = [item[1] for item in batch]
    padded_words = pad_batch(words, pad_value=PAD_ID)
    padded_tags = pad_batch(tags, pad_value=PAD_ID)
    return padded_words, padded_tags

# Fonction d'entraînement sur une époque
def train(model, dataloader, criterion, optimizer, device):
    model.train()
    total_loss = 0.0
    for words, tags in dataloader:
        words = words.to(device)
        tags = tags.to(device)
        optimizer.zero_grad()
        outputs = model(words)  # (batch, seq, tagset_size)
        outputs = outputs.view(-1, outputs.shape[-1])
        tags = tags.view(-1)
        loss = criterion(outputs, tags)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

# Fonction d'évaluation sur le set dev
def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0.0
    correct = 0
    total_tokens = 0
    with torch.no_grad():
        for words, tags in dataloader:
            words = words.to(device)
            tags = tags.to(device)
            outputs = model(words)
            outputs = outputs.view(-1, outputs.shape[-1])
            tags = tags.view(-1)
            loss = criterion(outputs, tags)
            total_loss += loss.item()

            predictions = outputs.argmax(dim=1)
            mask = tags != PAD_ID
            correct += (predictions[mask] == tags[mask]).sum().item()
            total_tokens += mask.sum().item()
    accuracy = correct / total_tokens if total_tokens > 0 else 0
    return total_loss / len(dataloader), accuracy

if __name__ == "__main__":
    # Chemins vers les fichiers
    train_file = "/content/qaf_arabizi-ud-train.conllu"
    dev_file = "/content/qaf_arabizi-ud-dev.conllu"

    # Hyperparamètres
    embedding_dim = 100
    hidden_dim = 128
    batch_size = 32
    epochs = 10
    learning_rate = 0.001
    min_freq = 2
    dropout = 0.5
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Construction des vocabulaires à partir du fichier d'entraînement
    Vw, Vi = build_vocab(train_file, min_freq)
    print(f"Taille du vocabulaire des mots : {len(Vw)}")
    print(f"Taille du vocabulaire des étiquettes : {len(Vi)}")

    # Sauvegarde des vocabulaires
    save_vocab(Vw, Vi, "vocab")

    # Encodage des données d'entraînement et de développement
    train_data = encode_data(train_file, Vw, Vi)
    dev_data = encode_data(dev_file, Vw, Vi)

    train_dataset = POSDataset(train_data)
    dev_dataset = POSDataset(dev_data)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    dev_loader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

    # Initialisation du modèle, de la fonction de perte et de l'optimiseur
    model = POSTagger(vocab_size=len(Vw), tagset_size=len(Vi),
                      embedding_dim=embedding_dim, hidden_dim=hidden_dim,
                      padding_idx=PAD_ID, dropout=dropout)
    model.to(device)

    criterion = nn.CrossEntropyLoss(ignore_index=PAD_ID)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Boucle d'entraînement
    for epoch in range(1, epochs+1):
        train_loss = train(model, train_loader, criterion, optimizer, device)
        dev_loss, dev_accuracy = evaluate(model, dev_loader, criterion, device)
        print(f"Epoch {epoch}: train loss = {train_loss:.4f}, dev loss = {dev_loss:.4f}, dev accuracy = {dev_accuracy:.4f}")

    # Sauvegarde du modèle et des paramètres nécessaires
    model_state = {
        'model_state_dict': model.state_dict(),
        'Vw': Vw,
        'Vi': Vi,
        'embedding_dim': embedding_dim,
        'hidden_dim': hidden_dim,
        'PAD_ID': PAD_ID,
        'UNK_ID': UNK_ID
    }
    torch.save(model_state, "pos_tagger.pt")
    print("Modèle sauvegardé dans pos_tagger.pt")


Taille du vocabulaire des mots : 1443
Taille du vocabulaire des étiquettes : 18
Epoch 1: train loss = 2.4798, dev loss = 2.1086, dev accuracy = 0.3822
Epoch 2: train loss = 2.0280, dev loss = 1.8138, dev accuracy = 0.4623
Epoch 3: train loss = 1.7729, dev loss = 1.6495, dev accuracy = 0.5147
Epoch 4: train loss = 1.6054, dev loss = 1.5494, dev accuracy = 0.5403
Epoch 5: train loss = 1.4968, dev loss = 1.4562, dev accuracy = 0.5778
Epoch 6: train loss = 1.4049, dev loss = 1.3956, dev accuracy = 0.5901
Epoch 7: train loss = 1.3271, dev loss = 1.3492, dev accuracy = 0.6114
Epoch 8: train loss = 1.2723, dev loss = 1.3023, dev accuracy = 0.6216
Epoch 9: train loss = 1.2101, dev loss = 1.2796, dev accuracy = 0.6319
Epoch 10: train loss = 1.1620, dev loss = 1.2482, dev accuracy = 0.6412
Modèle sauvegardé dans pos_tagger.pt


In [8]:
import torch
from conllu import parse_incr

def load_model(model_path, device):
    checkpoint = torch.load(model_path, map_location=device)
    Vw = checkpoint['Vw']
    Vi = checkpoint['Vi']
    embedding_dim = checkpoint['embedding_dim']
    hidden_dim = checkpoint['hidden_dim']
    model = POSTagger(vocab_size=len(Vw), tagset_size=len(Vi),
                      embedding_dim=embedding_dim, hidden_dim=hidden_dim,
                      padding_idx=PAD_ID)
    model.load_state_dict(checkpoint['model_state_dict'])
    model.to(device)
    model.eval()
    return model, Vw, Vi

def predict_sentence(model, sentence, Vw, Vi, device):
    # Extraction des mots et conversion en indices
    words = [token['form'].lower() for token in sentence]
    word_indices = [Vw.get(word, UNK_ID) for word in words]
    input_tensor = torch.tensor(word_indices, dtype=torch.long).unsqueeze(0).to(device)
    with torch.no_grad():
        outputs = model(input_tensor)  # (1, seq_len, tagset_size)
        predictions = outputs.argmax(dim=-1).squeeze(0).cpu().tolist()
    # Création d'un dictionnaire inverse pour les étiquettes
    rev_Vi = {idx: tag for tag, idx in Vi.items()}
    predicted_tags = [rev_Vi.get(idx, "UNK") for idx in predictions]
    return predicted_tags

if __name__ == "__main__":
    import sys
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model_path = "/content/pos_tagger.pt"

    # Chargement du modèle et des vocabulaires
    model, Vw, Vi = load_model(model_path, device)

    # Chemin du fichier à traiter (par exemple dev ou test)
    file_path = "/content/qaf_arabizi-ud-test.conllu"

    with open(file_path, "r", encoding="utf-8") as f:
        for sentence in parse_incr(f):
            predicted_tags = predict_sentence(model, sentence, Vw, Vi, device)
            tokens = [token["form"] for token in sentence]
            print("Sentence :", " ".join(tokens))
            print("Predicted POS :", " ".join(predicted_tags))
            print("---")


  checkpoint = torch.load(model_path, map_location=device)


Sentence : amina2003alg@yahoofr
Predicted POS : VERB
---
Sentence : eh benh man jadda wadjad wa man zara3a hassad elli ta3bo nchalah enajah e najah w lokhrine l okhrine mel m el ahsane esanw
Predicted POS : VERB VERB PRON VERB VERB CCONJ PRON VERB VERB VERB VERB INTJ VERB DET NOUN CCONJ VERB DET NOUN _ ADP DET NOUN VERB
---
Sentence : ntouma ma3labalkomch ja bladna director general ta3 cannal+ ewa appres ki masha 3la alger chaf danya ka3 barabollat alors f conference m3a press 9alhoum algerie champion du monde f barabol ? chafto kifach tbahdila wallah hada li dar 9anoun ya3tih saha bazaf ? ness tabki parce wue khlas ya y a pas xxxx amcho tobo ila allah allah yahdina wa iyakom amin ya rabi
Predicted POS : PRON VERB VERB NOUN VERB VERB ADP NOUN VERB VERB ADV VERB ADP NOUN VERB VERB VERB VERB VERB ADP NOUN ADP NOUN _ PROPN ADP ADP NOUN ADP NOUN PUNCT VERB ADV NOUN INTJ PRON PRON VERB NOUN VERB NOUN DET PUNCT PRON VERB VERB VERB VERB INTJ PRON VERB ADV VERB VERB VERB ADP PROPN PROPN VERB C

In [9]:
from sklearn.metrics import accuracy_score, f1_score

def evaluate_metrics(model, dataloader, device, UNK_ID, PAD_ID=0):
    model.eval()
    all_preds, all_tags = [], []
    oov_preds, oov_tags = [], []
    known_preds, known_tags = [], []

    with torch.no_grad():
        for words, tags in dataloader:
            words = words.to(device)
            tags = tags.to(device)
            outputs = model(words)  # shape : (batch_size, seq_len, tagset_size)
            predictions = outputs.argmax(dim=-1)  # (batch_size, seq_len)

            # Parcourir chaque batch et chaque token
            for i in range(words.shape[0]):
                for j in range(words.shape[1]):
                    # Ignorer le padding
                    if tags[i, j].item() == PAD_ID:
                        continue
                    pred = predictions[i, j].item()
                    true = tags[i, j].item()
                    all_preds.append(pred)
                    all_tags.append(true)

                    # Si le token est OOV (celui-ci a été encodé par UNK_ID)
                    if words[i, j].item() == UNK_ID:
                        oov_preds.append(pred)
                        oov_tags.append(true)
                    else:
                        known_preds.append(pred)
                        known_tags.append(true)

    overall_acc = accuracy_score(all_tags, all_preds)
    overall_f1 = f1_score(all_tags, all_preds, average="macro")

    known_acc = accuracy_score(known_tags, known_preds) if known_tags else 0.0
    known_f1 = f1_score(known_tags, known_preds, average="macro") if known_tags else 0.0

    oov_acc = accuracy_score(oov_tags, oov_preds) if oov_tags else 0.0
    oov_f1 = f1_score(oov_tags, oov_preds, average="macro") if oov_tags else 0.0

    return {
        "overall_accuracy": overall_acc,
        "overall_f1": overall_f1,
        "known_accuracy": known_acc,
        "known_f1": known_f1,
        "oov_accuracy": oov_acc,
        "oov_f1": oov_f1
    }


In [10]:

for epoch in range(1, epochs+1):
    train_loss = train(model, train_loader, criterion, optimizer, device)
    dev_loss, dev_accuracy = evaluate(model, dev_loader, criterion, device)

    # Calcul des métriques détaillées (accuracy et F1 global, sur tokens connus et OOV)
    metrics = evaluate_metrics(model, dev_loader, device, UNK_ID, PAD_ID)

    print(f"Epoch {epoch}: train loss = {train_loss:.4f}, dev loss = {dev_loss:.4f}, dev accuracy = {dev_accuracy:.4f}")
    print("Overall Accuracy: {:.4f}, Overall F1: {:.4f}".format(metrics["overall_accuracy"], metrics["overall_f1"]))
    print("Known Accuracy: {:.4f}, Known F1: {:.4f}".format(metrics["known_accuracy"], metrics["known_f1"]))
    print("OOV Accuracy: {:.4f}, OOV F1: {:.4f}".format(metrics["oov_accuracy"], metrics["oov_f1"]))


Epoch 1: train loss = 1.1352, dev loss = 1.2482, dev accuracy = 0.6412
Overall Accuracy: 0.6412, Overall F1: 0.5195
Known Accuracy: 0.7978, Known F1: 0.6218
OOV Accuracy: 0.4348, OOV F1: 0.0813
Epoch 2: train loss = 1.1250, dev loss = 1.2482, dev accuracy = 0.6412
Overall Accuracy: 0.6412, Overall F1: 0.5195
Known Accuracy: 0.7978, Known F1: 0.6218
OOV Accuracy: 0.4348, OOV F1: 0.0813
Epoch 3: train loss = 1.1280, dev loss = 1.2482, dev accuracy = 0.6412
Overall Accuracy: 0.6412, Overall F1: 0.5195
Known Accuracy: 0.7978, Known F1: 0.6218
OOV Accuracy: 0.4348, OOV F1: 0.0813
Epoch 4: train loss = 1.1275, dev loss = 1.2482, dev accuracy = 0.6412
Overall Accuracy: 0.6412, Overall F1: 0.5195
Known Accuracy: 0.7978, Known F1: 0.6218
OOV Accuracy: 0.4348, OOV F1: 0.0813
Epoch 5: train loss = 1.1257, dev loss = 1.2482, dev accuracy = 0.6412
Overall Accuracy: 0.6412, Overall F1: 0.5195
Known Accuracy: 0.7978, Known F1: 0.6218
OOV Accuracy: 0.4348, OOV F1: 0.0813
Epoch 6: train loss = 1.1314, 

In [12]:
import torch
from conllu import parse_incr

# Définir l'appareil (GPU si disponible)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Charger le modèle et les vocabulaires (assurez-vous que load_model est défini dans le notebook)
model, Vw, Vi = load_model("/content/pos_tagger.pt", device)

# Ouvrir le fichier de prédictions en écriture
with open("/content/pred.conllu", "w", encoding="utf-8") as outf:
    # Lire le fichier de test
    with open("/content/qaf_arabizi-ud-test.conllu", "r", encoding="utf-8") as f:
        for sentence in parse_incr(f):
            # Prédire les POS pour la phrase
            predicted_tags = predict_sentence(model, sentence, Vw, Vi, device)
            # Mettre à jour le champ "upos" de chaque token avec la prédiction
            for token, tag in zip(sentence, predicted_tags):
                token["upos"] = tag
            # Écrire la phrase au format CoNLL-U dans le fichier de prédictions
            outf.write(sentence.serialize())
            outf.write("\n")

# Lancer l'évaluation avec accuracy.py
!python accuracy.py -p pred.conllu -g qaf_arabizi-ud-test.conllu -t qaf_arabizi-ud-train.conllu -c upos -f form


  checkpoint = torch.load(model_path, map_location=device)


Predictions file: pred.conllu
Accuracy on all upos: 64.85 ( 1489/ 2296)
Accuracy on OOV upos: 43.90 (  349/  795)
