In [None]:
!git pull "https://DareAdekunle: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@github.com/DareAdekunle/masakhane-pos.git" main

From https://github.com/DareAdekunle/masakhane-pos
 * branch            main       -> FETCH_HEAD
Already up to date.


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
%cd /content/drive/MyDrive/GitHub/repos/masakhane-pos

/content/drive/MyDrive/GitHub/repos/masakhane-pos


# African POS Notebook [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/masakhane-io/masakhane-pos/blob/main/train_pos.ipynb)


This notebook is designed to be able to train a pre-trained model on an African NER dataset.

##### Sections:

There are four sections in this notebook:

1. Installations: this is where we do installation for relevant dependencies
2. Imports: here, we perform imports for all the dependencies needed
3. Utility Classes and Functions: here, we define utility classes and functions that will help us train
4. Training: Here, the actual training process is done

### NB: Please run the entire cells in the notebooks as they are. The only section that can be modified is the training section. The parts of the code that can be modified are clearly explained in the the training section.


### 1. Installations

In [3]:
!pip install transformers
!pip install seqeval
!pip install ptvsd
!pip install sentencepiece

Collecting transformers
  Downloading transformers-4.33.2-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m24.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.17.1-py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.8/294.8 kB[0m [31m36.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m60.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m67.3 MB/s[0m eta [36m0:00:0

### 2. Imports

In [2]:


import argparse
import glob
import logging
import os
import random
from collections import defaultdict, Counter

import torch
import numpy as np
import pandas as pd

from scipy.sparse import save_npz, load_npz
from seqeval.metrics import f1_score, precision_score, recall_score
from torch import LongTensor
from torch import nn, optim
from torch.nn import CrossEntropyLoss
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler, TensorDataset
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange

from transformers import (
    WEIGHTS_NAME,
    AutoModelForSequenceClassification,
    AutoModelForTokenClassification,
    AutoConfig,
    AutoTokenizer,
    AutoModel,
    AdamW,
    BertConfig,
    BertForTokenClassification,
    BertTokenizer,
    CamembertConfig,
    CamembertForTokenClassification,
    CamembertTokenizer,
    DistilBertConfig,
    DistilBertForTokenClassification,
    DistilBertTokenizer,
    RobertaConfig,
    RobertaForTokenClassification,
    RobertaTokenizer,
    XLMRobertaConfig,
    XLMRobertaForTokenClassification,
    XLMRobertaTokenizer,
    get_linear_schedule_with_warmup,
    get_constant_schedule_with_warmup,
)

try:
    from torch.utils.tensorboard import SummaryWriter
except ImportError:
    from tensorboardX import SummaryWriter

logger = logging.getLogger("Afri_NER_Log")
logging.basicConfig(level=logging.DEBUG)

MODEL_CLASSES = {
    "bert": (BertConfig, BertForTokenClassification, BertTokenizer),
    "roberta": (RobertaConfig, RobertaForTokenClassification, RobertaTokenizer),
    "distilbert": (DistilBertConfig, DistilBertForTokenClassification, DistilBertTokenizer),
    "camembert": (CamembertConfig, CamembertForTokenClassification, CamembertTokenizer),
    "xlmroberta": (XLMRobertaConfig, XLMRobertaForTokenClassification, XLMRobertaTokenizer),
}

### 3. Utility classes and functions

Here, we write utility classes and functions that we will use for training. You can just run all the cells below.

**PLEASE DO NOT MAKE ANY CHANGES IN THIS SECTION**

We begin by writing custom datasets for our NER task

In [3]:
class InputExample(object):
    """A single training/test example for token classification."""

    def __init__(self, guid, words, labels):
        """Constructs a InputExample.
        Args:
            guid: Unique id for the example.
            words: list. The words of the sequence.
            labels: (Optional) list. The labels for each word of the sequence. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.guid = guid
        self.words = words
        self.labels = labels

class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, input_mask, segment_ids, label_ids):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_ids = label_ids


Next, we define the train and evaluation functions

In [4]:
def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
    )

    # Check if saved optimizer or scheduler states exist
    if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
            os.path.join(args.model_name_or_path, "scheduler.pt")
    ):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
        )

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size
        * args.gradient_accumulation_steps
        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if os.path.exists(args.model_name_or_path):
        # set global_step to gobal_step of last saved checkpoint from model path
        try:
            global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0])
        except ValueError:
            global_step = 0
        epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
        steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)

        logger.info("  Continuing training from checkpoint, will skip to saved global_step")
        logger.info("  Continuing training from epoch %d", epochs_trained)
        logger.info("  Continuing training from global step %d", global_step)
        logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)

    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(
        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
    )
    set_seed(args)  # Added here for reproductibility
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            model.train()
            batch = tuple(t.to(args.device) for t in batch)
            inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
            if args.model_type != "distilbert":
                inputs["token_type_ids"] = (
                    batch[2] if args.model_type in ["bert", "xlnet"] else None
                )  # XLM and RoBERTa don"t use segment_ids

            outputs = model(**inputs)
            loss = outputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

                scheduler.step()  # Update learning rate schedule
                optimizer.step()
                model.zero_grad()
                global_step += 1

                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if (
                            args.local_rank == -1 and args.evaluate_during_training
                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                        results, _ = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="dev")
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s", output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step

In [5]:
def evaluate(args, model, tokenizer, labels, pad_token_label_id, mode, prefix=""):
    eval_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode=mode)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

    # multi-gpu evaluate
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation %s *****", prefix)
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    preds = None
    out_label_ids = None
    model.eval()
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        batch = tuple(t.to(args.device) for t in batch)

        with torch.no_grad():
            inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
            if args.model_type != "distilbert":
                inputs["token_type_ids"] = (
                    batch[2] if args.model_type in ["bert", "xlnet"] else None
                )  # XLM and RoBERTa don"t use segment_ids
            outputs = model(**inputs)
            tmp_eval_loss, logits = outputs[:2]

            if args.n_gpu > 1:
                tmp_eval_loss = tmp_eval_loss.mean()  # mean() to average on multi-gpu parallel evaluating

            eval_loss += tmp_eval_loss.item()
        nb_eval_steps += 1
        if preds is None:
            preds = logits.detach().cpu().numpy()
            out_label_ids = inputs["labels"].detach().cpu().numpy()
        else:
            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
            out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)

    eval_loss = eval_loss / nb_eval_steps
    preds = np.argmax(preds, axis=2)

    label_map = {i: label for i, label in enumerate(labels)}

    out_label_list = [[] for _ in range(out_label_ids.shape[0])]
    preds_list = [[] for _ in range(out_label_ids.shape[0])]

    for i in range(out_label_ids.shape[0]):
        for j in range(out_label_ids.shape[1]):
            if out_label_ids[i, j] != pad_token_label_id:
                out_label_list[i].append(label_map[out_label_ids[i][j]])
                preds_list[i].append(label_map[preds[i][j]])

    results = {
        "loss": eval_loss,
        "precision": precision_score(out_label_list, preds_list),
        "recall": recall_score(out_label_list, preds_list),
        "f1": f1_score(out_label_list, preds_list),
    }

    logger.info("***** Eval results %s *****", prefix)
    for key in sorted(results.keys()):
        logger.info("  %s = %s", key, str(results[key]))

    return results, preds_list


Next, we define functions that will help us load and preprocess the examples.

In [6]:
def load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode):
    if args.local_rank not in [-1, 0] and not evaluate:
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    # Load data features from cache or dataset file
    cached_features_file = os.path.join(
        args.data_dir,
        "cached_{}_{}_{}".format(
            mode, list(filter(None, args.model_name_or_path.split("/"))).pop(), str(args.max_seq_length)
        ),
    )
    if os.path.exists(cached_features_file) and not args.overwrite_cache:
        logger.info("Loading features from cached file %s", cached_features_file)
        features = torch.load(cached_features_file)
    else:
        logger.info("Creating features from dataset file at %s", args.data_dir)
        examples = read_examples_from_file(args.data_dir, mode)
        features = convert_examples_to_features(
            examples,
            labels,
            args.max_seq_length,
            tokenizer,
            cls_token_at_end=bool(args.model_type in ["xlnet"]),
            # xlnet has a cls token at the end
            cls_token=tokenizer.cls_token,
            cls_token_segment_id=2 if args.model_type in ["xlnet"] else 0,
            sep_token=tokenizer.sep_token,
            sep_token_extra=bool(args.model_type in ["roberta"]),
            # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
            pad_on_left=bool(args.model_type in ["xlnet"]),
            # pad on the left for xlnet
            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
            pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
            pad_token_label_id=pad_token_label_id,
        )
        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s", cached_features_file)
            torch.save(features, cached_features_file)

    if args.local_rank == 0 and not evaluate:
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
    all_label_ids = torch.tensor([f.label_ids for f in features], dtype=torch.long)

    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
    return dataset


In [7]:
def read_examples_from_file(data_dir, mode):
    file_path = os.path.join(data_dir, "{}.txt".format(mode))
    guid_index = 1
    examples = []
    with open(file_path, encoding="utf-8") as f:
        words = []
        labels = []
        for line in f:
            line = line.strip()
            if len(line) < 2  or line == "\n":
                print(line, words)
                if words:
                    examples.append(InputExample(guid="{}-{}".format(mode, guid_index), words=words, labels=labels))
                    guid_index += 1
                    words = []
                    labels = []
            else:
                splits = line.split(" ")
                words.append(splits[0])
                if len(splits) > 1:
                    labels.append(splits[-1].replace("\n", ""))
                else:
                    # Examples could have no label for mode = "test"
                    labels.append("O")
        if words:
            examples.append(InputExample(guid="{}-{}".format(mode, guid_index), words=words, labels=labels))
    return examples


def convert_examples_to_features(
    examples,
    label_list,
    max_seq_length,
    tokenizer,
    cls_token_at_end=False,
    cls_token="[CLS]",
    cls_token_segment_id=1,
    sep_token="[SEP]",
    sep_token_extra=False,
    pad_on_left=False,
    pad_token=0,
    pad_token_segment_id=0,
    pad_token_label_id=-100,
    sequence_a_segment_id=0,
    mask_padding_with_zero=True,
):
    """ Loads a data file into a list of `InputBatch`s
        `cls_token_at_end` define the location of the CLS token:
            - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
            - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
        `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
    """

    label_map = {label: i for i, label in enumerate(label_list)}

    features = []
    for (ex_index, example) in enumerate(examples):
        #print(ex_index, len(example.words))
        if ex_index % 10000 == 0:
            logger.info("Writing example %d of %d", ex_index, len(examples))

        tokens = []
        label_ids = []
        for word, label in zip(example.words, example.labels):
            word_tokens = tokenizer.tokenize(word)
            tokens.extend(word_tokens)
            # Use the real label id for the first token of the word, and padding ids for the remaining tokens
            label_ids.extend([label_map[label]] + [pad_token_label_id] * (len(word_tokens) - 1))

        # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
        special_tokens_count = 3 if sep_token_extra else 2
        if len(tokens) > max_seq_length - special_tokens_count:
            tokens = tokens[: (max_seq_length - special_tokens_count)]
            label_ids = label_ids[: (max_seq_length - special_tokens_count)]

        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids:   0   0   0   0  0     0   0
        #
        # Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
        # `type=1` were learned during pre-training and are added to the wordpiece
        # embedding vector (and position vector). This is not *strictly* necessary
        # since the [SEP] token unambiguously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.
        #
        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        tokens += [sep_token]
        label_ids += [pad_token_label_id]
        if sep_token_extra:
            # roberta uses an extra separator b/w pairs of sentences
            tokens += [sep_token]
            label_ids += [pad_token_label_id]
        segment_ids = [sequence_a_segment_id] * len(tokens)

        if cls_token_at_end:
            tokens += [cls_token]
            label_ids += [pad_token_label_id]
            segment_ids += [cls_token_segment_id]
        else:
            tokens = [cls_token] + tokens
            label_ids = [pad_token_label_id] + label_ids
            segment_ids = [cls_token_segment_id] + segment_ids

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding_length = max_seq_length - len(input_ids)
        if pad_on_left:
            input_ids = ([pad_token] * padding_length) + input_ids
            input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask
            segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
            label_ids = ([pad_token_label_id] * padding_length) + label_ids
        else:
            input_ids += [pad_token] * padding_length
            input_mask += [0 if mask_padding_with_zero else 1] * padding_length
            segment_ids += [pad_token_segment_id] * padding_length
            label_ids += [pad_token_label_id] * padding_length

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length
        try:
            assert len(label_ids) == max_seq_length
        except:
            continue

        if ex_index < 5:
            logger.info("*** Example ***")
            logger.info("guid: %s", example.guid)
            logger.info("tokens: %s", " ".join([str(x) for x in tokens]))
            logger.info("input_ids: %s", " ".join([str(x) for x in input_ids]))
            logger.info("input_mask: %s", " ".join([str(x) for x in input_mask]))
            logger.info("segment_ids: %s", " ".join([str(x) for x in segment_ids]))
            logger.info("label_ids: %s", " ".join([str(x) for x in label_ids]))

        features.append(
            InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_ids=label_ids)
        )
    return features

def get_labels(path):
    if path:
        with open(path, "r") as f:
            labels = f.read().splitlines()
        if "X" not in labels:
            labels = ["X"] + labels
        return labels
    else:
        return ["X", "ADJ", "ADP", "ADV", "AUX", "CCONJ", "DET", "INTJ", "NOUN", "NUM", "PART", "PRON", "PROPN", "PUNCT", "SCONJ", "SYM", "VERB"]

Next, we define a function to set the seed and the function to start the actual training

In [8]:
def set_seed(args):
    """Set seed for training"""
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

In [9]:
def start_training(args):
    """
    Start the actual training process
    """
    if (
        os.path.exists(args.output_dir)
        and os.listdir(args.output_dir)
        and args.do_train
        and not args.overwrite_output_dir
    ):
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
                args.output_dir
            )
        )

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd

        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()

    # Setup mps, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend="nccl")
        args.n_gpu = 1
    args.device = device

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
    )

    # Set seed
    set_seed(args)

    # Prepare CONLL-2003 task
    labels = get_labels(args.labels)
    num_labels = len(labels)
    # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later
    pad_token_label_id = CrossEntropyLoss().ignore_index

    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab

    args.model_type = args.model_type.lower()
    config_class, model_class, tokenizer_class = AutoConfig, AutoModelForTokenClassification, AutoTokenizer #MODEL_CLASSES[args.model_type]

    config = config_class.from_pretrained(
    args.config_name if args.config_name else args.model_name_or_path,
    num_labels=num_labels,
    id2label={str(i): label for i, label in enumerate(labels)},
    label2id={label: i for i, label in enumerate(labels)},
    cache_dir=args.cache_dir if args.cache_dir else None,
    )
    tokenizer = tokenizer_class.from_pretrained(
        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
        #do_lower_case=args.do_lower_case,
        cache_dir=args.cache_dir if args.cache_dir else None,
        use_fast=True #args.use_fast,
    )
    model = model_class.from_pretrained(
        args.model_name_or_path,
        from_tf=bool(".ckpt" in args.model_name_or_path),
        config=config,
        cache_dir=args.cache_dir if args.cache_dir else None,
    )

    if args.local_rank == 0:
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab

    model.to(args.device)

    logger.info("Training/evaluation parameters %s", args)

    # Training
    if args.do_train:
        train_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode="train")
        #train_dataset = load_examples(args, mode="train")
        global_step, tr_loss = train(args, train_dataset, model, tokenizer, labels, pad_token_label_id)
        #global_step, tr_loss = train_ner(args, train_dataset, model, tokenizer, labels, pad_token_label_id)
        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)

    # Fine-tuning
    if args.do_finetune:
        tokenizer = tokenizer_class.from_pretrained(args.input_dir, do_lower_case=args.do_lower_case)
        model = model_class.from_pretrained(args.input_dir)
        model.to(args.device)
        result, predictions = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="test")
        train_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode="train")

        # train_dataset = load_examples(args, mode="train")
        global_step, tr_loss = train(args, train_dataset, model, tokenizer, labels, pad_token_label_id)
        # global_step, tr_loss = train_ner(args, train_dataset, model, tokenizer, labels, pad_token_label_id)
        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)

    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
    if (args.do_train or args.do_finetune) and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
        # Create output directory if needed
        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(args.output_dir)

        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = (
            model.module if hasattr(model, "module") else model
        )  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(
                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
            )
            logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
            model = model_class.from_pretrained(checkpoint)
            model.to(args.device)
            result, _ = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="dev", prefix=global_step)
            if global_step:
                result = {"{}_{}".format(global_step, k): v for k, v in result.items()}
            results.update(result)
        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            for key in sorted(results.keys()):
                writer.write("{} = {}\n".format(key, str(results[key])))

    if args.do_predict and args.local_rank in [-1, 0]:
        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
        model = model_class.from_pretrained(args.output_dir)
        model.to(args.device)
        result, predictions = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="test")
        # Save results
        output_test_results_file = os.path.join(args.output_dir, "test_results.txt")
        with open(output_test_results_file, "w") as writer:
            for key in sorted(result.keys()):
                writer.write("{} = {}\n".format(key, str(result[key])))
        # Save predictions
        output_test_predictions_file = os.path.join(args.output_dir, "test_predictions.txt")
        with open(output_test_predictions_file, "w") as writer:
            with open(os.path.join(args.data_dir, "test.txt"), "r") as f:
                example_id = 0
                for line in f:
                    if line.startswith("-DOCSTART-") or line == "" or line == "\n":
                        writer.write(line)
                        if not predictions[example_id]:
                            example_id += 1
                    elif predictions[example_id]:
                        output_line = line.split()[0] + " " + predictions[example_id].pop(0) + "\n"
                        writer.write(output_line)
                    else:
                        logger.warning("Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0])

    logger.info(results)


### 4. Training

Here, we perform the actual training process after defining the training arguments.

In [10]:
import argparse

In [11]:
def get_args():
    """
    Get training arguments
    """
    parser = argparse.ArgumentParser()
    # Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        help="The input data dir. Should contain the training files for the CoNLL-2003 NER task.",
    )
    parser.add_argument(
        "--model_type",
        default=None,
        type=str,
        #help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
    )
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        help="Path to pre-trained model or shortcut name selected in the list: " + ", ",
    )
    parser.add_argument(
        "--input_dir",
        default=None,
        type=str,
        required=False,
        help="The input model directory.",
    )
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        help="The output directory where the model predictions and checkpoints will be written.",
    )

    # Other parameters
    parser.add_argument(
        "--labels",
        default="",
        type=str,
        help="Path to a file containing all labels. If not specified, CoNLL-2003 labels are used.",
    )
    parser.add_argument(
        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
    )
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help="Pretrained tokenizer name or path if not the same as model_name",
    )
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help="Where do you want to store the pre-trained models downloaded from s3",
    )
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help="The maximum total input sequence length after tokenization. Sequences longer "
        "than this will be truncated, sequences shorter will be padded.",
    )
    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
    parser.add_argument("--do_finetune", action="store_true", help="Whether to run training.")
    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
    parser.add_argument("--do_predict", action="store_true", help="Whether to run predictions on the test set.")
    parser.add_argument(
        "--evaluate_during_training",
        action="store_true",
        help="Whether to run evaluation during training at each logging step.",
    )
    parser.add_argument(
        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
    )

    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
    parser.add_argument(
        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
    )
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help="Number of updates steps to accumulate before performing a backward/update pass.",
    )
    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
    parser.add_argument(
        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
    )
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
    )
    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")

    parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.")
    parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.")
    parser.add_argument(
        "--eval_all_checkpoints",
        action="store_true",
        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
    )
    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
    parser.add_argument(
        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
    )
    parser.add_argument(
        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
    )
    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")

    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
    parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
    parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")

    return parser.parse_known_args()

**CHANGES CAN BE MADE HERE:**

Note that the argments with comments may need to be modified. The remaining arguments can be left as they are, as these are good defaults.

Hence, you should start by only supplying the data directory and output directory.

In [12]:
# !git clone https://github.com/masakhane-io/masakhane-pos.git

In [15]:
data_path = 'masakhane-pos/data/yor'

BERT model training:

In [None]:
args, _ = get_args()
args.data_dir = data_path  # to-change: supply data directory
args.output_dir = "yor_model" # to-change: supply output directory
args.model_type = "bert" #"bert"
args.model_name_or_path = "Davlan/afro-xlmr-mini" #"bert-base-multilingual-cased"
args.max_seq_length = 200
args.num_train_epochs = 10
args.per_gpu_train_batch_size = 32
args.save_steps = 10000
args.seed = 1
args.do_train = True
args.do_eval = True
args.do_predict = True

In [None]:
# confirm your cuda devices before setting this command
#!export CUDA_VISIBLE_DEVICES=1,2,3
# start_training(args)

In [16]:
file_path = '/content/drive/MyDrive/GitHub/repos/masakhane-pos/data/sub/id_word_n1.txt'
ids= []
Word= []

# Open the file for reading
with open(file_path, 'r') as file:
    for line in file:
        line = line.strip()  # Remove leading/trailing whitespace and newline characters

        # Check if the line is not empty
        if line:
            # Split the line based on a delimiter (e.g., space)
            parts = line.split()
            ids.append(parts[0])
            Word.append(parts[1])


        else:
            pass  # Skip empty lines

XLM-Roberta model training:

In [None]:
args, _ = get_args()
args.data_dir = "data/swahili/" # to-change: supply data directory
args.output_dir = "swahili_xlmr" # to-change: supply output directory
args.model_type = "bert"
args.model_name_or_path = "xlm-roberta-base"
args.max_seq_length = 164
args.num_train_epochs = 10
args.per_gpu_train_batch_size = 32
args.save_steps = 10000
args.seed = 1
args.do_train
args.do_eval
args.do_predict

In [None]:
# confirm your cuda devices before setting this command
!export CUDA_VISIBLE_DEVICES=1,2,3
start_training(args)

In [21]:
# import os

# folder_path = "/content/drive/MyDrive/GitHub/repos/masakhane-pos/data"

# # Use os.listdir() to get a list of all items (files and folders) in the folder
# all_items = os.listdir(folder_path)

# # Use a list comprehension to filter out only the directories (folders)
# folders = [item for item in all_items if os.path.isdir(os.path.join(folder_path, item))]

# # Now 'folders' contains a list of folder names in the specified directory
# # folders= folders[:-2]
# print(folders)

folders = ['mtxelsn']

In [22]:
folders

['mtxelsn']

In [25]:
for i in folders:
  data_dir= f"/content/drive/MyDrive/GitHub/repos/masakhane-pos/data/{i}"
  args, _ = get_args()
  args.data_dir = data_dir
  args.output_dir = f"/content/drive/MyDrive/GitHub/repos/masakhane-pos/afro_xlmr_mini/{i}_xlmr"
  args.model_type = "bert"
  args.model_name_or_path = "Davlan/afro-xlmr-base"
  args.max_seq_length = 200
  args.weight_decay = 1e-5
  args.gradient_accumulation_steps = 5
  args.learning_rate = 1e-4
  args.num_train_epochs = 1
  args.per_gpu_train_batch_size = 32
  args.save_steps = 10000
  args.seed = 42
  args.do_train = True
  args.do_eval = True
  args.do_predict = True
  args.overwrite_output_dir = True
  start_training(args)

  # #test data loop
  # data_path = '/content/drive/MyDrive/GitHub/repos/masakhane-pos/data/sub'
  # args, _ = get_args()
  # args.data_dir = data_path  # to-change: supply data directory
  # args.output_dir = f"/content/drive/MyDrive/GitHub/repos/masakhane-pos/afro_xlmr_mini/{i}_xlmr" # to-change: supply output directory
  # args.model_type = "bert" #"bert"
  # args.model_name_or_path = f"/content/drive/MyDrive/GitHub/repos/masakhane-pos/afro_xlmr_mini/{i}_xlmr"
  # args.tokenizer_name = f"/content/drive/MyDrive/GitHub/repos/masakhane-pos/afro_xlmr_mini/{i}_xlmr"
  # args.max_seq_length = 200
  # args.seed = 42
  # args.do_train = False
  # args.do_eval = False
  # args.do_predict = True
  # start_training(args)


  # file_path = f"/content/drive/MyDrive/GitHub/repos/masakhane-pos/afro_xlmr_mini/{i}_xlmr/test_predictions.txt"
  # words= []
  # pos= []

  # # Open the file for reading
  # with open(file_path, 'r') as file:
  #     for line in file:
  #         line = line.strip()  # Remove leading/trailing whitespace and newline characters

  #         # Check if the line is not empty
  #         if line:
  #             # Split the line based on a delimiter (e.g., space)
  #             parts = line.split()
  #             words.append(parts[0])
  #             pos.append(parts[1])


  #         else:
  #             pass  # Skip empty lines


  # for j in range(len(words)):
  #   if Word[j] != words[j]:
  #     print(j, Word[j], words[j])

  # df_dict= {
  #     'Id': ids,
  #     'Word' : Word,
  #     'Pos' : pos,
  #     'Word_o' : words
  # }

  # merged_df= pd.DataFrame(df_dict)
  # merged_df[['Id', 'Pos']].to_csv(f'/content/drive/MyDrive/GitHub/repos/masakhane-pos/cleaned_sub_files/{i}_003.csv', index=False)

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch:   0%|          | 0/1 [00:00<?, ?it/s]
Iteration:   0%|          | 0/164 [00:00<?, ?it/s][A
Iteration:   1%|          | 1/164 [00:00<02:35,  1.05it/s][A
Iteration:   1%|          | 2/164 [00:01<02:32,  1.06it/s][A
Iteration:   2%|▏         | 3/164 [00:02<02:29,  1.07it/s][A
Iteration:   2%|▏         | 4/164 [00:03<02:28,  1.08it/s][A
Iteration:   3%|▎         | 5/164 [00:04<02:33,  1.03it/s][A
Iteration:   4%|▎         | 6/164 [00:05<02:30,  1.05it/s][A
Iteration:   4%|▍         | 7/164 [00:06<02:28,  1.06it/s][A
Iteration:   5%|▍         | 8/164 [00:07<02:26,  1.06it/s][A
Iteration:   5%|▌         | 9/164 [00:08<02:25,  1.07it/s][A
Iteration:   6%|▌         | 10/164 [00:09<0

In [None]:
n= 0
for i in range(len(merged_df)):
    if merged_df['Word'][i] == '""""':
        merged_df['Pos'] = 'X'
        n+= 1

In [None]:
data_path = '/content/drive/MyDrive/GitHub/repos/masakhane-pos/data/sub'
args, _ = get_args()
args.data_dir = data_path  # to-change: supply data directory
args.output_dir = f"/content/drive/MyDrive/GitHub/repos/masakhane-pos/afro_xlmr_mini/{i}_xlmr" # to-change: supply output directory
args.model_type = "bert" #"bert"
args.model_name_or_path = f"/content/drive/MyDrive/GitHub/repos/masakhane-pos/afro_xlmr_mini/{i}_xlmr"
args.tokenizer_name = f"/content/drive/MyDrive/GitHub/repos/masakhane-pos/afro_xlmr_mini/{i}_xlmr"
args.max_seq_length = 200
args.seed = 42
args.do_train = False
args.do_eval = False
args.do_predict = True
start_training(args)

239

In [None]:
merged_df[['Id', 'Pos']].to_csv(f'/content/drive/MyDrive/GitHub/repos/masakhane-pos/cleaned_sub_files/try_001.csv', index=False)

In [None]:
# trail 2

for i in folders:
  data_dir= f"/content/drive/MyDrive/GitHub/repos/masakhane-pos/data/{i}"
  args, _ = get_args()
  args.data_dir = data_dir
  args.output_dir = f"/content/drive/MyDrive/GitHub/repos/masakhane-pos/afro_xlmr_mini/{i}_xlmr"
  args.model_type = "bert"
  args.model_name_or_path = "Davlan/afro-xlmr-small"
  args.learning_rate= 5e-4
  args.weight_decay= 1e-2
  args.max_grad_norm= 1
  args.gradient_accumulation_steps= 1
  args.max_seq_length = 200
  args.num_train_epochs = 4
  args.per_gpu_train_batch_size = 32
  args.save_steps = 10000
  args.seed = 1
  args.do_train = True
  args.do_eval = True
  args.do_predict = True
  args.overwrite_output_dir = True
  start_training(args)

In [None]:
#test data loop
for i in folders:
  #test data loop
  data_path = '/content/drive/MyDrive/GitHub/repos/masakhane-pos/data/sub'
  args, _ = get_args()
  args.data_dir = data_path  # to-change: supply data directory
  args.output_dir = f"/content/drive/MyDrive/GitHub/repos/masakhane-pos/afro_xlmr_mini/{i}_xlmr" # to-change: supply output directory
  args.model_type = "bert" #"bert"
  args.model_name_or_path = f"/content/drive/MyDrive/GitHub/repos/masakhane-pos/afro_xlmr_mini/{i}_xlmr"
  args.tokenizer_name = f"/content/drive/MyDrive/GitHub/repos/masakhane-pos/afro_xlmr_mini/{i}_xlmr"
  args.max_seq_length = 200
  args.seed = 1
  args.do_train = False
  args.do_eval = False
  args.do_predict = True
  start_training(args)


  file_path = f"/content/drive/MyDrive/GitHub/repos/masakhane-pos/afro_xlmr_mini/{i}_xlmr/test_predictions.txt"
  words= []
  pos= []

  # Open the file for reading
  with open(file_path, 'r') as file:
      for line in file:
          line = line.strip()  # Remove leading/trailing whitespace and newline characters

          # Check if the line is not empty
          if line:
              # Split the line based on a delimiter (e.g., space)
              parts = line.split()
              words.append(parts[0])
              pos.append(parts[1])


          else:
              pass  # Skip empty lines


  for j in range(len(words)):
    if Word[j] != words[j]:
      print(j, Word[j], words[j])

  df_dict= {
      'Id': ids,
      'Word' : Word,
      'Pos' : pos,
      'Word_o' : words
  }

  merged_df= pd.DataFrame(df_dict)
  merged_df[['Id', 'Pos']].to_csv(f'/content/drive/MyDrive/GitHub/repos/masakhane-pos/cleaned_sub_files/{i}_001.csv', index=False)

In [None]:
# trail 2

for i in folders:
  data_dir= f"/content/drive/MyDrive/GitHub/repos/masakhane-pos/data/{i}"
  args, _ = get_args()
  args.data_dir = data_dir
  args.output_dir = f"/content/drive/MyDrive/GitHub/repos/masakhane-pos/afro_xlmr_mini/{i}_xlmr"
  args.model_type = "bert"
  args.model_name_or_path = "Davlan/afro-xlmr-mini"
  args.learning_rate= 5e-5
  args.weight_decay= 1e-3
  args.max_grad_norm= 0.5
  args.gradient_accumulation_steps= 2
  args.max_seq_length = 200
  args.num_train_epochs = 7
  args.per_gpu_train_batch_size = 32
  args.save_steps = 10000
  args.seed = 1
  args.do_train = True
  args.do_eval = True
  args.do_predict = True
  args.overwrite_output_dir = True
  start_training(args)

  #test data loop
  data_path = '/content/drive/MyDrive/GitHub/repos/masakhane-pos/data/sub'
  args, _ = get_args()
  args.data_dir = data_path  # to-change: supply data directory
  args.output_dir = f"/content/drive/MyDrive/GitHub/repos/masakhane-pos/afro_xlmr_mini/{i}_xlmr" # to-change: supply output directory
  args.model_type = "bert" #"bert"
  args.model_name_or_path = f"/content/drive/MyDrive/GitHub/repos/masakhane-pos/afro_xlmr_mini/{i}_xlmr"
  args.tokenizer_name = f"/content/drive/MyDrive/GitHub/repos/masakhane-pos/afro_xlmr_mini/{i}_xlmr"
  args.max_seq_length = 200
  args.seed = 1
  args.do_train = False
  args.do_eval = False
  args.do_predict = True
  start_training(args)


  file_path = f"/content/drive/MyDrive/GitHub/repos/masakhane-pos/afro_xlmr_mini/{i}_xlmr/test_predictions.txt"
  words= []
  pos= []

  # Open the file for reading
  with open(file_path, 'r') as file:
      for line in file:
          line = line.strip()  # Remove leading/trailing whitespace and newline characters

          # Check if the line is not empty
          if line:
              # Split the line based on a delimiter (e.g., space)
              parts = line.split()
              words.append(parts[0])
              pos.append(parts[1])


          else:
              pass  # Skip empty lines


  for j in range(len(words)):
    if Word[j] != words[j]:
      print(j, Word[j], words[j])

  df_dict= {
      'Id': ids,
      'Word' : Word,
      'Pos' : pos,
      'Word_o' : words
  }

  merged_df= pd.DataFrame(df_dict)
  merged_df[['Id', 'Pos']].to_csv(f'/content/drive/MyDrive/GitHub/repos/masakhane-pos/cleaned_sub_files/{i}_002.csv', index=False)

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at Davlan/afro-xlmr-mini and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch:   0%|          | 0/7 [00:00<?, ?it/s]
Iteration:   0%|          | 0/69 [00:00<?, ?it/s][A

Iteration:   3%|▎         | 2/69 [00:00<00:26,  2.51it/s][A
Iteration:   4%|▍         | 3/69 [00:01<00:24,  2.70it/s][A
Iteration:   6%|▌         | 4/69 [00:01<00:24,  2.64it/s][A
Iteration:   7%|▋         | 5/69 [00:01<00:23,  2.73it/s][A
Iteration:   9%|▊         | 6/69 [00:02<00:23,  2.67it/s][A
Iteration:  10%|█         | 7/69 [00:02<00:22,  2.77it/s][A
Iteration:  12%|█▏        | 8/69 [00:02<00:22,  2.70it/s][A
Iteration:  13%|█▎        | 9/69 [00:03<00:21,  2.78it/s][A
Iteration:  14%|█▍        | 10/69 [00:03<00:21,  2.70it/s][A
Iteration:  16%|█▌        | 11/69 [00:04<00:20,  2.

In [None]:
# # Create the submission file by recreating the 'Id' and 'Word' in the same manner as the test

# import pandas as pd

# # Sample DataFrame (replace this with your actual DataFrame)
# path = '/content/drive/MyDrive/GitHub/repos/masakhane-pos/Test.csv'
# df = pd.read_csv(path)
# df['Pos'] = 'NOUN'

# # Output file path
# output_file = '/content/drive/MyDrive/GitHub/repos/masakhane-pos/data/sub/id_word_n1.txt'

# # Open the file for writing
# with open(output_file, 'w', encoding='utf-8') as file:
#     current_group = None  # To keep track of the current group
#     for index, row in df.iterrows():
#         group = row['Id'].split('_')[0]  # Extract the group name
#         word = row['Word']
#         id = row['Id']

#         if current_group is not None and group != current_group:
#             # Add an empty line between different groups
#             file.write('\n')

#         # Write the current row's data to the file
#         file.write(f"{id} {word}\n")

#         # Update the current_group
#         current_group = group

# print(f"Data has been written to {output_file}")


file_path = '/content/drive/MyDrive/GitHub/repos/masakhane-pos/data/sub/id_word_n1.txt'
ids= []
Word= []

# Open the file for reading
with open(file_path, 'r') as file:
    for line in file:
        line = line.strip()  # Remove leading/trailing whitespace and newline characters

        # Check if the line is not empty
        if line:
            # Split the line based on a delimiter (e.g., space)
            parts = line.split()
            ids.append(parts[0])
            Word.append(parts[1])


        else:
            pass  # Skip empty lines

In [None]:
len(ids), len(Word)

(32045, 32045)

In [None]:
# for i in folders[6:]:
data_dir= "/content/drive/MyDrive/GitHub/repos/masakhane-pos/data/sna"
args, _ = get_args()
args.data_dir = data_dir
args.output_dir = f"/content/drive/MyDrive/masakhane-pos/sna" # to-change: supply output directory
args.model_type = "bert"
args.model_name_or_path = "Davlan/afro-xlmr-mini"
args.max_seq_length = 164
args.num_train_epochs = 10
args.per_gpu_train_batch_size = 32
args.save_steps = 10000
args.seed = 1
args.do_train = True
args.do_eval = True
args.do_predict = True
args.overwrite_output_dir = True
start_training(args)

#test data loop
data_path = '/content/drive/MyDrive/masakhane-pos/data/sub'
args, _ = get_args()
args.data_dir = data_path  # to-change: supply data directory
args.output_dir = "/content/drive/MyDrive/masakhane-pos/sna" # to-change: supply output directory
args.model_type = "bert" #"bert"
args.model_name_or_path = f'/content/drive/MyDrive/masakhane-pos/sna'
args.tokenizer_name = f'/content/drive/MyDrive/masakhane-pos/sna'
args.max_seq_length = 200
args.seed = 1
args.do_train = False
args.do_eval = False
args.do_predict = True
start_training(args)
file_path = f'/content/drive/MyDrive/masakhane-pos/xlm_r/{i}_xlmr/test_predictions.txt'
words= []
pos= []

# Open the file for reading
with open(file_path, 'r') as file:
    for line in file:
        line = line.strip()  # Remove leading/trailing whitespace and newline characters

        # Check if the line is not empty
        if line:
            # Split the line based on a delimiter (e.g., space)
            parts = line.split()
            words.append(parts[0])
            pos.append(parts[1])


        else:
            pass  # Skip empty lines


# for j in range(len(words)):
#   if Word[j] != words[j]:
#     print(j, Word[j], words[j])

df_dict= {
    'Id': ids,
    'Word' : Word,
    'Pos' : pos,
    'Word_o' : words
}

merged_df= pd.DataFrame(df_dict)
merged_df[['Id', 'Pos']].to_csv(f'cleaned_sub_files/sna_001.csv', index=False)

In [None]:
# List of file names you want to concatenate
file_names = [
    "/content/drive/MyDrive/GitHub/repos/masakhane-pos/data/sna/train.txt",
    "/content/drive/MyDrive/GitHub/repos/masakhane-pos/data/nya/train.txt",
    "/content/drive/MyDrive/GitHub/repos/masakhane-pos/data/lug/train.txt"
]

# Output file name
output_file = "concatenated.txt"

# Open the output file in write mode
with open(output_file, "w") as outfile:
    for file_name in file_names:
        try:
            # Open each input file in read mode
            with open(file_name, "r") as infile:
                # Read the contents of the input file
                file_contents = infile.read()
                # Write the contents to the output file
                outfile.write(file_contents)
                # Add a newline character to separate the contents of different files
                outfile.write("\n\n")  # Add an extra newline
        except FileNotFoundError:
            print(f"File not found: {file_name}")

print(f"Concatenated files into {output_file}")


In [33]:
import os

# Creating a new folder to merge the data of the three best performing models
# Define the name of the new folder you want to create
folder_name = "mtxelsn"

# Specify the path where you want to create the folder (you can change this to your desired path)
folder_path = "/content/drive/MyDrive/GitHub/repos/masakhane-pos/data/" + folder_name

# Use the os.makedirs() function to create the folder
os.makedirs(folder_path)

# Check if the folder was created successfully
if os.path.exists(folder_path):
    print(f"Folder '{folder_name}' was created successfully at '{folder_path}'")
else:
    print(f"Failed to create folder '{folder_name}'")

Folder 'mtxelsn' was created successfully at '/content/drive/MyDrive/GitHub/repos/masakhane-pos/data/mtxelsn'


In [34]:
# List of file names you want to concatenate
file_names = [
    "/content/drive/MyDrive/GitHub/repos/masakhane-pos/data/mos/train.txt",
    "/content/drive/MyDrive/GitHub/repos/masakhane-pos/data/nya/train.txt",
    "/content/drive/MyDrive/GitHub/repos/masakhane-pos/data/twi/train.txt",
    "/content/drive/MyDrive/GitHub/repos/masakhane-pos/data/sna/train.txt",
    "/content/drive/MyDrive/GitHub/repos/masakhane-pos/data/xho/train.txt",
    "/content/drive/MyDrive/GitHub/repos/masakhane-pos/data/lug/train.txt",
    "/content/drive/MyDrive/GitHub/repos/masakhane-pos/data/ewe/train.txt"
]

# Output file name
output_file = f"/content/drive/MyDrive/GitHub/repos/masakhane-pos/data/{folder_name}/train.txt"

# Open the output file in write mode
with open(output_file, "w") as outfile:
    for file_name in file_names:
        try:
            # Open each input file in read mode
            with open(file_name, "r") as infile:
                # Read the contents of the input file
                file_contents = infile.read()
                # Write the contents to the output file
                outfile.write(file_contents)
                # Add a newline character to separate the contents of different files
                outfile.write("\n\n")  # Add an extra newline
        except FileNotFoundError:
            print(f"File not found: {file_name}")

print(f"Concatenated files into {output_file}")


Concatenated files into /content/drive/MyDrive/GitHub/repos/masakhane-pos/data/mtxelsn/train.txt


In [35]:
# List of file names you want to concatenate
file_names = [
    "/content/drive/MyDrive/GitHub/repos/masakhane-pos/data/mos/test.txt",
    "/content/drive/MyDrive/GitHub/repos/masakhane-pos/data/nya/test.txt",
    "/content/drive/MyDrive/GitHub/repos/masakhane-pos/data/twi/test.txt",
    "/content/drive/MyDrive/GitHub/repos/masakhane-pos/data/sna/test.txt",
    "/content/drive/MyDrive/GitHub/repos/masakhane-pos/data/xho/test.txt",
    "/content/drive/MyDrive/GitHub/repos/masakhane-pos/data/lug/test.txt",
    "/content/drive/MyDrive/GitHub/repos/masakhane-pos/data/ewe/test.txt"
]

# Output file name
output_file = f"/content/drive/MyDrive/GitHub/repos/masakhane-pos/data/{folder_name}/test.txt"
# Open the output file in write mode
with open(output_file, "w") as outfile:
    for file_name in file_names:
        try:
            # Open each input file in read mode
            with open(file_name, "r") as infile:
                # Read the contents of the input file
                file_contents = infile.read()
                # Write the contents to the output file
                outfile.write(file_contents)
                # Add a newline character to separate the contents of different files
                outfile.write("\n\n")  # Add an extra newline
        except FileNotFoundError:
            print(f"File not found: {file_name}")

print(f"Concatenated files into {output_file}")


Concatenated files into /content/drive/MyDrive/GitHub/repos/masakhane-pos/data/mtxelsn/test.txt


In [36]:
# List of file names you want to concatenate
file_names = [
    "/content/drive/MyDrive/GitHub/repos/masakhane-pos/data/mos/dev.txt",
    "/content/drive/MyDrive/GitHub/repos/masakhane-pos/data/nya/dev.txt",
    "/content/drive/MyDrive/GitHub/repos/masakhane-pos/data/twi/dev.txt",
    "/content/drive/MyDrive/GitHub/repos/masakhane-pos/data/sna/dev.txt",
    "/content/drive/MyDrive/GitHub/repos/masakhane-pos/data/xho/dev.txt",
    "/content/drive/MyDrive/GitHub/repos/masakhane-pos/data/lug/dev.txt",
    "/content/drive/MyDrive/GitHub/repos/masakhane-pos/data/ewe/dev.txt"
]

# Output file name
output_file = f"/content/drive/MyDrive/GitHub/repos/masakhane-pos/data/{folder_name}/dev.txt"

# Open the output file in write mode
with open(output_file, "w") as outfile:
    for file_name in file_names:
        try:
            # Open each input file in read mode
            with open(file_name, "r") as infile:
                # Read the contents of the input file
                file_contents = infile.read()
                # Write the contents to the output file
                outfile.write(file_contents)
                # Add a newline character to separate the contents of different files
                outfile.write("\n\n")  # Add an extra newline
        except FileNotFoundError:
            print(f"File not found: {file_name}")

print(f"Concatenated files into {output_file}")


Concatenated files into /content/drive/MyDrive/GitHub/repos/masakhane-pos/data/mtxelsn/dev.txt


In [None]:
%cd /content/drive/MyDrive/GitHub/repos/masakhane-pos

/content/drive/MyDrive/GitHub/repos/masakhane-pos


In [None]:
!git add .

In [None]:
!git reset

Unstaged changes after reset:
M	preprocess.py
M	script/afroxlmr_ud_transfer.sh
M	script/all_afriberta.sh
M	script/all_afrolm.sh
M	script/all_afroxlmr.sh
M	script/all_afroxlmrbase.sh
M	script/all_afroxlmrbase_transfer.sh
M	script/all_mbert.sh
M	script/all_rembert.sh
M	script/all_xlmr_large.sh
M	split_sentence.sh
M	train_pos.ipynb


In [26]:
!cat .gitignore


afro_xlmr_mini

In [None]:
!touch .gitignore

In [None]:
!nano .gitignore

/bin/bash: line 1: nano: command not found


In [None]:
# Define the files to be added to .gitignore
files_to_ignore = ["afro_xlmr_mini"]

# Read the current contents of .gitignore
with open('.gitignore', 'r') as f:
    content = f.read()

# Append the new files to the contents
for file in files_to_ignore:
    if file not in content:
        content += f"\n{file}"

# Write the updated contents back to .gitignore
with open('.gitignore', 'w') as f:
    f.write(content)

In [27]:
!git add .

In [None]:
!git config --global user.email "oludare_adekunle@outlook.com"
!git config --global user.name "DareAdekunle"

In [None]:
! git commit -m "updated notebook 1"

[main 312cb8f] updated notebook 1
 1 file changed, 1 insertion(+), 1 deletion(-)
 rewrite train_pos.ipynb (95%)


In [None]:
!git push origin main

fatal: could not read Username for 'https://github.com': No such device or address


In [None]:
!git remote -v


origin	https://github.com/DareAdekunle/masakhane-pos.git (fetch)
origin	https://github.com/DareAdekunle/masakhane-pos.git (push)


In [None]:
!git push origin main

fatal: could not read Username for 'https://github.com': No such device or address


In [None]:
!git push

In [None]:
!git push "https://DareAdekunle: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@github.com/DareAdekunle/masakhane-pos.git" main

Enumerating objects: 5, done.
Counting objects:  20% (1/5)Counting objects:  40% (2/5)Counting objects:  60% (3/5)Counting objects:  80% (4/5)Counting objects: 100% (5/5)Counting objects: 100% (5/5), done.
Delta compression using up to 2 threads
Compressing objects:  33% (1/3)Compressing objects:  66% (2/3)Compressing objects: 100% (3/3)Compressing objects: 100% (3/3), done.
Writing objects:  33% (1/3)Writing objects:  66% (2/3)Writing objects: 100% (3/3)Writing objects: 100% (3/3), 2.09 KiB | 194.00 KiB/s, done.
Total 3 (delta 2), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (2/2), completed with 2 local objects.[K
To https://github.com/DareAdekunle/masakhane-pos.git
   e37a296..312cb8f  main -> main
