In [None]:
!nvidia-smi

In [None]:
!pip uninstall -y torchtext --quiet
!pip install -qq torchtext==0.6.0
!pip install -qq torch==1.11.0 pytorch-lightning==1.2.2 --quiet
!pip install -U transformers[torch] --quiet
!pip install -qq datasets --quiet
!pip install -qq accelerate -U --quiet

In [None]:
import io
import os
import sys
import gc
import pickle
import random
import termcolor
import warnings
import shutil
import math
from functools import partial
from datetime import datetime
from dataclasses import dataclass
from pathlib import Path
from typing import List

import pandas as pd
import numpy as np
from tqdm.auto import tqdm

from scipy.special import softmax

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Sampler
from tqdm import tqdm
import random
import os
import re
import string

import warnings
warnings.filterwarnings('ignore')

In [None]:
import datasets
from datasets import DatasetDict, Dataset

import transformers
from transformers import (
    AutoModel, AutoTokenizer, AdamW, get_linear_schedule_with_warmup,
    AutoModelForMaskedLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling,
    EarlyStoppingCallback, AutoModelForTokenClassification,
)

In [None]:
!git clone https://github.com/NtemKenyor/masakhane-pos

# Config

In [None]:
DEBUG = True
SAVE_PATH = None
SEED = 42
NOW = datetime.strftime(datetime.now() , "%m%d")

SAVE_PATH = Path(f'/content/lightning-logs/')
SAVE_PATH.mkdir(exist_ok=True)

NUM_WORKERS = os.cpu_count()
GPUS = 1 if torch.cuda.is_available() else None

print("DEBUG:\t", DEBUG)
print("SAVE_PATH:\t", SAVE_PATH)
print("NUM_WORKERS:\t", NUM_WORKERS)
print("GPUS:\t", GPUS)

DEBUG:	 True
SAVE_PATH:	 None
NUM_WORKERS:	 56
GPUS:	 1


#**Possible language combinations for Model training:**

In [None]:
from itertools import combinations

tsn_langs = ['sna', 'xho', 'zul']
tsn_valid = []
for r in range(1, len(tsn_langs)+1):
    tsn_valid.extend(list(combinations(tsn_langs, r=r)))

luo_langs = ['lug', 'kin']
luo_valid = []
for r in range(1, len(luo_langs)+1):
    luo_valid.extend(list(combinations(luo_langs, r=r)))

tsn_luo_valid = []
extra_valid = ['wol', 'pcm']
for tsn_ in tsn_valid:
    tsn_luo_valid.append([*extra_valid, *tsn_])
for luo_ in luo_valid:
    tsn_luo_valid.append([*extra_valid, *luo_])
for tsn_ in tsn_valid:
    for luo_ in luo_valid:
        tsn_luo_valid.append([*extra_valid, *tsn_, *luo_])
tsn_luo_valid.sort(key = len)
tsn_luo_valid

[['wol', 'pcm', 'sna'],
 ['wol', 'pcm', 'xho'],
 ['wol', 'pcm', 'bam'],
 ['wol', 'pcm', 'lug'],
 ['wol', 'pcm', 'swa'],
 ['wol', 'pcm', 'sna', 'xho'],
 ['wol', 'pcm', 'sna', 'bam'],
 ['wol', 'pcm', 'xho', 'bam'],
 ['wol', 'pcm', 'lug', 'swa'],
 ['wol', 'pcm', 'sna', 'lug'],
 ['wol', 'pcm', 'sna', 'swa'],
 ['wol', 'pcm', 'xho', 'lug'],
 ['wol', 'pcm', 'xho', 'swa'],
 ['wol', 'pcm', 'bam', 'lug'],
 ['wol', 'pcm', 'bam', 'swa'],
 ['wol', 'pcm', 'sna', 'xho', 'bam'],
 ['wol', 'pcm', 'sna', 'lug', 'swa'],
 ['wol', 'pcm', 'xho', 'lug', 'swa'],
 ['wol', 'pcm', 'bam', 'lug', 'swa'],
 ['wol', 'pcm', 'sna', 'xho', 'lug'],
 ['wol', 'pcm', 'sna', 'xho', 'swa'],
 ['wol', 'pcm', 'sna', 'bam', 'lug'],
 ['wol', 'pcm', 'sna', 'bam', 'swa'],
 ['wol', 'pcm', 'xho', 'bam', 'lug'],
 ['wol', 'pcm', 'xho', 'bam', 'swa'],
 ['wol', 'pcm', 'sna', 'xho', 'lug', 'swa'],
 ['wol', 'pcm', 'sna', 'bam', 'lug', 'swa'],
 ['wol', 'pcm', 'xho', 'bam', 'lug', 'swa'],
 ['wol', 'pcm', 'sna', 'xho', 'bam', 'lug'],
 ['wol', '

# **Model choice:**
- Davlan/afro-xlmr-large-75L
- Davlan/afro-xlmr-large-61L
- Davlan/afro-xlmr-large-29L
- Davlan/afro-xlmr-large
- Davlan/afro-xlmr-small
- Davlan/xlm-roberta-large-masakhaner

In [None]:
@dataclass
class Config:
    model_name:str = 'Davlan/afro-xlmr-large-75L'
    model_type:str = "xlmroberta"
    max_seq_len:int = 200
    test_path:str = "/content/masakhane-pos/data/Test.csv"
    train_path:str = "/content/masakhane-pos/data/africa_lan.csv"
    data_dir:str = "/content/masakhane-pos/data"
    num_classes = None
    label_mappings = None
    valid_languages = ['wol','sna','pcm']
    lr:float = 1e-6
    batch_size: int = 16
    num_epoch:int = 30
    gradient_accumulation_steps: int = 1
    warmup_steps: int = 100

    def __post_init__(self):
        pass

cfg = Config()
cfg

Config(model_name='Davlan/afro-xlmr-large-75L', model_type='xlmroberta', max_seq_len=200, test_path='Test.csv', data_dir='data', lr=1e-06, batch_size=16, num_epoch=30, gradient_accumulation_steps=1, warmup_steps=100)

# Setup tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(cfg.model_name)

# **Seeding:**

In [None]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHEDSEED"] = str(seed)

    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    torch.backends.cudnn.deterministic = True
    transformers.set_seed(seed)

set_seed(SEED)

# Load Data

In [None]:
df_test = pd.read_csv(Config.test_path)
df_test.head(2)

Unnamed: 0,Id,Word,Language,Pos
0,Id00qog2f11n_0,Ne,luo,
1,Id00qog2f11n_1,otim,luo,


In [None]:
df_test['Pos'] = 'X'  # assumed for now
df_test['sentence_Id'] = df_test['Id'].apply(lambda x: x.split('_')[0])
df_test = df_test.groupby('sentence_Id').agg(list).reset_index()
df_test.head(2)

Unnamed: 0,sentence_Id,Id,Word,Language,Pos
0,Id00qog2f11n,"[Id00qog2f11n_0, Id00qog2f11n_1, Id00qog2f11n_...","[Ne, otim, penj, e, kind, Februar, tarik, 9, g...","[luo, luo, luo, luo, luo, luo, luo, luo, luo, ...","[X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, ..."
1,Id01lywjj7oz,"[Id01lywjj7oz_0, Id01lywjj7oz_1, Id01lywjj7oz_...","[Sifuna, ne, ojiwo, jonyuol, kod, joma, moko, ...","[luo, luo, luo, luo, luo, luo, luo, luo, luo, ...","[X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, ..."


In [None]:
def read_examples_from_file(file_path):
    example_words = []
    example_labels = []
    with open(file_path, encoding="utf-8") as f:
        words = []
        labels = []
        for line in f:
            line = line.strip()
            if len(line) < 2  or line == "\n":
                if words:
                    example_words.append(words)
                    example_labels.append(labels)

                    words = []
                    labels = []
            else:
                splits = line.split(" ")
                words.append(splits[0])
                if len(splits) > 1:
                    labels.append(splits[-1].replace("\n", ""))
                else:
                    # Examples could have no label for mode = "test"
                    labels.append("O")
        if words:
            example_words.append(words)
            example_labels.append(labels)
    return example_words, example_labels

In [None]:
import glob

In [None]:
cfg.valid_languages

['wol', 'sna', 'pcm']

In [None]:
train_words = []
train_labels = []

valid_words = []
valid_labels = []

for file_path in glob.glob(f"{cfg.data_dir}/**/*.txt"):
    words_list, labels_list = read_examples_from_file(file_path)

    language = file_path.split(os.sep)[-2]
    if language in cfg.valid_languages:
        valid_words.extend(words_list)
        valid_labels.extend(labels_list)
    else:
        train_words.extend(words_list)
        train_labels.extend(labels_list)

In [None]:
len(train_labels), len(train_words), len(valid_labels), len(valid_words)

(22916, 22916, 4557, 4557)

In [None]:
df_train = pd.DataFrame()
df_train['Word'] = train_words
df_train['Pos'] = train_labels
df_train.head(2)

Unnamed: 0,Word,Pos
0,"[Ni, yo, ihangayikishije, kurusha, izindi, zos...","[VERB, PRON, VERB, VERB, PRON, PRON, VERB, CCO..."
1,"[Alpha, yo, imaze, kugera, mu, bihugu, bisaga,...","[PROPN, PRON, VERB, VERB, ADP, NOUN, VERB, NUM..."


In [None]:
df_valid = pd.DataFrame()
df_valid['Word'] = valid_words
df_valid['Pos'] = valid_labels
df_valid.head(2)

Unnamed: 0,Word,Pos
0,"[Nekuda, kwe, mabasa, avakabata, munguva, ye, ...","[SCONJ, ADP, NOUN, VERB, NOUN, ADP, NOUN, CCON..."
1,"[Nhoroondo, ye, kuimba, Chinx, akaimba, ne, va...","[NOUN, ADP, VERB, PROPN, VERB, ADP, NOUN, ADJ,..."


In [None]:
print(df_train.shape, df_valid.shape, df_test.shape)

labels = ["X", "ADJ", "ADP", "ADV", "AUX", "CCONJ", "DET", "INTJ", "NOUN", "NUM", "PART", "PRON", "PROPN", "PUNCT", "SCONJ", "SYM", "VERB"]
cfg.num_classes = len(labels)

(22916, 2) (4557, 2) (1208, 5)


# Preprocess Features

In [None]:
def convert_to_feature(
    row,
    tokenizer=tokenizer,
    label_list=labels,
    max_seq_length=cfg.max_seq_len,
    cls_token_at_end=False,
    cls_token="[CLS]",
    cls_token_segment_id=1,
    sep_token="[SEP]",
    sep_token_extra=False,
    pad_on_left=False,
    pad_token=0,
    pad_token_segment_id=0,
    pad_token_label_id=-100,
    sequence_a_segment_id=0,
    mask_padding_with_zero=True,
):
    """ Loads a data file into a list of `InputBatch`s
        `cls_token_at_end` define the location of the CLS token:
            - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
            - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
        `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
    """

    label_map = {label: i for i, label in enumerate(label_list)}

    tokens = []
    label_ids = []
    for word, label in zip(row['Word'], row['Pos']):
        word_tokens = tokenizer.tokenize(word)
        tokens.extend(word_tokens)
        # Use the real label id for the first token of the word, and padding ids for the remaining tokens
        label_ids.extend([label_map[label]] + [pad_token_label_id] * (len(word_tokens) - 1))

    # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
    special_tokens_count = 3 if sep_token_extra else 2
    if len(tokens) > max_seq_length - special_tokens_count:
        tokens = tokens[: (max_seq_length - special_tokens_count)]
        label_ids = label_ids[: (max_seq_length - special_tokens_count)]

    tokens += [sep_token]
    label_ids += [pad_token_label_id]
    if sep_token_extra:
        # roberta uses an extra separator b/w pairs of sentences
        tokens += [sep_token]
        label_ids += [pad_token_label_id]
    segment_ids = [sequence_a_segment_id] * len(tokens)

    if cls_token_at_end:
        tokens += [cls_token]
        label_ids += [pad_token_label_id]
        segment_ids += [cls_token_segment_id]
    else:
        tokens = [cls_token] + tokens
        label_ids = [pad_token_label_id] + label_ids
        segment_ids = [cls_token_segment_id] + segment_ids

    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # The mask has 1 for real tokens and 0 for padding tokens.
    # Only real tokens are attended to.
    input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

    # Zero-pad up to the sequence length.
    padding_length = max_seq_length - len(input_ids)
    if pad_on_left:
        input_ids = ([pad_token] * padding_length) + input_ids
        input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask
        segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
        label_ids = ([pad_token_label_id] * padding_length) + label_ids
    else:
        input_ids += [pad_token] * padding_length
        input_mask += [0 if mask_padding_with_zero else 1] * padding_length
        segment_ids += [pad_token_segment_id] * padding_length
        label_ids += [pad_token_label_id] * padding_length

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length

    return dict(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_ids=label_ids)

In [None]:
%%time
# train features
train_features = pd.DataFrame(df_train.apply(lambda row: convert_to_feature(row), axis=1).tolist())
train_features.head(2)

CPU times: user 16 s, sys: 83.3 ms, total: 16.1 s
Wall time: 16.1 s


Unnamed: 0,input_ids,input_mask,segment_ids,label_ids
0,"[3, 1520, 3005, 8469, 31989, 783, 12787, 236, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-100, 16, 11, 16, -100, -100, -100, -100, 16,..."
1,"[3, 76569, 3005, 4618, 731, 228, 28783, 842, 3...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-100, 12, 11, 16, -100, 16, -100, 2, 8, -100,..."


In [None]:
%%time
# valid features
valid_features = pd.DataFrame(df_valid.apply(lambda row: convert_to_feature(row), axis=1).tolist())
valid_features.head(2)

CPU times: user 3.59 s, sys: 15.7 ms, total: 3.61 s
Wall time: 3.61 s


Unnamed: 0,input_ids,input_mask,segment_ids,label_ids
0,"[3, 82195, 6874, 16646, 291, 49967, 10, 48730,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-100, 14, -100, 2, 8, -100, 16, -100, -100, 8..."
1,"[3, 33101, 12393, 39413, 2422, 228, 60576, 627...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-100, 8, -100, -100, 2, 16, -100, 12, -100, 1..."


In [None]:
%%time
# test features
test_features = pd.DataFrame(df_test.apply(lambda row: convert_to_feature(row), axis=1).tolist())
test_features.head(2)

CPU times: user 921 ms, sys: 4.13 ms, total: 925 ms
Wall time: 924 ms


Unnamed: 0,input_ids,input_mask,segment_ids,label_ids
0,"[3, 799, 36, 5083, 5551, 170, 28, 8562, 61783,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-100, 0, 0, -100, 0, -100, 0, 0, 0, 0, 0, 0, ..."
1,"[3, 602, 95635, 108, 36, 658, 3613, 741, 299, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-100, 0, -100, 0, 0, -100, -100, 0, -100, -10..."


In [None]:
masakhane = DatasetDict({
    "train": Dataset.from_pandas(train_features),
    "valid": Dataset.from_pandas(valid_features),
    "test": Dataset.from_pandas(test_features),
})

masakhane

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'input_mask', 'segment_ids', 'label_ids'],
        num_rows: 22916
    })
    valid: Dataset({
        features: ['input_ids', 'input_mask', 'segment_ids', 'label_ids'],
        num_rows: 4557
    })
    test: Dataset({
        features: ['input_ids', 'input_mask', 'segment_ids', 'label_ids'],
        num_rows: 1208
    })
})

In [None]:
label_map = {i: label for i, label in enumerate(labels)}

In [None]:
def compute_metrics(
    eval_pred,
    pad_token_label_id=-100,
):
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis = -1)

    out_label_list = []
    preds_list = []
    for i in range(labels.shape[0]):
        for j in range(labels.shape[1]):
            if labels[i, j] != pad_token_label_id:
                out_label_list.append(label_map[labels[i][j]])
                preds_list.append(label_map[preds[i][j]])

    accuracy = accuracy_score(out_label_list, preds_list)
    return {"accuracy": accuracy}

In [None]:
model = AutoModelForTokenClassification.from_pretrained(
    cfg.model_name,
    num_labels = cfg.num_classes,
    id2label={str(i): label for i, label in enumerate(labels)},
    label2id={label: i for i, label in enumerate(labels)},
)

Some weights of the model checkpoint at Davlan/afro-xlmr-large-75L were not used when initializing XLMRobertaForTokenClassification: ['lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at Davlan/afro-xlmr-large-75L and are newly initialized: ['classifier.bias', 'classifier.weight']
You s

In [None]:
cfg.__dict__

{'model_name': 'Davlan/afro-xlmr-large-75L',
 'model_type': 'xlmroberta',
 'max_seq_len': 200,
 'test_path': 'Test.csv',
 'data_dir': 'data',
 'lr': 1e-06,
 'batch_size': 16,
 'num_epoch': 30,
 'gradient_accumulation_steps': 1,
 'warmup_steps': 100,
 'num_classes': 17}

In [None]:
epoch_steps = int(np.ceil(len(masakhane['train']) / cfg.batch_size))
display(epoch_steps)

1433

In [None]:
cfg

Config(model_name='Davlan/afro-xlmr-large-75L', model_type='xlmroberta', max_seq_len=200, test_path='Test.csv', data_dir='data', lr=1e-06, batch_size=16, num_epoch=30, gradient_accumulation_steps=1, warmup_steps=100)

In [None]:
training_args = TrainingArguments(
    output_dir='/content/masakhane-pos',
    evaluation_strategy = "steps",
    save_strategy='steps',
    eval_steps = epoch_steps//2,
    save_steps = epoch_steps//2,
    logging_strategy="steps",
    logging_steps= epoch_steps//2,
    learning_rate=cfg.lr,
    save_total_limit=1,
    num_train_epochs=cfg.num_epoch,
    fp16=True,
    report_to='none',
    metric_for_best_model = "accuracy",
    greater_is_better=True,
    load_best_model_at_end = True,
    per_device_train_batch_size=cfg.batch_size,
    per_device_eval_batch_size=cfg.batch_size,
    warmup_steps = cfg.warmup_steps,
)

In [None]:
trainer = Trainer(
    model = model,
    args = training_args,
    compute_metrics = compute_metrics,
    train_dataset = masakhane['train'],
    eval_dataset = masakhane['valid'],
    tokenizer = tokenizer,
    callbacks = [EarlyStoppingCallback(5)],
)

# **Model Training Starts !!!**

In [None]:
%%time
trainer.train()

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy
716,1.9636,1.311872,0.629698
1432,1.0519,0.86654,0.753467
2148,0.7872,0.765431,0.781999
2864,0.656,0.726134,0.789755
3580,0.5772,0.707048,0.798001
4296,0.5339,0.704399,0.795974
5012,0.4999,0.687499,0.799668
5728,0.4738,0.695865,0.799873
6444,0.4548,0.679069,0.801311
7160,0.4362,0.690138,0.794765


CPU times: user 44min 56s, sys: 10min 44s, total: 55min 41s
Wall time: 55min 37s


TrainOutput(global_step=10024, training_loss=0.6470755221647733, metrics={'train_runtime': 3336.4647, 'train_samples_per_second': 206.05, 'train_steps_per_second': 12.885, 'total_flos': 5.81603309290656e+16, 'train_loss': 0.6470755221647733, 'epoch': 7.0})

In [None]:
trainer.evaluate()

{'eval_loss': 0.679069459438324,
 'eval_accuracy': 0.8013109180512762,
 'eval_runtime': 19.8082,
 'eval_samples_per_second': 230.057,
 'eval_steps_per_second': 14.388,
 'epoch': 7.0}

# **Saving and loading back my trained model above:**

In [None]:
trainer.save_model("Lacuna_Masakhane_POS_model")

In [None]:
loaded_model = AutoModelForTokenClassification.from_pretrained(
    "/content/Lacuna_Masakhane_POS_model",
    num_labels = cfg.num_classes,

)

loaded_tokenizer = AutoTokenizer.from_pretrained(
    '/content/Lacuna_Masakhane_POS_model'
)

In [None]:
del trainer, model, df_train, df_valid
torch.cuda.empty_cache()
gc.collect()

11

# **Predicting on test data:**

In [None]:
pad_token_label_id=-100

test_args = TrainingArguments(
    output_dir= '/content/',
    do_train =False,
    do_predict = True,
    dataloader_drop_last = False
)

trainer = Trainer(
    model = loaded_model,
    args = test_args,
)

test_results = trainer.predict(masakhane['test'])

# **Prepare Submission:**

In [None]:
result = test_results.predictions.argmax(axis = -1)

preds_list = [[] for _ in range(test_results.label_ids.shape[0])]
for i in range(test_results.label_ids.shape[0]):
    for j in range(test_results.label_ids.shape[1]):
        if test_results.label_ids[i, j] != pad_token_label_id:
            preds_list[i].append(label_map[result[i][j]])

df_test['Pos'] = preds_list
df_test.head(2)

Unnamed: 0,sentence_Id,Id,Word,Language,Pos
0,Id00qog2f11n,"[Id00qog2f11n_0, Id00qog2f11n_1, Id00qog2f11n_...","[Ne, otim, penj, e, kind, Februar, tarik, 9, g...","[luo, luo, luo, luo, luo, luo, luo, luo, luo, ...","[AUX, VERB, NOUN, ADP, NOUN, PROPN, NOUN, NUM,..."
1,Id01lywjj7oz,"[Id01lywjj7oz_0, Id01lywjj7oz_1, Id01lywjj7oz_...","[Sifuna, ne, ojiwo, jonyuol, kod, joma, moko, ...","[luo, luo, luo, luo, luo, luo, luo, luo, luo, ...","[PROPN, AUX, VERB, NOUN, CCONJ, NOUN, ADJ, VER..."


In [None]:
submission = df_test[['Id', 'Pos']].explode(column=['Id', 'Pos'], ignore_index=True)
submission.head()

Unnamed: 0,Id,Pos
0,Id00qog2f11n_0,AUX
1,Id00qog2f11n_1,VERB
2,Id00qog2f11n_2,NOUN
3,Id00qog2f11n_3,ADP
4,Id00qog2f11n_4,NOUN


In [None]:
submission.to_csv("afro-xlmr-large-75L_3vl.csv", index = False)

In [None]:
submission['Pos'].value_counts()

NOUN     6806
VERB     4643
ADP      3821
PUNCT    2992
PROPN    2672
AUX      2449
SCONJ    1902
PRON     1561
DET      1534
CCONJ     952
ADJ       820
ADV       754
NUM       574
PART      559
X           6
Name: Pos, dtype: int64

In [None]:
from google.colab import files
files.download("afro-xlmr-large-75L_3vl.csv")