<a href="https://colab.research.google.com/github/Brackly/lacuna-challenge/blob/main/silabi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Building the Customer tokenizer

### Preparing data

In [None]:
!git clone https://github.com/masakhane-io/masakhane-pos.git

Cloning into 'masakhane-pos'...
remote: Enumerating objects: 385, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 385 (delta 0), reused 0 (delta 0), pack-reused 382[K
Receiving objects: 100% (385/385), 9.74 MiB | 11.76 MiB/s, done.
Resolving deltas: 100% (149/149), done.


In [None]:
!pip install tokenizers transformers datasets



In [None]:
!pip install accelerate -U

Collecting accelerate
  Downloading accelerate-0.23.0-py3-none-any.whl (258 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/258.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━[0m [32m143.4/258.1 kB[0m [31m4.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.1/258.1 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.23.0


In [None]:
!git clone https://github.com/Brackly/lacuna-challenge.git

fatal: destination path 'lacuna-challenge' already exists and is not an empty directory.


### Create Train and Test txt Dataset


In [None]:
from datasets import load_dataset
dataset = load_dataset("swahili_news")
dataset

In [None]:
import os

def create_train_data_file(dir_path):
  try:
    num=int(len(dataset["train"]['text'])/64)
    text=dataset["train"]['text'][:num]
    from pathlib import Path
    paths = [str(x) for x in Path(dir_path).glob("**/*.txt")]
    for path in paths:
      with open(path,'r',encoding='utf-8') as file:
        lines=file.read()
        print(path,len(lines))
        text.append(lines)
    trainFilePath=os.path.join(dir_path,"train.txt")
    with open(trainFilePath,'w') as f:
      for line in text:
        f.write(line)
    return text
  except Exception as e:
    print(e)
# create_train_data_file("/content/lacuna-challenge/data")

### Create a tokenizer

In [None]:
!mkdir Tokenizer

In [None]:
from tokenizers import ByteLevelBPETokenizer
tokenizer = ByteLevelBPETokenizer()
paths = ["/content/lacuna-challenge/data/train.txt"]
# Customize training
tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

# Save files to disk

tokenizer.save_model("./Tokenizer")

['./Tokenizer/vocab.json', './Tokenizer/merges.txt']

In [None]:
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing


tokenizer = ByteLevelBPETokenizer(
    "/content/Tokenizer/vocab.json",
    "/content/Tokenizer/merges.txt",
)
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

In [None]:
import torch
from pathlib import Path
from torch.utils.data import Dataset

class SilabiDataset(Dataset):
    def __init__(self, evaluate: bool = False):
        tokenizer = ByteLevelBPETokenizer(
            "/content/Tokenizer/vocab.json",
            "/content/Tokenizer/merges.txt",
        )
        tokenizer._tokenizer.post_processor = BertProcessing(
            ("</s>", tokenizer.token_to_id("</s>")),
            ("<s>", tokenizer.token_to_id("<s>")),
        )
        tokenizer.enable_truncation(max_length=512)

        self.examples = []

        src_files = Path("/content/lacuna-challenge/data/").glob("*-dev.txt") if evaluate else Path("/content/lacuna-challenge/data/").glob("*-train.txt")
        for src_file in src_files:
            print("🔥", src_file)
            lines = src_file.read_text(encoding="utf-8").splitlines()
            print(lines)
            self.examples += [x.ids for x in tokenizer.encode_batch(lines)]

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        # We’ll pad at the batch level.
        return torch.tensor(self.examples[i])

### Create Model

In [None]:
!mkdir Model

In [None]:
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

In [None]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained("./Tokenizer", max_len=512)

In [None]:
from transformers import RobertaForMaskedLM

model = RobertaForMaskedLM(config=config)

In [None]:
import json
import os
import pickle
import random
import time
import warnings
from typing import Dict, List, Optional

import torch
from filelock import FileLock
from torch.utils.data import Dataset


DEPRECATION_WARNING = (
    "This dataset will be removed from the library soon, preprocessing should be handled with the 🤗 Datasets "
    "library. You can have a look at this example script for pointers: {0}"
)

class LineByLineTextDataset(Dataset):
    """
    This will be superseded by a framework-agnostic approach soon.
    """

    def __init__(self, tokenizer, file_path: str, block_size: int):
        warnings.warn(
            DEPRECATION_WARNING.format(
                "https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm.py"
            ),
            FutureWarning,
        )
        if os.path.isfile(file_path) is False:
            raise ValueError(f"Input file path {file_path} not found")
        # Here, we do not cache the features, operating under the assumption
        # that we will soon use fast multithreaded tokenizers from the
        # `tokenizers` repo everywhere =)
        # logger.info(f"Creating features from dataset file at {file_path}")

        with open(file_path, encoding="utf-8") as f:
          lines = [line for line in f.read().split("\n") if (len(line) > 0 and not line.isspace())]

        batch_encoding = tokenizer(lines, add_special_tokens=True, truncation=True, max_length=block_size)
        self.examples = batch_encoding["input_ids"]
        self.examples = [{"input_ids": torch.tensor(e, dtype=torch.long)} for e in self.examples]

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i) -> Dict[str, torch.tensor]:
        return self.examples[i]

In [None]:
dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="/content/lacuna-challenge/data/train.txt",
    block_size=128,
)



In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [None]:
from transformers import Trainer, TrainingArguments

In [None]:
training_args = TrainingArguments(
    output_dir="/content/Model",
    overwrite_output_dir=True,
    num_train_epochs=200,
    per_gpu_train_batch_size=64,
    save_steps=10_000,
    # push_to_hub=True
    save_total_limit=2,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

trainer.train()

In [None]:
tokenizer.save_pretrained("./Tokenizer")
trainer.save_model("./Model")

In [None]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model="./Model",
    tokenizer="./Tokenizer"
)
fill_mask("hik manyonge ni mondo <mask> godo chenorogo to mano pok otimore nyaka sani.")

## Finetuned Model

In [None]:
# class SilabiFinetuned(model):
#   def __init(self,config):
