In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers accelerate seqeval dataset

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dataset
  Downloading dataset-1.6.2-py2.py3-none-any.whl.metadata (1.9 kB)
Collecting sqlalchemy<2.0.0,>=1.3.2 (from dataset)
  Downloading SQLAlchemy-1.4.54-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting alembic>=0.6.2 (from dataset)
  Downloading alembic-1.16.2-py3-none-any.whl.metadata (7.3 kB)
Collecting banal>=1.0.1 (from dataset)
  Downloading banal-1.0.6-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from seqeval.metrics import precision_score, recall_score, f1_score, accuracy_score

In [None]:
def load_conll_format(file_path):
    sentences = []
    labels = []
    with open(file_path, "r", encoding="utf-8") as file:
        tokens = []
        ner_tags = []
        for line in file:
            line = line.strip()
            if not line:
                if tokens:
                    sentences.append(tokens)
                    labels.append(ner_tags)
                    tokens = []
                    ner_tags = []
            else:
                token, tag = line.split()
                tokens.append(token)
                ner_tags.append(tag)

        if tokens:
            sentences.append(tokens)
            labels.append(ner_tags)
    return sentences, labels


file_path = "/content/drive/MyDrive/Afan-oromo-ner/dataset-Afan-Oromo-new-merge-one.txt"  # Replace with your file's path
sentences, ner_tags = load_conll_format(file_path)


print("First Sentence:", sentences[0])
print("First Tags:", ner_tags[0])


First Sentence: ['Af', "yaa'iin", 'Caffee', 'Oromiyaa', 'aadde', 'Loomii', 'Badhoo', 'raawwii', 'hojii', 'gurguddoo', 'Caffeen', 'hojjechaa', 'ture', 'ilaalchisee', 'miidiyaaleef', 'ibsa', 'laatan', '.']
First Tags: ['O', 'O', 'B-ORG', 'I-ORG', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [None]:
from datasets import Dataset


data = [{"tokens": tokens, "ner_tags": tags} for tokens, tags in zip(sentences, ner_tags)]


hf_dataset = Dataset.from_list(data)


print(hf_dataset[0])

{'tokens': ['Af', "yaa'iin", 'Caffee', 'Oromiyaa', 'aadde', 'Loomii', 'Badhoo', 'raawwii', 'hojii', 'gurguddoo', 'Caffeen', 'hojjechaa', 'ture', 'ilaalchisee', 'miidiyaaleef', 'ibsa', 'laatan', '.'], 'ner_tags': ['O', 'O', 'B-ORG', 'I-ORG', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O']}


In [None]:
# Define label-to-ID mapping (update with actual tags in your dataset)
label2id = {"O": 0, "B-LOC": 1, "I-LOC": 2, "B-PER": 3, "I-PER": 4, "B-ORG": 5, "I-ORG": 6, "B-DATE":7, "I-DATE":8, "B-NUM":9, "I-NUM":10, "B-MONEY":11, "I-MONEY":12, "B-TIME":13}
id2label = {v: k for k, v in label2id.items()}
num_labels = len(label2id)


hf_dataset = hf_dataset.map(lambda x: {"ner_tags_ids": [label2id[tag] for tag in x["ner_tags"]]})


print(hf_dataset[0])

Map:   0%|          | 0/1507 [00:00<?, ? examples/s]

{'tokens': ['Af', "yaa'iin", 'Caffee', 'Oromiyaa', 'aadde', 'Loomii', 'Badhoo', 'raawwii', 'hojii', 'gurguddoo', 'Caffeen', 'hojjechaa', 'ture', 'ilaalchisee', 'miidiyaaleef', 'ibsa', 'laatan', '.'], 'ner_tags': ['O', 'O', 'B-ORG', 'I-ORG', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], 'ner_tags_ids': [0, 0, 5, 6, 0, 3, 4, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0]}


In [None]:
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

train_sentences, temp_sentences, train_tags, temp_tags = train_test_split(
    sentences, ner_tags, test_size=0.2, random_state=42
)
val_sentences, test_sentences, val_tags, test_tags = train_test_split(
    temp_sentences, temp_tags, test_size=0.5, random_state=42
)


train_data = [{"tokens": tokens, "ner_tags": tags} for tokens, tags in zip(train_sentences, train_tags)]
val_data = [{"tokens": tokens, "ner_tags": tags} for tokens, tags in zip(val_sentences, val_tags)]
test_data = [{"tokens": tokens, "ner_tags": tags} for tokens, tags in zip(test_sentences, test_tags)]

train_dataset = Dataset.from_list(train_data)
val_dataset = Dataset.from_list(val_data)
test_dataset = Dataset.from_list(test_data)

In [None]:
dataset_dict = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset,
})

print(dataset_dict)


DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 1205
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 151
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 151
    })
})


In [None]:
dataset_dict.save_to_disk("/content/drive/MyDrive/Afan-oromo-ner/new-ao-dataset")


Saving the dataset (0/1 shards):   0%|          | 0/1205 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/151 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/151 [00:00<?, ? examples/s]

In [None]:
train_dataset = dataset_dict["train"]
val_dataset = dataset_dict["validation"]
test_dataset = dataset_dict["test"]

In [None]:
print("First train example:", dataset_dict["train"][0])
print("First validation example:", dataset_dict["validation"][0])
print("First test example:", dataset_dict["test"][0])

First train example: {'tokens': ['Boordiin', 'Filannoo', 'Biyyaalessaa', 'Ityoophiyaa', 'filannoo', 'marsaa', '6ffaa', 'biyyaalessaan', 'walqabatee', 'jijjiirrama', 'teessoo', 'fi', 'naannoo', 'filannoo', 'naannolee', 'fi', 'bulchiinsa', 'magaalotaa', 'kan', 'hin', 'fudhanne', 'ta’uu', 'beeksise', '.'], 'ner_tags': ['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'B-NUM', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']}
First validation example: {'tokens': ['Masaraa', 'mootummaa', 'keessaa', 'Mallasiin', 'awwaalleen', 'baha', 'jedhee', 'yaadee', 'hin', 'beeku', '.'], 'ner_tags': ['O', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O']}
First test example: {'tokens': ['bara', '2007', 'keessa', 'hiriirri', 'bahamee', 'ture', '.'], 'ner_tags': ['B-DATE', 'B-DATE', 'O', 'O', 'O', 'O', 'O']}


In [None]:
tokenizer = AutoTokenizer.from_pretrained("castorini/afriberta_large")
model_name = "castorini/afriberta_large"

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/257 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/731 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/1.55M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]



In [None]:
def tokenize_and_align_labels(examples):

    examples["ner_tags_ids"] = [[label2id[tag] for tag in tags] for tags in examples["ner_tags"]]


    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, padding="max_length", max_length=128, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples["ner_tags_ids"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_id = None
        label_ids = []
        for word_id in word_ids:
            if word_id is None:
                label_ids.append(-100)
            elif word_id != previous_word_id:
                label_ids.append(label[word_id])
            else:
                label_ids.append(-100)
            previous_word_id = word_id
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
def compute_metrics(p):
    predictions, labels = p
    predictions = predictions.argmax(axis=2)


    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    precision = precision_score(true_labels, true_predictions)
    recall = recall_score(true_labels, true_predictions)
    f1 = f1_score(true_labels, true_predictions)
    accuracy = accuracy_score(true_labels, true_predictions)

    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "accuracy": accuracy,
    }

In [None]:
tokenized_datasets = dataset_dict.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=["tokens", "ner_tags"]
)

Map:   0%|          | 0/1205 [00:00<?, ? examples/s]

Map:   0%|          | 0/151 [00:00<?, ? examples/s]

Map:   0%|          | 0/151 [00:00<?, ? examples/s]

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/afan-oromo-ner/afri-berta-large-NER-afan-oromo",               # Directory to save the model
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    push_to_hub=False
)

In [None]:
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)

pytorch_model.bin:   0%|          | 0.00/503M [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at castorini/afriberta_large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

  trainer = Trainer(


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mresearchmt12[0m ([33mresearchmt12-addis-ababa-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.365,0.33886,0.495146,0.496753,0.495948,0.891837
2,0.2196,0.260156,0.569061,0.668831,0.614925,0.916327
3,0.1817,0.239004,0.589532,0.694805,0.637854,0.926531


Non-default generation parameters: {'max_length': 512}
Non-default generation parameters: {'max_length': 512}
Non-default generation parameters: {'max_length': 512}


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.365,0.33886,0.495146,0.496753,0.495948,0.891837
2,0.2196,0.260156,0.569061,0.668831,0.614925,0.916327
3,0.1817,0.239004,0.589532,0.694805,0.637854,0.926531
4,0.1491,0.239272,0.614085,0.707792,0.657617,0.927755
5,0.1091,0.236368,0.597765,0.694805,0.642643,0.926122


Non-default generation parameters: {'max_length': 512}
Non-default generation parameters: {'max_length': 512}


TrainOutput(global_step=380, training_loss=0.2891856802137274, metrics={'train_runtime': 240.8298, 'train_samples_per_second': 25.018, 'train_steps_per_second': 1.578, 'total_flos': 328026936998400.0, 'train_loss': 0.2891856802137274, 'epoch': 5.0})

In [None]:
test_results = trainer.evaluate(eval_dataset=tokenized_datasets["test"])
print(test_results)

{'eval_loss': 0.18775728344917297, 'eval_precision': 0.667458432304038, 'eval_recall': 0.7805555555555556, 'eval_f1': 0.7195902688860435, 'eval_accuracy': 0.9409420289855073, 'eval_runtime': 0.967, 'eval_samples_per_second': 156.151, 'eval_steps_per_second': 10.341, 'epoch': 5.0}


In [None]:
trainer.push_to_hub(commit_message="Training complete")

Non-default generation parameters: {'max_length': 512}


Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

sentencepiece.bpe.model:   0%|          | 0.00/1.55M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.30k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/500M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Elu-dan/afri-berta-large-NER-Afan-oromo/commit/faee5d6ec653220b71e95a7b8ffbd1e6b5010379', commit_message='Training complete', commit_description='', oid='faee5d6ec653220b71e95a7b8ffbd1e6b5010379', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Elu-dan/afri-berta-large-NER-Afan-oromo', endpoint='https://huggingface.co', repo_type='model', repo_id='Elu-dan/afri-berta-large-NER-Afan-oromo'), pr_revision=None, pr_num=None)

In [None]:
result = token_classifier("Ministirri Muummee Xaaliyaanii Joorijiyaa Melonii lammii Liibiyaa yakka waraanaatiin mana murtii yakkaa Addunyaa ICC’n barbaadamu hidhaa gadhiisuun qorataman.")

print(result)

[{'entity_group': 'LOC', 'score': np.float32(0.6469256), 'word': 'Xaaliyaanii', 'start': 18, 'end': 30}, {'entity_group': 'PER', 'score': np.float32(0.9499381), 'word': 'Joo', 'start': 30, 'end': 34}, {'entity_group': 'PER', 'score': np.float32(0.8997798), 'word': 'rijiyaa Melonii', 'start': 34, 'end': 49}, {'entity_group': 'LOC', 'score': np.float32(0.90984774), 'word': 'Liibiyaa', 'start': 56, 'end': 65}, {'entity_group': 'ORG', 'score': np.float32(0.862312), 'word': 'yakkaa Addunyaa ICC', 'start': 96, 'end': 116}]


# **HornMT Dataset**

In [None]:
from huggingface_hub import login

login(token="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")


In [None]:
from transformers import pipeline

model_checkpoint = "Elu-dan/afri-berta-large-NER-Afan-oromo"
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
)


file_path = "/content/drive/MyDrive/Afan-oromo-ner/orm.txt"

with open(file_path, "r", encoding="utf-8") as file:
    texts = file.read().splitlines()


texts = [text.strip() for text in texts if text.strip()]

Device set to use cuda:0


In [None]:
from tqdm import tqdm

results = []
for text in tqdm(texts, desc="Processing"):
    ner_result = token_classifier(text)
    results.append({"text": text, "ner_results": ner_result})

Processing:   0%|          | 0/2030 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Processing:   0%|          | 10/2030 [00:00<00:52, 38.18it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Processing: 100%|██████████| 2030/2030 [00:24<00:00, 84.14it/s]


In [None]:
import json
import numpy as np


def make_serializable(obj):
    if isinstance(obj, np.float32):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    else:
        raise TypeError(f"Object of type {type(obj)} is not JSON serializable")

# Save the results
output_path = "/content/drive/MyDrive/hornmt_ao.json"
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=4, default=make_serializable)


In [None]:
import pandas as pd
data = [{"text": entry["text"], "ner_results": entry["ner_results"]} for entry in results]
df = pd.DataFrame(data)

# Save as CSV
output_path = "/content/drive/MyDrive/hornmt_ao.csv"  # Update with your desired path
df.to_csv(output_path, index=False, encoding="utf-8")

In [None]:
with open("/content/drive/MyDrive/hornmt_ao.txt", "w", encoding="utf-8") as f:
    for entry in results:
        f.write(f"Text: {entry['text']}\n")
        f.write(f"NER Results: {entry['ner_results']}\n")
        f.write("-" * 50 + "\n")

print("Results saved to ner_results.txt")

Results saved to ner_results.txt


In [None]:
# Load from the JSON file you saved
with open("/content/drive/MyDrive/hornmt_ao.json", "r", encoding="utf-8") as f:
    ner_data = json.load(f)

# Extract only "PER" entities
per_entities = []
for entry in ner_data:
    per_entities.extend([ent["word"] for ent in entry.get("ner_results", []) if ent.get("entity_group") == "PER"])

# Save to text file
output_file = "/content/drive/MyDrive/hornmt_per_entities_afanoromo.txt"
with open(output_file, "w", encoding="utf-8") as f:
    for word in per_entities:
        f.write(word + "\n")

print(f"✅ Extracted {len(per_entities)} 'PER' entities. Saved to {output_file}.")

✅ Extracted 1750 'PER' entities. Saved to /content/drive/MyDrive/hornmt_per_entities_afanoromo.txt.


# **Flores 1 Dataset**

In [None]:
from transformers import pipeline

# Replace this with your own checkpoint
model_checkpoint = "Elu-dan/afri-berta-large-NER-Afan-oromo"
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
)


file_path = "/content/drive/MyDrive/Afan-oromo-ner/orm.devtest"

with open(file_path, "r", encoding="utf-8") as file:
    texts = file.read().splitlines()


texts = [text.strip() for text in texts if text.strip()]

Device set to use cuda:0


In [None]:
from tqdm import tqdm

results = []
for text in tqdm(texts, desc="Processing"):
    ner_result = token_classifier(text)
    results.append({"text": text, "ner_results": ner_result})

Processing:   0%|          | 0/1012 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Processing:   1%|          | 7/1012 [00:00<00:43, 23.18it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Processing: 100%|██████████| 1012/1012 [00:12<00:00, 83.03it/s]


In [None]:
import json
import numpy as np

def make_serializable(obj):
    if isinstance(obj, np.float32):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    else:
        raise TypeError(f"Object of type {type(obj)} is not JSON serializable")

# Save the results
output_path = "/content/drive/MyDrive/flores1_ao.json"
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=4, default=make_serializable)


In [None]:
import pandas as pd
data = [{"text": entry["text"], "ner_results": entry["ner_results"]} for entry in results]
df = pd.DataFrame(data)

# Save as CSV
output_path = "/content/drive/MyDrive/flores1_ao.csv"
df.to_csv(output_path, index=False, encoding="utf-8")

In [None]:
with open("/content/drive/MyDrive/flores1_ao.txt", "w", encoding="utf-8") as f:
    for entry in results:
        f.write(f"Text: {entry['text']}\n")
        f.write(f"NER Results: {entry['ner_results']}\n")
        f.write("-" * 50 + "\n")

print("Results saved to ner_results.txt")

Results saved to ner_results.txt


In [None]:
# Load from the JSON file you saved
with open("/content/drive/MyDrive/flores1_ao.json", "r", encoding="utf-8") as f:
    ner_data = json.load(f)

# Extract only "PER" entities
per_entities = []
for entry in ner_data:
    per_entities.extend([ent["word"] for ent in entry.get("ner_results", []) if ent.get("entity_group") == "PER"])

# Save to text file
output_file = "/content/drive/MyDrive/flores1_per_entities_afanoromo.txt"
with open(output_file, "w", encoding="utf-8") as f:
    for word in per_entities:
        f.write(word + "\n")

print(f"✅ Extracted {len(per_entities)} 'PER' entities. Saved to {output_file}.")

✅ Extracted 396 'PER' entities. Saved to /content/drive/MyDrive/flores1_per_entities_afanoromo.txt.


# **Flores 2 Dataset**

In [None]:
from transformers import pipeline


model_checkpoint = "Elu-dan/afri-berta-large-NER-Afan-oromo"
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
)


file_path = "/content/drive/MyDrive/Afan-oromo-ner/orm.dev"

with open(file_path, "r", encoding="utf-8") as file:
    texts = file.read().splitlines()


texts = [text.strip() for text in texts if text.strip()]

Device set to use cuda:0


In [None]:
from tqdm import tqdm

results = []
for text in tqdm(texts, desc="Processing"):
    ner_result = token_classifier(text)
    results.append({"text": text, "ner_results": ner_result})

Processing:   0%|          | 0/997 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Processing:   1%|          | 10/997 [00:00<00:24, 40.77it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Processing: 100%|██████████| 997/997 [00:12<00:00, 80.65it/s]


In [None]:
import json
import numpy as np

def make_serializable(obj):
    if isinstance(obj, np.float32):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    else:
        raise TypeError(f"Object of type {type(obj)} is not JSON serializable")


output_path = "/content/drive/MyDrive/flores2_ao.json"
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=4, default=make_serializable)


In [None]:
import pandas as pd
data = [{"text": entry["text"], "ner_results": entry["ner_results"]} for entry in results]
df = pd.DataFrame(data)


output_path = "/content/drive/MyDrive/flores2_ao.csv"  # Update with your desired path
df.to_csv(output_path, index=False, encoding="utf-8")

In [None]:
with open("/content/drive/MyDrive/flores2_ao.txt", "w", encoding="utf-8") as f:
    for entry in results:
        f.write(f"Text: {entry['text']}\n")
        f.write(f"NER Results: {entry['ner_results']}\n")
        f.write("-" * 50 + "\n")

print("Results saved to ner_results.txt")

Results saved to ner_results.txt


In [None]:
# Load from the JSON file you saved
with open("/content/drive/MyDrive/flores2_ao.json", "r", encoding="utf-8") as f:
    ner_data = json.load(f)

# Extract only "PER" entities
per_entities = []
for entry in ner_data:
    per_entities.extend([ent["word"] for ent in entry.get("ner_results", []) if ent.get("entity_group") == "PER"])

# Save to text file
output_file = "/content/drive/MyDrive/flores2_per_entities_afanoromo.txt"
with open(output_file, "w", encoding="utf-8") as f:
    for word in per_entities:
        f.write(word + "\n")

print(f"✅ Extracted {len(per_entities)} 'PER' entities. Saved to {output_file}.")

✅ Extracted 411 'PER' entities. Saved to /content/drive/MyDrive/flores2_per_entities_afanoromo.txt.


# **NLLB**

In [None]:
import random

# Define file paths
input_file = '/content/drive/MyDrive/om.txt'
output_file = '/content/drive/MyDrive/om_final.txt'

# Define the target number of sentences
target_sentences = 1400000

# Read the large dataset
with open(input_file, 'r', encoding='utf-8', errors='ignore') as f:
    all_sentences = f.readlines()

# Randomly select 1.4 million sentences
selected_sentences = random.sample(all_sentences, target_sentences)

# Write the selected sentences to a new file
with open(output_file, 'w', encoding='utf-8') as f:
    f.writelines(selected_sentences)

print(f"Successfully saved {target_sentences} sentences to {output_file}")

Successfully saved 1400000 sentences to /content/drive/MyDrive/om_final.txt


In [None]:
import re

input_file = "/content/drive/MyDrive/om_final.txt"
output_file = "/content/drive/MyDrive/om_final_cleaned_dataset.txt"

with open(input_file, 'r', encoding='utf-8') as infile, \
     open(output_file, 'w', encoding='utf-8') as outfile:

    for line in infile:
        # Remove all characters except letters and spaces
        clean_line = re.sub(r'[^A-Za-z\s]', '', line)
        # Optionally remove extra spaces and strip leading/trailing whitespace
        clean_line = re.sub(r'\s+', ' ', clean_line).strip()
        outfile.write(clean_line + '\n')


In [None]:
# Efficient version for large files
def split_file_into_chunks(file_path, output_prefix, num_chunks):
    with open(file_path, 'r', encoding='utf-8') as f:
        sentences = f.readlines()

    total_sentences = len(sentences)
    chunk_size = total_sentences // num_chunks

    for i in range(num_chunks):
        start = i * chunk_size
        end = (i + 1) * chunk_size if i != num_chunks - 1 else total_sentences
        chunk = sentences[start:end]

        with open(f"{output_prefix}_chunk_{i+1}.txt", 'w', encoding='utf-8') as out_f:
            out_f.writelines(chunk)

# Example usage:
split_file_into_chunks("/content/drive/MyDrive/om_final_cleaned_dataset.txt", "/content/drive/MyDrive/split_oromo_files", 20)


In [None]:
from transformers import pipeline

# File paths
input_path = "/content/drive/MyDrive/split_oromo_files/split_oromo_files_chunk_15.txt"
output_path = "/content/drive/MyDrive/output_15_res_orm.txt"
raw_output_path = "/content/drive/MyDrive/output_15_raw_orm.txt"  # Now saving readable text

# Model checkpoint
checkpoint = "/content/drive/MyDrive/afri-berta-ao-large/checkpoint-380"

# Initialize the pipeline
token_classifier = pipeline(
    "token-classification",
    model=checkpoint,
    tokenizer=checkpoint,
    aggregation_strategy="simple",
    device=0
)

# Entity merge logic (no '##' assumption)
def merge_entities(ner_results):
    merged = []
    if not ner_results:
        return merged

    current = ner_results[0].copy()

    for entity in ner_results[1:]:
        if entity["entity_group"] == current["entity_group"] and entity["start"] == current["end"]:
            current["word"] += entity["word"]
            current["end"] = entity["end"]
            current["score"] = max(current["score"], entity["score"])
        else:
            merged.append(current)
            current = entity.copy()

    merged.append(current)
    return merged

# Read input lines
with open(input_path, "r", encoding="utf-8") as f:
    lines = [line.strip() for line in f if line.strip()]

per_entities = []

# Open human-readable raw output file
with open(raw_output_path, "w", encoding="utf-8") as raw_out_f:
    for idx, line in enumerate(lines):
        try:
            raw_output = token_classifier(line)

            # Write plain readable results
            raw_out_f.write(f"[Line {idx+1}] {line}\n")
            for ent in raw_output:
                raw_out_f.write(
                    f"  - {ent['entity_group']}: {ent['word']} "
                    f"(score={ent['score']:.2f}, start={ent['start']}, end={ent['end']})\n"
                )
            raw_out_f.write("\n")

            # Merge and extract PER entities
            merged_output = merge_entities(raw_output)
            per_entities.extend([e["word"].strip() for e in merged_output if e["entity_group"] == "PER"])
        except Exception as e:
            print(f"[Warning] Line {idx+1} failed: {e}")

# Write extracted PER entities to output file
with open(output_path, "w", encoding="utf-8") as f:
    for word in per_entities:
        f.write(word + "\n")

print(f"✅ Saved human-readable NER output to: {raw_output_path}")
print(f"✅ Extracted {len(per_entities)} PER entities (including duplicates) to: {output_path}")


Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


✅ Saved human-readable NER output to: /content/drive/MyDrive/output_15_raw_orm.txt
✅ Extracted 22737 PER entities (including duplicates) to: /content/drive/MyDrive/output_15_res_orm.txt


In [None]:
import os

# Set the folder where your .txt files are located
folder_path = '/content/drive/MyDrive/oromo'

# List all .txt files in the folder
txt_files = [f for f in os.listdir(folder_path) if f.endswith('.txt')]

# Sort files if needed (optional)
txt_files.sort()

# Output file path
output_file = os.path.join(folder_path, '/content/drive/MyDrive/afaoromo_merged_file1.txt')

# Merge all the files
with open(output_file, 'w', encoding='utf-8') as outfile:
    for fname in txt_files:
        file_path = os.path.join(folder_path, fname)
        with open(file_path, 'r', encoding='utf-8') as infile:
            outfile.write(infile.read())
            outfile.write('\n')

print("Merge complete! Output file:", output_file)


Merge complete! Output file: /content/drive/MyDrive/afaoromo_merged_file1.txt


Merge File

In [None]:
import os

folder_path = 'path/to/your/folder'

txt_files = [f for f in os.listdir(folder_path) if f.endswith('.txt')]

txt_files.sort()

output_file = os.path.join(folder_path, 'afaoromo_merged_file.txt')

with open(output_file, 'w', encoding='utf-8') as outfile:
    for fname in txt_files:
        file_path = os.path.join(folder_path, fname)
        with open(file_path, 'r', encoding='utf-8') as infile:
            outfile.write(infile.read())
            outfile.write('\n')  # Optional: adds a newline between files

print("Merge complete! Output file:", output_file)
