In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install datasets



In [None]:
!pip install transformers accelerate seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-non

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from seqeval.metrics import precision_score, recall_score, f1_score, accuracy_score

In [None]:
def load_conll_format(file_path):
    sentences = []
    labels = []
    with open(file_path, "r", encoding="utf-8") as file:
        tokens = []
        ner_tags = []
        for line in file:
            line = line.strip()
            if not line:  # Blank line indicates end of a sentence
                if tokens:
                    sentences.append(tokens)
                    labels.append(ner_tags)
                    tokens = []
                    ner_tags = []
            else:
                token, tag = line.split()  # Split token and tag
                tokens.append(token)
                ner_tags.append(tag)
        # Add the last sentence if it exists
        if tokens:
            sentences.append(tokens)
            labels.append(ner_tags)
    return sentences, labels

# Load dataset
file_path = "/content/drive/MyDrive/Tigrigna-NER/Tigrinya-NER-Dataset.txt"  # Replace with your file's path
sentences, ner_tags = load_conll_format(file_path)

# Example output
print("First Sentence:", sentences[0])
print("First Tags:", ner_tags[0])


First Sentence: ['ገለ', 'መሰኻኽር', 'ሓያሎ', 'ጠያይት', 'ክትኮስ', 'ከምዝሰምዑ', 'ክገለጹ', 'እንከለዉ', 'ሓደ', 'ናይ', 'ዓይኒ', 'ምስክር', 'ድማ', 'ኦቶማቲክ', 'ብረት', 'ዝሓዘ', 'ሰብ', 'ክትኩስ', 'ከምዝራኣየ', 'ተዛሪቡ', '።']
First Tags: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [None]:
print("Second Sentence:", sentences[1])
print("Second Tags:", ner_tags[1])

Second Sentence: ['ነቲ', 'ኣብ', 'ከተማ', 'ኢንዲያናፖሰስ', 'ዝተገብረ', 'ጃምላዊ', 'ቅትለት', 'ከምዝፈጸመ', 'ዝንገረሉ', 'ዘሎ', 'ሰብ', 'በይኑ', 'ዝፈጸሞ', 'ከይኮነ', 'ከምዘይተርፍ', 'ጸብጻባት', 'ሓቢሮም', '።']
Second Tags: ['O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [None]:
from datasets import Dataset

# Prepare data for Hugging Face
data = [{"tokens": tokens, "ner_tags": tags} for tokens, tags in zip(sentences, ner_tags)]

# Convert to Hugging Face Dataset
hf_dataset = Dataset.from_list(data)

# Check the first few examples
print(hf_dataset[0])

{'tokens': ['ገለ', 'መሰኻኽር', 'ሓያሎ', 'ጠያይት', 'ክትኮስ', 'ከምዝሰምዑ', 'ክገለጹ', 'እንከለዉ', 'ሓደ', 'ናይ', 'ዓይኒ', 'ምስክር', 'ድማ', 'ኦቶማቲክ', 'ብረት', 'ዝሓዘ', 'ሰብ', 'ክትኩስ', 'ከምዝራኣየ', 'ተዛሪቡ', '።'], 'ner_tags': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']}


In [None]:
# Define label-to-ID mapping (update with actual tags in your dataset)
label2id = {"O": 0, "B-LOC": 1, "I-LOC": 2, "B-PER": 3, "I-PER": 4, "B-ORG": 5, "I-ORG": 6, "B-DATE":7, "I-DATE":8, "B-MISC":9, "I-MISC":10}
id2label = {v: k for k, v in label2id.items()}
num_labels = len(label2id)

# Add label IDs to the dataset
hf_dataset = hf_dataset.map(lambda x: {"ner_tags_ids": [label2id[tag] for tag in x["ner_tags"]]})

# Inspect updated dataset
print(hf_dataset[0])

Map:   0%|          | 0/5703 [00:00<?, ? examples/s]

{'tokens': ['ገለ', 'መሰኻኽር', 'ሓያሎ', 'ጠያይት', 'ክትኮስ', 'ከምዝሰምዑ', 'ክገለጹ', 'እንከለዉ', 'ሓደ', 'ናይ', 'ዓይኒ', 'ምስክር', 'ድማ', 'ኦቶማቲክ', 'ብረት', 'ዝሓዘ', 'ሰብ', 'ክትኩስ', 'ከምዝራኣየ', 'ተዛሪቡ', '።'], 'ner_tags': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], 'ner_tags_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


In [None]:
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

# Split sentences and tags into train (80%), validation (10%), and test (10%)
train_sentences, temp_sentences, train_tags, temp_tags = train_test_split(
    sentences, ner_tags, test_size=0.2, random_state=42
)
val_sentences, test_sentences, val_tags, test_tags = train_test_split(
    temp_sentences, temp_tags, test_size=0.5, random_state=42
)

# Create Hugging Face datasets
train_data = [{"tokens": tokens, "ner_tags": tags} for tokens, tags in zip(train_sentences, train_tags)]
val_data = [{"tokens": tokens, "ner_tags": tags} for tokens, tags in zip(val_sentences, val_tags)]
test_data = [{"tokens": tokens, "ner_tags": tags} for tokens, tags in zip(test_sentences, test_tags)]

train_dataset = Dataset.from_list(train_data)
val_dataset = Dataset.from_list(val_data)
test_dataset = Dataset.from_list(test_data)


In [None]:
# Combine splits into a DatasetDict
dataset_dict = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset,
})

# Inspect the splits
print(dataset_dict)


DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 4562
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 570
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 571
    })
})


In [None]:
# Access the splits directly from the dataset_dict
train_dataset = dataset_dict["train"]
val_dataset = dataset_dict["validation"]
test_dataset = dataset_dict["test"]

In [None]:
print("First train example:", dataset_dict["train"][0])
print("First validation example:", dataset_dict["validation"][0])
print("First test example:", dataset_dict["test"][0])

First train example: {'tokens': ['ኤል', 'ሳዳዊ', 'ወለዳ', 'ጓል', '10', 'ዓመት', 'እንከላ', 'ከመርዑውዋ', 'ፈቲኖም', 'ሓንጊዳ', 'ክትኣብዮም', 'እንከላ', 'ኣዲኣ', 'ኣብ', 'ጎና', 'ኮይና', 'ደጊፋታ', '።'], 'ner_tags': ['B-ORG', 'I-ORG', 'O', 'O', 'B-DATE', 'I-DATE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']}
First validation example: {'tokens': ['ካብዚ', 'ሓሊፉ', 'ፕሮጀክት', 'ሪድ', 'ቱ', 'ምስ', 'ትካል', 'ገባሪ', 'ሰናይ', 'ክሪኤቲቭ', 'ኣሶሼት', 'ኢንተርናሽናል', 'ብምትሕብባር', 'ካብ', 'ወርሒ', 'መጋቢት', '2011', 'ዓ/ም', 'ጀሚሩ', 'ኣብ', '241', 'ቀዳማይ', 'ብርኪ', 'ኣብያተ', 'ትምህርቲ', 'ወረዳታት', 'ራያ', 'ዓዘቦ', 'ጋንታ', 'ኣፈሹም', 'ታሕታይ', 'ማይጨውን', 'ወልቃይትን', 'ክትግበር', 'ፀኒሑ', '።'], 'ner_tags': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'B-DATE', 'I-DATE', 'I-DATE', 'I-DATE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'I-LOC', 'O', 'O', 'O', 'B-LOC', 'B-LOC', 'O', 'O', 'O']}
First test example: {'tokens': ['ተራ', 'ዜጋታት', 'ምበር', 'ላዕለዎት', 'መራሕቲ', 'ኣይኮኑን', 'ዝጉድኡ', '።'], 'ner_tags': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']}


In [None]:
model_name = "castorini/afriberta_base"
tokenizer = AutoTokenizer.from_pretrained("castorini/afriberta_base")
tokenizer.model_max_length = 512

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/257 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/729 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/1.55M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]



In [None]:
def tokenize_and_align_labels(examples):
    # Convert NER tags to IDs using label2id
    examples["ner_tags_ids"] = [[label2id[tag] for tag in tags] for tags in examples["ner_tags"]]

    # Tokenize the inputs, apply truncation and padding
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, padding="max_length", max_length=128, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples["ner_tags_ids"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_id = None
        label_ids = []
        for word_id in word_ids:
            if word_id is None:
                label_ids.append(-100)  # Special tokens are ignored in the loss calculation
            elif word_id != previous_word_id:
                label_ids.append(label[word_id])
            else:
                label_ids.append(-100)  # Ignore sub-token labels
            previous_word_id = word_id
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


In [None]:
def compute_metrics(p):
    predictions, labels = p
    predictions = predictions.argmax(axis=2)  # Get the predicted labels

    # Remove ignored index (-100)
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    precision = precision_score(true_labels, true_predictions)
    recall = recall_score(true_labels, true_predictions)
    f1 = f1_score(true_labels, true_predictions)
    accuracy = accuracy_score(true_labels, true_predictions)

    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "accuracy": accuracy,
    }

In [None]:
tokenized_datasets = dataset_dict.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=["tokens", "ner_tags"]
)

Map:   0%|          | 0/4562 [00:00<?, ? examples/s]

Map:   0%|          | 0/570 [00:00<?, ? examples/s]

Map:   0%|          | 0/571 [00:00<?, ? examples/s]

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/Tigrigna-NER/afri-berta-tigrigna",               # Directory to save the model
    eval_strategy="epoch",         # Evaluate after each epoch
    learning_rate=2e-5,                  # Learning rate
    per_device_train_batch_size=16,      # Batch size for training
    per_device_eval_batch_size=16,       # Batch size for evaluation
    num_train_epochs=5,                  # Number of training epochs
    weight_decay=0.01,                   # Weight decay
    logging_dir="./logs",                # Directory for logs
    logging_steps=10,
    save_strategy="epoch",
    load_best_model_at_end=True,         # Load the best model after training
    metric_for_best_model="f1",         # Metric to track best model
    greater_is_better=True,
    push_to_hub=False
    )

In [None]:
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at castorini/afriberta_base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1746,0.183134,0.655819,0.661692,0.658742,0.940848
2,0.1317,0.167072,0.665133,0.719403,0.691205,0.944505
3,0.1011,0.160442,0.713307,0.725373,0.71929,0.950722
4,0.0833,0.167918,0.70463,0.757214,0.729976,0.951362


Non-default generation parameters: {'max_length': 512}
Non-default generation parameters: {'max_length': 512}
Non-default generation parameters: {'max_length': 512}
Non-default generation parameters: {'max_length': 512}


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1746,0.183134,0.655819,0.661692,0.658742,0.940848
2,0.1317,0.167072,0.665133,0.719403,0.691205,0.944505
3,0.1011,0.160442,0.713307,0.725373,0.71929,0.950722
4,0.0833,0.167918,0.70463,0.757214,0.729976,0.951362
5,0.059,0.16813,0.718456,0.759204,0.738268,0.952825


Non-default generation parameters: {'max_length': 512}


TrainOutput(global_step=1430, training_loss=0.14626189945461032, metrics={'train_runtime': 415.826, 'train_samples_per_second': 54.855, 'train_steps_per_second': 3.439, 'total_flos': 993502363015680.0, 'train_loss': 0.14626189945461032, 'epoch': 5.0})

In [None]:
test_results = trainer.evaluate(eval_dataset=tokenized_datasets["test"])
print(test_results)

{'eval_loss': 0.16543681919574738, 'eval_precision': 0.725912067352666, 'eval_recall': 0.7585532746823069, 'eval_f1': 0.7418738049713193, 'eval_accuracy': 0.9538298248717495, 'eval_runtime': 3.1823, 'eval_samples_per_second': 179.429, 'eval_steps_per_second': 11.313, 'epoch': 5.0}


In [None]:
trainer.push_to_hub(commit_message="Training complete")

Non-default generation parameters: {'max_length': 512}


Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

sentencepiece.bpe.model:   0%|          | 0.00/1.55M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.30k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/444M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Elu-dan/afri-berta-tigrigna/commit/1d977757ce3b2336e93cf6fcda334be87e1fa577', commit_message='Training complete', commit_description='', oid='1d977757ce3b2336e93cf6fcda334be87e1fa577', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Elu-dan/afri-berta-tigrigna', endpoint='https://huggingface.co', repo_type='model', repo_id='Elu-dan/afri-berta-tigrigna'), pr_revision=None, pr_num=None)

In [None]:
from transformers import pipeline

# Replace this with your own checkpoint
model_checkpoint = "Elu-dan/afri-berta-tigrigna"
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
)
token_classifier("ኣንድሬ ቡሉዋ፡ ኣፍሪቃ ዝረስዓታ ጅግና ተቓላሲት ናጽነት::")

config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/444M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/1.55M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/4.98M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

Device set to use cuda:0


[{'entity_group': 'PER',
  'score': np.float32(0.94669515),
  'word': 'ኣንድሬ ቡሉዋ',
  'start': 0,
  'end': 8}]

In [None]:
result1 = token_classifier("ቀዳማይ ሚኒስተር ኣብይ ኣሕመድ፡ ናብ ስልጣን ምስደየበ፡ ድሕሪ ሒደት መዓልታት ናብ ካይሮ’ዩ ኣምሪሑ።")

print(result1)

[{'entity_group': 'PER', 'score': np.float32(0.9633071), 'word': 'ኣብይ ኣሕመድ፡', 'start': 10, 'end': 20}, {'entity_group': 'DATE', 'score': np.float32(0.8109887), 'word': 'ድሕሪ ሒደት መዓልታት', 'start': 35, 'end': 49}, {'entity_group': 'LOC', 'score': np.float32(0.9808646), 'word': 'ካይሮ', 'start': 52, 'end': 56}]


# **HornMT Dataset**





In [None]:
file_path = "/content/drive/MyDrive/Tigrigna-NER/tir.txt"

with open(file_path, "r", encoding="utf-8") as file:
    texts = file.read().splitlines()


texts = [text.strip() for text in texts if text.strip()]

In [None]:
from tqdm import tqdm

results = []
for text in tqdm(texts, desc="Processing"):
    ner_result = token_classifier(text)
    results.append({"text": text, "ner_results": ner_result})

Processing:   0%|          | 0/2030 [00:00<?, ?it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Processing: 100%|██████████| 2030/2030 [00:25<00:00, 78.42it/s]


In [None]:
import json
import numpy as np

# Helper function to make objects JSON serializable
def make_serializable(obj):
    if isinstance(obj, np.float32):  # Convert np.float32 to float
        return float(obj)
    elif isinstance(obj, np.ndarray):  # Convert np.ndarray to a list
        return obj.tolist()
    else:
        raise TypeError(f"Object of type {type(obj)} is not JSON serializable")

# Save the results
output_path = "/content/drive/MyDrive/ner_results_hornmt_tigrigna.json"  # Update with your desired path
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=4, default=make_serializable)



In [None]:
import pandas as pd
data = [{"text": entry["text"], "ner_results": entry["ner_results"]} for entry in results]
df = pd.DataFrame(data)

# Save as CSV
output_path = "/content/drive/MyDrive/ner_results_hornmt_tigrigna.csv"  # Update with your desired path
df.to_csv(output_path, index=False, encoding="utf-8")

In [None]:
with open("/content/drive/MyDrive/hornmt_ner_results_tigrigna.txt", "w", encoding="utf-8") as f:
    for entry in results:
        f.write(f"Text: {entry['text']}\n")
        f.write(f"NER Results: {entry['ner_results']}\n")
        f.write("-" * 50 + "\n")

print("Results saved to ner_results.txt")

Results saved to ner_results.txt


In [None]:
# Load from the JSON file you saved
with open("/content/drive/MyDrive/ner_results_hornmt_tigrigna.json", "r", encoding="utf-8") as f:
    ner_data = json.load(f)

# Extract only "PER" entities
per_entities = []
for entry in ner_data:
    per_entities.extend([ent["word"] for ent in entry.get("ner_results", []) if ent.get("entity_group") == "PER"])

# Save to text file
output_file = "/content/drive/MyDrive/hornmt_per_entities_tigrigna.txt"
with open(output_file, "w", encoding="utf-8") as f:
    for word in per_entities:
        f.write(word + "\n")

print(f"✅ Extracted {len(per_entities)} 'PER' entities. Saved to {output_file}.")

✅ Extracted 1325 'PER' entities. Saved to /content/drive/MyDrive/hornmt_per_entities_tigrigna.txt.


# **Flores 1 Dataset**

https://github.com/facebookresearch/flores/tree/main/flores200

In [None]:
from transformers import pipeline
from tqdm import tqdm
import json
import numpy as np

from transformers import pipeline

model_checkpoint = "Elu-dan/afri-berta-tigrigna"
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Device set to use cuda:0


In [None]:
file_path = "/content/drive/MyDrive/Tigrigna-NER/tir_Ethi.dev"  # Update with the actual path to the file

# Read the file into a list of sentences
with open(file_path, "r", encoding="utf-8") as file:
    texts = file.read().splitlines()

# Remove empty lines and strip extra spaces
texts = [text.strip() for text in texts if text.strip()]

In [None]:
results = []
for text in tqdm(texts, desc="Processing Sentences"):
    ner_result = token_classifier(text)
    results.append({"text": text, "ner_results": ner_result})

Processing Sentences:   0%|          | 1/997 [00:00<05:57,  2.78it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Processing Sentences: 100%|██████████| 997/997 [00:09<00:00, 102.05it/s]


In [None]:
def make_serializable(obj):
    if isinstance(obj, np.float32):  # Convert np.float32 to float
        return float(obj)
    elif isinstance(obj, np.ndarray):  # Convert np.ndarray to a list
        return obj.tolist()
    else:
        raise TypeError(f"Object of type {type(obj)} is not JSON serializable")

# Save the results
output_path = "/content/drive/MyDrive/ner_results_flores1.json"  # Update with your desired path
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=4, default=make_serializable)

In [None]:
import pandas as pd
data = [{"text": entry["text"], "ner_results": entry["ner_results"]} for entry in results]
df = pd.DataFrame(data)

# Save as CSV
output_path = "/content/drive/MyDrive/ner_results_flores1.csv"  # Update with your desired path
df.to_csv(output_path, index=False, encoding="utf-8")

In [None]:
with open("/content/drive/MyDrive/flores1_ner_results.txt", "w", encoding="utf-8") as f:
    for entry in results:
        f.write(f"Text: {entry['text']}\n")
        f.write(f"NER Results: {entry['ner_results']}\n")
        f.write("-" * 50 + "\n")

print("Results saved to ner_results.txt")

Results saved to ner_results.txt


In [None]:
# Load from the JSON file you saved
with open("/content/drive/MyDrive/ner_results_flores1.json", "r", encoding="utf-8") as f:
    ner_data = json.load(f)

# Extract only "PER" entities
per_entities = []
for entry in ner_data:
    per_entities.extend([ent["word"] for ent in entry.get("ner_results", []) if ent.get("entity_group") == "PER"])

# Save to text file
output_file = "/content/drive/MyDrive/flores1_per_entities_tigrigna.txt"
with open(output_file, "w", encoding="utf-8") as f:
    for word in per_entities:
        f.write(word + "\n")

print(f"✅ Extracted {len(per_entities)} 'PER' entities. Saved to {output_file}.")

✅ Extracted 297 'PER' entities. Saved to /content/drive/MyDrive/flores1_per_entities_tigrigna.txt.


# **Flores 2 Dataset**

In [None]:
from transformers import pipeline
from tqdm import tqdm
import json
import numpy as np

from transformers import pipeline

model_checkpoint = "Elu-dan/afri-berta-tigrigna"
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Device set to use cuda:0


In [None]:
file_path = "/content/drive/MyDrive/Tigrigna-NER/tir_Ethi.devtest"  # Update with the actual path to the file

# Read the file into a list of sentences
with open(file_path, "r", encoding="utf-8") as file:
    texts = file.read().splitlines()

# Remove empty lines and strip extra spaces
texts = [text.strip() for text in texts if text.strip()]

In [None]:
results = []
for text in tqdm(texts, desc="Processing Sentences"):
    ner_result = token_classifier(text)
    results.append({"text": text, "ner_results": ner_result})

Processing Sentences:   0%|          | 1/1012 [00:00<03:50,  4.39it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Processing Sentences: 100%|██████████| 1012/1012 [00:13<00:00, 75.55it/s]


In [None]:
def make_serializable(obj):
    if isinstance(obj, np.float32):  # Convert np.float32 to float
        return float(obj)
    elif isinstance(obj, np.ndarray):  # Convert np.ndarray to a list
        return obj.tolist()
    else:
        raise TypeError(f"Object of type {type(obj)} is not JSON serializable")

# Save the results
output_path = "/content/drive/MyDrive/ner_results_flores2.json"  # Update with your desired path
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=4, default=make_serializable)

In [None]:
import pandas as pd
data = [{"text": entry["text"], "ner_results": entry["ner_results"]} for entry in results]
df = pd.DataFrame(data)

# Save as CSV
output_path = "/content/drive/MyDrive/ner_results_flores2.csv"  # Update with your desired path
df.to_csv(output_path, index=False, encoding="utf-8")

In [None]:
with open("/content/drive/MyDrive/flores2_ner_results.txt", "w", encoding="utf-8") as f:
    for entry in results:
        f.write(f"Text: {entry['text']}\n")
        f.write(f"NER Results: {entry['ner_results']}\n")
        f.write("-" * 50 + "\n")

print("Results saved to ner_results.txt")

Results saved to ner_results.txt


In [None]:
# Load from the JSON file you saved
with open("/content/drive/MyDrive/ner_results_flores2.json", "r", encoding="utf-8") as f:
    ner_data = json.load(f)

# Extract only "PER" entities
per_entities = []
for entry in ner_data:
    per_entities.extend([ent["word"] for ent in entry.get("ner_results", []) if ent.get("entity_group") == "PER"])

# Save to text file
output_file = "/content/drive/MyDrive/flores2_per_entities_tigrigna.txt"
with open(output_file, "w", encoding="utf-8") as f:
    for word in per_entities:
        f.write(word + "\n")

print(f"✅ Extracted {len(per_entities)} 'PER' entities. Saved to {output_file}.")

✅ Extracted 308 'PER' entities. Saved to /content/drive/MyDrive/flores2_per_entities_tigrigna.txt.


# **NLLB**

In [None]:
import os

def split_text_dataset(input_file, output_dir, num_files=20):
    # Ensure the output directory exists
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Read the input file
    with open(input_file, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    # Calculate the number of lines per file
    total_lines = len(lines)
    lines_per_file = total_lines // num_files
    remainder = total_lines % num_files

    # Split the lines into chunks
    start = 0
    for i in range(num_files):
        end = start + lines_per_file + (1 if i < remainder else 0)
        chunk = lines[start:end]

        # Write the chunk to a new file
        output_file = os.path.join(output_dir, f'output_{i+1}.txt')
        with open(output_file, 'w', encoding='utf-8') as out_file:
            out_file.writelines(chunk)

        start = end

    print(f"Dataset split into {num_files} files in '{output_dir}'.")

# Example usage
input_file = '/content/drive/MyDrive/ti.txt'  # Path to your input text file
output_dir = '/content/drive/MyDrive/split_files'  # Directory to save the split files
split_text_dataset(input_file, output_dir, num_files=20)

Dataset split into 20 files in '/content/drive/MyDrive/split_files'.


In [None]:
from transformers import pipeline
from tqdm import tqdm
import json
import numpy as np

from transformers import pipeline

model_checkpoint = "Elu-dan/afri-berta-tigrigna"
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Device set to use cuda:0


In [None]:
file_path = "/content/drive/MyDrive/Tigrigna-NER/output_20.txt"  # Update with the actual path to the file

# Read the file into a list of sentences
with open(file_path, "r", encoding="utf-8") as file:
    texts = file.read().splitlines()

# Remove empty lines and strip extra spaces
texts = [text.strip() for text in texts if text.strip()]

results = []
for text in tqdm(texts, desc="Processing Sentences"):
    ner_result = token_classifier(text)
    results.append({"text": text, "ner_results": ner_result})

Processing Sentences:   0%|          | 8/70982 [00:00<40:30, 29.20it/s]  You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Processing Sentences: 100%|██████████| 70982/70982 [09:57<00:00, 118.85it/s]


In [None]:
def make_serializable(obj):
    if isinstance(obj, np.float32):  # Convert np.float32 to float
        return float(obj)
    elif isinstance(obj, np.ndarray):  # Convert np.ndarray to a list
        return obj.tolist()
    else:
        raise TypeError(f"Object of type {type(obj)} is not JSON serializable")

# Save the results
output_path = "/content/drive/MyDrive/output_20.json"  # Update with your desired path
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=4, default=make_serializable)

In [None]:
import pandas as pd
data = [{"text": entry["text"], "ner_results": entry["ner_results"]} for entry in results]
df = pd.DataFrame(data)

# Save as CSV
output_path = "/content/drive/MyDrive/output_20.csv"  # Update with your desired path
df.to_csv(output_path, index=False, encoding="utf-8")

In [None]:
with open("/content/drive/MyDrive/nllb_output20.txt", "w", encoding="utf-8") as f:
    for entry in results:
        f.write(f"Text: {entry['text']}\n")
        f.write(f"NER Results: {entry['ner_results']}\n")
        f.write("-" * 50 + "\n")

print("Results saved to ner_results.txt")

Results saved to ner_results.txt


In [None]:
# Load from the JSON file you saved
with open("/content/drive/MyDrive/output_20.json", "r", encoding="utf-8") as f:
    ner_data = json.load(f)

# Extract only "PER" entities
per_entities = []
for entry in ner_data:
    per_entities.extend([ent["word"] for ent in entry.get("ner_results", []) if ent.get("entity_group") == "PER"])

# Save to text file
output_file = "/content/drive/MyDrive/nllb_output_20_per.txt"
with open(output_file, "w", encoding="utf-8") as f:
    for word in per_entities:
        f.write(word + "\n")

print(f"✅ Extracted {len(per_entities)} 'PER' entities. Saved to {output_file}.")

Extracted 19128 'PER' entities. Saved to /content/drive/MyDrive/nllb_output20_per.txt.
