In [None]:
from datasets import load_dataset


In [None]:
import os

print("Current working directory:", os.getcwd())
print("Files:", os.listdir())


In [None]:
from pathlib import Path

file_path = "../data/labeled_telegram_product_price_location.txt"

def parse_conll(filepath):
    tokens, ner_tags = [], []
    token_list, tag_list = [], []

    with open(filepath, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                if token_list:
                    tokens.append(token_list)
                    ner_tags.append(tag_list)
                    token_list, tag_list = [], []
            else:
                parts = line.split()
                if len(parts) >= 2:
                    token, tag = parts[0], parts[1]
                    token_list.append(token)
                    tag_list.append(tag)
    
    return tokens, ner_tags

tokens, tags = parse_conll(file_path)


In [None]:
# Create label2id and id2label mappings
unique_labels = sorted(set(tag for seq in tags for tag in seq))
label2id = {label: idx for idx, label in enumerate(unique_labels)}
id2label = {idx: label for label, idx in label2id.items()}

# Convert tag strings to integers
tags_ids = [[label2id[tag] for tag in seq] for seq in tags]


In [None]:
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

train_tokens, val_tokens, train_tags, val_tags = train_test_split(tokens, tags_ids, test_size=0.2, random_state=42)

dataset = DatasetDict({
    "train": Dataset.from_dict({"tokens": train_tokens, "ner_tags": train_tags}),
    "validation": Dataset.from_dict({"tokens": val_tokens, "ner_tags": val_tags}),
})

# Confirm working
print(dataset)
print("Example labels:", [id2label[i] for i in dataset['train'][0]['ner_tags']])


In [None]:
label_list = list(label2id.keys())
num_labels = len(label_list)


In [None]:
!pip install sentencepiece


In [None]:
import sentencepiece
print("✅ SentencePiece is working!")


In [None]:
!pip install protobuf


In [None]:
model_name = "Davlan/bert-base-multilingual-cased-ner-hrl"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=num_labels, id2label=id2label, label2id=label2id)


In [None]:
# analysis_and_finetune.ipynb

# Step 1: Install necessary libraries
!pip install transformers datasets seqeval -q

# Step 2: Import required modules
import pandas as pd
from datasets import load_dataset, Dataset, ClassLabel, Sequence
from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification, Trainer, TrainingArguments
from transformers import pipeline
import numpy as np
from sklearn.metrics import classification_report

# Step 3: Load your CoNLL formatted dataset
# Assuming you have a .txt file: 'labeled_amharic.conll'
from datasets import load_dataset

data_files = {"train": "labeled_amharic_train.conll", "validation": "labeled_amharic_val.conll"}
file_path = "../data/labeled_telegram_product_price_location.txt"


# Step 4: Define labels
label_list = dataset["train"].features["ner_tags"].feature.names
num_labels = len(label_list)

# Step 5: Load pretrained tokenizer and model
model_name = "Davlan/xlm-roberta-base-ner-hrl"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=num_labels)

# Step 6: Tokenize and align labels
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx] if label[word_idx] != -100 else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

# Step 7: Set up training arguments
args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Step 8: Define Trainer
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Step 9: Train the model
trainer.train()

# Step 10: Evaluate
predictions, labels, _ = trainer.predict(tokenized_dataset["validation"])
preds = np.argmax(predictions, axis=2)

true_labels = [[label_list[l] for l in label if l != -100] for label in labels]
true_preds = [[label_list[p] for (p, l) in zip(pred, label) if l != -100] for pred, label in zip(preds, labels)]

print(classification_report(true_labels, true_preds))

# Optional: Save the model
model.save_pretrained("saved_model_amharic_ner")
tokenizer.save_pretrained("saved_model_amharic_ner")
