# Fine-Tuning


# 1. Data Preparation

In [None]:
import pandas as pd

# Load the Excel file
df = pd.read_excel('telegram_data.xlsx')

# Extract messages with Amharic text
amharic_messages = df[df['Message'].notna()]['Message'].tolist()

# 2. Annotation Guidelines

### Define our entity types based on what appears in our data:

* Products 
* Prices 
* Locations/Addresses 
* Phone numbers
* Measurements 

# 3. Data Annotation
* Annotate the  data in to  BIO format  using Doccano tool.

# 4. Model Selection
### For Amharic NER,Use AfroXLMRoberta (better for African languages)

In [None]:
model_name = "Davlan/afro-xlmr-base"

# 5. Training Code Example

In [None]:
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
from transformers import Trainer, TrainingArguments

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("Davlan/afro-xlmr-base")
model = AutoModelForTokenClassification.from_pretrained("Davlan/afro-xlmr-base", num_labels=num_labels)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    save_steps=10_000,
    save_total_limit=2,
    learning_rate=2e-5,
)

# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

# Start training
trainer.train()

# 6. Preprocessing Recommendation

In [None]:
import re

def preprocess_amharic_text(text):
    # Normalize Ethiopic numbers if present
    text = text.replace('፩', '1').replace('፪', '2') # etc for all Ethiopic numbers
    
    # Standardize price formats
    text = re.sub(r'(\d+)\s*ብር', r'\1 ብር', text)
    
    # Remove excessive whitespace and line breaks
    text = ' '.join(text.split())
    
    return text