<a href="https://colab.research.google.com/github/BelayAbAb/Centralized-Telegram-E-Commerce-Platform-for-EthioMart_I/blob/Task1%262/Fine-tuning%20NER%20Model_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
# Step 1: Install Required Libraries
!pip install datasets transformers scikit-learn

# Step 2: Import Libraries
import os
from datasets import Dataset
from transformers import XLMRobertaTokenizerFast, XLMRobertaForTokenClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split

# Step 3: Model Selection
model_name = "xlm-roberta-base"
tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_name)

# Step 4: Dataset Loading
def load_conll_dataset(file_path):
    dataset = []
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        tokens, labels = [], []
        for line in lines:
            if line.strip():
                parts = line.split('\t')
                if len(parts) == 2:
                    tokens.append(parts[0])
                    labels.append(parts[1].strip())
            else:
                if tokens:  # End of a sentence
                    dataset.append((tokens, labels))
                    tokens, labels = [], []
    return dataset

# Load the dataset from Google Drive or upload directly
from google.colab import files
uploaded = files.upload()  # You can upload your .conll file directly here

# Assuming you upload the file named 'labeled_data_all_channels.conll'
data = load_conll_dataset('labeled_data_all_channels.conll')
train_data, val_data = train_test_split(data, test_size=0.1, random_state=42)

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_list([{'tokens': tokens, 'labels': labels} for tokens, labels in train_data])
val_dataset = Dataset.from_list([{'tokens': tokens, 'labels': labels} for tokens, labels in val_data])

# Step 5: Tokenization and Label Alignment
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples['tokens'], truncation=True, is_split_into_words=True, padding='max_length', max_length=128)
    labels = []

    # Create a mapping from labels to IDs
    label_to_id = {label: idx for idx, label in enumerate(set(label for sublist in examples['labels'] for label in sublist))}

    for i, label in enumerate(examples['labels']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = [-100] * len(tokenized_inputs['input_ids'][i])  # -100 is used to ignore certain tokens

        for j, label_id in enumerate(label):
            if j < len(word_ids) and word_ids[j] is not None:  # Adjusted index check
                label_ids[word_ids[j]] = label_to_id[label_id]  # Convert label to ID

        labels.append(label_ids)

    tokenized_inputs['labels'] = labels
    return tokenized_inputs

# Tokenize the datasets
train_tokenized = train_dataset.map(tokenize_and_align_labels, batched=True)
val_tokenized = val_dataset.map(tokenize_and_align_labels, batched=True)

# Step 6: Training Configuration
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Step 7: Model Training
num_labels = len(set(label for _, labels in train_data for label in labels))
model = XLMRobertaForTokenClassification.from_pretrained(model_name, num_labels=num_labels)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
)

trainer.train()

# Step 8: Performance Evaluation
trainer.evaluate()

# Step 9: Model Saving
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")




Saving labeled_data_all_channels.conll to labeled_data_all_channels (4).conll


Map:   0%|          | 0/801 [00:00<?, ? examples/s]

Map:   0%|          | 0/90 [00:00<?, ? examples/s]



model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.005538
2,No log,0.002842


Epoch,Training Loss,Validation Loss
1,No log,0.005538
2,No log,0.002842
3,No log,0.001494


('./fine_tuned_model/tokenizer_config.json',
 './fine_tuned_model/special_tokens_map.json',
 './fine_tuned_model/sentencepiece.bpe.model',
 './fine_tuned_model/added_tokens.json',
 './fine_tuned_model/tokenizer.json')