<a href="https://colab.research.google.com/github/Elish-Ab/AI-Mastery-10x-Week5/blob/main/Task_3_week_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
!pip install transformers datasets



In [33]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from datasets import Dataset
import numpy as np
from sklearn.metrics import f1_score

In [35]:
# Load your CoNLL format data into a DataFrame
def load_conll_data(file_path):
    sentences = []
    labels = []
    current_sentence = []
    current_labels = []

    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line:
                # Split the line into token and label
                try:
                    token, label = line.split()
                    if label not in label2id:  # Check if label is recognized
                        print(f"Skipping unexpected label: {label}")
                        continue  # Skip this line if the label is not recognized
                except ValueError:
                    print(f"Skipping line due to unexpected format: {line}")
                    continue

                current_sentence.append(token)
                current_labels.append(label2id[label])  # Convert label to id

            else:
                if current_sentence:
                    sentences.append(current_sentence)
                    labels.append(current_labels)
                    current_sentence = []
                    current_labels = []

    return sentences, labels

# Load data
file_path = 'labeled_dataset.conll'  # Change this to your actual file path
sentences, labels = load_conll_data(file_path)

# Create a DataFrame
df = pd.DataFrame({'tokens': sentences, 'ner_tags': labels})

# Print DataFrame to debug
print(df.head())  # Check the first few entries to ensure data is loaded correctly

# Map labels to IDs
label2id = {
    "B-Product": 0,
    "I-Product": 1,
    "B-LOC": 2,
    "I-LOC": 3,
    "B-PRICE": 4,
    "I-PRICE": 5,
    "O": 6
}

# Using a pre-trained tokenizer and model
model_name = "xlm-roberta-base"  # You can replace this with another pre-trained model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label2id))

# Tokenize and align the labels with the tokens
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        padding="max_length",
        max_length=128,
        is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_id = None
        label_ids = []
        for word_id in word_ids:
            if word_id is None:
                label_ids.append(-100)  # Ignore tokens without labels
            elif word_id != previous_word_id:
                label_ids.append(label[word_id])  # Label for the first token in a word
            else:
                label_ids.append(-100)  # Ignore subsequent tokens in a word
            previous_word_id = word_id
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


# Convert DataFrame to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Apply tokenization and label alignment
tokenized_data = dataset.map(tokenize_and_align_labels, batched=True)

# Split the dataset into training and validation sets using Hugging Face's method
train_test_data = tokenized_data.train_test_split(test_size=0.2)
train_dataset = train_test_data['train']
val_dataset = train_test_data['test']


# Load and preprocess data here...

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=2)
    true_labels = p.label_ids
    # Flatten the predictions and true labels
    true_labels_flat = true_labels.flatten()
    preds_flat = preds.flatten()

    # Remove ignored index (special tokens)
    mask = true_labels_flat != -100
    true_labels_flat = true_labels_flat[mask]
    preds_flat = preds_flat[mask]

    f1 = f1_score(true_labels_flat, preds_flat, average='weighted')
    return {'f1': f1}

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='steps',
    save_steps=500,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained('./fine_tuned_model')
tokenizer.save_pretrained('./fine_tuned_model')


Skipping unexpected label: B-Price
Skipping unexpected label: U-PRICE
Skipping line due to unexpected format: 0902660722
Skipping line due to unexpected format: ሴራሚክ
Skipping line due to unexpected format: 6ፍሬ
                                              tokens  \
0  [የህፃናት, መመገቢያ, ጡጦ, ዋጋ፦, 400, ብርውስን, አድራሻቁ1መገናኛ...   
1  [የሞተ, ቆዳን, እንዲሁም, ቆሻሻን, ለማፅዳት, ተመራጭዋጋ፦, 200, ብ...   
2  [ከብረት, የተሰራጫማ, እና, የተለያዩ, ዕቃወች, ማስቀመጫ, ለመገጣጠም,...   
3  [ዘመናዊ, የልብስ, ማስቀመጫ, ቁምሳጥን, በቀላሉ, የሚገጣጠም, የሚነቃቀ...   
4  [ከፍተኛ, ጥራትእስከ, 30, እንቁላል, መያዝ, የሚችል, ፍሪጅዎን, በስ...   

                                            ner_tags  
0  [0, 1, 1, 4, 5, 5, 2, 3, 3, 3, 3, 3, 3, 3, 3, ...  
1  [0, 1, 1, 1, 1, 4, 5, 5, 5, 2, 3, 3, 3, 3, 3, ...  
2  [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 5, 5, ...  
3  [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...  
4  [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...  


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/41 [00:00<?, ? examples/s]



Step,Training Loss,Validation Loss


('./fine_tuned_model/tokenizer_config.json',
 './fine_tuned_model/special_tokens_map.json',
 './fine_tuned_model/sentencepiece.bpe.model',
 './fine_tuned_model/added_tokens.json',
 './fine_tuned_model/tokenizer.json')