<a href="https://colab.research.google.com/github/BlenSeleshi/LLM/blob/task-1/test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install transformers datasets
!pip install datasets
!pip install pandas

In [None]:
from datasets import Dataset
import os

def read_conll_file(filepath):
    tokens = []
    labels = []
    current_sentence_tokens = []
    current_sentence_labels = []

    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip() == "":  # End of sentence
                if current_sentence_tokens:
                    tokens.append(current_sentence_tokens)
                    labels.append(current_sentence_labels)
                    current_sentence_tokens = []
                    current_sentence_labels = []
            else:
                token, label = line.split()
                current_sentence_tokens.append(token)
                current_sentence_labels.append(label)

    if current_sentence_tokens:  # Add last sentence if file doesn't end with newline
        tokens.append(current_sentence_tokens)
        labels.append(current_sentence_labels)

    return tokens, labels

tokens, labels = read_conll_file('merged_output.conll')

dataset = Dataset.from_dict({'tokens': tokens, 'ner_tags': labels})


In [None]:

label_normalization = {
    'O': 'O',
    "'O'": 'O',
    "'O']":'O',
    'IO': 'O',
    'I-PHONE': 'O',
    "B-PRICE": 'B-PRICE',
    "B-Price": 'B-PRICE',
    "I-PRICE": 'I-PRICE',
    "I-Price": 'I-PRICE',
    "B-LOC": 'B-LOC',
    "I-LOC": 'I-LOC',
    "B-PRODUCT": 'B-PRODUCT',
    "I-PRODUCT": 'I-PRODUCT',
    "B-Product": 'B-PRODUCT',
    "I-Product": 'I-PRODUCT',
    "'I-LOC']":'I-LOC',
    "'I-PRODUCT']":'I-PRODUCT',
    "'I-PRICE']":'I-PRICE',
    "'B-PRODUCT']":'B-PRODUCT',
    "'B-PRICE']":'B-PRICE'


}


label_list = list(set(label_normalization.values()))


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples['tokens'], padding='max_length', truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples['ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Special tokens (CLS, SEP)
            elif word_idx != previous_word_idx:  # Start of a new word
                # Normalize the label using the normalization mapping
                normalized_label = label_normalization.get(label[word_idx], 'O')  # Default to 'O' if not found
                label_ids.append(label_list.index(normalized_label))
            else:
                label_ids.append(-100)  # Pad subword tokens with -100
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs['labels'] = labels
    return tokenized_inputs

# Reapply the function
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)


In [None]:

dataset_size = len(tokenized_datasets)


train_size = int(0.8 * dataset_size)
eval_size = dataset_size - train_size


small_train_dataset = tokenized_datasets.shuffle(seed=42).select(range(train_size))
small_eval_dataset = tokenized_datasets.shuffle(seed=42).select(range(train_size, train_size + eval_size))

In [None]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    "bert-base-cased",
    num_labels=len(label_list)
)


In [None]:
! pip install evaluate

In [None]:
# Now proceed with the rest of the code
from transformers import TrainingArguments, Trainer
import numpy as np
import evaluate

# Load the evaluation metric
metric = evaluate.load("accuracy")

# Define compute_metrics function for accuracy
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)


    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for l in label if l != -100]
        for label in labels
    ]


    pred_indices = [[label_list.index(p) for p in pred] for pred in true_predictions]
    label_indices = [[label_list.index(l) for l in lab] for lab in true_labels]

    return metric.compute(predictions=pred_indices, references=label_indices)



In [None]:
# Define the training arguments
training_args = TrainingArguments(
    output_dir="test_trainer",
    eval_strategy="epoch",  # Use eval_strategy instead of evaluation_strategy to avoid the warning
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss
