<a href="https://colab.research.google.com/github/BlenSeleshi/LLM/blob/task-1/test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
! pip install transformers datasets
!pip install datasets
!pip install pandas



In [13]:
from datasets import Dataset
import os

def read_conll_file(filepath):
    tokens = []
    labels = []
    current_sentence_tokens = []
    current_sentence_labels = []

    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip() == "":
                if current_sentence_tokens:
                    tokens.append(current_sentence_tokens)
                    labels.append(current_sentence_labels)
                    current_sentence_tokens = []
                    current_sentence_labels = []
            else:
                token, label = line.split()
                current_sentence_tokens.append(token)
                current_sentence_labels.append(label)

    if current_sentence_tokens:
        tokens.append(current_sentence_tokens)
        labels.append(current_sentence_labels)

    return tokens, labels

tokens, labels = read_conll_file('merged_output.conll')

dataset = Dataset.from_dict({'tokens': tokens, 'ner_tags': labels})


In [14]:
from datasets import Dataset

# Function to get basic information about the dataset
def get_dataset_info(tokens, labels):
    num_sentences = len(tokens)
    num_tokens = sum([len(sentence) for sentence in tokens])
    unique_labels = set([label for sentence_labels in labels for label in sentence_labels])

    print(f"Number of sentences: {num_sentences}")
    print(f"Total number of tokens: {num_tokens}")
    print(f"Unique labels: {unique_labels}")
    print(f"Number of unique labels: {len(unique_labels)}")


get_dataset_info(tokens, labels)


Number of sentences: 53671
Total number of tokens: 2511357
Unique labels: {'I-PHONE', 'B-Price', 'O', 'I-LOC', 'B-PRICE', 'I-PRODUCT', "'O']", 'B-Product', "'B-PRODUCT']", 'I-Product', 'OO', "'I-LOC']", 'I-PRICE', "'I-PRODUCT']", "'I-PRICE']", 'I-Price', 'B-PRODUCT', 'B-LOC', "'B-PRICE']", 'IO'}
Number of unique labels: 20


In [15]:
label_normalization = {
    'O': 'O',
    '0': 'O',
    "'O'": 'O',
    "'O']": 'O',
    'IO': 'O',
    'I-PHONE': 'O',
    'OO':'O',
    "B-PRICE": 'B-PRICE',
    "B-Price": 'B-PRICE',
    "I-PRICE": 'I-PRICE',
    "I-Price": 'I-PRICE',
    "B-LOC": 'B-LOC',
    "I-LOC": 'I-LOC',
    "B-PRODUCT": 'B-PRODUCT',
    "I-PRODUCT": 'I-PRODUCT',
    "B-Product": 'B-PRODUCT',
    "I-Product": 'I-PRODUCT',
    "'I-LOC']": 'I-LOC',
    "'I-PRODUCT']": 'I-PRODUCT',
    "'I-PRICE']": 'I-PRICE',
    "'B-PRODUCT']": 'B-PRODUCT',
    "'B-PRICE']": 'B-PRICE'
}

label_list = sorted(list(set(label_normalization.values())))


In [16]:
from transformers import AutoTokenizer, XLMRobertaForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

def normalize_label(label):
    return label_normalization.get(label, 'O')

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples['tokens'], padding='max_length', truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples['ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Padding token label
            elif word_idx != previous_word_idx:
                try:
                    # Normalize and index the label
                    normalized_label = normalize_label(label[word_idx])
                    label_ids.append(label_list.index(normalized_label))
                except ValueError:
                    # Handle ValueError when the label is not in the label list
                    print(f"Label '{label[word_idx]}' not found in label list. Using 'O' (Outside) label.")
                    label_ids.append(label_list.index('O'))
            else:
                label_ids.append(-100)  # For sub-tokens
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs['labels'] = labels
    return tokenized_inputs

# Map the function to the dataset
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)




Map:   0%|          | 0/53671 [00:00<?, ? examples/s]

In [17]:

dataset_size = len(tokenized_datasets)


train_size = int(0.8 * dataset_size)
eval_size = dataset_size - train_size


small_train_dataset = tokenized_datasets.shuffle(seed=42).select(range(train_size))
small_eval_dataset = tokenized_datasets.shuffle(seed=42).select(range(train_size, train_size + eval_size))

In [18]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    "xlm-roberta-base",
    num_labels=len(label_list)
)


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
! pip install evaluate



In [20]:
import numpy as np
import evaluate

# Load the evaluation metric
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    true_predictions = [
        [p for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [l for l in label if l != -100]
        for label in labels
    ]

    # Safely convert predictions and labels to indices, handling errors
    pred_indices = []
    label_indices = []

    for pred, lab in zip(true_predictions, true_labels):
        pred_index = []
        label_index = []

        for p in pred:
            try:
                pred_index.append(label_list.index(p))
            except ValueError:
                pred_index.append(label_list.index('O'))  # Default to 'O' if the label is not found

        for l in lab:
            try:
                label_index.append(label_list.index(l))
            except ValueError:
                label_index.append(label_list.index('O'))  # Default to 'O' if the label is not found

        pred_indices.extend(pred_index) # Extend the list instead of appending
        label_indices.extend(label_index) # Extend the list instead of appending

    # Return the result in the correct format
    return metric.compute(predictions=pred_indices, references=label_indices)



In [21]:
from transformers import Trainer, TrainingArguments, XLMRobertaForTokenClassification

# Load XLM-Roberta model for token classification
model = XLMRobertaForTokenClassification.from_pretrained("xlm-roberta-base", num_labels=len(label_list))

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,  # Enable mixed precision training for speed and memory optimization
    save_total_limit=2,
    logging_steps=500,
    logging_dir='./logs',
    load_best_model_at_end=True,
)

# Create a Trainer instance for fine-tuning
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,  # Function to compute metrics (e.g., F1, accuracy)
)


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [22]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
0,0.056,0.036926,1.0
1,0.0219,0.020782,1.0
2,0.0147,0.020009,1.0


TrainOutput(global_step=4023, training_loss=0.0424532676154987, metrics={'train_runtime': 5943.3884, 'train_samples_per_second': 21.672, 'train_steps_per_second': 0.677, 'total_flos': 3.3639819884199936e+16, 'train_loss': 0.0424532676154987, 'epoch': 2.9983230855226384})