In [1]:
!pip install pyarrow==10.0.1 datasets==2.4.0 seqeval


## Importing Libraries

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import XLMRobertaTokenizerFast
from datasets import Dataset, Features, Sequence, Value
from transformers import TrainingArguments
from transformers import XLMRobertaForTokenClassification, AutoModelForTokenClassification, AutoTokenizer, Trainer


## Loading File from Task-2

In [3]:
from google.colab import files
uploaded = files.upload()


ModuleNotFoundError: No module named 'datasets'

In [None]:
# Function to load CoNLL formatted data
def load_conll(file_path):
    sentences = []
    labels = []
    with open(file_path, 'r', encoding='utf-8') as f:
        sentence = []
        label = []
        for line in f:
            if line.strip():  # Non-empty line
                token, label_item = line.split()
                sentence.append(token)
                label.append(label_item)
            else:  # Empty line indicates end of a sentence
                sentences.append(sentence)
                labels.append(label)
                sentence = []
                label = []
    return pd.DataFrame({'tokens': sentences, 'labels': labels})

# Load your CoNLL file
df = load_conll('first.conll')


In [None]:
df.head()

## Defining Unique Labels

In [None]:
unique_labels = set(label for sublist in df['labels'] for label in sublist)
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}


In [None]:
df['labels'] = df['labels'].apply(lambda x: [label2id[label] for label in x])


## Convert DataFrame to Hugging Face Dataset

In [None]:
# Define the features with the correct data types
features = Features({
    'tokens': Sequence(Value('string')),  # List of strings for tokens
    'labels': Sequence(Value('int32'))    # List of integers for labels
})

# Convert DataFrame to Hugging Face Dataset with specified features
dataset = Dataset.from_pandas(df[['tokens', 'labels']], features=features)

## Tokenization and Label Allignment

In [None]:
# For XLM-Roberta
tokenizer = XLMRobertaTokenizerFast.from_pretrained(
    "xlm-roberta-base",
    clean_up_tokenization_spaces=True
    )

In [None]:
# For DistilBERT
tokenizer_distilbert = AutoTokenizer.from_pretrained(
     'distilbert-base-multilingual-cased',
     clean_up_tokenization_spaces=True
     )

In [None]:
# For mBERT
tokenizer_mbert = AutoTokenizer.from_pretrained(
     'bert-base-multilingual-cased',
     clean_up_tokenization_spaces=True
     )

## Tokenization Function

In [None]:
# Tokenization and alignment function
def tokenize_and_align_labels(examples):
  tokenized_inputs = tokenizer(examples['tokens'], truncation=True, is_split_into_words=True, padding="max_length", max_length=128)  # Set max_length as needed
  labels = []

  for i in range(len(examples['tokens'])):
      label = examples['labels'][i]
      tokenized_label = [-100] * len(tokenized_inputs['input_ids'][i])  # Default label for all tokens

      # Aligning labels with tokens
      for j, token in enumerate(tokenized_inputs['input_ids'][i]):
          # Check if this token corresponds to the original word
          original_word_idx = tokenizer.decode(token).strip()
          if original_word_idx in examples['tokens'][i]:
              token_index = examples['tokens'][i].index(original_word_idx)
              tokenized_label[j] = label[token_index]  # Use the corresponding label

      labels.append(tokenized_label)

  tokenized_inputs['labels'] = labels
  return tokenized_inputs

In [None]:
# Tokenize the dataset
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

In [None]:
tokenized_dataset

## Split the dataset into train and test data

In [None]:
# Split into train and validation datasets
train_test_split = tokenized_dataset.train_test_split(test_size=0.1)  # 90% train, 10% validation

In [None]:
# Print the lengths of input_ids, attention_mask, and labels for verification
print(f"Number of samples: {len(tokenized_dataset)}")
print(f"Input IDs length: {[len(x) for x in tokenized_dataset['input_ids']]}")
print(f"Attention Mask length: {[len(x) for x in tokenized_dataset['attention_mask']]}")
print(f"Labels length: {[len(x) for x in tokenized_dataset['labels']]}")

In [None]:
# Check the train and test split
train_test_split

## Setting Up Training Arguments 

In [None]:
# Set up training arguments with adjustments
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",     # Evaluates at the end of each epoch
    learning_rate=2e-5,
    per_device_train_batch_size=16,  # Batch size for training
    per_device_eval_batch_size=16,   # Batch size for evaluation
    num_train_epochs=7,
    weight_decay=0.01,               # Strength of weight decay
    max_grad_norm=1.0,  # Gradient clipping
    logging_dir='./logs',            # Directory for storing logs
    logging_strategy="steps",        # Log at regular intervals
    logging_steps=50,                # Log every 50 steps
    save_strategy="epoch",           # Save model at the end of each epoch
    report_to="none",                # Only show logs in the output (no TensorBoard)
)


## Fine Tuning the Model with different pre-trained models

In [None]:
# Initialize each of the models
# For XLM-Roberta
model_xlmr = XLMRobertaForTokenClassification.from_pretrained("xlm-roberta-base", num_labels=len(unique_labels)) # Ensure unique_labels is defined

# For DistilBERT
model_distilbert = AutoModelForTokenClassification.from_pretrained('distilbert-base-multilingual-cased', num_labels=len(unique_labels))

# For mBERT
model_distilbert = AutoModelForTokenClassification.from_pretrained('bert-base-multilingual-cased', num_labels=len(unique_labels))



## Setting Up Trainer for Each Model

In [None]:
trainer_xlmr = Trainer(
    model=model_xlmr,
    args=training_args,
    train_dataset=train_test_split['train'],
    eval_dataset=train_test_split['test'],  # Changed from validation to test based on split
)
trainer_distilbert = Trainer(
    model=model_distilbert,
    args=training_args,
    train_dataset=train_test_split['train'],
    eval_dataset=train_test_split['test'],  # Changed from validation to test based on split
)
trainer_mbert = Trainer(
    model=model_distilbert,
    args=training_args,
    train_dataset=train_test_split['train'],
    eval_dataset=train_test_split['test'],  # Changed from validation to test based on split
)

## Evaluating and Training each model

In [None]:
# Fine-tune XLM-Roberta
trainer_xlmr.train()
trainer_xlmr.evaluate()

# Fine-tune DistilBERT
trainer_distilbert.train()
trainer_distilbert.evaluate()

# Fine-tune mBERT
trainer_mbert.train()
trainer_mbert.evaluate()


## Saving the trained model

In [None]:
# Save the model
model.save_pretrained("./fine_tuned_ner_model")
tokenizer.save_pretrained("./fine_tuned_ner_model")

## Evaluations

In [None]:
eval_results = trainer.evaluate()
print(eval_results)

In [None]:
from datasets import Dataset
from transformers import XLMRobertaTokenizer, XLMRobertaForTokenClassification, Trainer, TrainingArguments
import pandas as pd

# Load your labeled data (replace with your actual data loading method)


# Load the tokenizer
tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")

# Tokenization and alignment function
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples['tokens'], truncation=True, is_split_into_words=True, padding="max_length", max_length=128)  # Set max_length as needed
    labels = []

    for i in range(len(examples['tokens'])):
        label = examples['labels'][i]
        tokenized_label = [-100] * len(tokenized_inputs['input_ids'][i])  # Default label for all tokens

        # Aligning labels with tokens
        for j, token in enumerate(tokenized_inputs['input_ids'][i]):
            # Check if this token corresponds to the original word
            original_word_idx = tokenizer.decode(token).strip()
            if original_word_idx in examples['tokens'][i]:
                token_index = examples['tokens'][i].index(original_word_idx)
                tokenized_label[j] = label[token_index]  # Use the corresponding label

        labels.append(tokenized_label)

    tokenized_inputs['labels'] = labels
    return tokenized_inputs

# Tokenize the dataset
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

# Split into train and validation datasets
train_test_split = tokenized_dataset.train_test_split(test_size=0.1)  # 90% train, 10% validation

# Print the lengths of input_ids, attention_mask, and labels for verification
print(f"Number of samples: {len(tokenized_dataset)}")
print(f"Input IDs length: {[len(x) for x in tokenized_dataset['input_ids']]}")
print(f"Attention Mask length: {[len(x) for x in tokenized_dataset['attention_mask']]}")
print(f"Labels length: {[len(x) for x in tokenized_dataset['labels']]}")

# Set up training arguments with adjustments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=1e-5,  # Reduced learning rate
    per_device_train_batch_size=4,  # Reduced batch size
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    max_grad_norm=1.0,  # Gradient clipping
)



# Train the model
trainer.train()


