In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import sys
import os
import pandas as pd
import numpy as np
sys.path.append('/content/gdrive/MyDrive')

In [None]:
import os
import sys

from datasets import load_dataset, Dataset
from transformers import (
    TrainingArguments,
    AutoModelForTokenClassification,
    AutoTokenizer,
    DataCollatorForTokenClassification,
    Trainer,
)

In [None]:
data = load_dataset("csv", data_files="/content/gdrive/MyDrive/final_tokens_labels.csv")
train_test_datasets = data['train'].train_test_split(train_size=0.8, seed=42)

In [None]:
train_test_datasets

DatasetDict({
    train: Dataset({
        features: ['Token', 'Label'],
        num_rows: 121209
    })
    test: Dataset({
        features: ['Token', 'Label'],
        num_rows: 30303
    })
})

In [None]:
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]



In [None]:
labels = list(set(data['train']['Label']))
label_mapping = {label: i for i, label in enumerate(labels)}

In [None]:
def tokenize(example):
    tokenized_inputs = tokenizer(example['Token'], truncation=True, padding='max_length', max_length=128)

    # Initialize labels with -100 for tokens that should be ignored
    labels = [-100] * len(tokenized_inputs['input_ids'])

    # Align labels with tokens
    label_index = label_mapping[example["Label"]]

    # Fill labels according to the example
    for i in range(len(tokenized_inputs['input_ids'])):
        # Assuming example['Token'] is not empty and corresponds to the label
        if example['Token'] == example['Token']:
            labels[i] = label_index

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
# Tokenize the dataset
tokenized_dataset = train_test_datasets.map(tokenize)

Map:   0%|          | 0/121209 [00:00<?, ? examples/s]

Map:   0%|          | 0/30303 [00:00<?, ? examples/s]

In [None]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['Token', 'Label', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 121209
    })
    test: Dataset({
        features: ['Token', 'Label', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 30303
    })
})

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=4,
    num_train_epochs=1,
    weight_decay=0.1,
    evaluation_strategy="steps",
    logging_steps=100,
    eval_steps=500,
    save_steps=500,
    # save_strategy="epoch",
    # logging_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True,
    seed=42,
)




In [None]:
# Load the model
model = AutoModelForTokenClassification.from_pretrained("xlm-roberta-base", num_labels=len(labels))

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Initialize the data collator
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, padding=True, return_tensors="pt")

In [None]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    # Flatten the arrays and ensure they are of type int32
    return metric.compute(predictions=predictions.flatten().astype(np.int32), references=labels.flatten().astype(np.int32))

In [None]:
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [None]:
# Train the model
trainer.train()

Step,Training Loss,Validation Loss,Accuracy
500,0.9318,0.932662,0.715606
1000,0.7774,0.795324,0.765964
1500,0.8226,0.783728,0.765964
2000,0.899,0.935352,0.715606
2500,0.9216,0.934221,0.715606
3000,0.9418,0.931584,0.715606
3500,0.8916,0.934255,0.715606


TrainOutput(global_step=3788, training_loss=0.8955632917732472, metrics={'train_runtime': 2607.3449, 'train_samples_per_second': 46.488, 'train_steps_per_second': 1.453, 'total_flos': 7918094402000640.0, 'train_loss': 0.8955632917732472, 'epoch': 1.0})

In [None]:
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

Evaluation results: {'eval_loss': 0.7953243255615234, 'eval_accuracy': 0.7659637659637659, 'eval_runtime': 110.1102, 'eval_samples_per_second': 275.206, 'eval_steps_per_second': 34.402, 'epoch': 1.0}


In [None]:
# Save the fine-tuned model and tokenizer
trainer.save_model('./fine_tuned_model')
tokenizer.save_pretrained('./fine_tuned_model')

('./fine_tuned_model/tokenizer_config.json',
 './fine_tuned_model/special_tokens_map.json',
 './fine_tuned_model/sentencepiece.bpe.model',
 './fine_tuned_model/added_tokens.json',
 './fine_tuned_model/tokenizer.json')