In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

def load_conll_to_dataframe(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()
            if line:  # If the line is not empty
                tokens = line.split()  # Split by whitespace
                if len(tokens) >= 2:  # Ensure there are at least 2 columns (token and label)
                    token, label = tokens[0], tokens[1]  # First token is the word, second is the label
                    data.append((token, label))  # Append as a tuple

    # Create a DataFrame with appropriate columns
    df = pd.DataFrame(data, columns=['Token', 'Label'])
    return df

In [None]:
# Usage
conll_file_path = '/content/drive/MyDrive/@mertteka_labeled_data.conll'
df = load_conll_to_dataframe(conll_file_path)

df.head()

Unnamed: 0,Token,Label
0,ይሄንን,O
1,ተጭነው,O
2,ያድርጉ፣,O
3,ቤተሰብ,O
4,ይሁኑ,O


In [None]:
# Split the dataset into training and test portions
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, test_size=200, random_state=21)

In [None]:
import pandas as pd
from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import torch
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load your dataset (assuming it's in a pandas DataFrame)
# df = pd.read_csv('your_dataset.csv')

# Map labels to IDs
label_list = df['Label'].unique().tolist()
label_map = {label: idx for idx, label in enumerate(label_list)}
num_labels = len(label_map)

# Encode tokens and labels
def encode_data(df):
    tokens = []
    labels = []
    for _, group in df.groupby((df['Label'] != df['Label'].shift()).cumsum()):
        tokenized_input = tokenizer(list(group['Token']),
                                    is_split_into_words=True,
                                    padding='max_length',
                                    truncation=True,
                                    return_tensors='pt')
        tokens.append(tokenized_input)
        label_ids = [label_map[label] for label in group['Label']]
        # Padding the labels for each token to the same length as input_ids
        label_ids = label_ids + [label_map['O']] * (tokenized_input['input_ids'].shape[1] - len(label_ids))
        labels.append(torch.tensor(label_ids))
    return tokens, labels

# Initialize the mBERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-multilingual-cased')

# Split your dataset into training and validation sets
train_df, val_df = train_test_split(df_train, test_size=0.2)
train_tokens, train_labels = encode_data(train_df)
val_tokens, val_labels = encode_data(val_df)

# Create a dataset class for PyTorch DataLoader
class NERDataset(torch.utils.data.Dataset):
    def __init__(self, tokens, labels):
        self.tokens = tokens
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.tokens[idx]['input_ids'].squeeze(),
            'attention_mask': self.tokens[idx]['attention_mask'].squeeze(),
            'labels': self.labels[idx]
        }

train_dataset = NERDataset(train_tokens, train_labels)
val_dataset = NERDataset(val_tokens, val_labels)

# Initialize the mBERT model for token classification
model = BertForTokenClassification.from_pretrained('bert-base-multilingual-cased', num_labels=num_labels)

# Set up the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,  # Adjust as needed
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,  # Accumulate gradients over 2 batches
    warmup_steps=500,
    weight_decay=0.01,
    fp16=True,  # If using a GPU with 16-bit precision
    logging_dir='./logs',
    evaluation_strategy="epoch",  # Evaluate after each epoch
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Start training the model
trainer.train()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss
1,0.0019,0.001811
2,0.0018,0.001746
3,0.0018,0.001742


TrainOutput(global_step=7356, training_loss=0.009672672261421676, metrics={'train_runtime': 4247.0817, 'train_samples_per_second': 27.707, 'train_steps_per_second': 1.732, 'total_flos': 3.07500417904896e+16, 'train_loss': 0.009672672261421676, 'epoch': 3.0})

In [None]:
# Prepare a test set for evaluation (assuming you have a test DataFrame)
test_tokens, test_labels = encode_data(df_test)
test_dataset = NERDataset(test_tokens, test_labels)

# Run predictions on the test dataset
predictions, label_ids, metrics = trainer.predict(test_dataset)

# Convert logits to predicted label IDs
predicted_labels = np.argmax(predictions, axis=2)

# Flatten the predictions and true labels for evaluation
true_labels_flat = []
predicted_labels_flat = []

for i in range(len(label_ids)):
    true_labels_flat.extend(label_ids[i])
    predicted_labels_flat.extend(predicted_labels[i])

# Remove padding tokens (if your label_map includes a 'PAD' token or 'O', exclude it)
true_labels_no_pad = [label for label in true_labels_flat if label != label_map['O']]
predicted_labels_no_pad = [predicted_labels_flat[i] for i, label in enumerate(true_labels_flat) if label != label_map['O']]

# Calculate accuracy, precision, recall, and F1 score
accuracy = accuracy_score(true_labels_flat, predicted_labels_flat)
precision = precision_score(true_labels_flat, predicted_labels_flat, average='weighted')
recall = recall_score(true_labels_flat, predicted_labels_flat, average='weighted')
f1 = f1_score(true_labels_flat, predicted_labels_flat, average='weighted')

# Print the evaluation metrics
print(f"Test Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

Test Accuracy: 99.94%
Precision: 1.00
Recall: 1.00
F1 Score: 1.00


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Save the fine-tuned model and tokenizer
output_dir = "./models"
trainer.save_model(output_dir)  # Saves the model
tokenizer.save_pretrained(output_dir)

('./models/tokenizer_config.json',
 './models/special_tokens_map.json',
 './models/vocab.txt',
 './models/added_tokens.json',
 './models/tokenizer.json')