In [None]:
# Cell 1: Mount Google Drive to access data files
from google.colab import drive
drive.mount('/content/drive', force_remount=True)


Mounted at /content/drive


In [None]:
!pip install transformers[torch]
!pip install datasets
!pip install tqdm
!pip install seqeval


Collecting transformers[torch]
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers[torch])
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m30.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers[torch])
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m57.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers[torch])
  Downloading safetensors-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m6

In [None]:
# Cell 1: Import necessary libraries
import warnings
warnings.filterwarnings("ignore", category=FutureWarning, module="transformers")
from sklearn.exceptions import UndefinedMetricWarning
warnings.filterwarnings("ignore", category=UndefinedMetricWarning, module="seqeval")
from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments
from datasets import Dataset
import numpy as np
import torch
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report
import gc
import os
from tqdm import tqdm
import re


In [None]:
# Cell 2: Define the path to your data directory on Google Drive and other global variables
data_dir = '/content/drive/MyDrive/database/multiconer2023'
languages = ['EN-English', 'ES-Spanish', 'HI-Hindi', 'BN-Bangla', 'ZH-Chinese', 'SV-Swedish', 'FA-Farsi', 'FR-French', 'IT-Italian', 'PT-Portuguese', 'UK-Ukrainian', 'DE-German']
percentage = 1


In [None]:
# Cell 3: Define a dictionary for grouping labels into broader categories
# Label Grouping dictionary
labels = {
    'B-Disease': 'B-Medical',
    'I-Disease': 'I-Medical',
    'B-Symptom': 'B-Medical',
    'I-Symptom': 'I-Medical',
    'B-AnatomicalStructure': 'B-Medical',
    'I-AnatomicalStructure': 'I-Medical',
    'B-MedicalProcedure': 'B-Medical',
    'I-MedicalProcedure': 'I-Medical',
    'B-Medication/Vaccine': 'B-Medical',
    'I-Medication/Vaccine': 'I-Medical',

    'B-OtherPROD': 'B-Product',
    'I-OtherPROD': 'I-Product',
    'B-Drink': 'B-Product',
    'I-Drink': 'I-Product',
    'B-Food': 'B-Product',
    'I-Food': 'I-Product',
    'B-Vehicle': 'B-Product',
    'I-Vehicle': 'I-Product',
    'B-Clothing': 'B-Product',
    'I-Clothing': 'I-Product',

    'B-OtherPER': 'B-Person',
    'I-OtherPER': 'I-Person',
    'B-SportsManager': 'B-Person',
    'I-SportsManager': 'I-Person',
    'B-Cleric': 'B-Person',
    'I-Cleric': 'I-Person',
    'B-Politician': 'B-Person',
    'I-Politician': 'I-Person',
    'B-Athlete': 'B-Person',
    'I-Athlete': 'I-Person',
    'B-Artist': 'B-Person',
    'I-Artist': 'I-Person',
    'B-Scientist': 'B-Person',
    'I-Scientist': 'I-Person',

    'B-MusicalGRP': 'B-Group',
    'I-MusicalGRP': 'I-Group',
    'B-PublicCorp': 'B-Group',
    'I-PublicCorp': 'I-Group',
    'B-PrivateCorp': 'B-Group',
    'I-PrivateCorp': 'I-Group',
    'B-AerospaceManufacturer': 'B-Group',
    'I-AerospaceManufacturer': 'I-Group',
    'B-SportsGRP': 'B-Group',
    'I-SportsGRP': 'I-Group',
    'B-CarManufacturer': 'B-Group',
    'I-CarManufacturer': 'I-Group',
    'B-ORG': 'B-Group',
    'I-ORG': 'I-Group',

    'B-VisualWork': 'B-CW',
    'I-VisualWork': 'I-CW',
    'B-MusicalWork': 'B-CW',
    'I-MusicalWork': 'I-CW',
    'B-WrittenWork': 'B-CW',
    'I-WrittenWork': 'I-CW',
    'B-ArtWork': 'B-CW',
    'I-ArtWork': 'I-CW',
    'B-Software': 'B-CW',
    'I-Software': 'I-CW',

    'B-Facility': 'B-Location',
    'I-Facility': 'I-Location',
    'B-OtherLOC': 'B-Location',
    'I-OtherLOC': 'I-Location',
    'B-HumanSettlement': 'B-Location',
    'I-HumanSettlement': 'I-Location',
    'B-Station': 'B-Location',
    'I-Station': 'I-Location',

    'O': 'O'
}


def convert_to_general_label(label):
    return labels.get(label)


In [None]:
# Cell 4: Define a function to read data from a file and extract tokens and NER tags
def read_file(file_path, percentage):
    tokens_all = []
    nertags_all = []
    unique_labels = set()

    nertag_pattern = re.compile(r'^(B|I)-\w+|O$')

    try:
        with open(file_path, 'r') as f:
            lines = f.readlines()
            num_lines = int(len(lines) * percentage)

            tokens = []
            nertags = []

            for index, line in enumerate(lines[:num_lines]):
                parts = line.strip().split()

                # End of sentence detected
                if not parts:
                    if tokens and nertags:  # only add if they are not empty
                        tokens_all.append(tokens)
                        nertags_all.append(nertags)
                    tokens = []
                    nertags = []
                    continue

                token = parts[0]
                nertag = parts[-1]

                if nertag_pattern.match(nertag):
                    tokens.append(token)
                    nertags.append(nertag)
                    unique_labels.add(nertag)
                else:
                  # Removing the sentence...
                    tokens = []  # reset
                    nertags = []  # reset

            # Add the last sentence if it's valid
            if tokens and nertags:
                tokens_all.append(tokens)
                nertags_all.append(nertags)

    except FileNotFoundError:
        print(f"File {file_path} not found. Skipping...")

    return tokens_all, nertags_all, unique_labels


In [None]:
# Cell 5: Read data for each language and store it in global dictionaries
global_tokens = {}
global_nertags = {}
unique_labels = set()

for lang in tqdm(languages, desc='Reading files for all languages'):
    global_tokens[lang] = []
    global_nertags[lang] = []
    files = [os.path.join(data_dir, f'{lang}/{lang[:2].lower()}_{ftype}.conll') for ftype in ['train', 'test', 'dev']]
    for file in files:
        tokens, nertags, file_unique_labels = read_file(file, percentage)
        global_tokens[lang].extend(tokens)
        global_nertags[lang].extend(nertags)
        unique_labels.update(file_unique_labels)

# Debugging tokens and labels after reading all files
for lang in languages:
    print(f"For language {lang}, found {len(global_tokens[lang])} tokens and {len(global_nertags[lang])} NER tags.")



Reading files for all languages: 100%|██████████| 2/2 [00:10<00:00,  5.01s/it]

For language UK-Ukrainian, found 255576 tokens and 255576 NER tags.
For language DE-German, found 30442 tokens and 30442 NER tags.





In [None]:
class_counts = {}

for lang, nertags in global_nertags.items():
    class_counts[lang] = {}
    for sent_nertags in nertags:
        for nertag in sent_nertags:
            # Convert the label to a broader category
            label_class = convert_to_general_label(nertag)
            if label_class not in class_counts[lang]:
                class_counts[lang][label_class] = 0
            class_counts[lang][label_class] += 1

import pandas as pd

# Create a DataFrame from the class_counts dictionary
df = pd.DataFrame(class_counts)

# Define the path to the Excel file
excel_file = './excel_file.xlsx'

# Export the DataFrame to an Excel file
df.to_excel(excel_file)


In [None]:
# Cell 6: Initialize the tokenizer and model
classes = list(unique_labels)
label_to_index = {label: index for index, label in enumerate(classes)}

tokenizer = BertTokenizerFast.from_pretrained('bert-base-multilingual-cased')
model = BertForTokenClassification.from_pretrained('bert-base-multilingual-cased', num_labels=len(classes))


Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
max_lengths = {}
for lang in languages:
    # Tokenize the text for the current language
    tokenized_inputs = tokenizer(global_tokens[lang], truncation=True, padding=True, is_split_into_words=True)

    # Compute the maximum length of the tokenized sequences
    max_length = max([len(tokenized_input) for tokenized_input in tokenized_inputs['input_ids']])

    # Store the maximum length for the current language
    max_lengths[lang] = max_length

# Print the maximum lengths for all languages
for lang, max_length in max_lengths.items():
    print(f"Maximum length for {lang}: {max_length}")


Maximum length for UK-Ukrainian: 78
Maximum length for DE-German: 78


In [None]:
# Cell 7: Define a function to tokenize the text and align the labels
def tokenize_and_align_labels(examples, max_length=256):
    tokenized_inputs = tokenizer(examples['tokens'], truncation=True, padding=True, max_length=max_length, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples['nertags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label_to_index[label[word_idx]])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs['labels'] = labels
    return tokenized_inputs


In [None]:
# Cell 8: Define a function to compute evaluation metrics
def compute_metrics(p):
    predictions = np.argmax(p.predictions, axis=2)
    true_labels = p.label_ids
    true_label_list = [[] for _ in range(true_labels.shape[0])]
    pred_label_list = [[] for _ in range(true_labels.shape[0])]

    # Convert label indices to label strings
    index_to_label = {i: label for label, i in label_to_index.items()}

    for i in range(true_labels.shape[0]):
        for j in range(true_labels.shape[1]):
            if true_labels[i, j] != -100: # Ignore padding and other special tokens
                true_label_list[i].append(index_to_label[true_labels[i, j]])
                pred_label_list[i].append(index_to_label[predictions[i, j]])

    # Convert detailed labels to general categories
    true_label_list = [[convert_to_general_label(label) for label in sent_labels] for sent_labels in true_label_list]
    pred_label_list = [[convert_to_general_label(label) for label in sent_labels] for sent_labels in pred_label_list]

    # Use seqeval for evaluation
    results = {
        "precision": precision_score(true_label_list, pred_label_list),
        "recall": recall_score(true_label_list, pred_label_list),
        "f1": f1_score(true_label_list, pred_label_list),
    }

    # If you want a more detailed report, you can use this:
    report = classification_report(true_label_list, pred_label_list)
    print(report)

    return results



In [None]:
import pickle

# Cell 9: Train the model for each language
for lang in languages:
    print(f"\nTraining  {lang} model: ")
    # Prepare data
    train_encodings = tokenize_and_align_labels({'tokens': global_tokens[lang], 'nertags': global_nertags[lang]}, max_length=max_lengths[lang])
    train_dataset = Dataset.from_dict(train_encodings)

   # Read and process data from dev.conll file
    dev_tokens, dev_nertags, _ = read_file(os.path.join(data_dir, f'{lang}/{lang[:2].lower()}_dev.conll'), percentage)
    dev_encodings = tokenize_and_align_labels({'tokens': dev_tokens, 'nertags': dev_nertags}, max_length=max_lengths[lang])
    dev_dataset = Dataset.from_dict(dev_encodings)

    # Define training arguments
    training_args = TrainingArguments(
        output_dir='/dev/shm/dummy_dir', # Directory in RAM
        evaluation_strategy='epoch',
        learning_rate=2.5e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=2,
        weight_decay=0.01,
        logging_steps=10,
        logging_dir=None,
        save_steps=100000, # large number to reduce saving frequency
        save_total_limit=1, # only keep the most recent checkpoint
        lr_scheduler_type='linear',
        warmup_steps=100
    )

    # Create trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=dev_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics

    )

    #training the model
    trainer.train()


    # Save the models and tokenizers for each language using pickle library

    label_to_index_path=os.path.join('/content/drive/MyDrive/mbert_fin_models', lang,'label_mapping')
    model_save_path=os.path.join('/content/drive/MyDrive/mbert_fin_models', lang,'model')
    tokenizer_save_path=os.path.join('/content/drive/MyDrive/mbert_fin_models', lang,'tokenizer')

    os.makedirs(model_save_path, exist_ok=True)
    os.makedirs(tokenizer_save_path, exist_ok=True)
    os.makedirs(label_to_index_path, exist_ok=True)

    with open(os.path.join(label_to_index_path,'label_mapping.pkl'), 'wb') as f:
      pickle.dump(label_to_index,f)

    with open(os.path.join(model_save_path,'model.pkl'),'wb') as f:
        pickle.dump(model,f)

    with open(os.path.join(tokenizer_save_path,'tokenizer.pkl'),'wb') as f:
        pickle.dump(tokenizer,f)



    # Evaluate model on dev dataset
    predictions, label_ids, _ = trainer.predict(dev_dataset)
    predictions = np.argmax(predictions, axis=-1)

    # Clear up memory
    del train_dataset
    del dev_dataset
    torch.cuda.empty_cache()
    gc.collect()

    #clear the model from memory and reinitialize for each language:
    del model
    model = BertForTokenClassification.from_pretrained('bert-base-multilingual-cased', num_labels=len(classes))



Training  UK-Ukrainian model: 


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.1405,0.136896,0.904275,0.857269,0.880145
2,0.1199,0.097722,0.924887,0.900441,0.9125


              precision    recall  f1-score   support

          CW       0.91      0.71      0.80       146
       Group       0.89      0.89      0.89       151
    Location       0.93      0.88      0.91       294
     Medical       0.79      0.71      0.75        86
      Person       0.95      0.95      0.95       341
     Product       0.79      0.77      0.78       117

   micro avg       0.90      0.86      0.88      1135
   macro avg       0.88      0.82      0.85      1135
weighted avg       0.90      0.86      0.88      1135

              precision    recall  f1-score   support

          CW       0.91      0.84      0.87       146
       Group       0.93      0.93      0.93       151
    Location       0.94      0.91      0.93       294
     Medical       0.84      0.78      0.81        86
      Person       0.96      0.97      0.97       341
     Product       0.84      0.81      0.83       117

   micro avg       0.92      0.90      0.91      1135
   macro avg       0.90

              precision    recall  f1-score   support

          CW       0.91      0.84      0.87       146
       Group       0.93      0.93      0.93       151
    Location       0.94      0.91      0.93       294
     Medical       0.84      0.78      0.81        86
      Person       0.96      0.97      0.97       341
     Product       0.84      0.81      0.83       117

   micro avg       0.92      0.90      0.91      1135
   macro avg       0.90      0.87      0.89      1135
weighted avg       0.92      0.90      0.91      1135



Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training  DE-German model: 


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.3142,0.266704,0.825301,0.814507,0.819868
2,0.2295,0.204311,0.871084,0.859691,0.86535


              precision    recall  f1-score   support

          CW       0.80      0.76      0.78       127
       Group       0.83      0.80      0.81       177
    Location       0.86      0.85      0.85       117
     Medical       0.80      0.82      0.81        62
      Person       0.90      0.90      0.90       280
     Product       0.58      0.58      0.58        78

   micro avg       0.83      0.81      0.82       841
   macro avg       0.79      0.78      0.79       841
weighted avg       0.82      0.81      0.82       841

              precision    recall  f1-score   support

          CW       0.85      0.83      0.84       127
       Group       0.85      0.84      0.85       177
    Location       0.89      0.93      0.91       117
     Medical       0.86      0.90      0.88        62
      Person       0.91      0.90      0.90       280
     Product       0.76      0.67      0.71        78

   micro avg       0.87      0.86      0.87       841
   macro avg       0.86

              precision    recall  f1-score   support

          CW       0.85      0.83      0.84       127
       Group       0.85      0.84      0.85       177
    Location       0.89      0.93      0.91       117
     Medical       0.86      0.90      0.88        62
      Person       0.91      0.90      0.90       280
     Product       0.76      0.67      0.71        78

   micro avg       0.87      0.86      0.87       841
   macro avg       0.86      0.85      0.85       841
weighted avg       0.87      0.86      0.86       841



Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
