In [None]:
# !pip install tokenizers
# !pip install transformers
# !pip install torch
# !pip install seqeval
# !pip install torchcrf

In [None]:
# https://github.com/NielsRogge/Transformers-Tutorials/blob/master/BERT/Custom_Named_Entity_Recognition_with_BERT.ipynb
# https://www.kaggle.com/code/pemagrg/named-entity-recognition-using-bert

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
import pandas as pd
import nltk
import numpy as np
from sklearn.metrics import accuracy_score, classification_report
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.nn import functional as F
# import torchcrf
from transformers import BertTokenizer, BertConfig, BertForTokenClassification, BertModel
from sklearn.model_selection import train_test_split

# **Data preprocessing**

In [None]:
data = pd.read_csv('/content/drive/MyDrive/ner/corpus.csv', encoding = 'UTF-8')

# Inspect the column names to get the label names
label_names = data.columns.tolist()
print(label_names)

['Word', 'Acutal', 'IOB', 'ner_tag', 'Sentence #']


  data = pd.read_csv('/content/drive/MyDrive/ner/corpus.csv', encoding = 'UTF-8')


In [None]:
data.head()

Unnamed: 0,Word,Acutal,IOB,ner_tag,Sentence #
0,الجامع,Book,B-Book,15,1
1,المسند,Book,I-Book,16,1
2,الصحيح,Book,I-Book,16,1
3,المختصر,Book,I-Book,16,1
4,من,Book,I-Book,16,1


In [None]:
data.count()

Word          258241
Acutal        258241
IOB           258241
ner_tag       258241
Sentence #    258241
dtype: int64

In [None]:
# different NER tags, and their frequency:
print("Number of tags: {}".format(len(data.IOB.unique())))
frequencies = data.IOB.value_counts()
frequencies

Number of tags: 40


O            186040
I-Pers        20916
B-Pers        18243
B-Allah        7706
B-Number       7424
I-Number       6283
B-Prophet      4490
I-Prophet      2012
B-Loc          1189
B-NatOb         628
B-Clan          449
I-Clan          318
B-Date          301
I-Date          295
B-Para          287
B-Hell          237
B-Rlig          182
I-Loc           160
B-Crime         151
B-Meas          130
B-Book          123
B-Mon           121
I-Allah         105
B-Time           64
B-Month          63
I-Crime          61
I-Book           60
I-NatOb          42
I-Time           38
B-Day            31
I-Mon            18
I-Meas           17
B-Sect           15
I-Month          14
I-Hell            8
I-Para            7
I-Org             6
B-Org             3
I-Sect            2
I-Rlig            2
Name: IOB, dtype: int64

In [None]:
# Let's print actual NER tags by frequency (highest to lowest):
tags = {}
for tag, count in zip(frequencies.index, frequencies):
    if tag != "O":
        if tag[2:8] not in tags.keys():
            tags[tag[2:8]] = count
        else:
            tags[tag[2:8]] += count
    continue

print(sorted(tags.items(), key=lambda x: x[1], reverse=True))

[('Pers', 39159), ('Number', 13707), ('Allah', 7811), ('Prophe', 6502), ('Loc', 1349), ('Clan', 767), ('NatOb', 670), ('Date', 596), ('Para', 294), ('Hell', 245), ('Crime', 212), ('Rlig', 184), ('Book', 183), ('Meas', 147), ('Mon', 139), ('Time', 102), ('Month', 77), ('Day', 31), ('Sect', 17), ('Org', 9)]


In [None]:
# Let's print NER tags with IOB by frequency (highest to lowest) based on the IOB column:
tags = {}
for iob, count in zip(data['IOB'].unique(), data['IOB'].value_counts()):
    if iob != "O":
        if iob[0:5] not in tags.keys():
            tags[iob[0:7]] = count
        else:
            tags[iob[0:7]] += count

print(sorted(tags.items(), key=lambda x: x[1], reverse=True))

[('B-Book', 186040), ('I-Book', 20916), ('B-Pers', 7706), ('I-Pers', 7424), ('B-Org', 6283), ('I-Org', 4490), ('B-Date', 2012), ('B-Numbe', 1189), ('B-Allah', 628), ('I-Allah', 449), ('I-Date', 318), ('B-Meas', 301), ('I-Meas', 295), ('B-Proph', 287), ('I-Proph', 237), ('B-Rlig', 182), ('I-Numbe', 160), ('B-Sect', 151), ('I-Sect', 130), ('I-Rlig', 123), ('B-Loc', 121), ('I-Loc', 105), ('B-Month', 64), ('B-NatOb', 63), ('B-Clan', 61), ('I-Clan', 60), ('I-NatOb', 42), ('B-Crime', 38), ('I-Crime', 31), ('B-Time', 18), ('I-Time', 17), ('B-Hell', 15), ('B-Para', 14), ('I-Para', 8), ('I-Hell', 7), ('B-Day', 6), ('I-Month', 3), ('B-Mon', 2), ('I-Mon', 2)]


In [None]:
# NEEDED: person(Pers),God(Allah),prophet  (Prophet), location (Loc), clan (Clan), date (Date), natural object (NatOb) and other (O)
# named entities since the rest are insuf cient to train the model.
entities_to_remove = ["B-Para", "I-Para", "B-Hell", "I-Hell", "B-Crime", "I-Crime", "B-Rlig", "I-Rlig", "B-Book", "I-Book", "B-Meas", "I-Meas", "B-Mon", "I-Mon", "B-Time", "I-Time", "B-Month", "I-Month", "B-Day", "I-Day", "B-Sect", "I-Sect", "B-Org", "I-Org"]
data = data[~data.IOB.isin(entities_to_remove)]
data.head()

Unnamed: 0,Word,Acutal,IOB,ner_tag,Sentence #
16,المؤلف,O,O,0,1
17,محمد,Pers,B-Pers,5,1
18,بن,Pers,I-Pers,6,1
19,إسماعيل,Pers,I-Pers,6,1
20,أبو,Pers,I-Pers,6,1


Now, we have to ask ourself the question: what is a training example in the case of NER, which is provided in a single forward pass? A training example is typically a sentence, with corresponding IOB tags. Let's group the words and corresponding tags by sentence:

In [None]:
# pandas has a very handy "forward fill" function to fill missing values based on the last upper non-nan value
data = data.fillna(method='ffill')
data.head()

Unnamed: 0,Word,Acutal,IOB,ner_tag,Sentence #
16,المؤلف,O,O,0,1
17,محمد,Pers,B-Pers,5,1
18,بن,Pers,I-Pers,6,1
19,إسماعيل,Pers,I-Pers,6,1
20,أبو,Pers,I-Pers,6,1


In [None]:
# let's create a new column called "sentence" which groups the words by sentence
# data['sentence'] = data[['Sentence #','Word','IOB']].groupby(['Sentence #'])['Word'].transform(lambda x: ' '.join(x))
data['sentence'] = data[['Sentence #','Word','IOB']].groupby(['Sentence #'])['Word'].transform(lambda x: ' '.join(str(word) for word in x))
# let's also create a new column called "word_labels" which groups the tags by sentence
data['word_labels'] = data[['Sentence #','Word','IOB']].groupby(['Sentence #'])['IOB'].transform(lambda x: ','.join(x))
data.head()

Unnamed: 0,Word,Acutal,IOB,ner_tag,Sentence #,sentence,word_labels
16,المؤلف,O,O,0,1,المؤلف محمد بن إسماعيل أبو عبد الله البخاري ال...,"O,B-Pers,I-Pers,I-Pers,I-Pers,I-Pers,I-Pers,I-..."
17,محمد,Pers,B-Pers,5,1,المؤلف محمد بن إسماعيل أبو عبد الله البخاري ال...,"O,B-Pers,I-Pers,I-Pers,I-Pers,I-Pers,I-Pers,I-..."
18,بن,Pers,I-Pers,6,1,المؤلف محمد بن إسماعيل أبو عبد الله البخاري ال...,"O,B-Pers,I-Pers,I-Pers,I-Pers,I-Pers,I-Pers,I-..."
19,إسماعيل,Pers,I-Pers,6,1,المؤلف محمد بن إسماعيل أبو عبد الله البخاري ال...,"O,B-Pers,I-Pers,I-Pers,I-Pers,I-Pers,I-Pers,I-..."
20,أبو,Pers,I-Pers,6,1,المؤلف محمد بن إسماعيل أبو عبد الله البخاري ال...,"O,B-Pers,I-Pers,I-Pers,I-Pers,I-Pers,I-Pers,I-..."


Let's have a look at the different NER tags.

We create 2 dictionaries: one that maps individual tags to indices, and one that maps indices to their individual tags. This is necessary in order to create the labels (as computers work with numbers = indices, rather than words = tags) - see further in this notebook.

In [None]:
label2id = {k: v for v, k in enumerate(data.IOB.unique())}
id2label = {v: k for v, k in enumerate(data.IOB.unique())}
label2id

{'O': 0,
 'B-Pers': 1,
 'I-Pers': 2,
 'B-Date': 3,
 'B-Number': 4,
 'B-Allah': 5,
 'I-Allah': 6,
 'I-Date': 7,
 'B-Prophet': 8,
 'I-Prophet': 9,
 'I-Number': 10,
 'B-Loc': 11,
 'I-Loc': 12,
 'B-NatOb': 13,
 'B-Clan': 14,
 'I-Clan': 15,
 'I-NatOb': 16}

As we can see, there are now only 10 different tags.

Let's only keep the "sentence" and "word_labels" columns, and drop duplicates:

In [None]:
data = data[["sentence", "word_labels"]].drop_duplicates().reset_index(drop=True)
data.head()

Unnamed: 0,sentence,word_labels
0,المؤلف محمد بن إسماعيل أبو عبد الله البخاري ال...,"O,B-Pers,I-Pers,I-Pers,I-Pers,I-Pers,I-Pers,I-..."
1,بن ناصر الناصر الناشر مصورة عن السلطانية بإضاف...,"I-Pers,I-Pers,I-Pers,O,O,O,O,O,O,O,B-Pers,I-Pe..."
2,وهو ضمن خدمة التخريج ومتن مرتبط بشرحه مع الكتا...,"O,O,O,O,O,O,O,O,O,O,O,O,B-Pers,I-Pers,I-Pers,O..."
3,في ط البغا يليه تعليقه ثم أطرافه مقدمة د مصطفى...,"O,O,B-Pers,O,O,O,O,O,O,B-Pers,I-Pers,O,B-Allah..."
4,والسلام على سيدنا محمد بن عبد الله الذي أرسله ...,"O,O,O,B-Pers,I-Pers,I-Pers,I-Pers,O,O,B-Allah,..."


In [None]:
len(data)

9223

In [None]:
data.iloc[4].sentence

'والسلام على سيدنا محمد بن عبد الله الذي أرسله الله تعالى رحمة للناس وآتاه الحكمة وجوامع الكلم وعلمه ما لم يكن يعلم وكان فضل الله عليه عظيما وعلى'

In [None]:
data.iloc[4].word_labels

'O,O,O,B-Pers,I-Pers,I-Pers,I-Pers,O,O,B-Allah,O,O,O,O,O,O,O,O,O,O,O,O,O,O,B-Allah,O,O,O'

In [None]:
# Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
# Label Encoding
label2id = {label: idx for idx, label in enumerate(data['word_labels'].str.split(',', expand=True).stack().unique())}
data['word_labels'] = data['word_labels'].apply(lambda x: [label2id[label] for label in x.split(',')])

In [None]:
# Train/Validation/Test Split
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.1, random_state=42)

In [None]:
# Define a Dataset class
class NERDataset(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence = self.data.iloc[idx]['sentence']
        labels = self.data.iloc[idx]['word_labels']

        encoding = self.tokenizer(sentence, padding='max_length', truncation=True, max_length=self.max_len, return_tensors='pt')
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()

        labels = torch.tensor(labels, dtype=torch.long)

        return input_ids, attention_mask, labels

In [None]:
# Define constants
MAX_LEN = 128

# Create DataLoader objects
train_dataset = NERDataset(train_data, tokenizer, MAX_LEN)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

val_dataset = NERDataset(val_data, tokenizer, MAX_LEN)
val_loader = DataLoader(val_dataset, batch_size=32)

test_dataset = NERDataset(test_data, tokenizer, MAX_LEN)
test_loader = DataLoader(test_dataset, batch_size=32)

In [None]:
# Step 2: Model Creation
class BERT_BiLSTM_CRF(nn.Module):
    def __init__(self, bert_model, num_labels, hidden_size, num_layers, bidirectional, dropout, device):
        super(BERT_BiLSTM_CRF, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model)
        self.dropout = nn.Dropout(dropout)
        self.lstm = nn.LSTM(input_size=self.bert.config.hidden_size,
                            hidden_size=hidden_size,
                            num_layers=num_layers,
                            bidirectional=bidirectional,
                            batch_first=True)
        self.hidden2tag = nn.Linear(hidden_size * 2 if bidirectional else hidden_size, num_labels)
        self.crf = torchcrf.CRF(num_labels, batch_first=True)
        self.device = device

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs[0]
        sequence_output = self.dropout(sequence_output)
        lstm_output, _ = self.lstm(sequence_output)
        lstm_output = self.dropout(lstm_output)
        emissions = self.hidden2tag(lstm_output)
        return emissions

In [None]:
# Step 3: Training, Validation, Testing
class Trainer:
    def __init__(self, model, train_loader, valid_loader, test_loader, optimizer, criterion, device):
        self.model = model
        self.train_loader = train_loader
        self.valid_loader = valid_loader
        self.test_loader = test_loader
        self.optimizer = optimizer
        self.criterion = criterion
        self.device = device

    def train(self, epochs):
        # Training loop
        pass

    def validate(self):
        # Validation loop
        pass

    def test(self):
        # Testing loop
        pass

# Step 4: Hyperparameter Tuning

class HyperparameterTuner:
    def __init__(self, model, train_loader, valid_loader, test_loader, device):
        self.model = model
        self.train_loader = train_loader
        self.valid_loader = valid_loader
        self.test_loader = test_loader
        self.device = device

    def tune_hyperparameters(self):
        # Hyperparameter tuning logic
        pass

In [None]:
# Step 6: Main Program
if __name__ == "__main__":
    # Load and preprocess data
    data_preprocessor = DataPreprocessor(data_path=data)
    data = data_preprocessor.load_data()
    preprocessed_data = data_preprocessor.preprocess_data(data)

    # Split data into train, validation, and test sets
    train_data, test_data = train_test_split(preprocessed_data, test_size=0.2)
    train_data, valid_data = train_test_split(train_data, test_size=0.1)

    # Define hyperparameters
    # hyperparameters = {...}

    # Create model
    model = BERT_BiLSTM_CRF(bert_model='bert-base-uncased', num_labels=...,
                            hidden_size=..., num_layers=..., bidirectional=..., dropout=..., device='cuda')

    # Define optimizer and criterion
    optimizer = optim.Adam(model.parameters(), lr=...)
    criterion = nn.CrossEntropyLoss()

    # Create data loaders
    train_loader = DataLoader(...)
    valid_loader = DataLoader(...)
    test_loader = DataLoader(...)

    # Train the model
    trainer = Trainer(model=model, train_loader=train_loader, valid_loader=valid_loader,
                      test_loader=test_loader, optimizer=optimizer, criterion=criterion, device='cuda')
    trainer.train(epochs=10)

    # Validate the model
    trainer.validate()

    # Test the model
    trainer.test()

    # Hyperparameter tuning
    tuner = HyperparameterTuner(model=model, train_loader=train_loader, valid_loader=valid_loader,
                                test_loader=test_loader, device='cuda')
    tuner.tune_hyperparameters()

NameError: name 'DataPreprocessor' is not defined