In [43]:
import numpy as np
import pandas as pd
from transformers import BertTokenizer, BertModel, get_linear_schedule_with_warmup
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import time
import re
from sklearn.model_selection import train_test_split
import warnings
warnings.simplefilter('ignore')

In [27]:
file_path = "/Users/desidero/Desktop/Kodlar/NLP/NER/ner_dataset.csv"
model_name = 'bert-base-uncased'
device = torch.device('mps')

In [82]:
class Preprocess(object):
    
    def __init__(self, file_path):
        self.data = pd.read_csv(file_path, encoding="unicode_escape")
        self.n_tags, self.n_pos, self.pos2ids, self.tags2ids, self.id2pos, self.id2tag = self.transform()
        self.pre_sentences, self.pos, self.tags = self.organise()
        self.sentences = self.clean()
        #self.correction()
    
    def correction(self):
        self.sentences[1901][0] = 'None'
        self.sentences[1956][0] = 'None'
        self.sentences = np.vectorize(lambda x: str(x))(self.sentences)

    def transform(self):
        tag_list = self.data['Tag'].unique()
        pos_list = self.data['POS'].unique()
        n_tags = len(tag_list)
        n_pos = len(pos_list)

        pos2ids = {pos: i+1 for i, pos in enumerate(pos_list)}
        id2pos = {i+1: pos for i, pos in enumerate(pos_list)}
        self.data["PosId"] = self.data["POS"].map(pos2ids)

        tags2ids = {tag: i+1 for i, tag in enumerate(tag_list)}
        id2tag = {i+1: tag for i, tag in enumerate(tag_list)}
        self.data["TagId"] = self.data["Tag"].map(tags2ids)

        return n_tags, n_pos, pos2ids, tags2ids, id2pos, id2tag

    def clean_text(self, text):
        text = text.lower()
        text = re.sub(r"i'm", "i am", text) # replace "i'm" with "i am"
        text = re.sub(r"im", "i am", text)
        text = re.sub(r"ive", "i have", text)
        text = re.sub(r"he's", "he is", text)
        text = re.sub(r"she's", "she is", text)
        text = re.sub(r"that's", "that is", text)
        text = re.sub(r"what's", "what is", text)
        text = re.sub(r"where's", "where is", text)
        text = re.sub(r"how's", "how is", text)
        text = re.sub(r"\'ll", " will", text)
        text = re.sub(r"\'ve", " have", text)
        text = re.sub(r"\'re", " are", text)
        text = re.sub(r"\'d", " would", text)
        text = re.sub(r"n't", "not", text)
        text = re.sub(r"won't", "will not", text)
        text = re.sub(r"wont", "will not", text)
        text = re.sub(r"won t", "will not", text)
        text = re.sub(r"didn't", "did not", text)
        text = re.sub(r"didnt", "did not", text)
        text = re.sub(r"didn t", "did not", text)
        text = re.sub(r"can't", "cannot", text)
        text = re.sub(r"cant", "cannot", text)
        text = re.sub(r"can t", "cannot", text)
        text = re.sub(r"[-()\"#/@:<>{}+=~|.?,!;]", "", text)
        return text 

    def clean(self):
        clean_sentences = [[self.clean_text(i) for i in x if type(i) == str] for x in self.pre_sentences]
        return clean_sentences

    def organise(self):

        self.data.rename(columns={'Sentence #':'Sentence'}, inplace=True)
        self.data["Sentence"] = self.data["Sentence"].fillna(method='ffill')

        sentences = self.data.groupby('Sentence')['Word'].apply(list).values
        pos = self.data.groupby('Sentence')['PosId'].apply(list).values
        tags = self.data.groupby('Sentence')['TagId'].apply(list).values

        return sentences, pos, tags
    
pre = Preprocess(file_path)

In [83]:
pre.sentences[0]

['thousands',
 'of',
 'demonstrators',
 'have',
 'marched',
 'through',
 'london',
 'to',
 'protest',
 'the',
 'war',
 'in',
 'iraq',
 'and',
 'demand',
 'the',
 'withdrawal',
 'of',
 'british',
 'troops',
 'from',
 'that',
 'country',
 '']

In [106]:
class NerDataset(Dataset):
    
    def __init__(self, sentences, pos_id, tag_id, tokenizer, max_len=128):
        super(NerDataset, self).__init__()
        self.text = sentences
        self.pos_id = pos_id
        self.tag_id = tag_id
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, x):
        text = self.text[x]
        pos = self.pos_id[x]
        tag = self.tag_id[x]
        ids = []
        attention_mask = []
        token_type_ids = []

        for i in text:
            encoding = self.tokenizer(i, add_special_tokens=False)
            ids.extend(encoding['input_ids'])
            attention_mask.extend(encoding['attention_mask'])
            token_type_ids.extend(encoding['token_type_ids'])

        # [101]: CLS, [102]: SEP, [0]: PAD
        ids = [101] + ids + [102]
        pos = [0] + pos + [0]
        tag = [0] + tag + [0]
        pos = pos + [0]*(self.max_len - len(pos))
        tag = tag + [0]*(self.max_len - len(tag))
        pos = pos[:128]
        tag = tag[:128]

        ids = ids + [0]*(self.max_len - len(ids))
        attention_mask = attention_mask + [0]*(self.max_len - len(attention_mask))
        token_type_ids = token_type_ids + [0]*(self.max_len - len(token_type_ids))
        
        return {'input_ids': torch.tensor(ids, dtype=torch.long, device=device), 
                'pos': torch.tensor(pos, dtype=torch.long, device=device), 
                'tag': torch.tensor(tag, dtype=torch.long, device=device),
                'attention_mask': torch.tensor(attention_mask, dtype=torch.long, device=device), 
                'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long, device=device)}

In [29]:
train_sentence, valid_sentence, train_pos, valid_pos, train_tags, valid_tags = train_test_split(pre.sentences, pre.pos, pre.tags, test_size=0.1)

In [30]:
tokenizer = BertTokenizer.from_pretrained(model_name)

In [107]:
train_dataset = NerDataset(train_sentence, train_pos, train_tags, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=16, drop_last=True)

valid_dataset = NerDataset(valid_sentence, valid_pos, valid_tags, tokenizer)
valid_dataloader = DataLoader(valid_dataset, batch_size=16, drop_last=True)

In [114]:
for i in train_dataloader:
    print(i['input_ids'].size())
    print(i['pos'].size())
    print(i['tag'].size())
    print(i['attention_mask'].size())
    print(i['token_type_ids'].size())
    break

torch.Size([16, 128])
torch.Size([16, 128])
torch.Size([16, 128])
torch.Size([16, 128])
torch.Size([16, 128])


In [None]:
class NerModel(nn.Module):
    
    def __init__(self, bert_model_name, n_pos, n_tags):
        super(NerModel, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.pos_dropout = nn.Dropout(0.1)
        self.tag_dropout = nn.Dropout(0.1)
        self.fc_pos = nn.Linear(self.bert.config.hidden_size, n_pos)
        self.fc_tag = nn.Linear(self.bert.config.hidden_size, n_tags)
        
    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.bert(input_ids, attention_mask, token_type_ids)
        pooler_outputs = outputs.last_hidden_state

        x_pos = self.pos_dropout(pooler_outputs)
        x_tag = self.pos_dropout(pooler_outputs)

        x_pos = self.fc_pos(x_pos)
        x_tag = self.fc_tag(x_tag)

        return x_pos, x_tag

In [None]:
learning_rate = 2e-5
num_epochs = 5
model = NerModel(model_name, pre.n_pos, pre.n_tags).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
criterion = nn.CrossEntropyLoss().to(device)

In [None]:
def loss_fn(prediction, target, masks, criterion, n_classes):
    logits = prediction.view(-1, n_classes)
    masks = masks.view(-1)
    target = target.view(-1)
    target = torch.where(masks == 1, target, torch.tensor(criterion.ignore_index).type_as(target))
    return criterion(logits, target)

In [None]:
def train(model, data_loader, optimizer, scheduler, criterion):
    
    model.train()
    for batch in data_loader:
        optimizer.zero_grad()
        pos_output, tag_output = model(input_ids = ['input_ids'], attention_mask = batch['attention_mask'], token_type_ids = batch['token_type_ids'])
        pos_loss = loss_fn(pos_output, batch['pos'], batch['attention_mask'], criterion, pre.n_pos)
        tag_loss = loss_fn(tag_output, batch['tag'], batch['attention_mask'], criterion, pre.n_tags)
        loss = (pos_loss + tag_loss)/2
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

In [None]:
def evaluate(model, data_loader):

    model.eval()
    pos_loss_list = []
    tag_loss_list = []
    with torch.no_grad():
        for batch in data_loader:
            pos_output, tag_output = model(input_ids = batch['input_ids'], attention_mask = batch['attention_mask'], token_type_ids = batch['token_type_ids'])
            pos_loss = loss_fn(pos_output, batch['pos'], batch['attention_mask'], criterion, pre.n_pos)
            tag_loss = loss_fn(tag_output, batch['tag'], batch['attention_mask'], criterion, pre.n_tags)
            pos_loss_list(pos_loss)
            tag_loss_list(tag_loss)

    return sum(pos_loss_list)/len(pos_loss_list), sum(tag_loss_list)/len(tag_loss_list)

In [None]:
for epoch in range(num_epochs):
    tic = time.time()
    print("-----------------------------------")
    print(f"Epoch {epoch + 1}")
    train(model, train_dataloader, optimizer, scheduler, criterion)
    pos_loss, tag_loss = evaluate(model, valid_dataloader)
    print("POS: ", pos_loss)
    print("TAG: ", tag_loss)
    print("-----------------")
    toc = time.time()
    print("epoch time: ", toc-tic)