<a href="https://colab.research.google.com/github/Chuck2Win/NER/blob/main/NER_BERT_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! pip install transformers
from google.colab import drive
drive.mount('/content/gdrive/')
import os
os.chdir('./gdrive/My Drive/ner')
import re
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
from sklearn.metrics import confusion_matrix, classification_report
from transformers import BertTokenizer
from transformers import BertModel
import pandas as pd
import numpy as np

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d5/43/cfe4ee779bbd6a678ac6a97c5a5cdeb03c35f9eaebbb9720b036680f9a2d/transformers-4.6.1-py3-none-any.whl (2.2MB)
[K     |████████████████████████████████| 2.3MB 5.7MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 33.9MB/s 
[?25hCollecting huggingface-hub==0.0.8
  Downloading https://files.pythonhosted.org/packages/a1/88/7b1e45720ecf59c6c6737ff332f41c955963090a18e72acbcbeac6b25e86/huggingface_hub-0.0.8-py3-none-any.whl
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 49.7MB/s 
Installing c

In [68]:
class preprocessing(object):
    def __init__(self, max_length = 64):
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-cased',do_lower_case = False)
        self.label2idx = {i:_ for _,i in enumerate(["O", "B-MISC", "I-MISC",  "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"])}
        self.max_length = max_length    
    
    def get_labels(self):
        return self.label2idx
 
    def read_file(self):
        f = open('./train.txt','r')
        tagged_sentences = []
        sentences = []
        for i in f:
            if i.startswith('-DOCSTART') or i=='\n': 
                if len(sentences)>0:
                    tagged_sentences.append(sentences)
                    sentences = []
                continue
            x=i.split()
            x = [re.sub('\n','',j) for j in x] # 단어, 품사태그, 청크 태크, 개체명 태크
            sentences.append([x[0],x[-1]]) # 단어, 개체명 태그
        ner_tags=[]
        words = []
        for sentence in tagged_sentences:
            word, tag = zip(*sentence)
            words.append(list(word))
            ner_tags.append(list(tag))
        #return words, ner_tags
        # ner tag를 idx로 변환
        idx_tags = []
        for i in ner_tags:
            t = []
            for j in i:
                t.append(self.label2idx[j])
            idx_tags.append(t)
        # return idx_tags
        self.words = words
        self.ner_tags = idx_tags
        # sub word token화 시키고, ner tag를 extend시킴('X'추가)
        tokenized_words = []
        extend_ner_tags = []
        subword_first_tags = []
        for sentence,ner_tag in zip(self.words, self.ner_tags):
            t = []
            ts = []
            sf = []
            for word,tag in zip(sentence,ner_tag):
                o = self.tokenizer.tokenize(word)
                t.extend(o)
                ts.extend([tag]*(len(o))) 
                sf.extend([1]+[0]*(len(o)-1))
            tokenized_words.append(t)
            extend_ner_tags.append(ts)
            subword_first_tags.append(sf)
        self.data = pd.DataFrame()
        self.data['words'] = tokenized_words
        self.data['labels'] = extend_ner_tags
        self.data['labels_mask'] = subword_first_tags
        # truncation
        self.data['labels'] = self.data['labels'].apply(lambda i : i[:self.max_length-2]) 
        self.data['labels_mask'] = self.data['labels_mask'].apply(lambda i : i[:self.max_length-2]) # first subword만 1이고 나머진 0
        # pad
        self.data['labels'] = self.data['labels'].apply(lambda i : [-1]+i+[-1]*(self.max_length-len(i)-1)) # [CLS],[SEP],[PAD]의 위치에는 -1을 기록
        self.data['labels_mask'] = self.data['labels_mask'].apply(lambda i : [0]+i+[0]*(self.max_length-len(i)-1)) # first subword만 1 나머진 0
        self.data['ids'] = self.data['words'].apply(lambda  i : self.tokenizer.encode(i,padding = 'max_length', max_length = self.max_length, truncation = True))

      # assert np.array(self.data.tags.tolist()).shape[1]==self.max_length
    
    def make_data_loader(self, batch_size = 32):
        # ids
        # attention mask
        # segment ids
        # labels
        
        ids = torch.LongTensor(self.data.ids.tolist())
        # bert model에선 mask할 곳이 False 안할 곳이 True
        attention_masks = ids.eq(self.tokenizer.pad_token_id)
        attention_masks = (attention_masks==False).long() 
        # segment가 다 0이므로
        token_type_ids = torch.zeros_like(ids)

        # label 관련
        labels = torch.LongTensor(self.data.labels.tolist())
        labels_mask = torch.LongTensor(self.data.labels_mask.tolist())
        
        dataset = TensorDataset(ids,attention_masks,token_type_ids,labels,labels_mask)
        data_loader = DataLoader(dataset,batch_size = batch_size, shuffle = True)
        return data_loader

In [69]:
p = preprocessing()
p.read_file()
data = p.data
data_loader = p.make_data_loader()

In [66]:
batch = next(iter(data_loader))

In [67]:
batch = batch.to('cuda')

AttributeError: ignored

In [41]:
l=batch['labels']
m=batch['labels_mask']

# Bert token classification 
pooling layer를 통과하지 않은 final hidden layer를 사용할 생각임.  
물론 실제 논문에서는 마지막 4개 layer를 concat한 경우가 가장 성능이 좋았음.(Feature based approach에서)  
나는 Fine tunning 방식으로 하고, Last hidden layer만을 활용할 것이다.(pooling layer를 통과하지 않고)  

In [45]:
class my_model(nn.Module):
    def __init__(self,bert,n_labels):
        super().__init__()
        self.bert = bert
        self.linear = nn.Linear(768,n_labels)
        self.droput = nn.Dropout(0.1)
    def forward(self, ids, attention_mask, token_type_ids):
        output = self.bert.forward(input_ids = ids, attention_mask = attention_mask, token_type_ids= token_type_ids)
        output = output.last_hidden_state
        out = self.linear.forward(self.droput.forward((output)))
        return out        

In [80]:
epochs  = 10
bert = BertModel.from_pretrained('bert-base-cased',add_pooling_layer = False)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = my_model(bert,9).to(device)
optimizer = torch.optim.Adam(model.parameters(),lr= 1e-5)
criterion = nn.CrossEntropyLoss(reduction='sum')


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'bert.pooler.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'bert.pooler.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [81]:
for epoch in tqdm(range(1,epochs+1),mininterval=60):
    model.train()
    check = []
    total_loss = 0
    predicted = []
    actual = []
    l = 0
    for data in data_loader:
        optimizer.zero_grad()
        data = tuple(i.to(device) for i in data)
        out = model.forward(data[0],data[1],data[2])
        out = out[data[-1]==1]
        labels = data[3][data[-1]==1]
        l+=len(labels)
        loss = criterion(out,labels)
        loss.backward()
        optimizer.step()
        predicted.extend(out.argmax(-1).reshape(-1).cpu().tolist())
        actual.extend(labels.reshape(-1).cpu().tolist())
        check.append(loss.item())
        total_loss+=loss.item()
    total_loss=total_loss/l
    # print(total_loss)
    if epoch % 5==0:
        print(total_loss)
        print(classification_report(actual,predicted))

 40%|████      | 4/10 [05:53<08:50, 88.43s/it]

0.013311877141907425


 50%|█████     | 5/10 [07:22<07:22, 88.49s/it]

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    169286
           1       0.97      0.97      0.97      3435
           2       0.96      0.96      0.96      1155
           3       0.99      0.99      0.99      6580
           4       0.99      1.00      1.00      4506
           5       0.98      0.98      0.98      6312
           6       0.98      0.99      0.99      3697
           7       0.99      0.99      0.99      7132
           8       0.99      0.98      0.98      1157

    accuracy                           1.00    203260
   macro avg       0.98      0.98      0.98    203260
weighted avg       1.00      1.00      1.00    203260



 90%|█████████ | 9/10 [13:15<01:28, 88.41s/it]

0.004075519219048202


100%|██████████| 10/10 [14:44<00:00, 88.45s/it]

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    169286
           1       0.99      0.99      0.99      3435
           2       0.99      0.99      0.99      1155
           3       1.00      1.00      1.00      6580
           4       1.00      1.00      1.00      4506
           5       0.99      0.99      0.99      6312
           6       1.00      1.00      1.00      3697
           7       1.00      1.00      1.00      7132
           8       1.00      1.00      1.00      1157

    accuracy                           1.00    203260
   macro avg       0.99      0.99      0.99    203260
weighted avg       1.00      1.00      1.00    203260






In [121]:
with torch.no_grad():
    model.eval()
    predicted = []
    actual = []
    for data in data_loader:
        #optimizer.zero_grad()
        input_ids = data['input_ids'].to('cuda')
        attention_mask = data['attention_mask'].to('cuda')
        token_type_ids = data['segment_ids'].to('cuda')
        labels = data['labels'].to('cuda')
        out = model.forward(input_ids,attention_mask, token_type_ids)
        loss = criterion(out.transpose(1,2),labels)
        #loss.backward()
        #optimizer.step()
        predicted.extend(out.argmax(-1).reshape(-1).cpu().tolist())
        actual.extend(labels.reshape(-1).cpu().tolist())
        #check.append(loss.item())
        #total_loss+=loss.item()