In [2]:
%load_ext autoreload
%autoreload 2
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import transformers as ts
from torch.utils.data import Dataset,DataLoader

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
def load_data(path,train=True):
    if train:
        name='train.conll'
    else:
        name='dev.conll'
    texts=[]
    labels=[]
    with open(path+'/'+name) as file:
        for line in file:
            if line == '' or line == '\n':
                if texts:
                    yield{
                        'text':texts,
                        'label':labels
                    }
                    texts=[]
                    labels=[]
            else:
                sprilts=line.split()
                texts.append(sprilts[0])
                labels.append(sprilts[1])
        if texts:
            yield{
                'text':texts,
                'label':labels
            }
    file.close()

def get_entities(text,label):
    entities=[]
    cur_entities={}
    for t,l in zip(text,label):
        if l[0] in 'BOS' and cur_entities:
            entities.append(cur_entities)
            cur_entities={}
        if l[0] in 'BS':
            cur_entities={
                'text':t,
                'entities':[l[2:]]
            }
        elif l[0] in 'IE':
            cur_entities['text']+=t
            cur_entities['entities'].append(l[2:])
    if cur_entities:
        entities.append(cur_entities)
    return entities

def makedata(data):
    sentences=[]
    labels=[]
    for _,d in enumerate(data):
        entities=get_entities(d['text'],d['label'])
        sentence=''
        label=[]
        for e in entities :
            sentence+=e['text']
            label.extend(e['entities'])
        sentences.append([sentence])
        labels.append(label)
    return {'text':sentences,'label':labels}

train_data=load_data('./datasets/')
train_data=makedata(train_data)
val_data=load_data('./datasets/',False)
val_data=makedata(val_data)
train_data['text'][0],train_data['label'][0]

(['浙江杭州市江干区九堡镇三村村一区'],
 ['prov',
  'prov',
  'city',
  'city',
  'city',
  'district',
  'district',
  'district',
  'town',
  'town',
  'town',
  'community',
  'community',
  'community',
  'poi',
  'poi'])

In [4]:
def label2int(labels):
    ldict=['O']
    for l in labels:
        for i in l:
            if i not in ldict:
                ldict.append(i)
    return ldict
ldict=label2int(train_data['label']+val_data['label'])
ldict

['O',
 'prov',
 'city',
 'district',
 'town',
 'community',
 'poi',
 'road',
 'roadno',
 'subpoi',
 'devzone',
 'houseno',
 'intersection',
 'assist',
 'cellno',
 'floorno',
 'distance',
 'village_group']

In [59]:
class Addr(Dataset):
    """docstring for Addr."""
    def __init__(self,data,tokenizer,ldict):
        self.text=[]
        for t in data['text']:
            self.text.append(list(t[0]))
        self.encodings=tokenizer(self.text,is_split_into_words=True,padding=True)
        labels=[]
        for i,l in enumerate(data['label']):
            label=[0,]
            for t in l:
                label.append(ldict.index(t))
            for _ in range(0,len(self.encodings['input_ids'][i])-len(label)):
                label.append(0)
            labels.append(label)
        self.labels=labels
        
    def __getitem__(self, idx):
        input_ids = torch.LongTensor(self.encodings['input_ids'][idx])
        attention_mask = torch.LongTensor(self.encodings['attention_mask'][idx])
        labels = torch.LongTensor(self.labels[idx])
        return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': labels}

    def __len__(self):
        return len(self.text)

In [108]:
tokenizer=ts.AutoTokenizer.from_pretrained('../../models/chinese-roberta-wwm-ext/')

In [109]:
train_datasets=Addr(train_data,tokenizer,ldict)
val_datasets=Addr(val_data,tokenizer,ldict)
val_datasets[1273]

{'input_ids': tensor([ 101, 3851, 3736, 4689, 2123, 3797, 2356, 3851, 3736, 4689, 2123, 3797,
         2356, 6969, 2336, 1277, 7674, 1298, 6125, 6887, 1921, 4997, 1298, 6662,
          121,  121,  121,  121, 1384, 4384, 4413, 7213, 3805, 1814,  121,  121,
         2231, 7824, 5440, 1767, 7674, 1298, 3862, 7623, 1324,  102]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
 'labels': tensor([ 0,  1,  1,  1,  2,  2,  2,  1,  1,  1,  2,  2,  2,  3,  3,  3,  4,  4,
          4,  4,  7,  7,  7,  7,  8,  8,  8,  8,  8,  6,  6,  6,  6,  6, 15, 15,
         15,  9,  9,  9,  9,  9,  9,  9,  9,  0])}

In [72]:
def load_test_data(path):
    texts=[]
    with open(path+'/final_test.txt') as file:
        for line in file:
            splits=line.split('\x01')
            texts.append(list(splits[1].rsplit()[0]))
        file.close()
    return texts

class AddrTest(Dataset):
    def __init__(self,data,tokenizer):
        self.text=data
        self.encodings=tokenizer(data,is_split_into_words=True,padding=True)
    def __len__(self):
        return len(self.text)
    def __getitem__(self, idx):
        input_ids = torch.LongTensor(self.encodings['input_ids'][idx])
        attention_mask = torch.LongTensor(self.encodings['attention_mask'][idx])
        return {'input_ids': input_ids, 'attention_mask': attention_mask}
    
test_data=load_test_data('./datasets/')
test_datasets=AddrTest(test_data,tokenizer)
test_datasets[0]

{'input_ids': tensor([ 101, 3308, 7345, 1277, 2207, 1068, 1266, 7027,  121,  121,  121,  118,
          121, 1384,  102,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [112]:
classfier=ts.AutoModelForTokenClassification.from_pretrained('../../models/chinese-roberta-wwm-ext/',num_labels=len(ldict))
device=torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
classfier.to(device)

Some weights of the model checkpoint at ../../models/chinese-roberta-wwm-ext/ were not used when initializing BertForTokenClassification: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model check

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, el

In [113]:

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = (labels == preds).sum()/len(labels)
    return {
        'accuracy': acc,
    }
training_args = ts.TrainingArguments(
    output_dir='./results',         # output directory 结果输出地址
    num_train_epochs=2,             # total # of training epochs 训练总批次
    per_device_train_batch_size=64,  # batch size per device during training 训练批大小
    per_device_eval_batch_size=64,   # batch size for evaluation 评估批大小
    logging_dir='./logs/',    # directory for storing logs 日志存储位置
    learning_rate=1e-3,             # 学习率
    save_steps=False,               # 不保存检查点
    disable_tqdm=True
)
trainer=ts.Trainer(model=classfier,
                   args=training_args,
                   train_dataset=train_datasets,
                   eval_dataset=val_datasets,
                   compute_metrics=compute_metrics,
                   tokenizer=None)

In [116]:
print(training_args.device)

cuda:0


In [102]:
trainer.train()



  0%|          | 0/278 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [67]:
trainer.evaluate()

  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 1.5310684442520142,
 'eval_accuracy': 29.446192893401015,
 'eval_runtime': 2.5566,
 'eval_samples_per_second': 770.55,
 'eval_steps_per_second': 12.125,
 'epoch': 2.0}

In [73]:
preds=trainer.predict(test_datasets)

  0%|          | 0/782 [00:00<?, ?it/s]

In [99]:
import numpy as np
pp=preds.predictions[0]
pp[7],np.argmax(pp[7])

(array([ 3.6413693 ,  0.29189742,  0.4956774 ,  0.7139142 ,  0.33139053,
        -0.63918626,  1.1995587 ,  0.73807526,  0.46714488, -0.7222633 ,
        -0.48162812, -0.28190982, -3.45267   , -2.7361784 , -1.817907  ,
        -1.4205768 , -4.5757823 , -3.2565966 ], dtype=float32),
 0)