In [245]:
import re
import time
import math
import random
import jieba
import jieba.analyse
import jieba.posseg
import numpy as np
filepath = r"D:\DEV\businessInfomationProject\NER\BosonNLP_NER_6C.txt"

In [246]:
#统计加入用户词典
#company_name person_name location org_name
#正常分句子，清洗，匹配tag
#训练
#predict

def to_ix_processing(pairs):
    word_to_ix = {}
    tag_to_ix= {"<START>":1,"<STOP>":2,"<UNK>":3}
    for sentence, tags in pairs:
        for word in sentence:
            if word not in word_to_ix:
                word_to_ix[word] = len(word_to_ix)
        for tag in tags:
            if tag not in tag_to_ix:
                tag_to_ix[tag] = len(tag_to_ix)+1
    return word_to_ix,tag_to_ix
    

def load_samplefile(filepath):
    with open(filepath,encoding="utf-8") as f:
        data = f.read()
        data = re.sub(r"\n\n","",data)
        data = re.sub(r"\d+","MM",data)
        for i in (["一","二","三","四","五","六","七","八","九","十"]):
            data = re.sub(i,"MM",data)   
    return data

def cut_sentence(data):
    sentLst = data.split("。")    
    return sentLst

#解析数据，输出用户字典和实体标注名称列表
def user_dict_count(data):
    regxLst = re.findall(r'\b{{[a-z].*?}}',data)
    dictWord_dict = {}
    for item in regxLst:   
        word = item[2:-2].split(":")[1]
        entity = item[2:-2].split(":")[0]
        if word not in dictWord_dict.keys():
            dictWord_dict[word] = entity
        
    del dictWord_dict['MM']
    return dictWord_dict

#添加用户字典
def add_userdict(user_dict):
    jieba.load_userdict(user_dict)
    return

def filterdata(text,entitys):
    text = text.replace("\n","").replace(" ","")
    # 去除标点符号
    punctuation = """{}➊➋➌➍➎➏➐➑➒➓+，◆。！？｡＂＃＄％＆＇\/()（）:＊＋*"－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏"""+"".join(entitys)
    re_punctuation = "[{}]+".format(punctuation)
    text = re.sub(re_punctuation, "", text)  
    return text

#返回分词结果和词性  
#分词
#词和词对应的词性返回
def dosegment_all(sentence,dictWord_dict):
    '''
    带词性标注，对句子进行分词，不排除停词等
    :param sentence:输入字符
    :return:
    '''
    sentence_seged = jieba.posseg.cut(sentence.strip())
    wordLst = []
    flagLst = []
    for x in sentence_seged:      
        wordLst.append(x.word)
        if x.word in dictWord_dict.keys():
            flagLst.append(dictWord_dict[x.word])
        else:
            flagLst.append(x.flag)    
    return [wordLst,flagLst]
    
 
def pairsPrepare(filepath):
    #读取数据
    data = load_samplefile(filepath) 
    sentLst = cut_sentence(data)
    user_dict = user_dict_count(data)
    add_userdict(list(user_dict.keys()))
    pairs = []
    entitys = list(set(user_dict.values()))
    for sent in sentLst:
        text = filterdata(sent,entitys)
        pairs.append(dosegment_all(text,user_dict))
    return pairs,entitys,user_dict

def pairs_trim(pairs,min_len = 3):
    pairs_trimed = []
    for i ,(w,t) in enumerate(pairs):
        if len(w) > min_len:
            pairs_trimed.append(pairs[i])

    return pairs_trimed


In [247]:
#pairs,entitys,user_dict = pairsPrepare(filepath)

In [248]:
#pairs = pairs_trim(pairs,min_len = 3)

In [249]:
# Author: Robert Guthrie

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.utils.data as data

torch.manual_seed(1935)


#取向量最大值
def argmax(vec):
    # return the argmax as a python int
    _, idx = torch.max(vec, 1)
    return idx.item()

#将词映射成id
def prepare_sequence(seq, to_ix):   
    idxs = []
    for w in seq:
        if w not in to_ix.keys():
            w = UNK_TAG 
        idxs.append(to_ix[w])
    return torch.tensor(idxs, dtype=torch.long)


# Compute log sum exp in a numerically stable way for the forward algorithm
def log_sum_exp(vec):
    max_score = vec[0, argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + \
        torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))


class BiLSTM_CRF(nn.Module):

    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)

        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=1, bidirectional=True)

        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (torch.randn(2, 1, self.hidden_dim // 2),
                torch.randn(2, 1, self.hidden_dim // 2))

    def _forward_alg(self, feats):
        # Do the forward algorithm to compute the partition function
        init_alphas = torch.full((1, self.tagset_size), -10000.)
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas

        # Iterate through the sentence
        
        for feat in feats:
            alphas_t = []  # The forward tensors at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the
                # scores.
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, sentence):
        self.hidden = self.init_hidden()
        embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats

    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        score = torch.zeros(1)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long), tags])
        if feats.size(0) == 0:return 0
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, tags):    
        feats = self._get_lstm_features(sentence)
        forward_score = self._forward_alg(feats)
        #print(forward_score)
        gold_score = self._score_sentence(feats, tags)

        return  forward_score - gold_score

    def forward(self, sentence):  # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence)

        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq


In [250]:
#to_ix_w,to_ix_l = to_ix_processing(pairs)  
START_TAG = "<START>"
STOP_TAG = "<STOP>"
UNK_TAG  = "<UNK>"

EPOCHES =200
EMBEDDING_DIM = 300
HIDDEN_DIM = 500
BATCH_SIZE = 64
# Make up some training data
pairs,entitys,user_dict = pairsPrepare(filepath)
pairs = pairs_trim(pairs,min_len = 3)
to_ix_w,to_ix_l = to_ix_processing(pairs)
training_batch_data = [[random.choice(pairs[:-300]) for _ in range(BATCH_SIZE)] for _ in range(EPOCHES)]
test_batch_data = pairs[-300:]   



In [251]:
lr=0.0005
model = BiLSTM_CRF(len(to_ix_w), to_ix_l, EMBEDDING_DIM, HIDDEN_DIM)
optimizer = optim.Adam(model.parameters(), lr=lr)

In [252]:

best_val_loss = float("inf")
best_model = None


def train(model,training_batch,to_ix_w,to_ix_l):
    import time
    start_time = time.time()
    model.train() # Turn on the train mode
    total_loss = 0.
    
    for i,pair in enumerate(training_batch): 
        
        
        
        sentence, tags = pair[0],pair[1]
        #print(sentence)
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.
        sentence_in = prepare_sequence(sentence, to_ix_w)

        targets = torch.tensor([to_ix_l[t] for t in tags], dtype=torch.long)

        # Step 3. Run our forward pass.
        loss = model.neg_log_likelihood(sentence_in, targets)
        

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()/len(sentence_in)
        
        
        log_interval = 20
        if i % log_interval == 0 and i != 0:
            cur_loss = total_loss / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | '
                  'lr {:02.6f} | ms/batch {:5.2f} | '
                  'loss {:5.5f} |'.format(
                    epoch, lr,
                    elapsed * 1000 / log_interval,
                    cur_loss))
            total_loss = 0
            start_time = time.time()
            
            
def evaluate(eval_model, data_source,to_ix_w, to_ix_l):
    eval_model.eval()  # Turn on the evaluation mode
    total_loss = 0.
    with torch.no_grad():
        for x_k, y_k in data_source:
            sentence_k = prepare_sequence(x_k, to_ix_w)
            targets_k = torch.tensor([to_ix_l[t] for t in y_k], dtype=torch.long)
            loss_test = eval_model.neg_log_likelihood(sentence_k, targets_k)
            total_loss += float(loss_test)/len(sentence_k)
    return total_loss

    

In [253]:

# Make sure prepare_sequence from earlier in the LSTM section is loaded
for epoch in range(EPOCHES):  # again, normally you would NOT do 300 epochs, it is toy data    
    training_batch = training_batch_data[epoch]
    epoch_start_time = time.time()
    train(model,training_batch,to_ix_w,to_ix_l)
    val_loss = evaluate(model, test_batch_data,to_ix_w,to_ix_l)
    print('-' * 89)
    print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '.format(epoch, (time.time() - epoch_start_time),
                                     val_loss))
    print('-' * 89)
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = model
    optimizer.step()
    

| epoch   0 | lr 0.000500 | ms/batch 327.45 | loss 4.41432 |
| epoch   0 | lr 0.000500 | ms/batch 331.72 | loss 3.98526 |
| epoch   0 | lr 0.000500 | ms/batch 453.04 | loss 3.33100 |
-----------------------------------------------------------------------------------------
| end of epoch   0 | time: 42.55s | valid loss 882.30 | 
-----------------------------------------------------------------------------------------
| epoch   1 | lr 0.000500 | ms/batch 367.47 | loss 3.12512 |
| epoch   1 | lr 0.000500 | ms/batch 336.85 | loss 2.82217 |
| epoch   1 | lr 0.000500 | ms/batch 350.61 | loss 2.73320 |
-----------------------------------------------------------------------------------------
| end of epoch   1 | time: 41.59s | valid loss 769.27 | 
-----------------------------------------------------------------------------------------
| epoch   2 | lr 0.000500 | ms/batch 477.02 | loss 2.75770 |
| epoch   2 | lr 0.000500 | ms/batch 364.18 | loss 2.53614 |
| epoch   2 | lr 0.000500 | ms/batch 3

| epoch  20 | lr 0.000500 | ms/batch 372.60 | loss 1.41039 |
| epoch  20 | lr 0.000500 | ms/batch 290.72 | loss 1.36266 |
| epoch  20 | lr 0.000500 | ms/batch 319.10 | loss 1.42403 |
-----------------------------------------------------------------------------------------
| end of epoch  20 | time: 40.97s | valid loss 421.33 | 
-----------------------------------------------------------------------------------------
| epoch  21 | lr 0.000500 | ms/batch 510.36 | loss 1.31350 |
| epoch  21 | lr 0.000500 | ms/batch 340.54 | loss 1.46281 |
| epoch  21 | lr 0.000500 | ms/batch 434.46 | loss 1.35527 |
-----------------------------------------------------------------------------------------
| end of epoch  21 | time: 46.11s | valid loss 416.02 | 
-----------------------------------------------------------------------------------------
| epoch  22 | lr 0.000500 | ms/batch 384.72 | loss 1.49031 |
| epoch  22 | lr 0.000500 | ms/batch 389.01 | loss 1.12445 |
| epoch  22 | lr 0.000500 | ms/batch 3

| epoch  40 | lr 0.000500 | ms/batch 400.18 | loss 0.95975 |
| epoch  40 | lr 0.000500 | ms/batch 317.40 | loss 0.95380 |
| epoch  40 | lr 0.000500 | ms/batch 365.32 | loss 0.84854 |
-----------------------------------------------------------------------------------------
| end of epoch  40 | time: 41.88s | valid loss 331.15 | 
-----------------------------------------------------------------------------------------
| epoch  41 | lr 0.000500 | ms/batch 346.72 | loss 1.09421 |
| epoch  41 | lr 0.000500 | ms/batch 313.16 | loss 1.16849 |
| epoch  41 | lr 0.000500 | ms/batch 389.63 | loss 0.79530 |
-----------------------------------------------------------------------------------------
| end of epoch  41 | time: 40.63s | valid loss 330.49 | 
-----------------------------------------------------------------------------------------
| epoch  42 | lr 0.000500 | ms/batch 369.16 | loss 1.04238 |
| epoch  42 | lr 0.000500 | ms/batch 397.24 | loss 0.96892 |
| epoch  42 | lr 0.000500 | ms/batch 3

| epoch  60 | lr 0.000500 | ms/batch 410.65 | loss 0.69811 |
| epoch  60 | lr 0.000500 | ms/batch 466.55 | loss 0.80276 |
| epoch  60 | lr 0.000500 | ms/batch 368.76 | loss 0.82386 |
-----------------------------------------------------------------------------------------
| end of epoch  60 | time: 45.00s | valid loss 279.04 | 
-----------------------------------------------------------------------------------------
| epoch  61 | lr 0.000500 | ms/batch 429.10 | loss 0.77614 |
| epoch  61 | lr 0.000500 | ms/batch 414.49 | loss 0.73145 |
| epoch  61 | lr 0.000500 | ms/batch 422.92 | loss 0.62094 |
-----------------------------------------------------------------------------------------
| end of epoch  61 | time: 45.14s | valid loss 276.79 | 
-----------------------------------------------------------------------------------------
| epoch  62 | lr 0.000500 | ms/batch 443.41 | loss 0.84205 |
| epoch  62 | lr 0.000500 | ms/batch 502.21 | loss 0.75246 |
| epoch  62 | lr 0.000500 | ms/batch 4

| epoch  80 | lr 0.000500 | ms/batch 383.62 | loss 0.57594 |
| epoch  80 | lr 0.000500 | ms/batch 402.67 | loss 0.73664 |
| epoch  80 | lr 0.000500 | ms/batch 382.68 | loss 0.78619 |
-----------------------------------------------------------------------------------------
| end of epoch  80 | time: 43.41s | valid loss 246.16 | 
-----------------------------------------------------------------------------------------
| epoch  81 | lr 0.000500 | ms/batch 448.30 | loss 0.53830 |
| epoch  81 | lr 0.000500 | ms/batch 404.02 | loss 0.59424 |
| epoch  81 | lr 0.000500 | ms/batch 386.47 | loss 0.79046 |
-----------------------------------------------------------------------------------------
| end of epoch  81 | time: 44.47s | valid loss 245.90 | 
-----------------------------------------------------------------------------------------
| epoch  82 | lr 0.000500 | ms/batch 415.44 | loss 0.70662 |
| epoch  82 | lr 0.000500 | ms/batch 435.78 | loss 0.63210 |
| epoch  82 | lr 0.000500 | ms/batch 4

| epoch 100 | lr 0.000500 | ms/batch 460.87 | loss 0.45511 |
| epoch 100 | lr 0.000500 | ms/batch 385.92 | loss 0.53882 |
| epoch 100 | lr 0.000500 | ms/batch 401.28 | loss 0.74621 |
-----------------------------------------------------------------------------------------
| end of epoch 100 | time: 44.77s | valid loss 226.56 | 
-----------------------------------------------------------------------------------------
| epoch 101 | lr 0.000500 | ms/batch 440.07 | loss 0.57099 |
| epoch 101 | lr 0.000500 | ms/batch 440.57 | loss 0.72194 |
| epoch 101 | lr 0.000500 | ms/batch 440.57 | loss 0.65714 |
-----------------------------------------------------------------------------------------
| end of epoch 101 | time: 46.16s | valid loss 224.49 | 
-----------------------------------------------------------------------------------------
| epoch 102 | lr 0.000500 | ms/batch 372.05 | loss 0.54069 |
| epoch 102 | lr 0.000500 | ms/batch 429.80 | loss 0.54287 |
| epoch 102 | lr 0.000500 | ms/batch 4

| epoch 120 | lr 0.000500 | ms/batch 396.44 | loss 0.63606 |
| epoch 120 | lr 0.000500 | ms/batch 434.04 | loss 0.56071 |
| epoch 120 | lr 0.000500 | ms/batch 438.03 | loss 0.54617 |
-----------------------------------------------------------------------------------------
| end of epoch 120 | time: 45.82s | valid loss 206.69 | 
-----------------------------------------------------------------------------------------
| epoch 121 | lr 0.000500 | ms/batch 412.85 | loss 0.46705 |
| epoch 121 | lr 0.000500 | ms/batch 495.92 | loss 0.34813 |
| epoch 121 | lr 0.000500 | ms/batch 440.92 | loss 0.60096 |
-----------------------------------------------------------------------------------------
| end of epoch 121 | time: 47.17s | valid loss 202.08 | 
-----------------------------------------------------------------------------------------
| epoch 122 | lr 0.000500 | ms/batch 414.59 | loss 0.55747 |
| epoch 122 | lr 0.000500 | ms/batch 594.81 | loss 0.41163 |
| epoch 122 | lr 0.000500 | ms/batch 4

| epoch 140 | lr 0.000500 | ms/batch 456.33 | loss 0.35584 |
| epoch 140 | lr 0.000500 | ms/batch 446.41 | loss 0.40410 |
| epoch 140 | lr 0.000500 | ms/batch 388.51 | loss 0.74612 |
-----------------------------------------------------------------------------------------
| end of epoch 140 | time: 46.66s | valid loss 194.06 | 
-----------------------------------------------------------------------------------------
| epoch 141 | lr 0.000500 | ms/batch 420.42 | loss 0.45145 |
| epoch 141 | lr 0.000500 | ms/batch 379.58 | loss 0.43998 |
| epoch 141 | lr 0.000500 | ms/batch 415.99 | loss 0.31830 |
-----------------------------------------------------------------------------------------
| end of epoch 141 | time: 44.16s | valid loss 193.75 | 
-----------------------------------------------------------------------------------------
| epoch 142 | lr 0.000500 | ms/batch 437.03 | loss 0.40292 |
| epoch 142 | lr 0.000500 | ms/batch 403.42 | loss 0.31630 |
| epoch 142 | lr 0.000500 | ms/batch 4

| epoch 160 | lr 0.000500 | ms/batch 377.34 | loss 0.36342 |
| epoch 160 | lr 0.000500 | ms/batch 477.67 | loss 0.35965 |
| epoch 160 | lr 0.000500 | ms/batch 450.74 | loss 0.28547 |
-----------------------------------------------------------------------------------------
| end of epoch 160 | time: 45.98s | valid loss 181.11 | 
-----------------------------------------------------------------------------------------
| epoch 161 | lr 0.000500 | ms/batch 444.16 | loss 0.25984 |
| epoch 161 | lr 0.000500 | ms/batch 408.66 | loss 0.25359 |
| epoch 161 | lr 0.000500 | ms/batch 381.53 | loss 0.17943 |
-----------------------------------------------------------------------------------------
| end of epoch 161 | time: 44.48s | valid loss 181.67 | 
-----------------------------------------------------------------------------------------
| epoch 162 | lr 0.000500 | ms/batch 475.58 | loss 0.35968 |
| epoch 162 | lr 0.000500 | ms/batch 408.81 | loss 0.37550 |
| epoch 162 | lr 0.000500 | ms/batch 4

| epoch 180 | lr 0.000500 | ms/batch 438.93 | loss 0.41120 |
| epoch 180 | lr 0.000500 | ms/batch 402.02 | loss 0.37884 |
| epoch 180 | lr 0.000500 | ms/batch 386.32 | loss 0.30962 |
-----------------------------------------------------------------------------------------
| end of epoch 180 | time: 44.68s | valid loss 180.39 | 
-----------------------------------------------------------------------------------------
| epoch 181 | lr 0.000500 | ms/batch 456.18 | loss 0.29806 |
| epoch 181 | lr 0.000500 | ms/batch 439.62 | loss 0.26197 |
| epoch 181 | lr 0.000500 | ms/batch 512.73 | loss 0.24158 |
-----------------------------------------------------------------------------------------
| end of epoch 181 | time: 47.96s | valid loss 178.99 | 
-----------------------------------------------------------------------------------------
| epoch 182 | lr 0.000500 | ms/batch 453.14 | loss 0.28396 |
| epoch 182 | lr 0.000500 | ms/batch 468.50 | loss 0.48154 |
| epoch 182 | lr 0.000500 | ms/batch 4

In [254]:
#存储数据
import os
torch.save(best_model.state_dict(), os.path.join(os.getcwd(),"model"))

In [255]:
#存储用户字典
with open(os.path.join(os.getcwd(),"user_dict.txt"),"w",encoding="utf-8") as ud:
    ud.write(str(list(user_dict.keys())))

In [256]:

#存储word_id表
import json
import datetime
import numpy as np

class JsonEncoder(json.JSONEncoder):

    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        elif isinstance(obj, datetime):                                 
            return obj.__str__()
        else:
            return super(MyEncoder, self).default(obj)

def save_dict(filename, dic):
    '''save dict into json file'''
    with open(filename,'w',encoding="utf-8") as json_file:
        json.dump(dic, json_file, ensure_ascii=False, cls=JsonEncoder)
        
def load_dict(filename):
    '''load dict from json file'''
    with open(filename,"r",encoding="utf-8") as json_file:
	    dic = json.load(json_file)
    return dic
    





In [257]:
save_dict(os.path.join(os.getcwd(),"word_id.txt"),to_ix_w)
save_dict(os.path.join(os.getcwd(),"tag_id.txt"),to_ix_l)
#存储最优model
#evalue
#读取数据
#

In [258]:
#predict
EMBEDDING_DIM = 300
HIDDEN_DIM = 500
to_ix_w = load_dict(os.path.join(os.getcwd(),"word_id.txt"))
to_ix_l = load_dict(os.path.join(os.getcwd(),"tag_id.txt"))
OrderedDict = torch.load(os.path.join(os.getcwd(),"model"))
len_ = len(OrderedDict["word_embeds.weight"])
to_l_ix = {v: k for k, v in to_ix_l.items()}
model = BiLSTM_CRF(len_, to_ix_l, EMBEDDING_DIM, HIDDEN_DIM)
model.load_state_dict(torch.load(os.path.join(os.getcwd(),"model")))
model.eval()

BiLSTM_CRF(
  (word_embeds): Embedding(32915, 300)
  (lstm): LSTM(300, 250, bidirectional=True)
  (hidden2tag): Linear(in_features=500, out_features=64, bias=True)
)

In [259]:
#读取数据，分词

In [278]:
def stopwordslist(filepath=r"D:\DEV\businessInfomationProject\NER\stopwords.txt"):  
    stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]  
    return stopwords


stopwords = stopwordslist()
import codecs
PATH = r"D:\DEV\businessInfomationProject\TextRank4ZH_v2\news\newsin"
for parent, dirnames, filenames in os.walk(PATH):
    for filename in filenames:
        print("transforming.....", filename)
        file_path_in = os.path.join(parent, filename)
        file_path_out = os.path.join(parent ,r"..\\NERout\\ner_{}".format(filename))
        test_d = codecs.open(file_path_in, 'r', 'utf-8').read()
        test_d = re.sub(r"\n\n","",test_d)
        test_d = re.sub(r"\d+","MM",test_d)
        for i in (["一","二","三","四","五","六","七","八","九","十"]):
            test_d = re.sub(i,"MM",test_d) 
        test_d_Lst = cut_sentence(test_d)
        
        result = set()
        for sent in test_d_Lst:
            if len(sent) == 0 :continue
            test_d_ = filterdata(sent,[])
            test_w,_ = dosegment_all(test_d_,user_dict)
                    
             
            test_id = [to_ix_w[i] if i in to_ix_w.keys() else 3 for i in test_w]
            if len(set(test_id))<=2:continue
            test_res_ = torch.tensor(test_id,dtype=torch.long)
            predict_res = model(test_res_)[1]
            

            for ix,k in enumerate(predict_res):
                if to_l_ix[k] in ["org_name","company_name","person_name"]:
                    if test_w[ix] not in stopwords:
                        result.add(test_w[ix]+" "+to_l_ix[k])
        with codecs.open(file_path_out, 'w+', 'utf-8') as surveyp:
            surveyp.write(",\n".join(result))
            
                    
                    

transforming..... 57037793_in.txt
transforming..... 57037794_in.txt
transforming..... 57037795_in.txt
transforming..... 57037796_in.txt
transforming..... 57037797_in.txt
transforming..... 57037798_in.txt
transforming..... 57037799_in.txt
transforming..... 57037800_in.txt
transforming..... 57037801_in.txt
transforming..... 57037802_in.txt
transforming..... 57037803_in.txt
transforming..... 57037804_in.txt
transforming..... 57037805_in.txt
transforming..... 57037806_in.txt
transforming..... 57037807_in.txt
transforming..... 57037808_in.txt
transforming..... 57037809_in.txt
transforming..... 57037810_in.txt
transforming..... 57037811_in.txt
transforming..... 57037812_in.txt
transforming..... 57037813_in.txt
transforming..... 57037814_in.txt
transforming..... 57037815_in.txt
transforming..... 57037816_in.txt
transforming..... 57037817_in.txt
transforming..... 57037818_in.txt
transforming..... 57037819_in.txt
transforming..... 57037820_in.txt
transforming..... 57037821_in.txt
transforming..

In [275]:
set(test_w_)

{'MM天',
 '《',
 '东方',
 '为',
 '为期',
 '之',
 '交响音乐会',
 '从',
 '以及',
 '众多',
 '到',
 '即墨',
 '国乐',
 '国际',
 '在',
 '多部',
 '大型',
 '大师',
 '师生',
 '带来',
 '期间',
 '汇聚',
 '海内外',
 '海洋',
 '的',
 '知名',
 '等',
 '红旗',
 '西方',
 '观众',
 '阎师',
 '阎维文',
 '青岛',
 '音',
 '音乐会',
 '音乐季',
 '音乐家',
 '颂',
 '高山流水',
 '高徒'}

In [146]:
print(len(training_batch))
for i,v in training_batch:
    print(len(i) == len(v))

64
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True


torch.Size([64, 500])