In [1]:
import warnings
import os
import pandas as pd
import logging
import sklearn
from torch.utils.data import Dataset, DataLoader
import time
import random
from transformers import BertModel,AlbertTokenizer,AutoModelForTokenClassification, TrainingArguments, Trainer,trainer_callback,DataCollatorForTokenClassification
import torch
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support,classification_report
import json
import gc
import sys
import re
from seqeval.metrics import accuracy_score, classification_report
warnings.filterwarnings('ignore')
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
def ClassEncoder(Class):
    if Class == 'Capacitors':
        return 1
    elif Class == 'Resistors':
        return 2
    else:
        return 0
    
def ClassDecoder(Class):
    if Class == 1:
        return 'Capacitors'
    elif Class == 2:
        return 'Resistors'
    else:
        return 'Others'
        
def attrEncoder(all_class_list,item_class,attr):
    if item_class in all_class_list and attr in all_class_list[item_class]:
        if item_class == 'Capacitors':
            return all_class_list[item_class][attr]
        if item_class == 'Resistors':
            return all_class_list[item_class][attr]-10
    if attr == '-100':
        return -100
    return 0

def attrDecoder(item_class,attr):
    if attr==0 or attr==-100:
        return 'others'
    else:
        if item_class == 'Resistors':
            attr+=10
        global all_attrs_dict
        return all_attrs_dict[attr]

In [3]:
class nerClass(torch.nn.Module):
    def __init__(self,config):
        super(nerClass, self).__init__()
        self.num_labels1 = config['num_labels1']
        self.num_labels2 = config['num_labels2']
        self.l1 = BertModel.from_pretrained(config['model'])
        self.dropout = torch.nn.Dropout(0.1)
        self.classifier1 = torch.nn.Linear(128, config['num_labels1'])
        self.classifier2 = torch.nn.Linear(128, config['num_labels2'])

    def forward(self, input_ids, attention_mask=None,classes=None,device=None):
        output = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output[0]
        pooler = self.dropout(hidden_state)
        
        mask1 = torch.eq(classes,1)
        mask2 = torch.eq(classes,2)
#        print('pooler[mask1].size()',pooler[mask1].size())
#        print('pooler[mask2].size()',pooler[mask2].size())
        output1 = self.classifier1(pooler[mask1])
        output2 = self.classifier2(pooler[mask2])
#         print('output1.size()',output1.size())
#         print('output2.size()',output2.size())
    
        return output1,output2

In [6]:
def group_sub_entities(entities):
    """
    Group together the adjacent tokens with the same entity predicted.
    Args:
        entities (:obj:`dict`): The entities predicted by the pipeline.
    """
    # Get the first entity in the entity group
    entity = entities[0]["entity"]
    scores = np.sum([entity["score"] for entity in entities])
    tokens = [entity["word"] for entity in entities]

    entity_group = {
        "pred": entity,
        "score": scores,
        "word": tokenizer.convert_tokens_to_string(tokens),
        "subtoken_num":len(entities),
        "index":entities[0]["index"]
    }
    return entity_group

def group_entities(entities):
    """
    Find and group together the adjacent tokens with the same entity predicted.
    Args:
        entities (:obj:`dict`): The entities predicted by the pipeline.
    """

    entity_groups = []
    entity_group_disagg = []

    if entities:
        last_idx = len(entities)-1

    for entity in entities:
        is_last_idx = entity["index"] == last_idx
        if not entity_group_disagg:
            entity_group_disagg += [entity]
            if is_last_idx:
                entity_groups += [group_sub_entities(entity_group_disagg)]
            continue

        # If the current entity is similar and adjacent to the previous entity, append it to the disaggregated entity group
        # The split is meant to account for the "B" and "I" suffixes
        if (entity["entity"] == entity_group_disagg[-1]["entity"] and entity["index"] == entity_group_disagg[-1]["index"] + 1):
            entity_group_disagg += [entity]
            # Group the entities at the last entity
            if is_last_idx:
                entity_groups += [group_sub_entities(entity_group_disagg)]
        # If the current entity is different from the previous entity, aggregate the disaggregated entity group
        else:
            entity_groups += [group_sub_entities(entity_group_disagg)]
            entity_group_disagg = [entity]
            # If it's the last entity, add it to the entity groups
            if is_last_idx:
                entity_groups += [group_sub_entities(entity_group_disagg)]

    return entity_groups

In [7]:
def judge_result(entities,input_tokens):
    #print(entities)
    initial_pred=[]
    for item in entities:
        initial_pred.append(item['pred'])
    
    if len(input_tokens)==len(initial_pred):
        #return dict(zip(initial_pred, input_tokens))
        return initial_pred, input_tokens, entities
#     else:
#         return None,None,None

    if len(input_tokens)>len(initial_pred):
        pad_num=len(input_tokens)-len(initial_pred)
        for i in range(pad_num):
            initial_pred.append('others')
        print('case1')
        print('entities',entities)
        print('input_tokens',input_tokens)
        print('initial_pred',initial_pred)
        #return dict(zip(initial_pred, input_tokens))
        return initial_pred, input_tokens,entities
#     else:
#         return None,None,None
    
    #预测的结果多于输入的token
    if len(input_tokens)<len(initial_pred):
        print('case2')
        del_num= len(initial_pred)-len(input_tokens)
        entities.sort(key=lambda item: item['score'],reverse=True)
        entities = entities[:-del_num]
        entities.sort(key=lambda item: item['index'],reverse=False)
        
        final_pred=[]
        for item in entities:
            final_pred.append(item['pred'])
        return final_pred, input_tokens,entities
#     else:
#         return None,None,None

In [24]:
def predict(input_tokens,input_class):
    """
        - **word** (:obj:`str`) -- The token/word classified.
        - **score** (:obj:`float`) -- The corresponding probability for :obj:`entity`.
        - **entity** (:obj:`str`) -- The entity predicted for that token/word.
        - **index** (:obj:`int`, only present when ``self.grouped_entities=False``) -- The index of the
          corresponding token in the sentence.
    """
    #for sentence in inputs:
    tokens = tokenizer(input_tokens, is_split_into_words=True,add_special_tokens=False,
                    padding=True,truncation=False,return_tensors="pt")
    tokens['classes']=torch.tensor(input_class)
    #print(tokens)
    # Forward
    with torch.no_grad():
        ids = tokens['input_ids'].to(device, dtype = torch.long)
        attention_mask = tokens['attention_mask'].to(device, dtype = torch.long)
        classes = tokens['classes'].to(device, dtype = torch.long)

        entities1,entities2= model(ids, attention_mask,classes,device)

        if input_class==1:
            print(1)
            entities1 = torch.squeeze(entities1, dim=0)
            entities1 = torch.squeeze(entities1, dim=0)
            entities = entities1.cpu().data.numpy()
        elif input_class==2:
            print(2)
            entities2 = torch.squeeze(entities2, dim=0)
            entities2 = torch.squeeze(entities2, dim=0)
            entities = entities2.cpu().data.numpy()

        input_ids = tokens["input_ids"].cpu().numpy()[0]

    score = np.exp(entities) / np.exp(entities).sum(-1, keepdims=True)
    labels_idx = score.argmax(axis=-1)
    print('score',score.shape)
    entities = []
    for idx, label_idx in enumerate(labels_idx):
        entity = {
            "word": tokenizer.convert_ids_to_tokens(int(input_ids[idx])),
            "score": score[idx][label_idx].item(),
            "entity": attrDecoder(ClassDecoder(input_class),label_idx),
            "index": idx}

        entities += [entity]
    return judge_result(group_entities(entities),input_tokens)

In [12]:
def compute(predictions, references, suffix=False):
    report = classification_report(y_true=references, y_pred=predictions, suffix=suffix, output_dict=True)
    report.pop("macro avg")
    report.pop("weighted avg")
    overall_score = report.pop("micro avg")

    scores = {
        type_name: {
            "precision": score["precision"],
            "recall": score["recall"],
            "f1": score["f1-score"],
            "number": score["support"],
        }
        for type_name, score in report.items()
    }
    scores["overall_precision"] = overall_score["precision"]
    scores["overall_recall"] = overall_score["recall"]
    scores["overall_f1"] = overall_score["f1-score"]
    scores["overall_accuracy"] = accuracy_score(y_true=references, y_pred=predictions)

    return scores

In [4]:
# label处理
cap_attrs = {'Capacitance':1,'SizeCode':2,'RatedDCVoltageURdc':3,'PositiveTolerance':4,'NegativeTolerance':5,'TemperatureCharacteristicsCode':6,'MfrPartNumber':7,'input class':8}
res_attrs = {'Resistance':11,'SizeCode':12,'WorkingVoltage':13,'Tolerance':14,'RatedPowerDissipationP':15,'MfrPartNumber':16,'input class':17}
all_class_list = {'Capacitors':cap_attrs,'Resistors':res_attrs}

cap_attrs_dict = dict(zip(cap_attrs.values(), cap_attrs.keys()))
res_attrs_dict = dict(zip(res_attrs.values(), res_attrs.keys()))
all_attrs_dict ={**cap_attrs_dict,**res_attrs_dict}

# 读取数据
with open(r'preprocess/description_with_label.json', 'r', errors='ignore', encoding='utf-8') as f:
    js = f.read()
    real_data = json.loads(js, strict=False)
#317条
real_input=[]
real_labels=[]
real_classes=[]
problem_set=[]
for i in range(len(real_data)):
    tmp_real_input=[]
    tmp_real_labels=[]
    for key,val in real_data[i].items():
        if key not in ['description','class', 'others', 'labels','Capacitance', 'RatedDCVoltageURdc', 'PositiveTolerance', 'NegativeTolerance',
                'Resistance', 'WorkingVoltage', 'RatedPowerDissipationP', 'Tolerance']:
            tmp_real_input.append(val.lower())
            if ' (User input)' in key:
                tmp_real_labels.append(key.replace(' (User input)',''))
            else:
                tmp_real_labels.append(key)
    
    if len(tmp_real_input)==len(tmp_real_labels) and len(tmp_real_input)!=0 and len(set(tmp_real_input))==len(tmp_real_labels):
        real_input.append(tmp_real_input)
        real_labels.append(tmp_real_labels)
        real_classes.append(ClassEncoder(real_data[i]['class']))
    else:
        problem_set.append(real_data[i])

# model init
model_checkpoint = r'C:\Users\coldkiller\Desktop\supplyframe\ner_prediction\model_final_albert'
tokenizer = AlbertTokenizer.from_pretrained(model_checkpoint)
model = torch.load(model_checkpoint+r'\model.bin',map_location='cpu')
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

In [27]:
# get output
inputs=[]
preds=[]
labels=[]
entity_list=[]
for i in range(len(real_input)):
    tmp_preds,tmp_inputs,entity = predict(real_input[i],real_classes[i])
    if tmp_preds:
        preds.append(tmp_preds)
        inputs.append(tmp_inputs)
        entity_list.append(entity)
        labels.append(real_labels[i])
#predict(real_input[12],real_classes[12])

1
score (13, 9)
1
score (13, 9)
1
score (12, 9)
1
score (12, 9)
1
score (10, 9)
1
score (11, 9)
1
score (11, 9)
1
score (13, 9)
1
score (15, 9)
1
score (15, 9)
1
score (16, 9)
1
score (16, 9)
1
score (14, 9)
2
score (14, 8)
2
score (12, 8)
2
score (10, 8)
case2
2
score (11, 8)
2
score (11, 8)
1
score (13, 9)
1
score (13, 9)
1
score (12, 9)
1
score (12, 9)
2
score (11, 8)
1
score (12, 9)
1
score (12, 9)
1
score (11, 9)
1
score (12, 9)
1
score (12, 9)
1
score (11, 9)
1
score (11, 9)
1
score (11, 9)
1
score (10, 9)
1
score (10, 9)
1
score (10, 9)
1
score (10, 9)
2
score (7, 8)
2
score (7, 8)
2
score (7, 8)
2
score (7, 8)
2
score (8, 8)
2
score (8, 8)
2
score (8, 8)
2
score (8, 8)
2
score (10, 8)
2
score (8, 8)
2
score (8, 8)
2
score (8, 8)
2
score (8, 8)
2
score (7, 8)
2
score (7, 8)
2
score (7, 8)
2
score (6, 8)
2
score (9, 8)
2
score (6, 8)
1
score (12, 9)
1
score (13, 9)
1
score (10, 9)
1
score (12, 9)
1
score (12, 9)
1
score (12, 9)
1
score (13, 9)
1
score (10, 9)
2
score (10, 8)
2
sc

In [28]:
# evaluate
compute(predictions=preds, references=labels)

{'Capacitance': {'precision': 0.9818181818181818,
  'recall': 0.9818181818181818,
  'f1': 0.9818181818181818,
  'number': 110},
 'MfrPartNumber': {'precision': 0.8762886597938144,
  'recall': 0.8762886597938144,
  'f1': 0.8762886597938144,
  'number': 97},
 'PositiveTolerance': {'precision': 1.0,
  'recall': 0.984375,
  'f1': 0.9921259842519685,
  'number': 64},
 'RatedDCVoltageURdc': {'precision': 0.9908256880733946,
  'recall': 0.9818181818181818,
  'f1': 0.9863013698630138,
  'number': 110},
 'RatedPowerDissipationP': {'precision': 0.9801980198019802,
  'recall': 0.9801980198019802,
  'f1': 0.9801980198019802,
  'number': 101},
 'Resistance': {'precision': 0.9528795811518325,
  'recall': 0.9238578680203046,
  'f1': 0.9381443298969072,
  'number': 197},
 'SizeCode': {'precision': 0.9730639730639731,
  'recall': 0.9863481228668942,
  'f1': 0.9796610169491526,
  'number': 293},
 'TemperatureCharacteristicsCode': {'precision': 1.0,
  'recall': 0.9714285714285714,
  'f1': 0.9855072463768

In [29]:
# save result
real_res = pd.DataFrame({"description":inputs,"true_labels":labels,"predicted_labels":preds,'entity_list':entity_list})
real_res.to_csv('ner_result.csv',encoding='utf_8_sig')