In [1]:
import sampling
import os
import pandas as pd
import logging
import sklearn
import time
import random
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support,classification_report
from sklearn.preprocessing import LabelEncoder
import json
from simpletransformers.ner import NERModel,NERArgs

In [2]:
config_class_features = sampling.read_data("config/config-class-features.json")
config_class_name = sampling.read_data("config/config-class-name.json")
config_classinfo = sampling.read_data("config/config-classinfo.json")
config_numeric_fields = sampling.read_data("config/config-numeric-fields.json")
config_dynamic_units = sampling.read_data("config/config-dynamic-units.json")
pair_params = sampling.read_data("config/pair-params")

In [3]:

def get_rs_list_of_all_category():
    """
    input_size: the number of items to sample
    returns the input for simpletransformer
    """
    # 读取所有catogory文件
    path = os.getcwd()
    files= os.listdir('./formatData') #得到文件夹下的所有文件名称
    rs_list = []
    
    for file in files: #遍历文件夹
        file_path = os.path.join(path, 'formatData/'+file)
        if os.path.isfile(file_path): #判断是否是文件夹，不是文件夹才打开
                rs = sampling.sampling(file,0.6)
                rs_list.append((rs,file))
                
    print('num_of_all_categories',len(rs_list))
    
    with open('running_output.txt','a') as f:
        f.write('num_of_all_categories'+str(len(rs_list))+'\n')
        
        
    return rs_list

def get_input_from_sampling(input_size: int):
    """
    input_size: the number of items to sample
    returns the input for simpletransformer
    """
    rs_list = get_rs_list_of_all_category()
    
    # input_size为1表示从每一个category抽取一个样本
    text_list = []
    label_list = []
    problematic = set()
    
    for i in range(input_size):
        for rs in rs_list:
            try:
                item=rs[0].random_sampling()
            except Exception as e:
                problematic.add(rs[1])
                continue
            # print(item)
            tmp_list = []
            for key, val in item.items():
                if key == 'class':
                    continue
                elif key == 'category':
                    label_list.append(val)
                else:
                    tmp_list.append(str(val))
            text_list.append(tmp_list)
    print('query数量：',len(text_list))
    print('label数量：',len(label_list))
    print('problematic',problematic)
    
    with open('running_output.txt','a') as f:
        f.write('query数量'+str(len(text_list))+'\n')
        f.write('label数量'+str(len(label_list))+'\n')
        f.write('problematic'+str(problematic)+'\n')
    return text_list,label_list

a,b=get_input_from_sampling(1)
print(a)
print(b)

num_of_all_categories 37
query数量： 37
label数量： 37
problematic set()
[['2/2kHz', 'BANDPASS', 'NO', '50 OHM', 'Active Filters', 'e0', '-13.0mV'], ['8/4µF', '4141', 'POLARIZED', '9/7', 'e3', 'IEC60384-4'], ['+-5', '75.5MHz', 'NO', 'MIL-STD-883', 'e4'], ['Converters', '0.015%', '12.0', '3.33µs', 'YES', 'NOT SPECIFIED', '-2.5', '9.57mm', 'DIP'], ['5.0V', 'BIPOLAR', '12.0', 'CHIP CARRIER', 'LDCC28,.5SQ'], ['+-6', '5.0uV', 'VOLTAGE-MODE', 'CAN ALSO OPERATE FROM A 15V NOMINAL SUPPLY', '260', 'L BEND'], ['1.0', 'BIDIRECTIONAL', 'OPEN-COLLECTOR', 'PLASTIC/EPOXY', '3.375kV', 'Tin/Lead (Sn/Pb)'], ['2.161e-05µF', '9/7%', '33.0kV', '10/1', 'SMT', 'BUSSED C NETWORK'], ['232.5ohm', '1/9%', 'THIN FILM', '75.0mV', 'GULL WING'], ['连接辅助', 'PROTECTIVE COVER', 'FEMALE', '362.5', 'COAX', 'NO', 'NO', '75.0', '175.0°C', '-65.0°C'], ['BIPOLAR', '3.3V', 'STS-1/OC-1', '51840.0', 'J BEND', '1', 'SONET;SDH', '85.0°C', '-40.0°C'], ['22.25', '0.905%', 'HTSSOP', '85.0', 'Consumer Circuits', 'INDUSTRIAL', 'Not Qualified

In [4]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

In [5]:
def input_process(config_classinfo,model_checkpoint,tokenizer,train_size,test_size):
    """
    """
    files= os.listdir('./inputData')
    gpu_available = torch.cuda.is_available()
    print('torch.cuda.is_available()',gpu_available)
    
    with open('running_output.txt','w') as f:
        f.write(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())+'\n')
        f.write('torch.cuda.is_available():'+str(gpu_available)+'\n')
    
    #准备训练数据
    print('train dataset\n')
    if 'train_encodings.pt' in files and 'train_labels.csv' in files:
        print('read train dataset\n')
        train_encodings = torch.load('./inputData/train_encodings.pt')
        train_labels = pd.read_csv('./inputData/train_labels.csv',index_col=0)
        train_labels = train_labels['0'].to_list()
    else:
        print('generate train dataset\n')
        train_dataset,train_labels = get_input_from_sampling(train_size)
        train_encodings = tokenizer(train_dataset, is_split_into_words=True,add_special_tokens=False,
                        padding=True,truncation=False,return_tensors="pt")
        torch.save(train_encodings, './inputData/train_encodings.pt')
        pd.DataFrame(train_labels).to_csv('./inputData/train_labels.csv')
        print('train dataset saved')
        
    print('%%%%%%%%%%%%%%%%%%%%%%%%%%%%%')
    
    print('test dataset')
    if 'test_encodings.pt' in files and 'test_labels.csv' in files:
        print('read test dataset\n')
        test_encodings = torch.load('./inputData/test_encodings.pt')
        test_labels = pd.read_csv('./inputData/test_labels.csv',index_col=0)
        test_labels = test_labels['0'].to_list()
    else:
        print('generate test dataset\n')
        test_dataset,test_labels = get_input_from_sampling(test_size)
        test_encodings = tokenizer(test_dataset, is_split_into_words=True,add_special_tokens=False,
                        padding=True,truncation=False,return_tensors="pt")
        torch.save(test_encodings, './inputData/test_encodings.pt')
        pd.DataFrame(test_labels).to_csv('./inputData/test_labels.csv')
        print('test dataset saved\n')

    # encode the labels
    categories = []
#     for val in config_classinfo.values():
#         categories.extend(val)
    le = LabelEncoder()
    categories = set(train_labels)|set(test_labels)
    le.fit(list(categories))
    label_cnt = le.classes_.shape[0]

    train_labels_encoded = list(le.transform(train_labels))
    print(len(train_labels_encoded))

    test_labels_encoded = list(le.transform(test_labels))
    print(len(test_labels_encoded))
    
    with open('running_output.txt','a') as f:
        f.write('category数量'+str(label_cnt)+'\n')
    
    return le,label_cnt,train_encodings,test_encodings,train_labels_encoded,test_labels_encoded


In [6]:

class processDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)



In [7]:
def getClass(dict,cat):
    for key, val in dict.items():
        if cat in val:
            return key
    return 'unknown_class'

In [8]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='micro')
    acc = accuracy_score(labels, preds)
    
    global le,config_classinfo
    cat_labels = list(le.inverse_transform(labels))
    # print(preds)
    cat_preds = list(le.inverse_transform(preds))
    cat_report = classification_report(cat_labels, cat_preds,output_dict=True)
    cat_report2 = classification_report(cat_labels, cat_preds)
    cat_result = {'accuracy': acc,
                'f1': f1,
                'precision': precision,
                'recall': recall}            
    #需要继续改
    class_preds = [getClass(config_classinfo,item) for item in cat_preds]
    class_labels = [getClass(config_classinfo,item) for item in cat_labels]
    class_report = classification_report(class_labels, class_preds,output_dict=True)
    class_report2 = classification_report(class_labels, class_preds)
    class_result = {'accuracy': acc,
                'f1': f1,
                'precision': precision,
                'recall': recall}
    
    
    with open('cat_classification_report.txt','w') as f: 
        js = json.dumps(cat_report) 
        f.write(js)
    with open('cat_classification_report_string_format.txt','w') as f: 
        f.write(cat_report2)
    with open('running_output.txt','a') as f:
        f.write('cat_result:'+str(cat_result)+'\n')
    
    with open('class_classification_report.txt','w') as f: 
        js = json.dumps(class_report) 
        f.write(js)
    with open('class_classification_report_string_format.txt','w') as f: 
        f.write(class_report2)
    with open('class_running_output.txt','w') as f:
        f.write('class_result:'+str(class_result)+'\n')

    return class_result

In [9]:
if __name__=="__main__":

    set_seed(1024)
    model_checkpoint = "albert-base-v2"
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    le,label_cnt,train_encodings,test_encodings,train_labels_encoded,test_labels_encoded = input_process(config_classinfo,
                                                                                                         model_checkpoint,tokenizer,10000,2000)

    print('input_process_finished\n')
    train_dataset = processDataset(train_encodings, train_labels_encoded)
    test_dataset = processDataset(test_encodings, test_labels_encoded)

    model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=label_cnt)

    training_args = TrainingArguments(
        output_dir='./results',
        learning_rate=5e-3,
        weight_decay=0.01,
        adam_beta1=0.9,
        adam_beta2=0.999,
        adam_epsilon=1e-8,
        num_train_epochs=10,
        logging_steps=150000,
        save_total_limit=5,
        no_cuda=False,
        seed=1024,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=64,
        warmup_steps=500,
        logging_dir='./logs',
        load_best_model_at_end=True,
        disable_tqdm=False
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        train_dataset=train_dataset,
        eval_dataset=test_dataset
    )

    # Train the model
    print('Training begins\n')
    start = time.time()
    trainer.train()
    end = time.time()
    print(f"training time: {end - start}")

    train_result = trainer.evaluate(train_dataset)
    print(train_result)
    test_result = trainer.evaluate(test_dataset)
    print(test_result)

    trainer.save_model()
    with open('running_output.txt','a') as f:
        f.write(f"training time: {end - start}"+'\n')
        f.write('train_dataset'+str(train_result)+'\n')        
        f.write('test_dataset'+str(test_result)+'\n')

train dataset
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
test dataset
37
37


Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.bias', 'predictions.LayerNorm.weight', 'predictions.LayerNorm.bias', 'predictions.dense.weight', 'predictions.dense.bias', 'predictions.decoder.weight', 'predictions.decoder.bias']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You sho

Step,Training Loss,Validation Loss


training time: 43.221421003341675


  _warn_prf(average, modifier, msg_start, len(result))


cat_preds ['Cavity Oscillators', 'Cavity Oscillators', 'Cavity Oscillators', 'Cavity Oscillators', 'Cavity Oscillators', 'Cavity Oscillators', 'Cavity Oscillators', 'Cavity Oscillators', 'Cavity Oscillators', 'Cavity Oscillators']
cat_labels ['Active Filters', 'Aluminum Electrolytic Capacitors', 'Analog Computational Functions', 'Analog to Digital Converters', 'Analog Transmission Interfaces', 'Analog Waveform Generation Functions', 'Arithmetic Circuits', 'Array/Network Capacitors', 'Array/Network Resistors', 'Assembly Items']
class_preds ['Oscillators', 'Oscillators', 'Oscillators', 'Oscillators', 'Oscillators', 'Oscillators', 'Oscillators', 'Oscillators', 'Oscillators', 'Oscillators']
class_labels ['Filters', 'Capacitors', 'Signal Circuits', 'Converters', 'Telecommunication Circuits', 'Signal Circuits', 'Logic', 'Capacitors', 'Resistors', 'Connector Support']
{'eval_loss': 3.6519815921783447, 'eval_accuracy': 0.02702702702702703, 'eval_f1': 0.02702702702702703, 'eval_precision': 0.02

In [10]:
# def show_prediction_result(model):
#     """
#     model: the trained model
#     returns the predicted result
#     """
#     rs_list = get_rs_list_of_all_category()
#     rs = random.choice(rs_list)
#     item = rs.random_sampling()
#     truth_result = pd.DataFrame.from_dict(item,orient='index').reset_index()
#     truth_result.columns = ['output_truth','input']
#     truth_result['input'] = truth_result['input']
#     print(truth_result)
#     input_sentence = list(item.values())
#     input_sentence = [str(x) for x in input_sentence]
#     predictions, raw_outputs = model.predict([input_sentence],split_on_space= False )
    
#     pred_dict={}
#     for item in predictions[0]:
#         pred_dict.update(item)
#     pred_result = pd.DataFrame.from_dict(pred_dict,orient='index').reset_index()
#     pred_result.columns = ['input','output_pred']
#     print(pred_result)
#     show_result = pd.merge(pred_result,truth_result,how='right',on='input')
#     return show_result
# show_prediction_result(model)

In [11]:
# import json

# with open('classification_report.txt', 'r') as f:
#     js = f.read()
#     dic = json.loads(js)   
#     print(dic) 
