In [1]:
import sampling
import os
import pandas as pd
import logging
import sklearn
import time
import random
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support,classification_report
from sklearn.preprocessing import LabelEncoder
import json
from simpletransformers.ner import NERModel,NERArgs
import gc

In [2]:
config_class_features = sampling.read_data("config/config-class-features.json")
config_class_name = sampling.read_data("config/config-class-name.json")
config_classinfo = sampling.read_data("config/config-classinfo.json")
config_numeric_fields = sampling.read_data("config/config-numeric-fields.json")
config_dynamic_units = sampling.read_data("config/config-dynamic-units.json")
pair_params = sampling.read_data("config/pair-params")
with open('class_dict.txt', 'r') as f:
    js = f.read()
    class_dict = json.loads(js)

In [3]:
#19个resistors的文件,12个capacitors的文件，总共374个文件
def get_rs_list_of_all_category():
    """
    input_size: the number of items to sample
    returns the input for simpletransformer
    """
    # 读取所有catogory文件
    path = os.getcwd()
    files= os.listdir('./formatData') #得到文件夹下的所有文件名称
    rs_list = []
    
    for file in files: #遍历文件夹
        file_path = os.path.join(path, 'formatData/'+file)
        if os.path.isfile(file_path): #判断是否是文件夹，不是文件夹才打开
            if 'Resistors' in file:
                for i in range(9):
                    rs = sampling.sampling(file,0.6)
                    rs_list.append((rs,file))
            elif 'Capacitors' in file:
                for i in range(14):
                    rs = sampling.sampling(file,0.6)
                    rs_list.append((rs,file))
            else:
                rs = sampling.sampling(file,0.6)
                rs_list.append((rs,file))
    
    random.shuffle(rs_list)        
    return rs_list

def get_input_from_sampling(input_size: int,train_flag:True):
    """
    input_size: the number of items to sample
    returns the input for simpletransformer
    """
    rs_list = get_rs_list_of_all_category()
    
    # input_size为1表示从每一个category抽取一个样本
    text_list = []
    label_list = []
    problematic = set()
    
    for i in range(input_size):
        for rs in rs_list:
            try:
                item=rs[0].random_sampling()
            except Exception as e:
                problematic.add(rs[1])
                continue
                
            # 保存抽样数据
            if i<5 and train_flag:
                with open('input_saved.txt','a') as f: 
                    js = json.dumps(item) 
                    f.write(js)
            
            tmp_string = ''
            
            shuffled_item = list(item.items())
            random.shuffle(shuffled_item)
            for key, val in shuffled_item:
                if key == 'category':
                    continue
                elif key == 'class':
                    label_list.append(val)
                else:
                    tmp_string+=' '+str(val)

            text_list.append(tmp_string)
    
    print('train_flag',train_flag)
    print('problematic',problematic)
    print('query数量：',len(text_list))
    print('label数量：',len(label_list))
    
    with open('running_output.txt','a') as f:
        f.write('train_flag:'+str(train_flag)+'\n')
        f.write('query数量:'+str(len(text_list))+'\n')
        f.write('label数量:'+str(len(label_list))+'\n')
        f.write('problematic:'+str(problematic)+'\n')
    return text_list,label_list

# a,b=get_input_from_sampling(1,True)
# print(a)
# print(b)

train_flag True
problematic set()
query数量： 84
label数量： 84
[' Circuit Protection 55.0uV 10.025nA LONG 240.0V e4 TRIP FREE TEB', ' 电容 1206 0.025µF CHASSIS MOUNT C0G MIL-PRF-55681 YES', ' 连接器 CIRCULAR CONNECTOR ADAPTER EMI SHIELDED, HIGH DENSITY 4 ALUMINUM PANEL', ' WRAPAROUND 3.5% CERAMIC 2/2µF Capacitors 900.0 20.5kV', ' 6.0ppm/°C -55.0°C 125.0°C 82.0kΩ METAL GLAZE/THICK FILM BUSSED 38415.0Ohm 1.05W 50ppm/°C', ' 11.965mm SIGNAL 8000 OHMS CT 15.0kHz THROUGH HOLE MOUNTED 0.35kHz 3/10', ' SMA FEMALE 1.5 5 50 OHM 7.0', ' 1/8°C YES 8.75 62.5uV C0G cap 9/9µF 8/5°C CERAMIC', ' WRAPAROUND e4 325.0V 0.003µF 1812', ' MECHANICAL TUNED CAVITY OSCILLATOR 10.0kHz -13.5 -20.0uV 2000.0% -20.0', ' 0.78mm X7R 0.005µF W1A 12.5% YES', ' 5.125kV 2.0µs 6/6°C GULL WING 4.0V INDUSTRIAL 2.5 70.0°C Converters', ' Res SPD16B2002CL 50ppm/°C MIL-PRF-83401 0.24mW Array/Network Resistors 6.74K THIN FILM Resistors', ' 3.79 0.001µF 4.5mm X7R CERAMIC SILVER PALLADIUM 10.0% Yes', ' 9/3µF 0.1 NO 10/6mΩ 52.5V 7.65', ' TR, 

In [8]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

In [9]:
def input_process(config_classinfo,model_checkpoint,tokenizer,train_size,test_size,class_dict):
    """
    """
    files= os.listdir('./class_inputData')

    with open('running_output.txt','w') as f:
        f.write(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())+'\n')
    
    #准备训练数据
    print('train dataset\n')
    if 'train_encodings.pt' in files and 'train_labels.csv' in files:
        print('read train dataset\n')
        train_encodings = torch.load('./class_inputData/train_encodings.pt')
        train_labels = pd.read_csv('./class_inputData/train_labels.csv',index_col=0)
        train_labels = train_labels['0'].to_list()
    else:
        print('generate train dataset\n')
        train_dataset,train_labels = get_input_from_sampling(train_size,train_flag=True)
        train_encodings = tokenizer(train_dataset,padding=True,truncation=False,return_tensors="pt")
        torch.save(train_encodings, './class_inputData/train_encodings.pt')
        pd.DataFrame(train_labels).to_csv('./class_inputData/train_labels.csv')
        print('train dataset saved')
        
    print('%%%%%%%%%%%%%%%%%%%%%%%%%%%%%')
    
    print('test dataset')
    if 'test_encodings.pt' in files and 'test_labels.csv' in files:
        print('read test dataset\n')
        test_encodings = torch.load('./class_inputData/test_encodings.pt')
        test_labels = pd.read_csv('./class_inputData/test_labels.csv',index_col=0)
        test_labels = test_labels['0'].to_list()
    else:
        print('generate test dataset\n')
        test_dataset,test_labels = get_input_from_sampling(test_size,train_flag=False)
        test_encodings = tokenizer(test_dataset,padding=True,truncation=False,return_tensors="pt")
        torch.save(test_encodings, './class_inputData/test_encodings.pt')
        pd.DataFrame(test_labels).to_csv('./class_inputData/test_labels.csv')
        print('test dataset saved\n')

    # encode the labels
    train_labels_encoded = list(map(lambda x:ClassEncoder(x),train_labels))
    print(len(train_labels_encoded))
    test_labels_encoded = list(map(lambda x:ClassEncoder(x),test_labels))
    print(len(test_labels_encoded))
    
    return train_encodings,test_encodings,train_labels_encoded,test_labels_encoded


In [10]:
class processDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [12]:
def ClassEncoder(Class):
    global class_dict
    return class_dict[Class]

In [14]:
def ClassDecoder(Class):
    global class_dict
    for key,val in class_dict.items():
        if val == Class:
            return key

In [27]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='micro')
    acc = accuracy_score(labels, preds)
    
    global class_dict
    print('compute_metrics:',len(labels))

    res = pd.DataFrame({"preds":preds,"labels":labels})
    res.to_csv('test_set_result.csv')
    
    class_preds = [ClassDecoder(item) for item in preds]
    class_labels = [ClassDecoder(item) for item in labels]
    class_report = classification_report(class_labels, class_preds,output_dict=True)
    class_report2 = classification_report(class_labels, class_preds)
    class_result = {'accuracy': acc,
                'f1': f1,
                'precision': precision,
                'recall': recall}
    
    with open('class_classification_report.txt','w') as f: 
        js = json.dumps(class_report) 
        f.write(js)
    with open('class_classification_report_string_format.txt','w') as f: 
        f.write(class_report2)
    with open('class_running_output.txt','w') as f:
        f.write('class_result:'+str(class_result)+'\n')

    return class_result

In [None]:
if __name__=="__main__":

    set_seed(1024)
    model_checkpoint = "albert-base-v2"
    # model_checkpoint = r'C:\Users\coldkiller\Desktop\supplyframe\checkpoint-3500'
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    gpu_available = torch.cuda.is_available()
    
    #logging.basicConfig(level=logging.DEBUG,format='%(asctime)s %(message)s')
    
    print('torch.cuda.is_available()',gpu_available)
    
    train_encodings,test_encodings,train_labels_encoded,test_labels_encoded = input_process(config_classinfo,
                                                                                    model_checkpoint,tokenizer,7,1,class_dict)

    print('input_process_finished\n')
    train_dataset = processDataset(train_encodings, train_labels_encoded)
    test_dataset = processDataset(test_encodings, test_labels_encoded)

    model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=35)

    training_args = TrainingArguments(
        output_dir='./results',
        learning_rate=1e-5,
        weight_decay=0.01,
        adam_beta1=0.9,
        adam_beta2=0.999,
        adam_epsilon=1e-8,
        num_train_epochs=3,
        logging_steps=50,
        save_steps=500000,
        no_cuda= not gpu_available,
        seed=1024,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=64,
        warmup_steps=500,
        logging_dir='./logs',
        load_best_model_at_end=True,
        save_total_limit=5,
        disable_tqdm=True
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        train_dataset=train_dataset,
        eval_dataset=test_dataset
    )

    # Train the model
    print('Training begins\n')
    start = time.time()
    trainer.train()
    end = time.time()
    print(f"training time: {end - start}")

    train_result = trainer.evaluate(train_dataset)
    print(train_result)
    test_result = trainer.predict(test_dataset).metrics
    print(test_result)

    trainer.save_model()
    with open('running_output.txt','a') as f:
        f.write(f"training time: {end - start}"+'\n')
        f.write('train_dataset'+str(train_result)+'\n')        
        f.write('test_dataset'+str(test_result)+'\n')

In [13]:
def save_class_mapping():
    #加上unknown是36个class
    class_dict = {}
    i=1
    for part_class in config_classinfo.keys():
        class_dict[part_class] = i
        i+=1

    class_dict['unknown_class'] = 0

    with open('class_dict.txt', 'w') as f:
        dic = json.dumps(class_dict)  
        f.write(dic)

In [None]:
#     training_args = TrainingArguments(
#         output_dir='./results',
#         learning_rate=1e-5,
#         weight_decay=0.01,
#         adam_beta1=0.9,
#         adam_beta2=0.999,
#         adam_epsilon=1e-8,
#         num_train_epochs=1,
#         logging_steps=5,
#         evaluation_strategy='steps',
#         save_steps=500000,
#         no_cuda=False,
#         seed=1024,
#         per_device_train_batch_size=16,
#         per_device_eval_batch_size=64,
#         warmup_steps=500,
#         logging_dir='./logs',
#         load_best_model_at_end=True,
#         save_total_limit=5,
#         disable_tqdm=True
#     )