In [1]:
import sampling
import os
import pandas as pd
import logging
import sklearn
import time
import random
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support,classification_report
import json
import gc
import sys
import re
import generate
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
config_class_features = sampling.read_data("config/config-class-features.json")
config_class_name = sampling.read_data("config/config-class-name.json")
config_classinfo = sampling.read_data("config/config-classinfo.json")
config_numeric_fields = sampling.read_data("config/config-numeric-fields.json")
config_dynamic_units = sampling.read_data("config/config-dynamic-units.json")
pair_params = sampling.read_data("config/pair-params")
class_dict = {'Resistors':1,'Capacitors':2,'others':0}

In [9]:
def get_data_from_file(path,inputClass,data_num,all_data):
    with open(path,'r', encoding='utf-8') as f:
        standard_data = json.loads(f.read())
    for item in standard_data[:data_num]:
        all_data.append(generate.generate(item, inputClass)['description'])
all_data=[]
get_data_from_file('preprocess/standard_cap.json','Capacitors',10,all_data)
get_data_from_file('preprocess/standard_res.json','Resistors',10,all_data)

In [4]:
# generate data
cap_data, res_data = [], []
for item in standard_cap[:10]:
    cap_data.append(generate.generate(item, 'Capacitors'))
    
for item in standard_res[:10]:
    res_data.append(generate.generate(item, 'Resistors'))

In [5]:
cap_data

[{'description': 'CERAMIC/2221_1/8F#0.36kV',
  'labels': ['others', 'SizeCode', 'Capacitance', 'RatedDCVoltageURdc'],
  'class': 'Capacitors',
  'SizeCode': '2221',
  'Capacitance': '1/8F',
  'RatedDCVoltageURdc': '0.36kV'},
 {'description': '7053889124_0.33F-1/8uV',
  'labels': ['others',
   'others',
   'Capacitance',
   'TemperatureCharacteristicsCode',
   'SizeCode',
   'RatedDCVoltageURdc'],
  'class': 'Capacitors',
  'Capacitance': '0.33F',
  'TemperatureCharacteristicsCode': '',
  'SizeCode': '',
  'RatedDCVoltageURdc': '1/8uV'},
 {'description': '1218263915-1210;30pF/0.7V:NP0',
  'labels': ['others',
   'SizeCode',
   'Capacitance',
   'RatedDCVoltageURdc',
   'TemperatureCharacteristicsCode'],
  'class': 'Capacitors',
  'SizeCode': '1210',
  'Capacitance': '30pF',
  'RatedDCVoltageURdc': '0.7V',
  'TemperatureCharacteristicsCode': 'NP0'},
 {'description': '0.98F,77V',
  'labels': ['Capacitance', 'RatedDCVoltageURdc', 'others'],
  'class': 'Capacitors',
  'Capacitance': '0.98F'

In [3]:
#19个resistors的文件*18,12个capacitors的文件*29，总共374个文件,
def convert2str(label_list,item):
    tmp_string = ''

    shuffled_item = list(item.items())
    random.shuffle(shuffled_item)
    
    text, text_sep= "", ""
    deliminter_list = ['#', ',', '/', ';', ':', '-', '_',' ']
    deliminter = random.sample(deliminter_list, 1)[0]
    
    for (key, val) in shuffled_item:
        if key == 'category' or key == 'labels' or key == 'description':
            continue
        elif key == 'class':
            label_list.append(val)
        else:
            if random.uniform(0, 1) > 0.1: # 10% chance to use another deliminter
                tmp_string += str(val) + random.sample(deliminter_list, 1)[0]
            else:
                tmp_string += str(val) + deliminter
                
    output = re.sub("(" + "|".join(deliminter_list) + ")$", "", tmp_string)
    return output

def get_input_from_sampling(input_size: int,train_flag:True):
    """
    input_size: the number of items to sample
    returns the input for simpletransformer
    """
    # 读取所有catogory文件
    path = os.getcwd()
    files= os.listdir('./formatData') #得到文件夹下的所有文件名称
    rs_list = []
    
    # input_size为1表示从每一个category抽取一个样本
    text_list = []
    label_list = []
    problematic = set()
    
    print('进入数据生成循环')
    for file in files: #遍历文件夹
        file_path = os.path.join(path, 'formatData/'+file)
        if os.path.isfile(file_path): #判断是否是文件夹，不是文件夹才打开
            rs = sampling.sampling(file,0.6)
            if 'Resistors' in file:
                for i in range(18*input_size):
                    try:
                        item = [item for item in rs.random_sampling()][0]
                        text_list.append(convert2str(label_list,item)) 
                    except Exception as e:
                        problematic.add(file)
                        continue

            elif 'Capacitors' in file:
                for i in range(29*input_size):
                    try:
                        item = [item for item in rs.random_sampling()][0]
                        text_list.append(convert2str(label_list,item))
                    except Exception as e:
                        problematic.add(file)
                        continue
                        
            else:
                for i in range(input_size):
                    try:
                        item = [item for item in rs.random_sampling()][0]
                        text_list.append(convert2str(label_list,item))
                    except Exception as e:
                        problematic.add(file)
                        continue
    
    #to shuffle input
    res=[]
    for i in range(len(label_list)):
        res.append((text_list[i],label_list[i]))
    random.shuffle(res)
    for i in range(len(label_list)):
        text_list[i]=res[i][0]
        label_list[i]=res[i][1]
        
    print('problematic',problematic)
    print('query数量：',len(text_list))
    print('label数量：',len(label_list))
    
    with open('running_output.txt','a') as f:
        f.write('query数量:'+str(len(text_list))+'\n')
        f.write('label数量:'+str(len(label_list))+'\n')
        f.write('problematic:'+str(problematic)+'\n')
    return text_list,label_list

# a,b=get_input_from_sampling(1,True)
# for i in range(len(b)):
#     print((a[i],b[i]),'\n')

进入数据生成循环
problematic set()
query数量： 1033
label数量： 1033
('3141000.0Ohm,1.25:2512-Dxx342M07B3F83-P:300-Rectangular', 'Resistors') 

('538:SMT,55.0;6/4:4/5e-05_cap;1816_e3#NP0', 'Capacitors') 

('20.0%-500.135µF:45.5uV-Capacitors 20.0%;REFERENCE STANDARD: IEC60384-4 ALUMINUM (WET)_330.0', 'Capacitors') 

('TUBE-MIL-STD-883B,5/6;Amp_0.0-2/6_4/3', 'Amplifier Circuits') 

('Variable Resistors -150,150#15.0%:250100.0O#VERTICAL', 'Resistors') 

('METAL FILM;PTF,0.3kW-Resistors:1235.0Ω:25ppm/°C-TUBULAR PACKAGE', 'Resistors') 

('15.0%:0.022µF WRAPAROUND/1200.0#0201#5000000.0 STANDARD: MIL-PRF-55681, MIL-PRF-123', 'Capacitors') 

('2HP@250VAC 100000.0Cycle(s),283.5;Random/14.0-Power/Signal Relays/545.0Ω/Relays 12.0', 'Relays') 

('6/1-2.54:Display Controllers/8/9MHz-3.5-DISPLAY CONTROLLER, DOT MATRIX LCD DISPLAY', 'Microcontrollers and Processors') 

('THROUGH HOLE MOUNT#3.0uW_2010/4.2/1506.0MΩ 45.0s_Fusible Resistors', 'Resistors') 

('FLATPACK,80166;TIN LEAD_16.0/NOT SPECIFIED_6/1', 'Microcont


('54.575K-2.525%/-300,300ppm/°C-WRAPAROUND 2-PRECISION', 'Resistors') 

('33.97kΩ NO,20ppm/°C/250.0°C-9/8°C_0.375%', 'Resistors') 

('3/3,1206,0.377,FIXED RESISTOR:THIN FILM#Resistors:5935.0MΩ', 'Resistors') 

('Cylindrical_-100-res-5044:182.503kΩ#8.13', 'Resistors') 

('Capacitors,cap:-47.5°C-630.0uV_0.101µF#1/4°C;NON-POLARIZED', 'Capacitors') 

('3.465-2.12;100ppm/°C/METAL GLAZE/THICK FILM:TR; TUBE:565.0MΩ', 'Resistors') 

('Matte Tin (Sn)#0.5uW_SWITCH OPTION AVAILABLE:4400.0ohm,res/100', 'Resistors') 

('642000.0O#2/10/2.035_5.04-3/7%:5ppm/°C', 'Resistors') 

('RF and Microwave-YES-1/7-RF#9/5dB-DIRECTIONAL COUPLER', 'RF and Microwave') 

('8.755_1430.0nm_1100.0nm_-40.0°C/0.875A/W:40.0 8/10nm:85.0°C', 'Fiber Optics') 

('6/3_4.15mV#3.6/5V 0.5 SMALL OUTLINE, VERY THIN PROFILE', 'Microcontrollers and Processors') 

('0.002µF_0/+70ppm/Celppm/°C;MIL-PRF-39001_500.0V_2:MICA', 'Capacitors') 

('5.0/41250800.0GΩ_3.5uW/38.1mm:电阻-WIRE;Cylindrical_8739', 'Resistors') 

('TANTALUM (DRY/SOLID)_

In [4]:
# #保存列表，每行一个元素
# with open('example.txt','w',encoding='utf-8') as f:
#     f.write('\n'.join(a))

In [4]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

In [6]:
def train_input_process(config_classinfo,model_checkpoint,train_size,class_dict,tokenizer):
    """
    """
#    files= os.listdir('./class_inputData')

    with open('running_output.txt','w') as f:
        f.write(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())+'\n')
    
    #准备训练数据
#    print('train dataset\n')
#     if 'train_encodings.pt' in files and 'train_labels.csv' in files:
#         print('read train dataset\n')
#         train_encodings = torch.load('./class_inputData/train_encodings.pt')
#         train_labels = pd.read_csv('./class_inputData/train_labels.csv',index_col=0)
#         train_labels = train_labels['0'].to_list()
#     else:
#    print('generate train dataset\n')
    train_dataset,train_labels = get_input_from_sampling(train_size,train_flag=True)
    train_encodings = tokenizer(train_dataset,padding=True,truncation=False)

#     pd.DataFrame(train_labels).to_csv('./class_inputData/train_labels.csv')
#     torch.save(train_encodings, './class_inputData/train_encodings.pt')
#     print('train dataset saved')
        
    # encode the labels
    train_labels_encoded = list(map(lambda x:ClassEncoder(x),train_labels))

    print('training dataset is ok\n')
    return train_encodings,train_labels_encoded

def test_input_process(config_classinfo,model_checkpoint,test_size,class_dict,tokenizer):
    """
    """
#    print('test dataset')
#    files= os.listdir('./class_inputData')
    
#     if 'test_encodings.pt' in files and 'test_labels.csv' in files:
#         print('read test dataset\n')
#         test_encodings = torch.load('./class_inputData/test_encodings.pt')
#         test_labels = pd.read_csv('./class_inputData/test_labels.csv',index_col=0)
#         test_labels = test_labels['0'].to_list()
#     else:
    print('generate test dataset\n')
    test_dataset,test_labels = get_input_from_sampling(test_size,train_flag=False)
    test_encodings = tokenizer(test_dataset,padding=True,truncation=False)

#         torch.save(test_encodings, './class_inputData/test_encodings.pt')
#         pd.DataFrame(test_labels).to_csv('./class_inputData/test_labels.csv')
#         print('test dataset saved\n')

    # encode the labels
    test_labels_encoded = list(map(lambda x:ClassEncoder(x),test_labels))
    print('testing dataset is ok\n')
    return test_encodings,test_labels_encoded


In [7]:
class processDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [8]:
def ClassEncoder(Class):
    if Class == 'Resistors':
        return 1
    elif Class == 'Capacitors':
        return 2
    else:
        return 0
    
def ClassDecoder(Class):
    if Class == 1:
        return 'Resistors'
    elif Class == 2:
        return 'Capacitors'
    else:
        return 'Others'

In [9]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='micro')
    acc = accuracy_score(labels, preds)
    
    global class_dict
    print('compute_metrics:',len(labels))

    res = pd.DataFrame({"preds":preds,"labels":labels})
    res[:1000].to_csv('test_set_result.csv')
    
    class_preds = [ClassDecoder(item) for item in preds]
    class_labels = [ClassDecoder(item) for item in labels]
    class_report = classification_report(class_labels, class_preds,output_dict=True)
    class_report2 = classification_report(class_labels, class_preds)
    class_result = {'accuracy': acc,
                'f1': f1,
                'precision': precision,
                'recall': recall}
    
    with open('class_classification_report.json','a',errors='ignore') as f: 
        json.dump(class_report,f,ensure_ascii=False, indent = 4) 
#     with open('class_classification_report_string_format.txt','w') as f: 
#         f.write(class_report2)
#     with open('class_running_output.txt','w') as f:
#         f.write('class_result:'+str(class_result)+'\n')

    return class_result

In [15]:
if __name__=="__main__":

    set_seed(1024)
    #model_checkpoint = "albert-base-v2"
    model_checkpoint = "prajjwal1/bert-tiny"
    # model_checkpoint = r'C:\Users\coldkiller\Desktop\supplyframe\checkpoint-3500'
    
    gpu_available = torch.cuda.is_available()
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    
    print('torch.cuda.is_available()',gpu_available)
    
    model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=3)

    training_args = TrainingArguments(
        output_dir='./results',
        dataloader_num_workers=7,
        do_train = True,
        learning_rate=1e-5,
        weight_decay=0.01,
        adam_beta1=0.9,
        adam_beta2=0.999,
        adam_epsilon=1e-8,
        num_train_epochs=100,
        logging_steps=100,
        save_steps=500000,
        no_cuda= not gpu_available,
        seed=1024,
        per_device_train_batch_size=256,
        per_device_eval_batch_size=256,
        warmup_steps=5,
        logging_dir='./logs',
        load_best_model_at_end=True,
        save_total_limit=5,
        disable_tqdm=True
    )

    for i in range(25):
        print(f'\n\nbegin generate train_dataset{i}\n\n')
        train_data,train_labels_encoded = train_input_process(config_classinfo,model_checkpoint,200,class_dict,tokenizer)
        print('sys.getsizeof(train_data)',sys.getsizeof(train_data))
        train_dataset = processDataset(train_data, train_labels_encoded)
        print('sys.getsizeof(train_dataset)',sys.getsizeof(train_dataset))
        
#         if i>0:
#             trainer.args.learning_rate = tmp_learning_rate
        
        trainer = Trainer(
            model=model,
            args=training_args,
            tokenizer=tokenizer,
            compute_metrics=compute_metrics,
            train_dataset=train_dataset
        )

        # Train the model
        print('Training begins\n')
        start = time.time()
        trainer.train()
        end = time.time()
        print(f"training time: {end - start}")

        #为下一批训练数据做初始化
        trainer.args.warmup_steps=0
        #tmp_learning_rate = trainer.args.learning_rate
        gc.collect()

    #保存模型
    trainer.save_model()
    
    # generate test data
    
    test_data,test_labels_encoded = test_input_process(config_classinfo,model_checkpoint,100,class_dict,tokenizer)
    print('sys.getsizeof(test_data)',sys.getsizeof(test_data))
    test_dataset = processDataset(test_data, test_labels_encoded)
    
    with open(r'testDesc(cap&res).json', 'r', errors='ignore', encoding='utf-8') as f:
        js = f.read()
        real_input = json.loads(js, strict=False, encoding="GB2312")['descriptions']
        labels=[1]*404
    
    real_encodings = tokenizer(real_input,padding=True,truncation=False)
    real_dataset = processDataset(real_encodings,labels)
    
    
    train_result = trainer.predict(train_dataset).metrics
    print(train_result)
    test_result = trainer.predict(test_dataset).metrics
    print(test_result)
    
    real_result = trainer.predict(real_dataset).predictions.argmax(-1)
    class_result = {}
    for i in range(len(real_input)):
        class_result[real_input[i]]=ClassDecoder(real_result[i])
    
    with open('running_output.txt','a') as f:
        f.write(f"training time: {end - start}"+'\n')
        f.write('train_dataset'+str(train_result)+'\n')        
        f.write('test_dataset'+str(test_result)+'\n')
        
    with open('real_description_output1.json','w',errors='ignore') as f: 
        json.dump(class_result,f,ensure_ascii=False, indent = 4) 

torch.cuda.is_available() False
test dataset
generate test dataset

进入数据生成循环
train_flag False
problematic set()
query数量： 37
label数量： 37
begin test tokenizer
finish test tokenizer
37
input_process_finished

sys.getsizeof(test_data) 56


Some weights of the model checkpoint at prajjwal1/bert-tiny were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initia

script running

train dataset

generate train dataset

进入数据生成循环
train_flag True
problematic set()
query数量： 37
label数量： 37
begin train tokenizer
finish train tokenizer
37
sys.getsizeof(train_data) 56
Training begins

{'loss': 0.9313957691192627, 'learning_rate': 3.3333333333333333e-06, 'epoch': 0.1}
{'loss': 1.009919285774231, 'learning_rate': 6.666666666666667e-06, 'epoch': 0.2}
{'loss': 0.942491888999939, 'learning_rate': 1e-05, 'epoch': 0.3}
{'loss': 0.9817836284637451, 'learning_rate': 8.571428571428571e-06, 'epoch': 0.4}
{'loss': 0.9721612930297852, 'learning_rate': 7.1428571428571436e-06, 'epoch': 0.5}
{'loss': 1.0546274185180664, 'learning_rate': 5.7142857142857145e-06, 'epoch': 0.6}
{'loss': 1.0171985626220703, 'learning_rate': 4.2857142857142855e-06, 'epoch': 0.7}
{'loss': 1.0142345428466797, 'learning_rate': 2.8571428571428573e-06, 'epoch': 0.8}
{'loss': 0.9331746101379395, 'learning_rate': 1.4285714285714286e-06, 'epoch': 0.9}
{'loss': 1.0118846893310547, 'learning_rate': 0.0

In [None]:
#改比例，训练数据数量，epoch,dataloader_num_workers

In [26]:
#trainer.model