In [5]:
import sampling
import os
import pandas as pd
import logging
import sklearn
import time
import random
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support,classification_report
from sklearn.preprocessing import LabelEncoder
import json
from simpletransformers.ner import NERModel,NERArgs

In [6]:
config_class_features = sampling.read_data("config/config-class-features.json")
config_class_name = sampling.read_data("config/config-class-name.json")
config_classinfo = sampling.read_data("config/config-classinfo.json")
config_numeric_fields = sampling.read_data("config/config-numeric-fields.json")
config_dynamic_units = sampling.read_data("config/config-dynamic-units.json")
pair_params = sampling.read_data("config/pair-params")
class_dict = {'Resistors':1,'Capacitors':2,'others':0}

In [7]:
def get_rs_list_of_all_category():
    """
    input_size: the number of items to sample
    returns the input for simpletransformer
    """
    # 读取所有catogory文件
    path = os.getcwd()
    files= os.listdir('./formatData') #得到文件夹下的所有文件名称
    rs_list = []
    
    for file in files: #遍历文件夹
        file_path = os.path.join(path, 'formatData/'+file)
        if os.path.isfile(file_path): #判断是否是文件夹，不是文件夹才打开
                rs = sampling.sampling(file,0.6)
                rs_list.append((rs,file))
                
    print('num_of_all_categories',len(rs_list))
    
    with open('running_output.txt','a') as f:
        f.write('num_of_all_categories'+str(len(rs_list))+'\n')
        
    return rs_list

def get_input_from_sampling(input_size: int):
    """
    input_size: the number of items to sample
    returns the input for simpletransformer
    """
    rs_list = get_rs_list_of_all_category()
    
    # input_size为1表示从每一个category抽取一个样本
    text_list = []
    label_list = []
    problematic = set()
    
    for i in range(input_size):
        for rs in rs_list:
            try:
                item=rs[0].random_sampling()
            except Exception as e:
                problematic.add(rs[1])
                continue
            # print(item)
            tmp_list = []
            for key, val in item.items():
                if key == 'category':
                    continue
                elif key == 'class':
                    label_list.append(val)
                else:
                    tmp_list.append(str(val))
            text_list.append(tmp_list)
    print('query数量：',len(text_list))
    print('label数量：',len(label_list))
    print('problematic',problematic)
    
    with open('running_output.txt','a') as f:
        f.write('query数量'+str(len(text_list))+'\n')
        f.write('label数量'+str(len(label_list))+'\n')
        f.write('problematic'+str(problematic)+'\n')
    return text_list,label_list

a,b=get_input_from_sampling(1)
print(a)
print(b)

In [8]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

In [9]:
def input_process(config_classinfo,model_checkpoint,tokenizer,train_size,test_size,class_dict):
    """
    """
    files= os.listdir('./class_inputData')

    with open('running_output.txt','w') as f:
        f.write(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())+'\n')
    
    #准备训练数据
    print('train dataset\n')
    if 'train_encodings.pt' in files and 'train_labels.csv' in files:
        print('read train dataset\n')
        train_encodings = torch.load('./class_inputData/train_encodings.pt')
        train_labels = pd.read_csv('./class_inputData/train_labels.csv',index_col=0)
        train_labels = train_labels['0'].to_list()
    else:
        print('generate train dataset\n')
        train_dataset,train_labels = get_input_from_sampling(train_size)
        train_encodings = tokenizer(train_dataset, is_split_into_words=True,add_special_tokens=False,
                        padding=True,truncation=False,return_tensors="pt")
        torch.save(train_encodings, './class_inputData/train_encodings.pt')
        pd.DataFrame(train_labels).to_csv('./class_inputData/train_labels.csv')
        print('train dataset saved')
        
    print('%%%%%%%%%%%%%%%%%%%%%%%%%%%%%')
    
    print('test dataset')
    if 'test_encodings.pt' in files and 'test_labels.csv' in files:
        print('read test dataset\n')
        test_encodings = torch.load('./class_inputData/test_encodings.pt')
        test_labels = pd.read_csv('./class_inputData/test_labels.csv',index_col=0)
        test_labels = test_labels['0'].to_list()
    else:
        print('generate test dataset\n')
        test_dataset,test_labels = get_input_from_sampling(test_size)
        test_encodings = tokenizer(test_dataset, is_split_into_words=True,add_special_tokens=False,
                        padding=True,truncation=False,return_tensors="pt")
        torch.save(test_encodings, './class_inputData/test_encodings.pt')
        pd.DataFrame(test_labels).to_csv('./class_inputData/test_labels.csv')
        print('test dataset saved\n')

    # encode the labels

    train_labels_encoded = list(map(lambda x:class_dict[x],train_labels))
    print(len(train_labels_encoded))
    test_labels_encoded = list(map(lambda x:class_dict[x],test_labels))
    print(len(test_labels_encoded))
    classes = set(train_labels_encoded)|set(test_labels_encoded)
    label_cnt = len(classes)
    print(label_cnt)
    
    with open('running_output.txt','a') as f:
        f.write('class数量'+str(label_cnt)+'\n')
    
    return label_cnt,train_encodings,test_encodings,train_labels_encoded,test_labels_encoded


In [10]:
class processDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [11]:
def getClass(dict,cat):
    for key, val in dict.items():
        if cat == val:
            return key
    return 'unknown_class'

In [25]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='micro')
    acc = accuracy_score(labels, preds)
    
    global class_dict

    res = pd.DataFrame({"preds":preds,"labels":labels})
    res.to_csv('test_set_result.csv')
    
    class_preds = [getClass(class_dict,item) for item in preds]
    class_labels = [getClass(class_dict,item) for item in labels]
    class_report = classification_report(class_labels, class_preds,output_dict=True)
    class_report2 = classification_report(class_labels, class_preds)
    class_result = {'accuracy': acc,
                'f1': f1,
                'precision': precision,
                'recall': recall}
    
    with open('class_classification_report.txt','w') as f: 
        js = json.dumps(class_report) 
        f.write(js)
    with open('class_classification_report_string_format.txt','w') as f: 
        f.write(class_report2)
    with open('class_running_output.txt','w') as f:
        f.write('class_result:'+str(class_result)+'\n')

    return class_result

In [26]:
if __name__=="__main__":

    set_seed(1024)
    model_checkpoint = "albert-base-v2"
    # model_checkpoint = r'C:\Users\coldkiller\Desktop\supplyframe\checkpoint-3500'
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    gpu_available = torch.cuda.is_available()
    print('torch.cuda.is_available()',gpu_available)
    
    label_cnt,train_encodings,test_encodings,train_labels_encoded,test_labels_encoded = input_process(config_classinfo,
                                                                                    model_checkpoint,tokenizer,1,1,class_dict)

    print('input_process_finished\n')
    train_dataset = processDataset(train_encodings, train_labels_encoded)
    test_dataset = processDataset(test_encodings, test_labels_encoded)

    model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=36)

    training_args = TrainingArguments(
        output_dir='./results',
        learning_rate=5e-3,
        weight_decay=0.01,
        adam_beta1=0.9,
        adam_beta2=0.999,
        adam_epsilon=1e-8,
        num_train_epochs=1,
        logging_steps=150000,
        save_steps=500000,
        no_cuda=False,
        seed=1024,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=64,
        warmup_steps=500,
        logging_dir='./logs',
        load_best_model_at_end=True,
        disable_tqdm=True
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        train_dataset=train_dataset,
        eval_dataset=test_dataset
    )

    # Train the model
    print('Training begins\n')
    start = time.time()
    trainer.train()
    end = time.time()
    print(f"training time: {end - start}")

    train_result = trainer.evaluate(train_dataset)
    print(train_result)
    test_result = trainer.evaluate(test_dataset)
    print(test_result)

    trainer.save_model()
    with open('running_output.txt','a') as f:
        f.write(f"training time: {end - start}"+'\n')
        f.write('train_dataset'+str(train_result)+'\n')        
        f.write('test_dataset'+str(test_result)+'\n')

torch.cuda.is_available() False
train dataset

read train dataset

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
test dataset
read test dataset

373
374
35
input_process_finished



Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.bias', 'predictions.LayerNorm.weight', 'predictions.LayerNorm.bias', 'predictions.dense.weight', 'predictions.dense.bias', 'predictions.decoder.weight', 'predictions.decoder.bias']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You sho

Training begins



Step,Training Loss


training time: 136.9040653705597


[(2, 25), (2, 4), (2, 15), (2, 24), (2, 24), (2, 29), (2, 15), (2, 21), (2, 4), (2, 1)]
{'eval_loss': 3.300023317337036, 'eval_accuracy': 0.128686327077748, 'eval_f1': 0.128686327077748, 'eval_precision': 0.128686327077748, 'eval_recall': 0.128686327077748, 'epoch': 1.0}
[(2, 25), (2, 4), (2, 15), (2, 24), (2, 24), (2, 29), (2, 15), (2, 21), (2, 4), (2, 1)]
{'eval_loss': 3.304050922393799, 'eval_accuracy': 0.12834224598930483, 'eval_f1': 0.12834224598930483, 'eval_precision': 0.12834224598930483, 'eval_recall': 0.12834224598930483, 'epoch': 1.0}


In [27]:
trainer.predict(test_dataset)

  import sys


KeyboardInterrupt: 

In [13]:
def save_class_mapping():
    #加上unknown是36个class
    class_dict = {}
    i=1
    for part_class in config_classinfo.keys():
        class_dict[part_class] = i
        i+=1

    class_dict['unknown_class'] = 0

    with open('class_dict.txt', 'w') as f:
        dic = json.dumps(class_dict)  
        f.write(dic)