In [1]:
import config
import NLP_model
import utl
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


# Prepare test input data, train data, and evaluate data

In [69]:
# Test data
# Use different data as test data
ref_data=pd.read_excel('./data/s2f_v2.xlsx').iloc[:,:].values #用于人工结果检查的参考数据
ref_data2=pd.read_excel('./data/Book2.xlsx').iloc[:,:].values
F_data = ref_data2.copy()[:,0].tolist()
S_data = ref_data2.copy()[:,1].tolist()

# Use different data as evaluate data
Feature_eval = ref_data2.copy()[:1000,0].tolist()
Scenario_eval = ref_data2.copy()[:1000,1].tolist()

# Train data
Feature_list = ref_data[:,0].tolist()
Scenario_list = ref_data[:,1].tolist()

In [70]:
# 文本检验与清洗
dic = ['~','!','@','#','$','%','^','&','*','(',')','_','+','=','.','[',']','\\','/',';',':']
F_data,S_data = utl.data_check(F_data,S_data,dic)
Feature_eval,Scenario_eval = utl.data_check(Feature_eval,Scenario_eval,dic)
Feature_list,Scenario_list = utl.data_check(Feature_list,Scenario_list,dic)

# Prepare model config and model

In [71]:
#ref_data=pd.read_excel('./data/Feature2Scenario.xlsx').iloc[:,:].values
# 可选模型：all-MiniLM-L6-v2、bert-base-chinese、ms-marco-MiniLM-L-6-v2、multi-qa-MiniLM-L6-cos-v1、paraphrase-MiniLM-L6-v2
# 最终使用微调后模型，上述模型中性能最好的是bert-base-chinese

model_name= 'bert-base-chinese'
ct = config.config(finetuneFile='./finetune_model/',  #微调模型保存路径
                    pretrainFile = './pretrain_model/', #预训练模型保存路径
                    resultsFile = './Results/', #结果保存路径
                    initial = False, #用于选取读取预训练模型还是微调模型，False为读取微调模型
                    model_name = model_name, #选取模型名字，目前准备模型有如上注释
                    max_expand = 1.5, #人工检验中用于扩大人工选择匹配文本对相似度的扩大倍数，该数据通过案例测试选定，可以根据后续数据调整
                    ref_data = ref_data
            )
#print(ct.finetuneFile,ct.train_log)
Bert_model = NLP_model.NLP_sim(ct) #模型初始化

# Model prediction

In [42]:
# 计算模型的相似性结果表，表的横排为当前输入所有feature，纵排为所有scenario，结果保存到config.ResultsFile对应文件夹中
# 参数说明：
## 第一个参数表示是否采用归一化计算；
## 第二个参数为输入的Feature数据，可以单个可以多个；
## 第三个参数为输入的所有Scenario；
## 第四个为文件保存名字

# sim为所有feature对所有scenario的相似度，为矩阵
# 计算的相似度是cos对应的向量相似度，并非概率分布
# order_results为所有feature对应sceanrio按照相似度进行排序的结果
sim,embeddings_F,embeddings_S = Bert_model.Get_SimMatrix(True,F_data,list(set(S_data)),model_name+'_original.csv')

In [43]:
# 在上述模型计算结果基础上，通过需求给定多路输出，计算筛选最终的scenario
k1,k2,b = 1.5, 1.5, 0.75 # BM2.5模型筛选所需参数，取值参考：https://zhuanlan.zhihu.com/p/79202151
order_scenario = Bert_model.Select_Scenario(F_data,list(set(S_data)),sim,k1,k2,b,model_name+'_orderedScenario.csv')

## 补充单个feature输入的计算结果，
## 以'加速踏板防误踩 AMAP  (Anti-maloperation for Accelerator Pedal)'为例，可以替换

In [44]:
# 这个模块也可以单个结果输出，输入一个feature，输出该feature对应所有scenario的相似度
test_feature = ['加速踏板防误踩 AMAP  (Anti-maloperation for Accelerator Pedal)']
# sim为所有feature对所有scenario的相似度，为矩阵
# order_results为所有feature对应sceanrio按照相似度进行排序的结果
sim_single,embeddings_Fs,embeddings_Ss = Bert_model.Get_SimMatrix(True,test_feature,list(set(S_data)),model_name+'_single.csv')

# 由于业务部门提出“需要查看给定的feature与scenario对的相似度结果以进行检验”，以下代码为此专门编写
## 输入一个feature，一个scenario，输出这一对的相似度。循环计算所有文本对的相似度，保存为csv以方便查看

In [72]:
# 准备测试文本数据
F_data = ref_data2.copy()[:,0].tolist()
S_data = ref_data2.copy()[:,1].tolist()
# 文本数据计算embedding
Fembedding = Bert_model._GetEmbedding_(F_data)
Sembedding = Bert_model._GetEmbedding_(S_data)

# 因为需要归一化计算，因此准备给定的标准scenario
add_S = ref_data2.copy()[:,1].tolist()
SS_data = list(set(pd.read_excel('./data/S_Data.xlsx').iloc[:,:].values.T[0].tolist()+add_S))
# 计算标准scenario的embedding
SSembedding = Bert_model._GetEmbedding_(SS_data)
# 结果保存为results变量，后续存储为csv
results = [['Scenario','Feature','sim number original','sim number softmax','sim number normalized']]
# 计算给定的每个feature与scenario对的归一化相似度
for s,f,em1,em2 in zip(S_data,F_data,Fembedding,Sembedding):
     # 计算相似度 包含softmax
     cos_scores = Bert_model._GetSim_cos_(em1,em2)
     base_scores = Bert_model._GetSim_cos_(em1,SSembedding)
     softmax_sim_base = np.exp(base_scores.numpy())/np.sum(np.exp(base_scores.numpy()))
     softmax_sim = np.exp(cos_scores.numpy())/np.sum(np.exp(base_scores.numpy()))
     # 计算归一化后的相似度
     hc_sim = (softmax_sim-softmax_sim_base.min())/(softmax_sim_base.max()-softmax_sim_base.min())
     tem=[s,f,
          cos_scores[0],
          softmax_sim[0],
          hc_sim[0],
          ]
     results.append(tem)
pd.DataFrame(results).to_csv('./Results/'+model_name+'_Final_results_2.csv',encoding = "utf_8_sig")

# Model fine-tune

In [41]:
train_loss,evaluator,eval = Bert_model.Fine_tune(Feature_list,Scenario_list,Feature_eval,Scenario_eval,10)

Iteration: 100%|██████████| 132/132 [06:19<00:00,  2.88s/it]
Iteration: 100%|██████████| 132/132 [06:10<00:00,  2.81s/it]
Iteration: 100%|██████████| 132/132 [06:09<00:00,  2.80s/it]
Iteration: 100%|██████████| 132/132 [06:07<00:00,  2.78s/it]
Iteration: 100%|██████████| 132/132 [06:10<00:00,  2.81s/it]
Iteration: 100%|██████████| 132/132 [06:11<00:00,  2.81s/it]
Iteration: 100%|██████████| 132/132 [06:11<00:00,  2.81s/it]
Iteration: 100%|██████████| 132/132 [06:08<00:00,  2.79s/it]
Iteration: 100%|██████████| 132/132 [06:11<00:00,  2.81s/it]
Iteration: 100%|██████████| 132/132 [06:08<00:00,  2.79s/it]
Epoch: 100%|██████████| 10/10 [1:03:08<00:00, 378.85s/it]


# 应上一轮修改需求，检验模型对特殊字符的适应性，Test special vocab
## 所有字符（包括特殊字符）数据给定在'./data/vocab.txt'中

In [None]:
# 检验特殊字符，并将检验结果保存到csv
data = pd.read_table('./data/vocab.txt').values
Vembedding = []
for dt in data:
    Vembedding.append(Bert_model._GetEmbedding_(dt[0]).tolist())
pdv = pd.DataFrame(np.array(Vembedding).T)
pdv.to_csv('./Results/vacab_results.csv')

# Pretrain示例
### 由于补充收集了“相关数据”，而该补充数据由于缺少人工的“相似与否”标定，只能用于预训练，因此补充预训练
### 由于预训练需要大量数据与计算资源，而目前收集的补充数据数量上无法达到这个标准，因此目前收集的数据并不能有效支持该过程取得完美效果，
### 但该接口是可用的，故保留，以便后续数据足够时使用

In [10]:
# 准备预训练数据
# 预训练数据包括两个部分，hugging face上获取数据，以及知乎专栏获取数据
pre_train_data = pd.read_csv('./data/pretrain_traindata.csv').iloc[:,1].values.tolist()+pd.read_csv('./data/pretrain_traindata2.csv').iloc[:,1].values.tolist()
pre_test_data = pd.read_csv('./data/pretrain_testdata.csv').iloc[:,1].values.tolist()+pd.read_csv('./data/pretrain_testdata2.csv').iloc[:,1].values.tolist()

In [11]:
# 去除数据中空值
tem_pre_train_data, tem_pre_test_data = [], []
for item,k in zip(pre_train_data,range(len(pre_train_data))):
    if type(item) == type(pre_train_data[0]):
        tem_pre_train_data.append(item)

for item,k in zip(pre_test_data,range(len(pre_test_data))):
    if type(item) == type(pre_test_data[0]):
        tem_pre_test_data.append(item)

pre_train_data = tem_pre_train_data
pre_test_data = tem_pre_test_data

In [12]:
# 准备预训练参数
# 由于验收时间关系，这些参数仅使用已有代码的推荐参数，没有进行cross-validation，属于benchmark hyperparameter
pre_ct = config.pretrain_config(per_device_train_batch_size=64,  #batch_size
                    save_steps = 1, #Save model every given steps
                    num_train_epochs = 1, #Number of epochs
                    use_fp16 = False, #Set to True, if your GPU supports FP16 operations
                    max_length = 512, #Max length for a text input
                    do_whole_word_mask = True, #If set to true, whole words are masked
                    mlm_prob = 0.15 #Probability that a word is replaced by a [MASK] token
            )

In [13]:
# 由于目前预训练使用的TrainingArguments无法读取中文，所以需要输入保存模型的路径参数，并且该参数中不能包含中文（不能保存在中文路径）
save_path = 'C:/testmodel/'
Bert_model.Pre_train(pre_train_data[:100],pre_test_data,pre_ct,save_path)

Some weights of the model checkpoint at ./pretrain_model/bert-base-chinese were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'cls.seq_relationship.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  0%|          | 0/2 [00:00<?, ?it/s]We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
 50%|█████     | 1/2 [00:14<00:14, 14.21s/it]

{'loss': 1.6962, 'learning_rate': 2.5e-05, 'epoch': 0.5}



 50%|█████     | 1/2 [01:29<00:14, 14.21s/it]    

{'eval_loss': 1.5047575235366821, 'eval_runtime': 75.6461, 'eval_samples_per_second': 15.017, 'eval_steps_per_second': 1.877, 'epoch': 0.5}


100%|██████████| 2/2 [01:42<00:00, 58.01s/it]

{'loss': 1.7433, 'learning_rate': 0.0, 'epoch': 1.0}



100%|██████████| 2/2 [02:58<00:00, 58.01s/it]    

{'eval_loss': 1.4644050598144531, 'eval_runtime': 75.2966, 'eval_samples_per_second': 15.087, 'eval_steps_per_second': 1.886, 'epoch': 1.0}


100%|██████████| 2/2 [02:59<00:00, 89.76s/it]


{'train_runtime': 179.5064, 'train_samples_per_second': 0.557, 'train_steps_per_second': 0.011, 'train_loss': 1.7197410464286804, 'epoch': 1.0}
