In [89]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [90]:
!pip install nlp

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [91]:
!pip install openai

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [92]:
import pandas as pd
from sklearn.utils import shuffle
import torch
from sklearn.model_selection import train_test_split
from transformers import BertForSequenceClassification, Trainer, TrainingArguments,BertTokenizerFast
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification,Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, log_loss
from scipy.special import softmax
from nlp import load_dataset
from nlp import Dataset
import numpy as np
import openai

In [93]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [94]:
training_args1 = TrainingArguments(
    output_dir='/content/results', #存储结果文件的目录
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32, 
    learning_rate=5e-5,
    load_best_model_at_end=True, 
    metric_for_best_model = "accuracy", # 最后载入最优模型的评判标准，这里选用acc最高的那个模型参数
    weight_decay=0.01,
    warmup_steps=1,
    evaluation_strategy="steps", #这里设置每100个batch做一次评估，也可以为“epoch”，也就是每个epoch进行一次
    logging_strategy = "steps",
    save_strategy ='steps',
    logging_steps = 1,
    seed = 2023,
    logging_dir='/content/logs' #存储logs的目录
)
training_args2 = TrainingArguments(
    output_dir='/content/results', #存储结果文件的目录
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32, 
    learning_rate=5e-5,
    load_best_model_at_end=True, 
    metric_for_best_model = "accuracy", # 最后载入最优模型的评判标准，这里选用acc最高的那个模型参数
    weight_decay=0.01,
    warmup_steps=1,
    evaluation_strategy="steps", #这里设置每100个batch做一次评估，也可以为“epoch”，也就是每个epoch进行一次
    logging_strategy = "steps",
    save_strategy ='steps',
    logging_steps = 1,
    seed = 2023,
    logging_dir='/content/logs' #存储logs的目录
)

training_args = [training_args1,training_args2]

In [95]:
def tokenize_data(X_train,y_train):
  train_dataset = tokenizer(X_train.tolist(), padding=True, truncation=True, max_length=max_len)
  max_token = []
  for i in range(len(X_train.tolist())):
    decoded_encodings=tokenizer.tokenize(X_train.tolist()[i])
    max_token.append(len(decoded_encodings))
  print(max(max_token))
  train_dataset['label'] = y_train.values
  return train_dataset

In [96]:
def reply(index_hard,index_simple,train_data):
    change = 0
    for i in index_hard:
      pred_train_label = 0
      count = 0
      while pred_train_label == 0 and count <= 2:
        #print('gen harder',count,'round for data',i)
        answer = support_hard.loc[i,'hard'+str(count)]
        pred_train_label = train_bert(answer,None,None,None,'test',None) #get the predict label for specific round
        count += 1
      #print(count)
      if count < 3:
        train_data.loc[i,'content'] = answer
        print(train_data.loc[i,'label'])
        change += 1
      if change >= np.round(len(index_hard)/2):
        break
    print('changing',change,'to hard.')
    # change = 0
    # for i in index_simple:
    #   pred_train_label = 1
    #   count = 0
    #   while pred_train_label == 1 and count <= 2:
    #     #print('gen simpler',count,'round for data',i)
    #     answer = support_simple.loc[i,'simple'+str(count)]
    #     pred_train_label = train_bert(answer,None,None,'test',None) #get the predict label for specific round
    #     count += 1
    #   if count < 3:
    #     train_data.loc[i,'content'] = answer
    #     change += 1
    #   if change >= np.round(len(index_simple)/2):
    #     break
    #   print('changing',change,'to simple.')


In [97]:
train_data = pd.read_csv('/content/data_bio_read.csv') #Read data
init_re = pd.read_csv('/content/init_re.csv')
support_hard = pd.read_csv('/content/harder.csv')
support_simple = pd.read_csv('/content/simpler.csv')
train_data = train_data.dropna()
train_data['index_article'] = train_data['index_article'].astype('int64')
train_data['index_paragraph'] = train_data['index_paragraph'].astype('int64')
train_data['label'] = train_data['label'].astype('int64')
init_re['label']=0
init_re['index_article'] = train_data['index_article']
init_re['index_paragraph'] = train_data['index_paragraph']
train_data = pd.concat([init_re,train_data]).reset_index()

In [98]:
val_data = pd.read_csv('/content/val_bio_data.csv')#need to gen before
test_data = pd.read_csv('/content/test_bio_data.csv')#need to gen before
OOD_data = pd.read_csv('/content/data_OOD.csv')#need to gen before
test_data = test_data.dropna()
OOD_data = OOD_data.dropna()

seed = 2023
torch.manual_seed(seed) # 为CPU设置随机种子
torch.cuda.manual_seed(seed) # 为当前GPU设置随机种子
torch.cuda.manual_seed_all(seed)  # if you are using multi-GPU，为所有GPU设置随机种子
np.random.seed(seed)  # Numpy module.	
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

X_val = val_data['content'] 
y_val = val_data['label']
# X_train, X_val, y_train, y_val = train_test_split(train_data['content'],train_data['label'], test_size=0.1, random_state=2023)

# Bert
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2,output_hidden_states = False)
# # RoBert-a
# tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')
# model = RobertaForSequenceClassification.from_pretrained('roberta-base',num_labels=2,output_hidden_states = False)

max_len = 216
val_dataset = tokenizer(X_val.tolist(), padding=True, truncation=True, max_length=max_len)
val_dataset['label'] = y_val.values
test_dataset = tokenizer(test_data['content'].tolist(),padding=True, truncation=True, max_length=max_len)
val_dataset = Dataset.from_dict(val_dataset)
test_dataset = Dataset.from_dict(test_dataset)
OOD_dataset = tokenizer(OOD_data['content'].tolist(),padding=True, truncation=True, max_length=max_len)
OOD_dataset = Dataset.from_dict(OOD_dataset)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [99]:
def train_bert(train_dataset,val_dataset,test_dataset,OOD, status,training_args):
  global max_len
  trainer = Trainer(model=model,args=training_args,compute_metrics=compute_metrics,train_dataset=train_dataset,eval_dataset=val_dataset,tokenizer=tokenizer)
  if status == 'train':
    trainer.train()
    pred_train = trainer.predict(test_dataset=train_dataset).predictions
    pred = trainer.predict(test_dataset=test_dataset).predictions
    prob_train = softmax(pred_train, axis = 1)
    prob = softmax(pred, axis = 1)
    pred_label_train = prob_train.argmax(-1)
    pred_label = prob.argmax(-1)
    OOD_pred = trainer.predict(test_dataset=OOD).predictions
    prob_OOD = softmax(OOD_pred, axis = 1)
    OOD_pred_label = prob_OOD.argmax(-1)
    return pred_label_train,pred_label, OOD_pred_label
  else:
    df = pd.DataFrame(columns=['content'], index=[0])
    df.loc[0,'content'] = train_dataset
    train_dataset = tokenizer(df['content'].tolist(),padding=True,truncation=True,max_length=max_len) #input train dataset is an paragraph
    train_dataset = Dataset.from_dict(train_dataset)
    pred_train = trainer.predict(test_dataset=train_dataset).predictions
    prob_train = softmax(pred_train, axis = 1)
    pred_label_train = prob_train.argmax(-1)
    return pred_label_train


In [100]:
def judge(y_train,y_train_pred):
  index_hard = []
  index_simple = []
  for i in range(len(y_train)):
    if y_train[i] == y_train_pred[i] and y_train[i] == 0:
      index_hard.append(i)
    elif y_train[i] != y_train_pred[i] and y_train[i] == 0:
      index_simple.append(i)
  return index_hard,index_simple


In [101]:
for round in range(2):
  train_data = train_data.dropna()
  X_train = train_data['content']
  y_train = train_data['label']
  train_dataset = tokenize_data(X_train,y_train)
  train_dataset = Dataset.from_dict(train_dataset)
  train_label,test_label, OOD_label = train_bert(train_dataset,val_dataset,test_dataset,OOD_dataset,'train',training_args[round])
  acc = accuracy_score(test_data['label'], test_label)
  precision, recall, f1, _ = precision_recall_fscore_support(test_data['label'], test_label, average='binary')
  print('in round',round,'testing acc is:',acc)
  print(precision,recall,f1)
  acc = accuracy_score(OOD_data['label'], OOD_label)
  precision, recall, f1, _ = precision_recall_fscore_support(OOD_data['label'], OOD_label, average='binary')
  print('in round',round,'OOD acc is:',acc)
  print(precision,recall,f1)
  index_hard, index_simple = judge(train_data['label'],train_label)
  reply(index_hard,index_simple,train_data)

368


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6879,0.703475,0.5,0.663507,0.5,0.985915
2,0.7303,0.719696,0.514085,0.168675,0.583333,0.098592
3,0.7941,0.711322,0.507042,0.027778,1.0,0.014085
4,0.6372,0.711232,0.5,0.0,0.0,0.0
5,0.777,0.717497,0.5,0.0,0.0,0.0
6,0.7869,0.711808,0.5,0.0,0.0,0.0
7,0.6465,0.710771,0.5,0.0,0.0,0.0
8,0.7937,0.705189,0.5,0.0,0.0,0.0
9,0.7266,0.698886,0.5,0.0,0.0,0.0
10,0.6709,0.694916,0.5,0.0,0.0,0.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


in round 0 testing acc is: 0.7
0.9666666666666667 0.4142857142857143 0.5800000000000001
in round 0 OOD acc is: 0.5963041933191187
0.9688581314878892 0.19900497512437812 0.33018867924528306


0


0


0


0


0


0


0


0


0


0


0


0


changing 12 to hard.
368




Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2324,0.637861,0.704225,0.637931,0.822222,0.521127
2,0.2961,0.518239,0.746479,0.731343,0.777778,0.690141
3,0.5176,0.675807,0.690141,0.584906,0.885714,0.43662
4,0.3428,1.160646,0.535211,0.153846,0.857143,0.084507
5,0.9174,1.200044,0.514085,0.054795,1.0,0.028169
6,1.0353,1.063973,0.542254,0.177215,0.875,0.098592
7,0.5922,0.805077,0.591549,0.355556,0.842105,0.225352
8,0.545,0.536068,0.71831,0.672131,0.803922,0.577465
9,0.5288,0.495991,0.802817,0.802817,0.802817,0.802817
10,0.53,0.496687,0.802817,0.802817,0.802817,0.802817


in round 1 testing acc is: 0.7071428571428572
0.8918918918918919 0.4714285714285714 0.6168224299065421
in round 1 OOD acc is: 0.7061122956645345
0.9559748427672956 0.43212508884150674 0.5952031326480666


0


0


0


0


0


0


0


0


0


0


0


0


0


0


0


changing 15 to hard.
