In [105]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [106]:
!pip install nlp

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [107]:
!pip install openai

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [108]:
import pandas as pd
from sklearn.utils import shuffle
import torch
from sklearn.model_selection import train_test_split
from transformers import BertForSequenceClassification, Trainer, TrainingArguments, BertTokenizerFast
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, log_loss
from scipy.special import softmax
from nlp import load_dataset
from nlp import Dataset
import numpy as np
import openai

In [109]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [110]:
training_args1 = TrainingArguments(
    output_dir='/content/results', #存储结果文件的目录
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32, 
    learning_rate=5e-5,
    load_best_model_at_end=True, 
    metric_for_best_model = "accuracy", # 最后载入最优模型的评判标准，这里选用acc最高的那个模型参数
    weight_decay=0.01,
    warmup_steps=1,
    evaluation_strategy="steps", #这里设置每100个batch做一次评估，也可以为“epoch”，也就是每个epoch进行一次
    logging_strategy = "steps",
    save_strategy ='steps',
    logging_steps = 1,
    seed = 2023,
    logging_dir='/content/logs' #存储logs的目录
)
training_args2 = TrainingArguments(
    output_dir='/content/results', #存储结果文件的目录
    overwrite_output_dir=True,
    num_train_epochs=2,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32, 
    learning_rate=5e-5,
    load_best_model_at_end=True, 
    metric_for_best_model = "accuracy", # 最后载入最优模型的评判标准，这里选用acc最高的那个模型参数
    weight_decay=0.01,
    warmup_steps=1,
    evaluation_strategy="steps", #这里设置每100个batch做一次评估，也可以为“epoch”，也就是每个epoch进行一次
    logging_strategy = "steps",
    save_strategy ='steps',
    logging_steps = 1,
    seed = 2023,
    logging_dir='/content/logs' #存储logs的目录
)

training_args = [training_args1,training_args2]

In [111]:
def tokenize_data(X_train,y_train):
  train_dataset = tokenizer(X_train.tolist(), padding=True, truncation=True, max_length=max_len)
  max_token = []
  for i in range(len(X_train.tolist())):
    decoded_encodings=tokenizer.tokenize(X_train.tolist()[i])
    max_token.append(len(decoded_encodings))
  print(max(max_token))
  train_dataset['label'] = y_train.values
  return train_dataset

In [112]:
def reply(index_hard,index_simple,train_data):
    # change to hard
    change = 0
    for i in index_hard:
      pred_train_label = 0
      count = 0
      while pred_train_label == 0 and count <= 2:
        #print('gen harder',count,'round for data',i)
        answer = support_hard.loc[i,'hard'+str(count)]
        pred_train_label = train_bert(answer,None,None,'test',None) #get the predict label for specific round
        count += 1
      #print(count)
      if count < 3:
        train_data.loc[i,'content'] = answer
        #print(train_data.loc[i,'label'])
        change += 1
      if change >= np.round(len(index_hard)/2):
        break
    print('changing',change,'to hard.')
    # change to simple (deleted)
    # change = 0
    # for i in index_simple:
    #   pred_train_label = 1
    #   count = 0
    #   while pred_train_label == 1 and count <= 2:
    #     print('gen simpler',count,'round for data',i)
    #     answer = support_simple.loc[i,'simple'+str(count)]
    #     pred_train_label = train_bert(answer,None,None,'test',None) #get the predict label for specific round
    #     count += 1
    #   if count < 3:
    #     train_data.loc[i,'content'] = answer
    #     change += 1
    #   if change >= np.round(len(index_simple)/2):
    #     break
    #   print('changing', change, 'to simple.')


In [113]:
train_data = pd.read_csv('/content/data_bio_read.csv') #Read data
init_re = pd.read_csv('/content/init_re.csv')
support_hard = pd.read_csv('/content/harder.csv')
support_simple = pd.read_csv('/content/simpler.csv')
train_data = train_data.dropna()
train_data['index_article'] = train_data['index_article'].astype('int64')
train_data['index_paragraph'] = train_data['index_paragraph'].astype('int64')
train_data['label'] = train_data['label'].astype('int64')
init_re['label']=0
init_re['index_article'] = train_data['index_article']
init_re['index_paragraph'] = train_data['index_paragraph']
train_data = pd.concat([init_re,train_data]).reset_index()

In [114]:
val_data = pd.read_csv('/content/val_bio_data.csv')#need to gen before
test_data = pd.read_csv('/content/test_bio_data.csv')#need to gen before
test_data = test_data.dropna()
seed = 2023
torch.manual_seed(seed) # 为CPU设置随机种子
torch.cuda.manual_seed(seed) # 为当前GPU设置随机种子
torch.cuda.manual_seed_all(seed)  # if you are using multi-GPU，为所有GPU设置随机种子
np.random.seed(seed)  # Numpy module.	
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

X_val = val_data['content'] 
y_val = val_data['label']
#X_train, X_val, y_train, y_val = train_test_split(train_data['content'],train_data['label'], test_size=0.1, random_state=2023)

# Bert
# tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
# model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2,output_hidden_states = False)
# RoBert-a
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base',num_labels=2,output_hidden_states = False)

max_len = 216
val_dataset = tokenizer(X_val.tolist(), padding=True, truncation=True, max_length=max_len)
val_dataset['label'] = y_val.values
test_dataset = tokenizer(test_data['content'].tolist(),padding=True, truncation=True, max_length=max_len)
val_dataset = Dataset.from_dict(val_dataset)
test_dataset = Dataset.from_dict(test_dataset)
test_dataset

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

Dataset(features: {'input_ids': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}, num_rows: 140)

In [115]:
def train_bert(train_dataset,val_dataset,test_dataset,status,training_args):
  global max_len
  trainer = Trainer(model=model,args=training_args,compute_metrics=compute_metrics,train_dataset=train_dataset,eval_dataset=val_dataset,tokenizer=tokenizer)
  if status == 'train':
    trainer.train()
    pred_train = trainer.predict(test_dataset=train_dataset).predictions
    pred = trainer.predict(test_dataset=test_dataset).predictions
    prob_train = softmax(pred_train, axis = 1)
    prob = softmax(pred, axis = 1)
    pred_label_train = prob_train.argmax(-1)
    pred_label = prob.argmax(-1)
    return pred_label_train,pred_label
  else:
    df = pd.DataFrame(columns=['content'], index=[0])
    df.loc[0,'content'] = train_dataset
    train_dataset = tokenizer(df['content'].tolist(),padding=True,truncation=True,max_length=max_len) #input train dataset is an paragraph
    train_dataset = Dataset.from_dict(train_dataset)
    pred_train = trainer.predict(test_dataset=train_dataset).predictions
    prob_train = softmax(pred_train, axis = 1)
    pred_label_train = prob_train.argmax(-1)
    return pred_label_train


In [116]:
def judge(y_train,y_train_pred):
  index_hard = []
  index_simple = []
  for i in range(len(y_train)):
    if y_train[i] == y_train_pred[i] and y_train[i] == 0:
      index_hard.append(i)
    elif y_train[i] != y_train_pred[i] and y_train[i] == 0:
      index_simple.append(i)
  return index_hard,index_simple


In [117]:
for round in range(2):
  train_data = train_data.dropna()
  X_train = train_data['content']
  y_train = train_data['label']
  train_dataset = tokenize_data(X_train,y_train)
  train_dataset = Dataset.from_dict(train_dataset)
  train_label,test_label = train_bert(train_dataset,val_dataset,test_dataset,'train',training_args[round])
  acc = accuracy_score(test_data['label'], test_label)
  precision, recall, f1, _ = precision_recall_fscore_support(test_data['label'], test_label, average='binary')
  print('in round',round,'testing acc is:',acc)
  print(precision,recall,f1)
  index_hard, index_simple = judge(train_data['label'],train_label)
  reply(index_hard,index_simple,train_data)

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


370


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.7219,0.716214,0.5,0.666667,0.5,1.0
2,0.7301,0.701103,0.5,0.666667,0.5,1.0
3,0.6682,0.695877,0.5,0.666667,0.5,1.0
4,0.7532,0.693618,0.492958,0.608696,0.495575,0.788732
5,0.6999,0.693555,0.5,0.0,0.0,0.0
6,0.706,0.690709,0.514085,0.054795,1.0,0.028169
7,0.7019,0.690296,0.507042,0.027778,1.0,0.014085
8,0.7023,0.689257,0.697183,0.718954,0.670732,0.774648
9,0.6895,0.688078,0.5,0.666667,0.5,1.0
10,0.6911,0.685058,0.5,0.666667,0.5,1.0


  _warn_prf(average, modifier, msg_start, len(result))


in round 0 testing acc is: 0.7357142857142858
0.8113207547169812 0.6142857142857143 0.6991869918699187


changing 33 to hard.
370




Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4576,0.598775,0.676056,0.634921,0.727273,0.56338
2,0.5618,0.73764,0.640845,0.690909,0.606383,0.802817
3,0.8395,0.67259,0.612676,0.682081,0.578431,0.830986
4,0.8253,0.631247,0.661972,0.578947,0.767442,0.464789
5,0.677,0.659752,0.640845,0.523364,0.777778,0.394366
6,0.7091,0.652324,0.647887,0.545455,0.769231,0.422535
7,0.6427,0.654,0.647887,0.537037,0.783784,0.408451
8,0.7564,0.650105,0.65493,0.550459,0.789474,0.422535
9,0.6573,0.636312,0.661972,0.578947,0.767442,0.464789
10,0.6184,0.621437,0.683099,0.615385,0.782609,0.507042


in round 1 testing acc is: 0.7357142857142858
0.7894736842105263 0.6428571428571429 0.7086614173228346


changing 21 to hard.
