# Install dependencies 

In [1]:
!pip install -q transformers
!pip install -q numpy
!pip install -q pandas
!pip install -q argparse
!pip install -q torch
# A dependency of the preprocessing for BERT inputs
!pip install -q -U tensorflow-text
!pip install -q tf-models-official

In [2]:
#Import needed classes
import sys
import numpy as np
import pandas as pd
import os
import warnings
warnings.filterwarnings('ignore')
os.chdir('..')
from Bert import Bert


# Import correct dataset and set hyperparmeters for GPT2

Set data_name to the name of your dataset. This needs to correspond to a folder in /data/, which should be generated by the generate_data.ipynb notebook. num_classes manually needs to be set to the number of classes in your dataset.

In [3]:
data_name = "imdb"
num_classes = 2

#Other hyperparmaters and choices
epochs = 5
batch_size = 2
device = "cuda"
repeat_num = 10

# Fine tune GPT2

This cell will run the GPT2Tuner.py script with different arguments in order to fine tune GPT2 on the data, and then create the sentences. The sentences will then be saved to the correct folder to later filter out the good ones with BERT.

In [4]:
def generate_run_string(data_name,sample_number,epochs,batch_size,device,repeat_num,samples_per_class):
    return f'python GPT2Tuner.py \
    --train_data_path data/{data_name}/train_labeled_{sample_number}.csv \
    --output_name generated_samples_{sample_number}.txt \
    --output_dir data/{data_name} \
    --epochs {epochs} \
    --batch_size {batch_size} \
    --device {device} \
    --torch_seed 1 \
    --numpy_seed 2 \
    --random_seed 3 \
    --repeat_num {repeat_num} \
    --samples_per_class {samples_per_class}'

for datapoints in [5, 10, 25, 50]:
    !{generate_run_string(data_name = data_name, sample_number = datapoints, epochs = epochs, batch_size = batch_size, device = device, repeat_num = repeat_num, samples_per_class = datapoints)}


Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.
Starting training:
avg_train_loss 43.82683486938477
elapsed time for 1 training epoch :  0:00:03
avg_train_loss 4.536903810501099
elapsed time for 1 training epoch :  0:00:03
avg_train_loss 3.741558790206909
elapsed time for 1 training epoch :  0:00:03
avg_train_loss 3.2865145206451416
elapsed time for 1 training epoch :  0:00:03
avg_train_loss 2.968233323097229
elapsed time for 1 training epoch :  0:00:03
Generating sequences
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_i

# Set hyperparameters for BERT

These are the hyperparameters used for the BERT classifier that will filter out the sentences we choose from the synthesized ones

In [5]:
batch_size = 4
seed = 0
learning_rate = 5e-5
epochs=5

In [6]:
 def get_sentences(train_path, synthesized_path, sentences_per_label):
        """This method trains a BERT classifier to choose the X best labels from among the data at synthesized_path
        and returns them as a pandas dataframe, where X is sentences_per_label"""
        #Train the BERT classifier that will be used to choose the sentences later
        h = Bert(num_classes = num_classes, random_state = seed)
        train = pd.read_csv(train_path)
        h.train(train.text, train.label,learning_rate=learning_rate,batch_size=batch_size,epochs=epochs) 
        
        #Read the data synthesized by GPT2
        with open(synthesized_path, "r") as file:
            sentences = file.readlines()
        
        #Split the data into labels and sentences
        labels = []
        cleaned_sentences = []
        for i,sentence in enumerate(sentences):
            #We split on the first space, as we know everything before it is the label
            sentence_parts = sentence.split(maxsplit = 1)
            if len(sentence_parts[1]) <10:
                continue
            labels.append(str(sentence_parts[0]))
            cleaned_sentences.append(sentence_parts[1])


        #Get confidence and prediction from BERT, and put it all into a pandas dataframe
        predictions = h.predict_label_proba(pd.DataFrame(cleaned_sentences, columns=["text"]))
        pred = [str(pred[0]) for pred in predictions]
        conf = [pred[1] for pred in predictions]
        data = {"text":cleaned_sentences,"label":labels,"predicted label":pred, "confidence":conf}
        df = pd.DataFrame(data)
        
        #Do some pandas magic to only get the X best labels from each class (judged by the confidence of BERT)
        #where X is sentences_per_label
        candidates = df.loc[df["label"] == df["predicted label"]]
        candidates = candidates.sort_values(["confidence"],ascending=False).groupby(["label"]).head(sentences_per_label)    
        
        #Return the dataframe but drop the "predicted label" as it will always be equal to label
        return candidates[["text", "label", "confidence"]]

# Getting the best synthesized data

Using the function above, we filter out the bad data and save the good data to a .csv that we can later to use train our final classifier

In [7]:
for datapoints in [5,10,25,50]:
    get_sentences(f"data/{data_name}/train_labeled_"+str(datapoints) + ".csv",
                f"data/{data_name}/generated_samples_" + str(datapoints)+".txt",
                  datapoints).to_csv(f"data/{data_name}/filtered_data_" + str(datapoints)+".csv")


BERT model selected           : https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/3
Preprocess model auto-selected: https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3


2021-11-11 19:10:02.227696: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 10800 MB memory:  -> device: 0, name: Tesla K80, pci bus id: 0000:06:00.0, compute capability: 3.7
2021-11-11 19:10:05.464888: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
BERT model selected           : https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/3
Preprocess model auto-selected: https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
BERT model selected           : https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/3
Preprocess model auto-selected: https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
BERT model selected           : https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/3
Preprocess model auto-selected: https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


# Train the final classifier on D_train and D_synthesized

Here we train the final classifier on all the data together and asses its performance on the test set. We save the results to a .csv file for later inspection

In [9]:
data = pd.DataFrame(columns=["n_per_class", "accuracy"])

for datapoints in [5,10,25,50]:
    #Train the classifier
    h = Bert(num_classes = num_classes, random_state = seed)
    train = pd.read_csv(f"data/{data_name}/filtered_data_" + str(datapoints)+".csv")
    train = pd.concat([pd.read_csv(f"data/{data_name}/train_labeled_" + str(datapoints)+".csv"),train]) 
    h.train(train.text, train.label, learning_rate=learning_rate,batch_size=batch_size,epochs=epochs)  
    
    #Evaludate and save
    performance = h.evaluate_from_path(f"data/{data_name}/test.csv")[1]
    row = {"n_per_class" : datapoints, "accuracy": performance}
    data = data.append(row, ignore_index=True)


data

BERT model selected           : https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/3
Preprocess model auto-selected: https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
BERT model selected           : https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/3
Preprocess model auto-selected: https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
BERT model selected           : https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/3
Preprocess model auto-selected: https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
BERT model selected           : https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/3
Preprocess model auto-selected: https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


Unnamed: 0,n_per_class,accuracy
0,5.0,0.496
1,10.0,0.648
2,25.0,0.674
3,50.0,0.784


# Save the results

Here we save the results to the correct folder so that everything is in one place and well organized

In [10]:

if not os.path.exists('results'):
      os.mkdir('results')
result_path = f'results/{data_name}'
if not os.path.exists(result_path):
      os.mkdir(result_path)
data.to_csv(f"{result_path}/LAMBADA_results.csv", index=False)