In [1]:
!pip install transformers==4.3.2

!pip install numpy
!pip install pandas



In [2]:
import sys
import numpy as np
import pandas as pd
import os
os.chdir('..')
from GanBert import GanBert



In [None]:
## gloabal variable for changing dataset.
## data_name possible values: "imdb", "medical"
data_name = "imdb"

In [3]:
## method for convcerting the dataset to the right format for the ganbert model
def data_to_ganbert(data):    
    data = list(zip(data.iloc[:,0],data.iloc[:,1]))
    return data

In [4]:
## get the path for the labeled training data, 5, 10, 25, and 50 per label
data_path = 'data/'+data_name
files = os.listdir(data_path)
labeled_files = [data_path+"/"+file for file in files if "train_labeled" in file]

## get the path for the unlabeled training data, 5, 10, 25, and 50 per label
unlabeled = data_to_ganbert(pd.read_csv("data/train_unlabeled.csv"))

## get the path for the test set
test = data_to_ganbert(pd.read_csv("data/test.csv"))

In [6]:
# hyper parameters 
batch_size = 64
max_seq_length = 128
seed = 0
learning_rate = 5e-5
epochs=5

# create a data frame to store the results
results=pd.DataFrame(columns=["n_per_class", "accuracy"])

In [7]:
## train and evaluate bert for each data set. 
for n_per_class in [5,10,25,50]:
    data_file = ""
    result = {"n_per_class":n_per_class}
    # create model
    ganbert = GanBert(batch_size=batch_size,max_seq_length= max_seq_length,epochs = epochs,
                      learning_rate_discriminator = learning_rate,learning_rate_generator = learning_rate,
                      print_each_n_step = 100,random_state = seed)
    ## find correct file
    for file in labeled_files:
        if f"data/train_labeled_{n_per_class}.csv" == file:
            data_file = file
            break
    print(data_file)
    labeled = data_to_ganbert(pd.read_csv(data_file))
    ## train and evaluate the model
    performance = ganbert.train(labeled,unlabeled, test)
    ## add to resutl data frame
    result["accuracy"] = performance
    results = results.append(result,ignore_index=True)

There are 1 GPU(s) available.
We will use the GPU: Tesla K80
data/train_labeled_5.csv

Training...

  Average training loss generetor: 0.712
  Average training loss discriminator: 1.181
  Training epcoh took: 0:04:53

Running Test...
  Accuracy: 0.571
  Test Loss: 0.816
  Test took: 0:00:06

Training...

  Average training loss generetor: 0.723
  Average training loss discriminator: 0.755
  Training epcoh took: 0:04:54

Running Test...
  Accuracy: 0.592
  Test Loss: 1.525
  Test took: 0:00:06

Training...

  Average training loss generetor: 0.712
  Average training loss discriminator: 0.727
  Training epcoh took: 0:04:54

Running Test...
  Accuracy: 0.558
  Test Loss: 2.347
  Test took: 0:00:06

Training...

  Average training loss generetor: 0.706
  Average training loss discriminator: 0.719
  Training epcoh took: 0:04:54

Running Test...
  Accuracy: 0.520
  Test Loss: 2.991
  Test took: 0:00:06

Training...

  Average training loss generetor: 0.702
  Average training loss discriminat

In [8]:
## print the result
results

Unnamed: 0,n_per_class,accuracy
0,5.0,0.533482
1,10.0,0.526786
2,25.0,0.732143
3,50.0,0.792411


In [9]:
# write the result to file
if not os.path.exists('results'):
      os.mkdir('results')
result_path = f'results/{data_name}'
if not os.path.exists(result_path):
      os.mkdir(result_path)
results.to_csv(f"{result_path}/GanBert_results.csv", index=False)