In [None]:
# suppress warnings (already checked they are not problematic, due to internal checks)
from transformers import logging as hf_logging
hf_logging.set_verbosity_error()
import tensorflow as tf
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

Set gpu: which one to use and memory growth 


In [None]:
gpus = tf.config.list_physical_devices('GPU')
if gpus:
  try:
    # Currently, memory growth needs to be the same across GPUs
    tf.config.set_visible_devices(gpus[0], 'GPU')
    tf.config.experimental.set_memory_growth(gpus[0], True)

    logical_gpus = tf.config.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Memory growth must be set before GPUs have been initialized
    print(e)

## Import preprocessed datasets

datasets are configured in data_preparation.ipynb

In [None]:
import pandas as pd

kps_train = pd.read_csv('../dataset_KPA_2021/kps_train.csv').to_numpy().reshape(-1)
args_train = pd.read_csv('../dataset_KPA_2021/args_train.csv').to_numpy().reshape(-1)
labels_train = pd.read_csv('../dataset_KPA_2021/labels_train.csv').to_numpy().reshape(-1)

kps_dev = pd.read_csv('../dataset_KPA_2021/kps_dev.csv').to_numpy().reshape(-1)
args_dev = pd.read_csv('../dataset_KPA_2021/args_dev.csv').to_numpy().reshape(-1)
labels_dev = pd.read_csv('../dataset_KPA_2021/labels_dev.csv').to_numpy().reshape(-1)


## Tokenize input

In our case, we only use bert uncased, in particular the base verision is used here

In [None]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
MAX_LEN = 250 # we checked in the data analysis notebook that no sentence (arg or kp) is longer than 250 tokens

tokenized_args_train = tokenizer(args_train.tolist(), max_length=MAX_LEN, return_tensors='tf', padding='max_length')
tokenized_kps_train = tokenizer(kps_train.tolist(), max_length=MAX_LEN, return_tensors='tf', padding='max_length')

tokenized_args_dev = tokenizer(args_dev.tolist(), max_length=MAX_LEN, return_tensors='tf', padding='max_length')
tokenized_kps_dev = tokenizer(kps_dev.tolist(), max_length=MAX_LEN, return_tensors='tf', padding='max_length')

## Dataset preparation
(zipping input with desired output)

### Split training and validation samples

In [None]:
## TRAINING 
toks1_input_train = tokenized_args_train.input_ids
atts1_input_train = tokenized_args_train.attention_mask

toks2_input_train = tokenized_kps_train.input_ids
atts2_input_train = tokenized_kps_train.attention_mask

print('training data')
print('sizes of data: training', toks1_input_train.shape,atts1_input_train.shape,toks2_input_train.shape,atts2_input_train.shape)


## VALIDATION/DEV
toks1_input_dev = tokenized_args_dev.input_ids
atts1_input_dev = tokenized_args_dev.attention_mask

toks2_input_dev = tokenized_kps_dev.input_ids
atts2_input_dev = tokenized_kps_dev.attention_mask

print('validation/dev data')
print('sizes of data: validation', toks1_input_dev.shape,atts1_input_dev.shape,toks2_input_dev.shape,atts2_input_dev.shape)


Testing the tokenization by decoding the tokenized sentences

In [None]:
for i in range(3):
  print( tokenizer.decode(toks1_input_train[i])[0:100] ) 
  print( tokenizer.decode(toks2_input_train[i])[0:100] )
  print(labels_train[i])

In [None]:
for i in range(3):
  print( tokenizer.decode(toks1_input_dev[i])[0:100] ) 
  print( tokenizer.decode(toks2_input_dev[i])[0:100] )
  print(labels_dev[i])

### Composing the dataset

In [None]:
inputs_train = (
   toks1_input_train,
   atts1_input_train,
   toks2_input_train,
   atts2_input_train
  )
input_dataset_train = tf.data.Dataset.from_tensor_slices( inputs_train )
output_dataset_train = tf.data.Dataset.from_tensor_slices( labels_train )
dataset_train = tf.data.Dataset.zip( (input_dataset_train, output_dataset_train) )
dataset_train = dataset_train.shuffle(buffer_size=20635, reshuffle_each_iteration=True, seed=0)

inputs_dev = (
   toks1_input_dev,
   atts1_input_dev,
   toks2_input_dev,
   atts2_input_dev
  )
input_dataset_dev = tf.data.Dataset.from_tensor_slices( inputs_dev )
output_dataset_dev = tf.data.Dataset.from_tensor_slices( labels_dev )
dataset_dev = tf.data.Dataset.zip( (input_dataset_dev, output_dataset_dev) )


## Grid Search

We ran an initial grid search on a total of ~80 configurations to find interesting intervals in hyperparaters, then we tested the best intervals on 5 dataset initializations

In [None]:
import sys
sys.path.append('../src/')
from utils import grid_search_iteration

# get previously computed results to append the new ones in the same file
try:
  results = pd.read_csv('results.csv')
  results = results.to_numpy().reshape(-1).tolist()
  print(results)
except FileNotFoundError:
  results = []

batch_size = 16

for cls_token_activate in [False]:
  for num_epochs in [1]:
    for lr in [(4e-06, 2e-06), (4e-06, 1e-06), (3e-06, 3e-06), (3e-06, 2e-06), (3e-06, 1e-06)]:
      # chose the configuration
      config = {
      'cls_token_activate' : cls_token_activate,
      'num_epochs' : num_epochs,
      'lr' : lr
      } 
      
      res_grid_iteration =  grid_search_iteration(config, dataset_train, batch_size, inputs_dev)

      # save precision togheter with config
      results.append(res_grid_iteration)
      pd.DataFrame(results).to_csv('./results.csv', index=False)

Fine-grained grid search with averaged precisions

In [None]:
import sys
sys.path.append('../src/')
from utils import grid_search_iteration

# get previously computed results to append the new ones in the same file
try:
  results_fine_grained = pd.read_csv('./results_fine_grained.csv')
  results_fine_grained = results_fine_grained.to_numpy().reshape(-1).tolist()
  print(results_fine_grained)
except FileNotFoundError:
  results_fine_grained = []

batch_size = 16

for cls_token_activate in [False]:
  for num_epochs in [1]:
    for lr in [(3e-06, 2e-06), (3e-06, 1e-06)]:
      # chose the configuration
      config = {
      'cls_token_activate' : cls_token_activate,
      'num_epochs' : num_epochs,
      'lr' : lr
      } 
      
      res_grid_iteration =  grid_search_iteration(config, dataset_train, batch_size, inputs_dev, num_tests = 5)

      # save precision togheter with config
      results_fine_grained.append(res_grid_iteration)
      pd.DataFrame(results_fine_grained).to_csv('./results_fine_grained.csv', index=False)