<a href="https://colab.research.google.com/github/ElFosco/NLP_argument_creation/blob/main/Inference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Installation

In [1]:
!pip install -q tf-models-official

In [2]:
!pip install tensorflow-text



#Import

In [3]:
import os
import shutil

import re
import numpy as np
import math

import pandas as pd

import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

import tensorflow as tf

import tensorflow_hub as hub
import tensorflow_text as text
import tensorflow_addons as tfa

import keras
from keras import backend as K
from keras.layers import concatenate
from keras import Sequential

from sklearn import metrics
from sklearn.preprocessing import LabelEncoder, OneHotEncoder  # one-hot encoding

from official.nlp import optimization  # to create AdamW optimizer

from urllib import request
import zipfile

import seaborn as sns
from matplotlib import pyplot as plt

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


#Data Extraction

In [None]:
# Using google drive to upload the data
from google.colab import drive
drive.mount('/content/drive')

In [4]:
dataset_folder = os.path.join(os.getcwd(), "Datasets")
url = 'https://nlp.stanford.edu/projects/snli/snli_1.0.zip'
dataset_path = os.path.join(dataset_folder, "snli_1.0.zip")
dataset_unzip = os.path.join(dataset_folder,"snli_1.0")

if not os.path.exists(dataset_folder):
  os.makedirs(dataset_folder)

if not os.path.exists(dataset_path):
  print("Downloading dataset...")
  request.urlretrieve(url, dataset_path)
  print("Download complete!")

if not os.path.exists(dataset_unzip):
  print("Extracting dataset... (it may take a while...)")
  with zipfile.ZipFile(dataset_path) as loaded_tar:
    loaded_tar.extractall(dataset_folder)
  print("Extraction completed!")

dataset_train =  os.path.join(dataset_unzip,"snli_1.0_train.jsonl")
dataset_dev = os.path.join(dataset_unzip,"snli_1.0_dev.jsonl")
dataset_test = os.path.join(dataset_unzip,"snli_1.0_test.jsonl")


In [16]:
df_train = pd.read_json(dataset_train,lines = True)[['sentence1','sentence2','gold_label']]
df_dev = pd.read_json(dataset_dev,lines = True)[['sentence1','sentence2','gold_label']]
df_test = pd.read_json(dataset_test,lines = True)[['sentence1','sentence2','gold_label']]

In [18]:
def append_sentences(sentence_1,sentence_2):
  if not (re.search('[\.|?|!]$',sentence_1)): #append the topic 
    sentence_1 = sentence_1+'. '
  else:
    sentence_1 = re.sub('[\.|?|!]$','. ',sentence_1)
  sentence_1 = (sentence_1 + sentence_2).lower()
  return sentence_1

In [19]:
df_train['sentences'] = df_train.apply(lambda row: append_sentences(row['sentence1'],row['sentence2']), axis = 1)
df_dev['sentences'] =   df_dev.apply(lambda row: append_sentences(row['sentence1'],row['sentence2']), axis = 1)
df_test['sentences'] =  df_test.apply(lambda row: append_sentences(row['sentence1'],row['sentence2']), axis = 1)


df_train = df_train.drop(['sentence1','sentence2'],axis=1)
df_dev = df_dev.drop(['sentence1','sentence2'],axis=1)
df_test = df_test.drop(['sentence1','sentence2'],axis=1)

In [80]:
df_train = df_train[df_train.gold_label != '-']
df_dev = df_dev[df_dev.gold_label != '-']
df_test = df_test[df_test.gold_label != '-']

df_train_grid = df_train.sample(frac=0.1)
df_dev_grid = df_dev.sample(frac=0.1)

x_train = df_train['sentences']
Y_train = df_train['gold_label']

x_train_grid = df_train_grid['sentences']
Y_train_grid = df_train_grid['gold_label']

x_dev = df_dev['sentences']
Y_dev = df_dev['gold_label']

x_dev_grid = df_dev_grid['sentences']
Y_dev_grid = df_dev_grid['gold_label']

x_test = df_test['sentences']
Y_test = df_test['gold_label']

##One hot encoding

In [81]:
class OneHotEncoderWrapper(object):

  def __init__(self):
    self.label_encoder = LabelEncoder()

  def get_one_hot_encoding(self, list_pos):
    # creates a dictionary containing pos and its one hot encoding
    ris = {}
    integer_encoded = self.label_encoder.fit_transform(list_pos)
    # binary encode
    onehot_encoder = OneHotEncoder(sparse=False)
    integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
    onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
    for i in range(len(list_pos)):
      ris[list_pos[i]] = onehot_encoded[i]
    return ris
  
  def get_inverse(self, encoded):
    # from one hot encoding to original pos
    return self.label_encoder.inverse_transform([np.argmax(encoded)])

  def get_pos_from_label(self, label):
    # from label to original pos
    return self.label_encoder.inverse_transform([label])

In [82]:
one_hot_encoder = OneHotEncoderWrapper()
set_label = df_train['gold_label'].unique()
ris = one_hot_encoder.get_one_hot_encoding(set_label)

Y_train_encoded = [ris[label] for label in Y_train]
Y_train_encoded_grid = [ris[label] for label in Y_train_grid]

Y_dev_encoded = [ris[label] for label in Y_dev]
Y_dev_encoded_grid = [ris[label] for label in Y_dev_grid]

Y_test_encoded = [ris[label] for label in Y_test]

Y_train_encoded = np.asarray(Y_train_encoded)
Y_train_encoded_grid = np.asarray(Y_train_encoded_grid)

Y_dev_encoded = np.asarray(Y_dev_encoded)
Y_dev_encoded_grid = np.asarray(Y_dev_encoded_grid)

Y_test_encoded = np.asarray(Y_test_encoded)


#Bert

In [75]:
# @title Choose a BERT model to fine-tune

bert_model_name = 'albert_en_base'  # @param ["bert_en_uncased_L-24_H-1024_A-16","bert_en_uncased_L-12_H-768_A-12", "bert_en_cased_L-12_H-768_A-12", "bert_multi_cased_L-12_H-768_A-12", "small_bert/bert_en_uncased_L-2_H-128_A-2", "small_bert/bert_en_uncased_L-2_H-256_A-4", "small_bert/bert_en_uncased_L-2_H-512_A-8", "small_bert/bert_en_uncased_L-2_H-768_A-12", "small_bert/bert_en_uncased_L-4_H-128_A-2", "small_bert/bert_en_uncased_L-4_H-256_A-4", "small_bert/bert_en_uncased_L-4_H-512_A-8", "small_bert/bert_en_uncased_L-4_H-768_A-12", "small_bert/bert_en_uncased_L-6_H-128_A-2", "small_bert/bert_en_uncased_L-6_H-256_A-4", "small_bert/bert_en_uncased_L-6_H-512_A-8", "small_bert/bert_en_uncased_L-6_H-768_A-12", "small_bert/bert_en_uncased_L-8_H-128_A-2", "small_bert/bert_en_uncased_L-8_H-256_A-4", "small_bert/bert_en_uncased_L-8_H-512_A-8", "small_bert/bert_en_uncased_L-8_H-768_A-12", "small_bert/bert_en_uncased_L-10_H-128_A-2", "small_bert/bert_en_uncased_L-10_H-256_A-4", "small_bert/bert_en_uncased_L-10_H-512_A-8", "small_bert/bert_en_uncased_L-10_H-768_A-12", "small_bert/bert_en_uncased_L-12_H-128_A-2", "small_bert/bert_en_uncased_L-12_H-256_A-4", "small_bert/bert_en_uncased_L-12_H-512_A-8", "small_bert/bert_en_uncased_L-12_H-768_A-12", "albert_en_base", "electra_small", "electra_base", "experts_pubmed", "experts_wiki_books", "talking-heads_base"]

map_name_to_handle = {
    'bert_en_uncased_L-24_H-1024_A-16':
        'https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/4',
    'bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3',
    'bert_en_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/3',
    'bert_multi_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/3',
    'small_bert/bert_en_uncased_L-2_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-2_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-2_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-2_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-4_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-4_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-4_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-4_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-6_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-6_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-6_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-6_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-8_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-8_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-8_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-8_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-10_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-10_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-10_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-10_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-12_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-12_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-12_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-768_A-12/1',
    'albert_en_base':
        'https://tfhub.dev/tensorflow/albert_en_base/3',
    'electra_small':
        'https://tfhub.dev/google/electra_small/2',
    'electra_base':
        'https://tfhub.dev/google/electra_base/2',
    'experts_pubmed':
        'https://tfhub.dev/google/experts/bert/pubmed/2',
    'experts_wiki_books':
        'https://tfhub.dev/google/experts/bert/wiki_books/2',
    'talking-heads_base':
        'https://tfhub.dev/tensorflow/talkheads_ggelu_bert_en_base/1',
}

map_model_to_preprocess = {
    'bert_en_uncased_L-24_H-1024_A-16':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_en_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_multi_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3',
    'albert_en_base':
        'https://tfhub.dev/tensorflow/albert_en_preprocess/3',
    'electra_small':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'electra_base':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'experts_pubmed':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'experts_wiki_books':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'talking-heads_base':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
}

tfhub_handle_encoder = map_name_to_handle[bert_model_name]
tfhub_handle_preprocess = map_model_to_preprocess[bert_model_name]

print(f'BERT model selected           : {tfhub_handle_encoder}')
print(f'Preprocess model auto-selected: {tfhub_handle_preprocess}')

BERT model selected           : https://tfhub.dev/tensorflow/albert_en_base/3
Preprocess model auto-selected: https://tfhub.dev/tensorflow/albert_en_preprocess/3


In [76]:
bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess) #preprocessing layer

In [77]:
bert_model = hub.KerasLayer(tfhub_handle_encoder)               #bert model

In [78]:
def build_classifier_model(dense_size=100):                     #model used to compute the score of the argument
  text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
  preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
  encoder_inputs = preprocessing_layer(text_input)
  encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
  outputs = encoder(encoder_inputs)
  net = outputs['pooled_output']
  net = tf.keras.layers.Dense(dense_size, activation=keras.activations.relu, name='fc_1')(net)
  net = tf.keras.layers.Dense(3, activation=keras.activations.softmax, name='classifier')(net)
  return tf.keras.Model(text_input, net)

#Grid Search

In [83]:
x_train_grid.shape 

(54937,)

In [84]:
parameters = {'epochs': [1,2], 
              'batch_size':[16,32,64],
              'init_lr': [3e-5],
              'dense_size' : [100,200,300]
              }
best_scores = -1
best_params = {1: dict()}



for epochs in parameters['epochs']:
  print(" Epochs: ", epochs)
  for init_lr in parameters['init_lr']:
    print("  Start Learning Rate: ", init_lr)
    for batch_size in parameters['batch_size']:
      print("   Batch Size: ", batch_size)
      for dense_size in parameters['dense_size']:
        print("    Dense size: ", dense_size)
        steps_per_epoch = x_train_grid.shape[0] / batch_size 
        num_train_steps = steps_per_epoch * epochs
        num_warmup_steps = int(epochs * x_train_grid.shape[0] * 0.1 / batch_size)
        optimizer = optimization.create_optimizer(init_lr=init_lr, 
                                                    num_train_steps=num_train_steps, 
                                                    num_warmup_steps=num_warmup_steps, 
                                                    optimizer_type='adamw')
        classifier_model = build_classifier_model(dense_size)
        classifier_model.compile(optimizer=optimizer, loss='categorical_crossentropy', 
                                   metrics=['accuracy'])
        history = classifier_model.fit(x=x_train_grid, y=Y_train_encoded_grid, epochs=epochs, 
                                         batch_size=batch_size)
        loss_calculated, accuracy = classifier_model.evaluate(x=x_dev_grid, y=Y_dev_encoded_grid)
        print("     Loss: ", loss_calculated)
        print("     Accuracy: ", accuracy)
        if accuracy > best_scores:                 
          best_score = accuracy
          best_params = {'epochs': epochs, 
                          'batch_size': batch_size, 
                          'start_lr': init_lr,  
                          'dense_size': dense_size,
                          'loss': loss_calculated}
print(best_scores)
print(best_params)


 Epochs:  1
  Start Learning Rate:  3e-05
   Batch Size:  16
    Dense size:  100
     Loss:  0.37172091007232666
     Accuracy:  0.8760162591934204
    Dense size:  200
     Loss:  0.39507368206977844
     Accuracy:  0.8546748161315918
    Dense size:  300

KeyboardInterrupt: ignored

#Training

In [None]:
# Best parameter found on grid search
parameters = {'epochs': 1, 
              'batch_size': ,
              'init_lr': 3e-5,
              'dense_size': ,
              'loss': 
              }

epochs = parameters['epochs']
batch_size = parameters['batch_size']
init_lr = parameters['init_lr']
dense_size = parameters['dense_size']
loss = parameters['loss']

x_final = x_train.append(x_dev)
Y_final_encoded = Y_train_encoded.append(Y_dev_encoded)

steps_per_epoch = (x_final).shape[0] / batch_size 
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(epochs * (x_final).shape[0]) * 0.1 / batch_size)
optimizer = optimization.create_optimizer(init_lr=init_lr, 
                                          num_train_steps=num_train_steps, 
                                          num_warmup_steps=num_warmup_steps, 
                                          optimizer_type='adamw')
classifier_model = build_classifier_model(dense_size)
classifier_model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
history = classifier_model.fit(x=(x_final).reset_index(drop=True), y=(Y_final_encoded).reset_index(drop=True), 
                               epochs=epochs, batch_size=batch_size)
loss_calculated, pearson, mse = classifier_model.evaluate(x_test, Y_test)
print("Pearson: ", pearson)
print("MSE: ", mse)

In [None]:
classifier_model.save("drive/MyDrive/Colab Notebooks/NLP/classifierNLI.h5")