# Installation

In [None]:
!pip install -q tf-models-official

In [None]:
!pip install tensorflow-text

# Import

In [None]:
import os
import shutil

import re
import numpy as np

import pandas as pd

import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import tensorflow_addons as tfa

import keras
from keras import backend as K

from sklearn import metrics

from official.nlp import optimization  # to create AdamW optimizer

import seaborn as sns
from matplotlib import pyplot as plt

#Data Exploration

In [None]:
# Using google drive to upload the data
from google.colab import drive
drive.mount('/content/drive')

# dir_path = "drive/MyDrive/NLP_project/Datasets/"
dir_path = "drive/MyDrive/NLP/"
dataset = "arg_quality_rank_30k.csv"

In [None]:
df = pd.read_csv(dir_path + dataset)
df.head()

In [None]:
set_topic = df.topic.unique()
dict_topic = {}

In [None]:
for i in set_topic:
  dict_topic[i] = df.loc[i==df['topic'], 'topic'].values.size
sorted(dict_topic.items(), key=lambda x: x[1], reverse=True)

#Data Preprocessing

In [None]:
lemmatizer = WordNetLemmatizer()
def clean_text(text):
  text = re.sub('\"|-|\\\\|`', ' ', text)  # delete this chars from the string ["-\`]
  text = re.sub('\n', ' ', text)
  text = re.sub('^[.]+', '', text)         # delete dots at the beginning of the sentence
  text = re.sub("([?.!,])", r" \1 ", text)
  text = re.sub('\. \.', '.', text)        # delete . .
  text = re.sub('&', ' and ', text)        # replace & with and
  text = re.sub(' +', ' ', text)           # delete additional whitespace
  text = text.rstrip()                  
  text = text.lstrip()
  text = " ".join([lemmatizer.lemmatize(x) for x in text.split()])
  return text

In [None]:
df['argument'] = df.apply(lambda row : clean_text(row['argument']), axis = 1)
df.loc[2, "argument"] = "zero tolerance policy in schools should not be adopted as circumstances are often not black and white, being more nuanced. no one should be written off due to a mistake of judgement."

In [None]:
df['argument']

##Data Split

In [None]:
is_training_data =  df['set']=='train'
is_validation_data =  df['set']=='dev'
is_test_data =  df['set']=='test'

training_data = df[is_training_data]
validation_data = df[is_validation_data]
test_data  = df[is_test_data ]

x_train = training_data['argument']
Y_train = training_data['MACE-P']

x_val = validation_data['argument']
Y_val = validation_data['MACE-P']

x_test = test_data['argument']
Y_test = test_data['MACE-P']

In [None]:
x_train.shape

#[Bert](https://colab.research.google.com/github/tensorflow/text/blob/master/docs/tutorials/classify_text_with_bert.ipynb)



In [None]:
# @title Choose a BERT model to fine-tune

bert_model_name = 'bert_en_uncased_L-12_H-768_A-12'  # @param ["bert_en_uncased_L-24_H-1024_A-16","bert_en_uncased_L-12_H-768_A-12", "bert_en_cased_L-12_H-768_A-12", "bert_multi_cased_L-12_H-768_A-12", "small_bert/bert_en_uncased_L-2_H-128_A-2", "small_bert/bert_en_uncased_L-2_H-256_A-4", "small_bert/bert_en_uncased_L-2_H-512_A-8", "small_bert/bert_en_uncased_L-2_H-768_A-12", "small_bert/bert_en_uncased_L-4_H-128_A-2", "small_bert/bert_en_uncased_L-4_H-256_A-4", "small_bert/bert_en_uncased_L-4_H-512_A-8", "small_bert/bert_en_uncased_L-4_H-768_A-12", "small_bert/bert_en_uncased_L-6_H-128_A-2", "small_bert/bert_en_uncased_L-6_H-256_A-4", "small_bert/bert_en_uncased_L-6_H-512_A-8", "small_bert/bert_en_uncased_L-6_H-768_A-12", "small_bert/bert_en_uncased_L-8_H-128_A-2", "small_bert/bert_en_uncased_L-8_H-256_A-4", "small_bert/bert_en_uncased_L-8_H-512_A-8", "small_bert/bert_en_uncased_L-8_H-768_A-12", "small_bert/bert_en_uncased_L-10_H-128_A-2", "small_bert/bert_en_uncased_L-10_H-256_A-4", "small_bert/bert_en_uncased_L-10_H-512_A-8", "small_bert/bert_en_uncased_L-10_H-768_A-12", "small_bert/bert_en_uncased_L-12_H-128_A-2", "small_bert/bert_en_uncased_L-12_H-256_A-4", "small_bert/bert_en_uncased_L-12_H-512_A-8", "small_bert/bert_en_uncased_L-12_H-768_A-12", "albert_en_base", "electra_small", "electra_base", "experts_pubmed", "experts_wiki_books", "talking-heads_base"]

map_name_to_handle = {
    'bert_en_uncased_L-24_H-1024_A-16':
        'https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/4',
    'bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3',
    'bert_en_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/3',
    'bert_multi_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/3',
    'small_bert/bert_en_uncased_L-2_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-2_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-2_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-2_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-4_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-4_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-4_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-4_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-6_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-6_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-6_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-6_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-8_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-8_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-8_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-8_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-10_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-10_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-10_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-10_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-12_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-12_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-12_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-768_A-12/1',
    'albert_en_base':
        'https://tfhub.dev/tensorflow/albert_en_base/2',
    'electra_small':
        'https://tfhub.dev/google/electra_small/2',
    'electra_base':
        'https://tfhub.dev/google/electra_base/2',
    'experts_pubmed':
        'https://tfhub.dev/google/experts/bert/pubmed/2',
    'experts_wiki_books':
        'https://tfhub.dev/google/experts/bert/wiki_books/2',
    'talking-heads_base':
        'https://tfhub.dev/tensorflow/talkheads_ggelu_bert_en_base/1',
}

map_model_to_preprocess = {
    'bert_en_uncased_L-24_H-1024_A-16':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_en_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_multi_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3',
    'albert_en_base':
        'https://tfhub.dev/tensorflow/albert_en_preprocess/3',
    'electra_small':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'electra_base':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'experts_pubmed':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'experts_wiki_books':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'talking-heads_base':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
}

tfhub_handle_encoder = map_name_to_handle[bert_model_name]
tfhub_handle_preprocess = map_model_to_preprocess[bert_model_name]

print(f'BERT model selected           : {tfhub_handle_encoder}')
print(f'Preprocess model auto-selected: {tfhub_handle_preprocess}')

In [None]:
bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)

In [None]:
bert_model = hub.KerasLayer(tfhub_handle_encoder)

In [None]:
def get_sentence_embeding(sentences):
    preprocessed_text = bert_preprocess_model(sentences)
    return bert_model(preprocessed_text)['encoder_outputs'][-1][:,0,:]

get_sentence_embeding([
    "500$ discount. hurry up", 
    "Bhavin, are you up for a volleybal game tomorrow?"]
)

In [None]:
def build_classifier_model(dense_size=100):
  text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
  preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
  encoder_inputs = preprocessing_layer(text_input)
  encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
  outputs = encoder(encoder_inputs)
  net = outputs['pooled_output']
  net = tf.keras.layers.Dense(dense_size, activation=keras.activations.relu, name='fc')(net)
  net = tf.keras.layers.Dense(1, activation=keras.activations.sigmoid, name='classifier')(net)
  return tf.keras.Model(text_input, net)

In [None]:
classifier_model = build_classifier_model()

In [None]:
classifier_model.summary()

In [None]:
def pearson_loss(y_true, y_pred):
    x = y_true
    y = y_pred
    mx = K.mean(x, axis=0)
    my = K.mean(y, axis=0)
    xm, ym = x - mx, y - my
    r_num = K.sum(xm * ym)
    x_square_sum = K.sum(xm * xm)
    y_square_sum = K.sum(ym * ym)
    r_den = K.sqrt(x_square_sum * y_square_sum)
    r = -r_num / r_den
    return K.mean(r)

def pearson_metric(y_true, y_pred):
    x = y_true
    y = y_pred
    mx = K.mean(x, axis=0)
    my = K.mean(y, axis=0)
    xm, ym = x - mx, y - my
    r_num = K.sum(xm * ym)
    x_square_sum = K.sum(xm * xm)
    y_square_sum = K.sum(ym * ym)
    r_den = K.sqrt(x_square_sum * y_square_sum)
    r = r_num / r_den
    return K.mean(r)

loss = pearson_loss
loss = tf.keras.losses.MeanSquaredError()

metric_pearson = pearson_metric
metric_mse = tf.keras.metrics.MeanSquaredError()

In [None]:
epochs = 1
batch_size = 32
steps_per_epoch = x_train.shape[0] / batch_size 
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(epochs * x_train.shape[0] * 0.1 / batch_size)

# solution 1

init_lr = 3e-5
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')

In [None]:
classifier_model.compile(optimizer=optimizer,
                         loss=loss,
                         metrics=[metric_pearson, metric_mse])

In [None]:
print(f'Training model with {tfhub_handle_encoder}')
history = classifier_model.fit(x=x_train, y=Y_train, epochs=epochs, 
                               batch_size=batch_size)

In [None]:
loss, metric_1, metric_2 = classifier_model.evaluate(x=x_test, y=Y_test)

In [None]:
Y_predicted = classifier_model.predict(x_test)
ris = pd.DataFrame(
    {'x_test': x_test,
     'Y_test': Y_test,
     'Y_predicted': list(Y_predicted)
    })
ris

#Grid Search

In [None]:
parameters = {'epochs': [1, 2, 3], 
              'batch_size':[32],
              'init_lr': [3e-6, 3e-5],
              'dense_size' : [100,200],
              'loss' : [tf.keras.losses.MeanSquaredError(), pearson_loss]
              }

best_scores = -1
best_params = {1: dict()}

for loss in parameters['loss']:
  print("Loss: ", loss)
  for epochs in parameters['epochs']:
    print(" Epochs: ", epochs)
    for init_lr in parameters['init_lr']:
      print("  Start Learning Rate: ", init_lr)
      for batch_size in parameters['batch_size']:
        print("   Batch Size: ", batch_size)
        for dense_size in parameters['dense_size']:
          print("    Dense size: ", dense_size)
          steps_per_epoch = x_train.shape[0] / batch_size 
          num_train_steps = steps_per_epoch * epochs
          num_warmup_steps = int(epochs * x_train.shape[0] * 0.1 / batch_size)
          optimizer = optimization.create_optimizer(init_lr=init_lr, 
                                                    num_train_steps=num_train_steps, 
                                                    num_warmup_steps=num_warmup_steps, 
                                                    optimizer_type='adamw')
          classifier_model = build_classifier_model(dense_size)
          classifier_model.compile(optimizer=optimizer, loss=loss, 
                                   metrics=[metric_pearson, metric_mse])
          history = classifier_model.fit(x=x_train, y=Y_train, epochs=epochs, 
                                         batch_size=batch_size)
          loss_calculated, pearson ,mse = classifier_model.evaluate(x=x_val, 
                                                                    y=Y_val)
          print("     Pearson: ", pearson)
          print("     MSE: ", mse)
          if pearson > best_scores:                 
            best_score = pearson
            best_params = {'epochs': epochs, 
                           'batch_size': batch_size, 
                           'start_lr': init_lr,  
                           'dense_size': dense_size,
                           'loss': loss_calculated}
print(best_scores)
print(best_params)

# Training

In [None]:
# Best parameter found on grid search
parameters = {'epochs': 2, 
              'batch_size': 32,
              'init_lr': 3e-5,
              'dense_size': 100,
              'loss': pearson_loss
              }

epochs = parameters['epochs']
batch_size = parameters['batch_size']
init_lr = parameters['init_lr']
dense_size = parameters['dense_size']
loss = parameters['loss']

steps_per_epoch = x_train.shape[0] / batch_size 
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(epochs * x_train.shape[0] * 0.1 / batch_size)
optimizer = optimization.create_optimizer(init_lr=init_lr, 
                                          num_train_steps=num_train_steps, 
                                          num_warmup_steps=num_warmup_steps, 
                                          optimizer_type='adamw')
classifier_model = build_classifier_model(dense_size)
classifier_model.compile(optimizer=optimizer, loss=loss, metrics=[metric_pearson, 
                                                                  metric_mse])
history = classifier_model.fit(x=x_train.append(x_val), y=Y_train.append(Y_val), validation_data=(x_test, Y_test), 
                               epochs=epochs, batch_size=batch_size)
loss_calculated, pearson, mse = classifier_model.evaluate(x_test, Y_test)
print("Pearson: ", pearson)
print("MSE: ", mse)

# Save model

In [None]:
classifier_model.save("drive/MyDrive/Colab Notebooks/NLP/classifierIBM30k.h5")