<a href="https://colab.research.google.com/github/ElFosco/NLP_argument_creation/blob/main/Predictor_score_jpynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installation

In [1]:
!pip install -q tf-models-official

In [2]:
!pip install tensorflow-text



In [3]:
pip install torch>=1.6.0 transformers>=4.11.3 sentencepiece

In [4]:
pip install numpy requests nlpaug

Collecting nlpaug
  Downloading nlpaug-1.1.10-py3-none-any.whl (410 kB)
[K     |████████████████████████████████| 410 kB 5.4 MB/s 
Installing collected packages: nlpaug
Successfully installed nlpaug-1.1.10


# Import

In [5]:
import os
import shutil

import re
import numpy as np

import pandas as pd

import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import tensorflow_addons as tfa

import keras
from keras import backend as K

from sklearn import metrics

from official.nlp import optimization  # to create AdamW optimizer

import nlpaug.augmenter.word as naw #data augmentation

import seaborn as sns
from matplotlib import pyplot as plt

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


 The versions of TensorFlow you are currently using is 2.8.0 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


#Data Exploration

In [6]:
# Using google drive to upload the data
from google.colab import drive
drive.mount('/content/drive')

# dir_path = "drive/MyDrive/NLP_project/Datasets/"
dir_path = "drive/MyDrive/Magistrale/NLP/Project/Data/"
dataset = "arg_quality_rank_30k.csv"

Mounted at /content/drive


In [7]:
df = pd.read_csv(dir_path + dataset)
df.head()

Unnamed: 0,argument,topic,set,WA,MACE-P,stance_WA,stance_WA_conf
0,"""marriage"" isn't keeping up with the times. a...",We should abandon marriage,train,0.846165,0.297659,1,1.0
1,.a multi-party system would be too confusing a...,We should adopt a multi-party system,train,0.891271,0.726133,-1,1.0
2,\ero-tolerance policy in schools should not be...,We should adopt a zero-tolerance policy in sch...,dev,0.721192,0.396953,-1,1.0
3,`people reach their limit when it comes to the...,Assisted suicide should be a criminal offence,train,0.730395,0.225212,-1,1.0
4,"100% agree, should they do that, it would be a...",We should abolish safe spaces,train,0.236686,0.004104,1,0.805517


In [8]:
set_topic = df.topic.unique()
dict_topic = {}

#Data Preprocessing

In [9]:
lemmatizer = WordNetLemmatizer()
def clean_text(text):
  text = re.sub('\"|-|\\\\|`', ' ', text)  # delete this chars from the string ["-\`]
  text = re.sub('\n', ' ', text)
  text = re.sub('^[.]+', '', text)         # delete dots at the beginning of the sentence
  text = re.sub("([?.!,])", r" \1 ", text)
  text = re.sub('\. \.', '.', text)        # delete . .
  text = re.sub('&', ' and ', text)        # replace & with and
  text = re.sub(' +', ' ', text)           # delete additional whitespace
  text = text.rstrip()                  
  text = text.lstrip()
  text = " ".join([lemmatizer.lemmatize(x) for x in text.split()])
  return text

In [10]:
df.loc[2, "argument"] = "zero tolerance policy in schools should not be adopted as circumstances are often not black and white, being more nuanced. no one should be written off due to a mistake of judgement."
df['argument'] = df.apply(lambda row : clean_text(row['argument']), axis = 1)


##Data Split

In [11]:
is_training_data =  df['set']=='train'
is_validation_data =  df['set']=='dev'
is_test_data =  df['set']=='test'

training_data = df[is_training_data]
validation_data = df[is_validation_data]
test_data  = df[is_test_data ]

x_train = training_data['argument'].reset_index(drop=True)
Y_train = training_data['MACE-P'].reset_index(drop=True)

x_val = validation_data['argument'].reset_index(drop=True)
Y_val = validation_data['MACE-P'].reset_index(drop=True)

x_test = test_data['argument'].reset_index(drop=True)
Y_test = test_data['MACE-P'].reset_index(drop=True)

#Data Augmentation

In [None]:
back_translation_aug = naw.BackTranslationAug(
    from_model_name='facebook/wmt19-en-de', 
    to_model_name='facebook/wmt19-de-en'
)

Downloading:   0%|          | 0.00/825 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.00G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/825 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.00G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/67.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/829k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/829k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/308k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/67.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/829k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/829k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/308k [00:00<?, ?B/s]

In [None]:
def create_augmented_data(x,Y):
  tmp_df = pd.DataFrame()
  for i in range(6000,int(x.shape[0])):
    print('['+str(i+1)+'/'+str(int(x.shape[0])+1)+']')
    new_argument = back_translation_aug.augment(x[i])
    score = Y[i]
    print(score)
    tmp = {'argument': new_argument, 'score': score}
    tmp_df = tmp_df.append(tmp,ignore_index=True)
    if ((i+1)%500)==0:
      print("Print on file:"+str(int(i+1)))
      tmp_df.to_csv(dir_path+"aug_"+str(int(i+1))+".csv", index=False, encoding='utf-8-sig')
      tmp_df = pd.DataFrame()
  print("Print on file")
  tmp_df.to_csv(dir_path+"aug.csv", index=False, encoding='utf-8-sig')


In [None]:
create_augmented_data(x_train,Y_train)

[6001/20975]
0.34637527
[6002/20975]
0.850216064
[6003/20975]
0.163752849
[6004/20975]
0.941575939
[6005/20975]
0.91257727
[6006/20975]
0.732177813
[6007/20975]
0.730394001
[6008/20975]
0.985196334
[6009/20975]
0.998134129
[6010/20975]
0.757758172
[6011/20975]
0.242041083
[6012/20975]
0.999585877
[6013/20975]
0.984383215
[6014/20975]
0.886625591
[6015/20975]
0.363580527
[6016/20975]
0.927668615
[6017/20975]
0.249409156
[6018/20975]
0.849970942
[6019/20975]
0.682275376
[6020/20975]
0.999619695
[6021/20975]
0.895690318
[6022/20975]
0.893136072
[6023/20975]
0.950949274
[6024/20975]
0.118880122
[6025/20975]
0.79330275
[6026/20975]
0.996884228
[6027/20975]
0.836000043
[6028/20975]
0.825804012
[6029/20975]
0.85565157
[6030/20975]
0.102515775
[6031/20975]
0.897728562
[6032/20975]
0.990721385
[6033/20975]
0.963533273
[6034/20975]
0.185723079
[6035/20975]
0.821555297
[6036/20975]
0.999674756
[6037/20975]
0.975135938
[6038/20975]
0.818659736
[6039/20975]
0.498553244
[6040/20975]
0.442044533
[604

##Read augmented data

In [19]:
def read_aug_data():
  ris = pd.DataFrame()
  for i in range(500,8500,500):
    df = pd.read_csv(dir_path+"aug_"+str(int(i))+".csv")
    ris = ris.append(df)
  return ris

In [20]:
ris_aug = read_aug_data()

In [21]:
x_train_aug = ris_aug['argument']
Y_train_aug = ris_aug['score']

#Data from UKP

In [None]:
ukp_path = dir_path + "UKPConvArg1-Ranking-CSV/" # need to download https://tudatalib.ulb.tu-darmstadt.de/handle/tudatalib/2427 in the given directory
i=0
ukp_dataset_train = pd.DataFrame()
ukp_dataset_valid = pd.DataFrame()
ukp_dataset_test = pd.DataFrame()
for csv in os.listdir(ukp_path):
  if i<=20:
    df = pd.read_csv(ukp_path+csv,sep='\t')
    ukp_dataset_train = ukp_dataset_train.append(df)
  elif 20<i<=25 :
    df = pd.read_csv(ukp_path+csv,sep='\t')
    ukp_dataset_valid = ukp_dataset_valid.append(df)
  else:
    df = pd.read_csv(ukp_path+csv,sep='\t')
    ukp_dataset_test = ukp_dataset_test.append(df)
  i+=1

print(ukp_dataset_train.shape)
print(ukp_dataset_valid.shape)
print(ukp_dataset_test.shape)


(702, 3)
(160, 3)
(190, 3)


In [None]:
lemmatizer = WordNetLemmatizer()
def clean_text_ukp(text):
  text = text.lower()
  text = re.sub('\"|-|\\\\|`|/|\'', ' ', text)  # delete this chars from the string ["-\`]
  text = re.sub('<br/>', ' ', text)
  text = re.sub(':\)', ' ', text)
  text = re.sub('[\.]+[\.]+', ' ', text)         # delete ...
  #text = re.sub("([?.!,])", r" \1 ", text)
  text = re.sub('&', ' and ', text)        # replace & with and
  text = re.sub(' +', ' ', text)           # delete additional whitespace
  text = text.rstrip()                  
  text = text.lstrip()
  text = " ".join([lemmatizer.lemmatize(x) for x in text.split()])
  return text
  

In [None]:
ukp_dataset_train['argument'] = ukp_dataset_train.apply(lambda row : clean_text_ukp(row['argument']), axis = 1)
ukp_dataset_valid['argument'] = ukp_dataset_valid.apply(lambda row : clean_text_ukp(row['argument']), axis = 1)
ukp_dataset_test['argument'] = ukp_dataset_test.apply(lambda row : clean_text_ukp(row['argument']), axis = 1)

x_train_ukp = ukp_dataset_train['argument']
Y_train_ukp = ukp_dataset_train['rank']

x_val_ukp = ukp_dataset_valid['argument']
Y_val_ukp = ukp_dataset_valid['rank']

x_test_ukp = ukp_dataset_test['argument']
Y_test_ukp = ukp_dataset_test['rank']

#Final Dataset

In [22]:
x_train = (x_train.append(x_train_aug)).reset_index(drop=True)
Y_train = (Y_train.append(Y_train_aug)).reset_index(drop=True)

x_val = (x_val).reset_index(drop=True)
Y_val = (Y_val).reset_index(drop=True)

x_test = (x_test).reset_index(drop=True)
Y_test = (Y_test).reset_index(drop=True)

#[Bert](https://colab.research.google.com/github/tensorflow/text/blob/master/docs/tutorials/classify_text_with_bert.ipynb)



In [23]:
# @title Choose a BERT model to fine-tune

bert_model_name = 'bert_en_uncased_L-12_H-768_A-12'  # @param ["bert_en_uncased_L-24_H-1024_A-16","bert_en_uncased_L-12_H-768_A-12", "bert_en_cased_L-12_H-768_A-12", "bert_multi_cased_L-12_H-768_A-12", "small_bert/bert_en_uncased_L-2_H-128_A-2", "small_bert/bert_en_uncased_L-2_H-256_A-4", "small_bert/bert_en_uncased_L-2_H-512_A-8", "small_bert/bert_en_uncased_L-2_H-768_A-12", "small_bert/bert_en_uncased_L-4_H-128_A-2", "small_bert/bert_en_uncased_L-4_H-256_A-4", "small_bert/bert_en_uncased_L-4_H-512_A-8", "small_bert/bert_en_uncased_L-4_H-768_A-12", "small_bert/bert_en_uncased_L-6_H-128_A-2", "small_bert/bert_en_uncased_L-6_H-256_A-4", "small_bert/bert_en_uncased_L-6_H-512_A-8", "small_bert/bert_en_uncased_L-6_H-768_A-12", "small_bert/bert_en_uncased_L-8_H-128_A-2", "small_bert/bert_en_uncased_L-8_H-256_A-4", "small_bert/bert_en_uncased_L-8_H-512_A-8", "small_bert/bert_en_uncased_L-8_H-768_A-12", "small_bert/bert_en_uncased_L-10_H-128_A-2", "small_bert/bert_en_uncased_L-10_H-256_A-4", "small_bert/bert_en_uncased_L-10_H-512_A-8", "small_bert/bert_en_uncased_L-10_H-768_A-12", "small_bert/bert_en_uncased_L-12_H-128_A-2", "small_bert/bert_en_uncased_L-12_H-256_A-4", "small_bert/bert_en_uncased_L-12_H-512_A-8", "small_bert/bert_en_uncased_L-12_H-768_A-12", "albert_en_base", "electra_small", "electra_base", "experts_pubmed", "experts_wiki_books", "talking-heads_base"]

map_name_to_handle = {
    'bert_en_uncased_L-24_H-1024_A-16':
        'https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/4',
    'bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3',
    'bert_en_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/3',
    'bert_multi_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/3',
    'small_bert/bert_en_uncased_L-2_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-2_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-2_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-2_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-4_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-4_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-4_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-4_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-6_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-6_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-6_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-6_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-8_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-8_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-8_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-8_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-10_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-10_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-10_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-10_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-12_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-12_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-12_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-768_A-12/1',
    'albert_en_base':
        'https://tfhub.dev/tensorflow/albert_en_base/2',
    'electra_small':
        'https://tfhub.dev/google/electra_small/2',
    'electra_base':
        'https://tfhub.dev/google/electra_base/2',
    'experts_pubmed':
        'https://tfhub.dev/google/experts/bert/pubmed/2',
    'experts_wiki_books':
        'https://tfhub.dev/google/experts/bert/wiki_books/2',
    'talking-heads_base':
        'https://tfhub.dev/tensorflow/talkheads_ggelu_bert_en_base/1',
}

map_model_to_preprocess = {
    'bert_en_uncased_L-24_H-1024_A-16':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_en_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_multi_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3',
    'albert_en_base':
        'https://tfhub.dev/tensorflow/albert_en_preprocess/3',
    'electra_small':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'electra_base':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'experts_pubmed':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'experts_wiki_books':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'talking-heads_base':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
}

tfhub_handle_encoder = map_name_to_handle[bert_model_name]
tfhub_handle_preprocess = map_model_to_preprocess[bert_model_name]

print(f'BERT model selected           : {tfhub_handle_encoder}')
print(f'Preprocess model auto-selected: {tfhub_handle_preprocess}')

BERT model selected           : https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3
Preprocess model auto-selected: https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3


In [25]:
bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)

In [26]:
bert_model = hub.KerasLayer(tfhub_handle_encoder)

In [27]:
def get_sentence_embeding(sentences):
    preprocessed_text = bert_preprocess_model(sentences)
    return bert_model(preprocessed_text)['encoder_outputs'][-1][:,0,:]

get_sentence_embeding([
    "500$ discount. hurry up", 
    "Bhavin, are you up for a volleybal game tomorrow?"]
)

<tf.Tensor: shape=(2, 768), dtype=float32, numpy=
array([[-1.94766521e-02, -1.63640231e-01,  3.24933901e-02, ...,
        -3.22463185e-01, -1.03215784e-01,  2.76807010e-01],
       [-3.08533795e-02, -3.36306602e-01, -1.65708363e-04, ...,
        -4.87345725e-01,  5.70318043e-01,  4.68657613e-01]], dtype=float32)>

In [28]:
def build_classifier_model(dense_size=100):
  text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
  preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
  encoder_inputs = preprocessing_layer(text_input)
  encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
  outputs = encoder(encoder_inputs)
  net = outputs['pooled_output']
  net = tf.keras.layers.Dense(dense_size, activation=keras.activations.relu, name='fc_1')(net)
  net = tf.keras.layers.Dense(64, activation=keras.activations.relu, name='fc_2')(net)
  net = tf.keras.layers.Dense(32, activation=keras.activations.relu, name='fc_3')(net)
  net = tf.keras.layers.Dense(16, activation=keras.activations.relu, name='fc_4')(net)
  net = tf.keras.layers.Dense(1, activation=keras.activations.sigmoid, name='classifier')(net)
  return tf.keras.Model(text_input, net)

In [16]:
classifier_model = build_classifier_model()

NameError: ignored

In [None]:
classifier_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 preprocessing (KerasLayer)     {'input_word_ids':   0           ['text[0][0]']                   
                                (None, 128),                                                      
                                 'input_type_ids':                                                
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128)}                                                      

In [29]:
def pearson_loss(y_true, y_pred):
    x = y_true
    y = y_pred
    mx = K.mean(x, axis=0)
    my = K.mean(y, axis=0)
    xm, ym = x - mx, y - my
    r_num = K.sum(xm * ym)
    x_square_sum = K.sum(xm * xm)
    y_square_sum = K.sum(ym * ym)
    r_den = K.sqrt(x_square_sum * y_square_sum)
    r = -r_num / r_den
    return K.mean(r)

def pearson_metric(y_true, y_pred):
    x = y_true
    y = y_pred
    mx = K.mean(x, axis=0)
    my = K.mean(y, axis=0)
    xm, ym = x - mx, y - my
    r_num = K.sum(xm * ym)
    x_square_sum = K.sum(xm * xm)
    y_square_sum = K.sum(ym * ym)
    r_den = K.sqrt(x_square_sum * y_square_sum)
    r = r_num / r_den
    return K.mean(r)

loss = pearson_loss
loss = tf.keras.losses.MeanSquaredError()

metric_pearson = pearson_metric
metric_mse = tf.keras.metrics.MeanSquaredError()

In [None]:
epochs = 1
batch_size = 32
steps_per_epoch = x_train.shape[0] / batch_size 
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(epochs * x_train.shape[0] * 0.1 / batch_size)

# solution 1

init_lr = 3e-5
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')

In [None]:
classifier_model.compile(optimizer=optimizer,
                         loss=loss,
                         metrics=[metric_pearson, metric_mse])

In [None]:
print(f'Training model with {tfhub_handle_encoder}')
history = classifier_model.fit(x=x_train, y=Y_train, epochs=epochs, 
                               batch_size=batch_size)

In [None]:
loss, metric_1, metric_2 = classifier_model.evaluate(x=x_test, y=Y_test)

In [None]:
Y_predicted = classifier_model.predict(x_test)
ris = pd.DataFrame(
    {'x_test': x_test,
     'Y_test': Y_test,
     'Y_predicted': list(Y_predicted)
    })
ris

#Grid Search

In [None]:
parameters = {'epochs': [1, 2, 3], 
              'batch_size':[32],
              'init_lr': [3e-6, 3e-5],
              'dense_size' : [100,200],
              'loss' : [tf.keras.losses.MeanSquaredError(), pearson_loss]
              }

best_scores = -1
best_params = {1: dict()}

for loss in parameters['loss']:
  print("Loss: ", loss)
  for epochs in parameters['epochs']:
    print(" Epochs: ", epochs)
    for init_lr in parameters['init_lr']:
      print("  Start Learning Rate: ", init_lr)
      for batch_size in parameters['batch_size']:
        print("   Batch Size: ", batch_size)
        for dense_size in parameters['dense_size']:
          print("    Dense size: ", dense_size)
          steps_per_epoch = x_train.shape[0] / batch_size 
          num_train_steps = steps_per_epoch * epochs
          num_warmup_steps = int(epochs * x_train.shape[0] * 0.1 / batch_size)
          optimizer = optimization.create_optimizer(init_lr=init_lr, 
                                                    num_train_steps=num_train_steps, 
                                                    num_warmup_steps=num_warmup_steps, 
                                                    optimizer_type='adamw')
          classifier_model = build_classifier_model(dense_size)
          classifier_model.compile(optimizer=optimizer, loss=loss, 
                                   metrics=[metric_pearson, metric_mse])
          history = classifier_model.fit(x=x_train, y=Y_train, epochs=epochs, 
                                         batch_size=batch_size)
          loss_calculated, pearson ,mse = classifier_model.evaluate(x=x_val, 
                                                                    y=Y_val)
          print("     Pearson: ", pearson)
          print("     MSE: ", mse)
          if pearson > best_scores:                 
            best_score = pearson
            best_params = {'epochs': epochs, 
                           'batch_size': batch_size, 
                           'start_lr': init_lr,  
                           'dense_size': dense_size,
                           'loss': loss_calculated}
print(best_scores)
print(best_params)

# Training

In [None]:
# Best parameter found on grid search
parameters = {'epochs': 2, 
              'batch_size': 32,
              'init_lr': 3e-5,
              'dense_size': 100,
              'loss': pearson_loss
              }

epochs = parameters['epochs']
batch_size = parameters['batch_size']
init_lr = parameters['init_lr']
dense_size = parameters['dense_size']
loss = parameters['loss']

steps_per_epoch = (x_train.append(x_val)).shape[0] / batch_size 
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(epochs * ((x_train.append(x_val)).shape[0]) * 0.1 / batch_size)
optimizer = optimization.create_optimizer(init_lr=init_lr, 
                                          num_train_steps=num_train_steps, 
                                          num_warmup_steps=num_warmup_steps, 
                                          optimizer_type='adamw')
classifier_model = build_classifier_model(dense_size)
classifier_model.compile(optimizer=optimizer, loss=loss, metrics=[metric_pearson, 
                                                                  metric_mse])
history = classifier_model.fit(x=(x_train.append(x_val)).reset_index(drop=True), y=(Y_train.append(Y_val)).reset_index(drop=True), 
                               epochs=epochs, batch_size=batch_size)
loss_calculated, pearson, mse = classifier_model.evaluate(x_test, Y_test)
print("Pearson: ", pearson)
print("MSE: ", mse)

Epoch 1/2

# Save model

In [None]:
classifier_model.save("drive/MyDrive/Colab Notebooks/NLP/classifierIBM30k.h5")