<a href="https://colab.research.google.com/github/ElFosco/NLP_argument_creation/blob/main/Predictor_score.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installation

In [None]:
!pip install -q tf-models-official

In [None]:
!pip install tensorflow-text

In [None]:
pip install torch>=1.6.0 transformers>=4.11.3 sentencepiece

In [None]:
pip install numpy requests nlpaug

# Import

In [None]:
import os
import shutil

import re
import numpy as np
import math

import pandas as pd

import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

import tensorflow as tf

import tensorflow_hub as hub
import tensorflow_text as text
import tensorflow_addons as tfa

import keras
from keras import backend as K
from keras.layers import concatenate
from keras import Sequential

from sklearn import metrics

from official.nlp import optimization  # to create AdamW optimizer

import nlpaug.augmenter.word as naw #data augmentation

import seaborn as sns
from matplotlib import pyplot as plt

#Data Exploration

In [None]:
# Using google drive to upload the data
from google.colab import drive
drive.mount('/content/drive')

dir_path = "Data/"  # Point to project folder
dataset = "arg_quality_rank_30k.csv"

In [None]:
df = pd.read_csv(dir_path + dataset)
df.head()

#Data Preprocessing

In [None]:
lemmatizer = WordNetLemmatizer()
def clean_text(text,topic):
  text = re.sub('\"|-|\\\\|`', ' ', text)  # delete this chars from the string ["-\`]
  text = re.sub('\n', ' ', text)
  text = re.sub('^[.]+', '', text)         # delete dots at the beginning of the sentence
  #text = re.sub("([?.!,])", r" \1 ", text)
  text = re.sub('\. \.', '.', text)        # delete . .
  text = re.sub('&', ' and ', text)        # replace & with and
  text = re.sub(' +', ' ', text)           # delete additional whitespace
  text = text.rstrip()                  
  text = text.lstrip()
  text = " ".join([lemmatizer.lemmatize(x) for x in text.split()])
  if not (re.search('[\.|?|!]$',text)): #append the topic 
    text = text+' [SEP]'
  else:
    text = re.sub('[\.|?|!]$',' [SEP]',text)
  text = text + " " + topic.lower()
  return text

In [None]:
df.loc[2, "argument"] = "zero tolerance policy in schools should not be adopted as circumstances are often not black and white, being more nuanced. no one should be written off due to a mistake of judgement."
df['argument'] = df.apply(lambda row : clean_text(row['argument'],row['topic']), axis = 1)


##Data Split

In [None]:
is_training_data =  df['set']=='train'
is_validation_data =  df['set']=='dev'
is_test_data =  df['set']=='test'

training_data = df[is_training_data]
validation_data = df[is_validation_data]
test_data  = df[is_test_data ]

x_train = training_data['argument'].reset_index(drop=True)
Y_train = training_data['MACE-P'].reset_index(drop=True)

x_val = validation_data['argument'].reset_index(drop=True)
Y_val = validation_data['MACE-P'].reset_index(drop=True)

x_test = test_data['argument'].reset_index(drop=True)
Y_test = test_data['MACE-P'].reset_index(drop=True)

In [None]:
x_train_aug=Y_train_aug=x_train_ukp=Y_train_ukp=x_val_ukp=Y_val_ukp=x_test_ukp=Y_test_ukp=None

#Data Augmentation (Not used in the final project)

In [None]:
def create_augmented_data(x,Y):                   # run this two cells if you want to create other augmented data
                                                  # right now, we have created around 8k of new data
  back_translation_aug = naw.BackTranslationAug(
    from_model_name='facebook/wmt19-en-de', 
    to_model_name='facebook/wmt19-de-en'
  )

  tmp_df = pd.DataFrame()
  for i in range(8000,int(x.shape[0])):
    print('['+str(i+1)+'/'+str(int(x.shape[0])+1)+']')
    new_argument = back_translation_aug.augment(x[i])
    score = Y[i]
    tmp = {'argument': new_argument, 'score': score}
    tmp_df = tmp_df.append(tmp,ignore_index=True)
    if ((i+1)%500)==0:
      print("Print on file:"+str(int(i+1)))
      tmp_df.to_csv(dir_path+"aug_"+str(int(i+1))+".csv", index=False, encoding='utf-8-sig')
      tmp_df = pd.DataFrame()
  tmp_df.to_csv(dir_path+"aug.csv", index=False, encoding='utf-8-sig')


In [None]:
if False:
  create_augmented_data(x_train,Y_train)        # run this two cells if you want to create other augmented data
                                                # right now, we have created around 8k of new data

##Read augmented data

In [None]:
def read_aug_data():                                      #read augmented data, from the one collected, right now there are 8k new data
  ris = pd.DataFrame()
  for i in range(500,8500,500):                           #change 8500 if you have created additional data
    df = pd.read_csv(dir_path+"aug_"+str(int(i))+".csv")
    ris = ris.append(df)
  return ris

In [None]:
ris_aug = read_aug_data()

In [None]:
x_train_aug = ris_aug['argument']
Y_train_aug = ris_aug['score']

#Data from UKP (Not used in final implementation)

In [None]:
ukp_path = dir_path + "UKPConvArg1-Ranking-CSV/"            #read data from UKP dataset, these will be splitted into
i=0                                                         #training, validation and test set
ukp_dataset_train = pd.DataFrame()
ukp_dataset_valid = pd.DataFrame()
ukp_dataset_test = pd.DataFrame()
for csv in os.listdir(ukp_path):
  if i<=20:
    df = pd.read_csv(ukp_path+csv,sep='\t')
    ukp_dataset_train = ukp_dataset_train.append(df)
  elif 20<i<=25 :
    df = pd.read_csv(ukp_path+csv,sep='\t')
    ukp_dataset_valid = ukp_dataset_valid.append(df)
  else:
    df = pd.read_csv(ukp_path+csv,sep='\t')
    ukp_dataset_test = ukp_dataset_test.append(df)
  i+=1

print(ukp_dataset_train.shape)
print(ukp_dataset_valid.shape)
print(ukp_dataset_test.shape)

In [None]:
lemmatizer = WordNetLemmatizer()
def clean_text_ukp(text):
  text = text.lower()
  text = re.sub('\"|-|\\\\|`|/|\'', ' ', text)  # delete this chars from the string ["-\`]
  text = re.sub('<br/>', ' ', text)
  text = re.sub(':\)', ' ', text)
  text = re.sub('[\.]+[\.]+', ' ', text)         # delete ...
  #text = re.sub("([?.!,])", r" \1 ", text)
  text = re.sub('&', ' and ', text)        # replace & with and
  text = re.sub(' +', ' ', text)           # delete additional whitespace
  text = text.rstrip()                  
  text = text.lstrip()
  text = " ".join([lemmatizer.lemmatize(x) for x in text.split()])
  return text

In [None]:
ukp_dataset_train['argument'] = ukp_dataset_train.apply(lambda row : clean_text_ukp(row['argument']), axis = 1)
ukp_dataset_valid['argument'] = ukp_dataset_valid.apply(lambda row : clean_text_ukp(row['argument']), axis = 1)
ukp_dataset_test['argument'] = ukp_dataset_test.apply(lambda row : clean_text_ukp(row['argument']), axis = 1)

x_train_ukp = ukp_dataset_train['argument']
Y_train_ukp = ukp_dataset_train['rank']

x_val_ukp = ukp_dataset_valid['argument']
Y_val_ukp = ukp_dataset_valid['rank']

x_test_ukp = ukp_dataset_test['argument']
Y_test_ukp = ukp_dataset_test['rank']

#Final Dataset

In [None]:
is_augmented_added = False        #flag indicating if you want to add the augmented dataset
is_ukp_added = False              #flag indicating if you want to add the ukp dataset

In [None]:
def generate_final_dataset(x_train,Y_train,x_val,Y_val,x_test,Y_test,
                           x_train_aug,Y_train_aug,
                           x_train_ukp,Y_train_ukp,x_val_ukp,Y_val_ukp,x_test_ukp,Y_test_ukp,
                           is_augmented_added,is_ukp_added):
  
  if is_augmented_added==True:

    x_train = (x_train.append(x_train_aug)).reset_index(drop=True)
    Y_train = (Y_train.append(Y_train_aug)).reset_index(drop=True)
  
  if is_ukp_added==True:

    x_train = (x_train.append(x_train_aug)).reset_index(drop=True)
    Y_train = (Y_train.append(Y_train_aug)).reset_index(drop=True)

    x_val = (x_val.append(x_val_ukp)).reset_index(drop=True)
    Y_val = (Y_val.append(Y_val_ukp)).reset_index(drop=True)

    x_test = (x_test.append(x_test_ukp)).reset_index(drop=True)
    Y_test = (Y_test.append(Y_test_ukp)).reset_index(drop=True)

  return x_train,Y_train,x_val,Y_val,x_test,Y_test

In [None]:
x_train,Y_train,x_val,Y_val,x_test,Y_test= generate_final_dataset(x_train,Y_train,
                                                                  x_val,Y_val,
                                                                  x_test,Y_test,
                                                                  x_train_aug,Y_train_aug,
                                                                  x_train_ukp,Y_train_ukp,
                                                                  x_val_ukp,Y_val_ukp,
                                                                  x_test_ukp,Y_test_ukp,
                                                                  is_augmented_added,is_ukp_added)

#[Bert](https://colab.research.google.com/github/tensorflow/text/blob/master/docs/tutorials/classify_text_with_bert.ipynb)



In [None]:
# @title Choose a BERT model to fine-tune

bert_model_name = 'bert_en_uncased_L-12_H-768_A-12'  # @param ["bert_en_uncased_L-24_H-1024_A-16","bert_en_uncased_L-12_H-768_A-12", "bert_en_cased_L-12_H-768_A-12", "bert_multi_cased_L-12_H-768_A-12", "small_bert/bert_en_uncased_L-2_H-128_A-2", "small_bert/bert_en_uncased_L-2_H-256_A-4", "small_bert/bert_en_uncased_L-2_H-512_A-8", "small_bert/bert_en_uncased_L-2_H-768_A-12", "small_bert/bert_en_uncased_L-4_H-128_A-2", "small_bert/bert_en_uncased_L-4_H-256_A-4", "small_bert/bert_en_uncased_L-4_H-512_A-8", "small_bert/bert_en_uncased_L-4_H-768_A-12", "small_bert/bert_en_uncased_L-6_H-128_A-2", "small_bert/bert_en_uncased_L-6_H-256_A-4", "small_bert/bert_en_uncased_L-6_H-512_A-8", "small_bert/bert_en_uncased_L-6_H-768_A-12", "small_bert/bert_en_uncased_L-8_H-128_A-2", "small_bert/bert_en_uncased_L-8_H-256_A-4", "small_bert/bert_en_uncased_L-8_H-512_A-8", "small_bert/bert_en_uncased_L-8_H-768_A-12", "small_bert/bert_en_uncased_L-10_H-128_A-2", "small_bert/bert_en_uncased_L-10_H-256_A-4", "small_bert/bert_en_uncased_L-10_H-512_A-8", "small_bert/bert_en_uncased_L-10_H-768_A-12", "small_bert/bert_en_uncased_L-12_H-128_A-2", "small_bert/bert_en_uncased_L-12_H-256_A-4", "small_bert/bert_en_uncased_L-12_H-512_A-8", "small_bert/bert_en_uncased_L-12_H-768_A-12", "albert_en_base", "electra_small", "electra_base", "experts_pubmed", "experts_wiki_books", "talking-heads_base"]

map_name_to_handle = {
    'bert_en_uncased_L-24_H-1024_A-16':
        'https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/4',
    'bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3',
    'bert_en_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/3',
    'bert_multi_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/3',
    'small_bert/bert_en_uncased_L-2_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-2_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-2_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-2_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-4_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-4_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-4_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-4_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-6_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-6_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-6_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-6_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-8_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-8_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-8_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-8_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-10_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-10_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-10_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-10_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-12_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-12_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-12_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-768_A-12/1',
    'albert_en_base':
        'https://tfhub.dev/tensorflow/albert_en_base/2',
    'electra_small':
        'https://tfhub.dev/google/electra_small/2',
    'electra_base':
        'https://tfhub.dev/google/electra_base/2',
    'experts_pubmed':
        'https://tfhub.dev/google/experts/bert/pubmed/2',
    'experts_wiki_books':
        'https://tfhub.dev/google/experts/bert/wiki_books/2',
    'talking-heads_base':
        'https://tfhub.dev/tensorflow/talkheads_ggelu_bert_en_base/1',
}

map_model_to_preprocess = {
    'bert_en_uncased_L-24_H-1024_A-16':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_en_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_multi_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3',
    'albert_en_base':
        'https://tfhub.dev/tensorflow/albert_en_preprocess/3',
    'electra_small':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'electra_base':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'experts_pubmed':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'experts_wiki_books':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'talking-heads_base':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
}

tfhub_handle_encoder = map_name_to_handle[bert_model_name]
tfhub_handle_preprocess = map_model_to_preprocess[bert_model_name]

print(f'BERT model selected           : {tfhub_handle_encoder}')
print(f'Preprocess model auto-selected: {tfhub_handle_preprocess}')

In [None]:
bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess) #preprocessing layer

In [None]:
bert_model = hub.KerasLayer(tfhub_handle_encoder)               #bert model

In [None]:
def build_classifier_model(dense_size=100):                     #model used to compute the score of the argument
  text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
  preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
  encoder_inputs = preprocessing_layer(text_input)
  encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
  outputs = encoder(encoder_inputs)
  net = outputs['pooled_output']
  net = tf.keras.layers.Dense(dense_size, activation=keras.activations.relu, name='fc_1')(net)
  net = tf.keras.layers.Dense(1, activation=keras.activations.sigmoid, name='classifier')(net)
  return tf.keras.Model(text_input, net)

In [None]:
classifier_model = build_classifier_model()

In [None]:
classifier_model.summary()

In [None]:
def pearson_metric(y_true, y_pred): #metric used to do some analysis for the data
    x = y_true
    y = y_pred
    mx = K.mean(x, axis=0)
    my = K.mean(y, axis=0)
    xm, ym = x - mx, y - my
    r_num = K.sum(xm * ym)
    x_square_sum = K.sum(xm * xm)
    y_square_sum = K.sum(ym * ym)
    r_den = K.sqrt(x_square_sum * y_square_sum)
    r = r_num / r_den
    return K.mean(r)

loss = tf.keras.losses.MeanSquaredError()

metric_pearson = pearson_metric
metric_mse = tf.keras.metrics.MeanSquaredError()

#Grid Search

In [None]:
parameters = {'epochs': [1,2,3], 
              'batch_size':[32],
              'init_lr': [3e-6,3e-5],
              'dense_size' : [100,200,300],
              'loss' : [tf.keras.losses.MeanSquaredError()]
              }

best_scores = -1
best_params = {1: dict()}

for loss in parameters['loss']:
  print("Loss: ", loss)
  for epochs in parameters['epochs']:
    print(" Epochs: ", epochs)
    for init_lr in parameters['init_lr']:
      print("  Start Learning Rate: ", init_lr)
      for batch_size in parameters['batch_size']:
        print("   Batch Size: ", batch_size)
        for dense_size in parameters['dense_size']:
          print("    Dense size: ", dense_size)
          steps_per_epoch = x_train.shape[0] / batch_size 
          num_train_steps = steps_per_epoch * epochs
          num_warmup_steps = int(epochs * x_train.shape[0] * 0.1 / batch_size)
          optimizer = optimization.create_optimizer(init_lr=init_lr, 
                                                    num_train_steps=num_train_steps, 
                                                    num_warmup_steps=num_warmup_steps, 
                                                    optimizer_type='adamw')
          classifier_model = build_classifier_model(dense_size)
          classifier_model.compile(optimizer=optimizer, loss=loss, 
                                   metrics=[metric_pearson, metric_mse])
          history = classifier_model.fit(x=x_train, y=Y_train, epochs=epochs, 
                                         batch_size=batch_size)
          loss_calculated, pearson ,mse = classifier_model.evaluate(x=x_val, 
                                                                    y=Y_val)
          print("     Pearson: ", pearson)
          print("     MSE: ", mse)
          if pearson > best_scores:                 
            best_score = pearson
            best_params = {'epochs': epochs, 
                           'batch_size': batch_size, 
                           'start_lr': init_lr,  
                           'dense_size': dense_size,
                           'loss': loss_calculated}
print(best_scores)
print(best_params)

# Training

In [None]:
# Best parameter found on grid search
parameters = {'epochs': 2, 
              'batch_size': 32,
              'init_lr': 3e-5,
              'dense_size': 100,
              'loss': tf.keras.losses.MeanSquaredError()
              }

epochs = parameters['epochs']
batch_size = parameters['batch_size']
init_lr = parameters['init_lr']
dense_size = parameters['dense_size']
loss = parameters['loss']

steps_per_epoch = (x_train.append(x_val)).shape[0] / batch_size 
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(epochs * ((x_train.append(x_val)).shape[0]) * 0.1 / batch_size)
optimizer = optimization.create_optimizer(init_lr=init_lr, 
                                          num_train_steps=num_train_steps, 
                                          num_warmup_steps=num_warmup_steps, 
                                          optimizer_type='adamw')
classifier_model = build_classifier_model(dense_size)
classifier_model.compile(optimizer=optimizer, loss=loss, metrics=[metric_pearson, 
                                                                  metric_mse])
history = classifier_model.fit(x=(x_train.append(x_val)).reset_index(drop=True), y=(Y_train.append(Y_val)).reset_index(drop=True), 
                               epochs=epochs, batch_size=batch_size)
loss_calculated, pearson, mse = classifier_model.evaluate(x_test, Y_test)
print("Pearson: ", pearson)
print("MSE: ", mse)

# Save model

In [None]:
classifier_model.save("Models/<model_name>")

#Data analysis

In [None]:
ris = classifier_model.predict(x_test)

In [None]:
mse = [0] * 10
mse_size = [0] * 10
for i in range(len(Y_test)):
  mse[math.ceil(ris[i][0]*10)-1] += (ris[i][0]-Y_test[i])**2
  mse_size[math.ceil(ris[i][0]*10)-1] +=1
for i in range(len(mse)):
 mse[i]=mse[i]/mse_size[i]

In [None]:
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
ax.set_title('MSE by value range')
ax.set_ylabel("MSE")
ax.bar(['0.1','0.2','0.3','0.4','0.5','0.6','0.7','0.8','0.8','0.9'],mse)
plt.show()

In [None]:
test_data  = df[is_test_data].reset_index(drop=True)
mse_topic = {}
for el in test_data['topic'].unique():
  mse_topic[el] = [0,0,0]
for i in range(test_data.shape[0]):
  mse_topic[test_data['topic'][i]][0]+=(ris[i][0]-Y_test[i])**2
  mse_topic[test_data['topic'][i]][1]+=1

for i in range(test_data.shape[0]): 
  mse_topic[test_data['topic'][i]][2]= mse_topic[test_data['topic'][i]][0]/mse_topic[test_data['topic'][i]][1]

In [None]:
pd.DataFrame.from_dict(mse_topic,orient='index',columns=['tot_mse', 'size', 'mse']).drop(['tot_mse','size'],axis=1)