In [1]:
import glob
import pandas as pd
import csv

Load Keras_Bert and Tokenizer

In [2]:
from keras_bert import Tokenizer, load_vocabulary
import os
pretrained_path = 'uncased_L-12_H-768_A-12'
config_path = os.path.join(pretrained_path, 'bert_config.json')
checkpoint_path = os.path.join(pretrained_path, 'bert_model.ckpt')

vocab_path = os.path.join(pretrained_path, 'vocab.txt')
token_dict = load_vocabulary(vocab_path)
tokenizer = Tokenizer(token_dict)

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [32]:
from keras_bert import load_trained_model_from_checkpoint
SEQ_LEN=64
model = load_trained_model_from_checkpoint(config_path, checkpoint_path,
                                          training = True,
                                          trainable = True,
                                          seq_len=SEQ_LEN)

Use given case as testing data and use the rest three cases as training data. (Generalize Performance)

In [4]:
import numpy as np
import keras
SEQ_LENGTH = 64
def get_data(case_name):
    header_list = ["file_name", "ignore","persuasion","speaker","argument"]
    directoryPath = './../scratch/data/processed_csv/'
    glued_data = pd.DataFrame()
    test_data = pd.DataFrame()
    for file_name in glob.glob(directoryPath+'*.csv'):
        if file_name[32]!=case_name:
            x = pd.read_csv(file_name, low_memory=False,names=header_list)
            glued_data = pd.concat([glued_data,x],axis=0)
        else:
            x = pd.read_csv(file_name, low_memory=False,names=header_list)
            test_data = pd.concat([test_data,x],axis=0)
    return glued_data, test_data

In [33]:
case_name = 'R'
train_data, test_data = get_data(case_name)
train_arg = train_data['argument'].to_numpy()
test_arg = test_data['argument'].to_numpy()
train_y = train_data['persuasion']!=0
test_y = test_data['persuasion']!=0

In [34]:
train_X = []
train_X1 = []
test_X = []
test_X1=[]
for i in range(0,len(train_arg)):
    ids, segments = tokenizer.encode(str(train_arg[i]),max_len=SEQ_LENGTH)
    train_X.append(ids)
    train_X1.append(segments)
train_X = np.array(train_X)
train_X1 = np.array(train_X1)
for i in range(0,len(test_arg)):
    ids, segments = tokenizer.encode(str(test_arg[i]),max_len=SEQ_LENGTH)
    test_X.append(ids)
    test_X1.append(segments)
test_X = np.array(test_X)
test_X1 = np.array(test_X1)

In [35]:
import keras_metrics
from keras_radam import RAdam

inputs = model.inputs[:2]
dense = model.get_layer('NSP-Dense').output
leave_one_out_outputs = keras.layers.Dense(1, activation='sigmoid')(dense)

leave_one_out_model_R=keras.models.Model(inputs,leave_one_out_outputs)
leave_one_out_model_R.compile(
    RAdam(learning_rate = 1e-4),
    loss='binary_crossentropy',metrics=[keras_metrics.recall()],
)

tracking <tf.Variable 'Variable_6:0' shape=() dtype=int32> tp
tracking <tf.Variable 'Variable_7:0' shape=() dtype=int32> fn


In [36]:
leave_one_out_history_R = leave_one_out_model_R.fit([train_X,train_X1],train_y,epochs=3,batch_size=100,
                               validation_data=([test_X,test_X1],test_y))

Train on 13608 samples, validate on 5249 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


More epochs might increase the performance. Still need to fine tune the model later.

In [37]:
leave_one_out_y_pred_R = leave_one_out_model_R.predict([test_X,test_X1])

In [38]:
from sklearn.metrics import confusion_matrix
leave_one_out_filtered = []
for pred in leave_one_out_y_pred_R:
    if pred>=0.5:
        leave_one_out_filtered.append(1)
    else:
        leave_one_out_filtered.append(0)
cm = confusion_matrix(leave_one_out_filtered,test_y)
print(cm)
tp,fp,fn,tn=cm.ravel()
print('recall score:',tn/(fn+tn))
print('precision:',(tn/(fp+tn)))

[[4226  399]
 [ 236  388]]
recall score: 0.6217948717948718
precision: 0.49301143583227447


In [41]:
case_name = 'T'
train_data, test_data = get_data(case_name)
train_arg = train_data['argument'].to_numpy()
test_arg = test_data['argument'].to_numpy()
train_y = train_data['persuasion']!=0
test_y = test_data['persuasion']!=0

In [42]:
train_X = []
train_X1 = []
test_X = []
test_X1=[]
for i in range(0,len(train_arg)):
    ids, segments = tokenizer.encode(str(train_arg[i]),max_len=SEQ_LENGTH)
    train_X.append(ids)
    train_X1.append(segments)
train_X = np.array(train_X)
train_X1 = np.array(train_X1)
for i in range(0,len(test_arg)):
    ids, segments = tokenizer.encode(str(test_arg[i]),max_len=SEQ_LENGTH)
    test_X.append(ids)
    test_X1.append(segments)
test_X = np.array(test_X)
test_X1 = np.array(test_X1)

In [43]:
model = load_trained_model_from_checkpoint(config_path, checkpoint_path,
                                          training = True,
                                          trainable = True,
                                          seq_len=SEQ_LEN)
inputs = model.inputs[:2]
dense = model.get_layer('NSP-Dense').output
leave_one_out_outputs = keras.layers.Dense(1, activation='sigmoid')(dense)

leave_one_out_model_T=keras.models.Model(inputs,leave_one_out_outputs)
leave_one_out_model_T.compile(
    RAdam(learning_rate = 1e-4),
    loss='binary_crossentropy',metrics=[keras_metrics.recall()],
)

tracking <tf.Variable 'Variable_8:0' shape=() dtype=int32> tp
tracking <tf.Variable 'Variable_9:0' shape=() dtype=int32> fn


In [44]:
leave_one_out_history_T = leave_one_out_model_T.fit([train_X,train_X1],train_y,epochs=3,batch_size=100,
                               validation_data=([test_X,test_X1],test_y))

Train on 11944 samples, validate on 6913 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [45]:
leave_one_out_y_pred_T = leave_one_out_model_T.predict([test_X,test_X1])

In [46]:
from sklearn.metrics import confusion_matrix
leave_one_out_filtered = []
for pred in leave_one_out_y_pred_T:
    if pred>=0.5:
        leave_one_out_filtered.append(1)
    else:
        leave_one_out_filtered.append(0)
cm = confusion_matrix(leave_one_out_filtered,test_y)
print(cm)
tp,fp,fn,tn=cm.ravel()
print('recall score:',tn/(fn+tn))
print('precision:',(tn/(fp+tn)))

[[5671  448]
 [ 357  437]]
recall score: 0.5503778337531486
precision: 0.4937853107344633


In [47]:
2 * ((tn/(fp+tn)) * (tn/(fn+tn))) / ((tn/(fp+tn)) + (tn/(fn+tn)))

0.5205479452054795

Two more models needed to make.