# Relation Prediction

In this notebook, the relation prediction model is trained. 

This notebook should be run in Google Colab

Data preprocessing and model implementation loosely inspired from: https://keras.io/examples/pretrained_word_embeddings/

Note: The entire notebook should not be run sequentially. Each section should be run against the data separately

In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
!pip install tensorflow==2.0.0-alpha0
!pip install keras-tuner
!pip install scikit-learn

# Model building/processing
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.initializers import Constant
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, LSTM, Embedding, Input
from tensorflow.keras.optimizers import Adam

# Merge layers
from tensorflow.keras.layers import Add, Concatenate

# split data
from sklearn.model_selection import train_test_split

# hyperparameter tuning
from kerastuner import RandomSearch, Objective

from tensorflow.keras.callbacks import Callback
import tensorflow.keras.backend as K
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score

# Baseline - Dummy classifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.dummy import DummyClassifier

# saving/loading
import pickle
from tensorflow.keras.models import load_model

%load_ext tensorboard

import os, datetime
import sys
import numpy as np
import json

In [None]:
# Embedding parameters
GLOVE_DIR = '/content/drive/My Drive/Colab Notebooks/glove'
EMBEDDING_DIM = 100

# Data parameters
MAX_SEQUENCE_LENGTH = 200
MAX_NUM_WORDS = 50000

VALIDATION_SPLIT = 0.1

In [None]:
# embeddings for entire dataset

#load glove embedding into a dict
embeddings_index = {}
with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt')) as f:
    for line in f:
        if len(embeddings_index) >= MAX_NUM_WORDS:
            break
        values = line.split()
        word = values[0]
        value = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = value

word_index = {w: i for i, w in enumerate(embeddings_index.keys(), 1)}

#create embedding matrix
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector[:EMBEDDING_DIM]

embedding_matrix.shape

In [None]:
# sentence tokenizer

def texts_to_sequences(texts, word_index):
    sequences = []
    for text in texts:
        tokens = text_to_word_sequence(text)
        sequences.append([word_index.get(w) for w in tokens if w in word_index])
    return sequences

def text_to_sequence(text, word_index):
    tokens = text_to_word_sequence(text)
    return [word_index.get(w) for w in tokens if w in word_index]


In [None]:
# Import dataset

dataset = []

with open('/content/drive/My Drive/Colab Notebooks/labelled_data/relations_dataset.json') as f:
    for line in f:
        json_line = json.loads(line)
        print(line)
        arg = {"text": json_line["content"], "label": json_line["annotation"]["labels"][0]}

        dataset.append(arg)

dataset

In [None]:
# Prepare samples and labels

# Two sample text sets are required for originator and responder
texts_originator = []
texts_responder = []

labels_index = {'attack':0, 'support':1, 'neither':1}  # dictionary mapping label name to numeric id
labels = []  # list of label ids

for sample in dataset:
    texts = sample['text'].split('£££££££')
    texts_originator.append(texts[0])
    texts_responder.append(texts[1])

labels = [labels_index[sample["label"]] for sample in dataset]

# Combined text sets
texts = texts_responder + texts_originator

In [None]:
originator_sequences = texts_to_sequences(texts_originator, word_index)
responder_sequences = texts_to_sequences(texts_responder, word_index)

originator_data = pad_sequences(originator_sequences, maxlen=MAX_SEQUENCE_LENGTH)
responder_data = pad_sequences(responder_sequences, maxlen=MAX_SEQUENCE_LENGTH)

# Convert labels to one-hot encoded matrix
labels_data = np.array(labels)
print('Shape of originator data tensor:', originator_data.shape)
print('Shape of responder_data tensor:', responder_data.shape)
print('Shape of label tensor:', labels_data.shape)

In [None]:
# split data

x_originator_data, x_originator_test, x_responder_data, x_responder_test, y_data, y_test = train_test_split(originator_data, responder_data, labels_data, test_size=0.1)

In [None]:
# split the data into a training set and a validation set

num_validation_samples = int(VALIDATION_SPLIT * len(x_originator_data))

x_originator_train = x_originator_data[:-num_validation_samples]
x_responder_train = x_responder_data[:-num_validation_samples]
y_train = y_data[:-num_validation_samples]

x_originator_val = x_originator_data[-num_validation_samples:]
x_responder_val = x_responder_data[-num_validation_samples:]
y_val = y_data[-num_validation_samples:]

print(x_originator_train[:2])
print(x_responder_train[:2])
print(y_train[:2])
print(x_originator_val[:2])
print(x_responder_val[:2])
print(y_val[:2])

## Training (Hyperparameter Tuning)

In [None]:
hyperparamter_search_results = []

# get best model
best_model_f1_val = 0
best_model = None

class Metrics(Callback):

    def __init__(self, validation_data, test_data):
        self.validation_data = validation_data
        self.test_data = test_data
    
    def on_epoch_end(self, epoch, logs={}):
        # Calculate f1 score on test set
        val_predict = (np.asarray(self.model.predict(self.validation_data[0]))).round()
        val_targ = self.validation_data[1]
        
        val_f1 = f1_score(val_targ, val_predict, average='macro')

        # save best model only
        global best_model_f1_val
        global best_model
        if val_f1 > best_model_f1_val:
            best_model_f1_val = val_f1
            best_model = self.model

            test_predict = (np.asarray(best_model.predict(self.test_data[0]))).round()
            test_targ = self.test_data[1]

            test_f1 = f1_score(test_targ, test_predict, average='macro')

            best_model_info = {"val_f1": best_model_f1_val,
                            "test_f1": test_f1,
                            "model_config": str(best_model.get_config())
                            }
            # Save the best model results and model h5 file
            with open('/content/drive/My Drive/Colab Notebooks/best_model_results_95.json', 'w') as f:
                json.dump(best_model_info, f)

            best_model.save('/content/drive/My Drive/Colab Notebooks/best_model_95.h5')

        return
 
metrics = Metrics(validation_data=[[x_originator_val, x_responder_val], y_val], test_data=[[x_originator_test, x_responder_test], y_test])

In [None]:
best_model_f1_val

In [None]:
# Siamese network loosely inspired by: https://towardsdatascience.com/one-shot-learning-with-siamese-networks-using-keras-17f34e75bb3d

def build_model(hp):
    lstm_units = hp.Int("lstm", min_value=8, max_value=64, step=8)
    
    # Additional hyperparameters
    # lstm_recurrent_dropout = hp.Float("lstm_recurrent_dropout", min_value=0.0, max_value=0.9, step=0.1)
    # lstm_dropout = hp.Float("lstm_dropout", min_value=0.0, max_value=0.9, step=0.1)

    # Branch 1
    model_1 = Sequential()
    model_1.add(Embedding(embedding_matrix.shape[0],
                        embedding_matrix.shape[1],
                        embeddings_initializer=Constant(embedding_matrix),
                        input_length=MAX_SEQUENCE_LENGTH,
                        trainable=False))
    
    model_1.add(LSTM(lstm_units))
    
    # Branch 2
    model_2 = Sequential()
    model_2.add(Embedding(embedding_matrix.shape[0],
                        embedding_matrix.shape[1],
                        embeddings_initializer=Constant(embedding_matrix),
                        input_length=MAX_SEQUENCE_LENGTH,
                        trainable=False))
    model_2.add(LSTM(lstm_units))
    if hp.Choice('merge', ['cat', 'add']) == 'cat':
        conc = Concatenate()([model_1.output, model_2.output])
    else:
        conc = Add()([model_1.output, model_2.output])

    out = Dense(hp.Int("dense", min_value=8, max_value=64, step=8), activation='relu')(conc)
    out = Dropout(hp.Float("dense_dropout", min_value=0.0, max_value=0.9, step=0.1))(out)
    out = Dense(1, activation='sigmoid')(out)

    # Connect the inputs with the outputs
    model = Model([model_1.input, model_2.input], out)

    model.compile(loss='binary_crossentropy',
                optimizer=Adam(
                    hp.Float('learning_rate', 1e-4, 1e-2, sampling='log')),
                metrics=['acc'])
    
    return model

tuner = RandomSearch(build_model,
                     objective=Objective('val_acc', direction="max"),
                     max_trials=50,
                     executions_per_trial=1,
                     directory="log",
                     project_name=datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))

tuner.search(x=[x_originator_train, x_responder_train],
             y=y_train,
             epochs=5,
             batch_size=64,
             validation_data=([x_originator_val, x_responder_val], y_val),
             callbacks=[metrics])

## Training the best model

This uses the best hyperparameters and aims to find the best number of epochs to train for.

In [None]:
# Save val f1, test f1, loss and val loss for analysis
f1_score_epochs = []

class Metrics(Callback):

    def __init__(self, validation_data, test_data):
        self.validation_data = validation_data
        self.test_data = test_data
    
    def on_epoch_end(self, epoch, logs={}):

        # Calculate f1 score on test set
        val_predict = (np.asarray(self.model.predict(self.validation_data[0]))).round()
        val_targ = self.validation_data[1]
        
        val_f1 = f1_score(val_targ, val_predict, average='macro')

        test_predict = (np.asarray(self.model.predict(self.test_data[0]))).round()
        test_targ = self.test_data[1]

        test_f1 = f1_score(test_targ, test_predict, average='macro')
        
        f1_score_epochs.append([val_f1, test_f1, logs.get('loss'), logs.get('val_loss')])

        return
 
metrics = Metrics(validation_data=[[x_originator_val, x_responder_val], y_val], test_data=[[x_originator_test, x_responder_test], y_test])

In [None]:
# Siamese network loosely inspired by: https://towardsdatascience.com/one-shot-learning-with-siamese-networks-using-keras-17f34e75bb3d

# Branch 1
model_1 = Sequential()
model_1.add(Embedding(embedding_matrix.shape[0],
                    embedding_matrix.shape[1],
                    embeddings_initializer=Constant(embedding_matrix),
                    input_length=MAX_SEQUENCE_LENGTH,
                    trainable=False))

model_1.add(LSTM(units=8))

# Branch 2
model_2 = Sequential()
model_2.add(Embedding(embedding_matrix.shape[0],
                    embedding_matrix.shape[1],
                    embeddings_initializer=Constant(embedding_matrix),
                    input_length=MAX_SEQUENCE_LENGTH,
                    trainable=False))
model_2.add(LSTM(units=8))
conc = Concatenate()([model_1.output, model_2.output])
out = Dense(32)(conc)
out = Dropout(0.1)(out)
out = Dense(1, activation='sigmoid')(out)

# Connect the inputs with the outputs
model = Model([model_1.input, model_2.input], out)

model.compile(loss='binary_crossentropy',
            optimizer=Adam(),
            metrics=['acc'])

history_callback = model.fit([x_originator_train, x_responder_train], y_train,
          batch_size=64,
          epochs=20,
          validation_data=([x_originator_val, x_responder_val], y_val),
          callbacks=[metrics])


In [None]:
model.save('/content/drive/My Drive/Colab Notebooks/best_model_all_data.h5')

In [None]:
f1_score_epochs

## Baseline

The dummy classifiers ignore the data and predict using just the labels.

In [None]:
# Dummy classifier - 2 label

parameters_dum = {'strategy': ['stratified', 'most_frequent', 'prior', 'uniform']}

clf_dum = GridSearchCV(DummyClassifier(), parameters_dum, cv=StratifiedKFold(n_splits=3, random_state=999), scoring='f1_macro', n_jobs=-1)

clf_dum.fit(x_originator_data, y_data)

x_originator_test

clf_dum.best_params_

In [None]:
# Dummy classifier score

from sklearn import metrics

print("Dummy")
print("Test Score: " + str(metrics.f1_score(y_test, clf_dum.predict(x_originator_test), average='macro')))
print("Best Score: " + str(clf_dum.best_score_))

In [None]:
# Dummy classifier - 3 label
# NOTE: Make sure the labels_index is changed to contain 3 labels

parameters_dum = {'strategy': ['stratified', 'most_frequent', 'prior', 'uniform']}

clf_dum = GridSearchCV(DummyClassifier(), parameters_dum, cv=StratifiedKFold(n_splits=3, random_state=999), scoring='f1_macro', n_jobs=-1)

clf_dum.fit(x_originator_data, y_data)

x_originator_test

clf_dum.best_params_

In [None]:
# Dummy classifier score

from sklearn import metrics

print("Dummy")
print("Test Score: " + str(metrics.f1_score(y_test, clf_dum.predict(x_originator_test), average='macro')))
print("Best Score: " + str(clf_dum.best_score_))