In [1]:
import pandas as pd
from keras.utils.vis_utils import plot_model
from tensorflow.python.keras.callbacks import EarlyStopping
from tensorflow.python.keras.utils.np_utils import to_categorical
from tensorflow import one_hot
import load_data
import preprocess
import utils
import drqa_model
import bidaf_model
import our_model
import sys
import io
import json

from os.path import isfile
import numpy as np
from tensorflow.keras.preprocessing.text import tokenizer_from_json 
from settings import EMBEDDING_DIM, MODEL, EPOCHS, BATCH_SIZE, MODELS_DIR

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\userl\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


✔ Download and installation successful
You can now load the model via spacy.load('en_core_web_sm')


In [2]:
#EMBEDDING_DIM = settings.EMBEDDING_DIM
#MODEL = settings.MODEL
#EPOCHS = settings.EPOCHS
#BATCH_SIZE = settings.BATCH_SIZE
#models_dir = settings.MODELS_DIR

In [3]:
print("Loading dataset...")
dataframe = load_data.load_dataset()
dataframe = dataframe[:5000]
print(dataframe.shape)

Loading dataset...
(5000, 6)


In [4]:
print("Splitting train and test set...")
train_df, test_df = load_data.split_test_set(dataframe)
print(train_df.shape, test_df.shape)

Splitting train and test set...
(4435, 6) (577, 6)


In [5]:
print("Splitting train and validation set...")
train_df, val_df = load_data.split_validation_set(train_df, rate=0.2)
print(train_df.shape, val_df.shape)

Splitting train and validation set...
(3937, 6) (498, 6)


In [6]:
PREPROCESSING_PIPELINE1 = [preprocess.expand_contractions,
                           preprocess.tokenization_spacy,
                           preprocess.remove_chars,
                           preprocess.split_alpha_num_sym,
                           preprocess.spell_correction,
                           preprocess.lemmatization,
                           preprocess.lower,
                           preprocess.strip_text]

print("Preprocessing training data...")
train_df1 = train_df.copy()
train_df1, train_tmp1 = preprocess.apply_preprocessing(train_df1, PREPROCESSING_PIPELINE1)

print("Preprocessing validation data...")
val_df1 = val_df.copy()
val_df1, val_tmp1 = preprocess.apply_preprocessing(val_df1, PREPROCESSING_PIPELINE1)

Preprocessing training data...
Preprocessing validation data...


In [7]:
# load the already saved content or compute from scratch

In [8]:
# load already saved content or compute it from scratch
load = (isfile(f"{MODELS_DIR}/word_listing.csv") and 
        isfile(f"{MODELS_DIR}/word2idx.json") and
        isfile(f"{MODELS_DIR}/idx2word.json") and
        isfile(f"{MODELS_DIR}/tokenizer.json") and
        isfile(f"{MODELS_DIR}/embedding_matrix.csv"))
print("load:", load)

if load:
    print("Loading matrices, tokenizers and dictionaries... ")
    #load pre-saved 
    df_word_listing = np.genfromtxt(f"{MODELS_DIR}/word_listing.csv", delimiter=',', encoding='utf-8', dtype='str')
    
    with open(f"{MODELS_DIR}/word2idx.json") as f:
        df_word_to_idx = json.load(f)

    with open(f"{MODELS_DIR}/idx2word.json") as f:
        df_idx_to_word = json.load(f)

    with open(f"{MODELS_DIR}/tokenizer.json") as f:
        tokenizer_json = json.load(f)
        df_tokenizer = tokenizer_from_json(tokenizer_json)

    embedding_matrix = np.genfromtxt(f"{models_dir}/embedding_matrix.csv", delimiter=',')
    print("Done")
          
else:
    #compute 
    print("Computing matrices, tokenizers and dictionaries... ")
    embedding_matrix, df_word_listing, df_tokenizer, df_word_to_idx, df_idx_to_word = utils.get_embedding_matrix(train_df1, EMBEDDING_DIM)
    
    np.savetxt(f"{MODELS_DIR}/embedding_matrix.csv", embedding_matrix, delimiter=",")
    np.savetxt(f"{MODELS_DIR}/word_listing.csv", df_word_listing, delimiter=",", fmt ="%s", encoding='utf-8')

    with open(f"{MODELS_DIR}/word2idx.json", 'w') as f:
        json.dump(df_word_to_idx, f)

    with open(f"{MODELS_DIR}/idx2word.json", 'w') as f:
        json.dump(df_idx_to_word, f)

    tokenizer_json = df_tokenizer.to_json()
    with io.open(f"{MODELS_DIR}/tokenizer.json", 'w', encoding='utf-8') as f:
        f.write(json.dumps(tokenizer_json, ensure_ascii=False))
    print("Done")

load: True
Loading matrices, tokenizers and dictionaries... 
Done


In [9]:
df_idx_to_word = dict(zip([int(k) for k in df_idx_to_word.keys()], df_idx_to_word.values()))

In [10]:
MAX_CONTEXT_LENGTH, MAX_TEXT_LENGTH, MAX_QUESTION_LENGTH = utils.get_max_length(train_df1)

print("Padding data...")
tr_context_padded = utils.pad(train_df1.context, df_tokenizer, MAX_CONTEXT_LENGTH)
tr_answer_padded = utils.pad(train_df1.text, df_tokenizer, MAX_TEXT_LENGTH)
tr_question_padded = utils.pad(train_df1.question, df_tokenizer, MAX_QUESTION_LENGTH)

val_context_padded = utils.pad(val_df1.context, df_tokenizer, MAX_CONTEXT_LENGTH)
val_answer_padded = utils.pad(val_df1.text, df_tokenizer, MAX_TEXT_LENGTH)
val_question_padded = utils.pad(val_df1.question, df_tokenizer, MAX_QUESTION_LENGTH)

Max length for context is 518
Max length adopted for context is 569
Max length for answer is 43
Max length adopted for answer is 47
Max length for question is 30
Max length adopted for question is 33
Padding data...


In [11]:
print("Computing start and end indices... ")
train_df1['s_idx'] = train_df.apply(
    lambda x: len(preprocess.preprocessing(x.context[:x.answer_start], PREPROCESSING_PIPELINE1).split()), axis=1)
train_df1['e_idx'] = train_df1.apply(lambda x: x.s_idx + len(x.text.split()) - 1, axis=1)

val_df1['s_idx'] = val_df.apply(
    lambda x: len(preprocess.preprocessing(x.context[:x.answer_start], PREPROCESSING_PIPELINE1).split()), axis=1)
val_df1['e_idx'] = val_df1.apply(lambda x: x.s_idx + len(x.text.split()) - 1, axis=1)
print("Done")

Computing start and end indices... 
Done


In [13]:
#save weights instead of saving entire model

if MODEL == 'basemodel' or MODEL == None:
    pass

elif MODEL == 'drqa':
    tag2idx, idx2tag = utils.create_pos_dicts()
    ner2idx, idx2ner = utils.create_ner_dicts()

    pos_embedding_matrix = to_categorical(list(idx2tag.keys()))
    ner_embedding_matrix = to_categorical(list(idx2ner.keys()))

    print("Extracting features for Train Set")
    train_em_input = utils.compute_exact_match(train_df1, MAX_CONTEXT_LENGTH)
    train_tf_input = utils.compute_tf(train_df1, MAX_CONTEXT_LENGTH)
    train_pos_input = utils.compute_pos(train_df1, tag2idx, MAX_CONTEXT_LENGTH)
    train_ner_input = utils.compute_ner(train_df1, ner2idx, MAX_CONTEXT_LENGTH)

    print("Extracting features for Validation Set")
    val_em_input = utils.compute_exact_match(val_df1, MAX_CONTEXT_LENGTH)
    val_tf_input = utils.compute_tf(val_df1, MAX_CONTEXT_LENGTH)
    val_pos_input = utils.compute_pos(val_df1, tag2idx, MAX_CONTEXT_LENGTH)
    val_ner_input = utils.compute_ner(val_df1, ner2idx, MAX_CONTEXT_LENGTH)


    model = drqa_model.build_model(MAX_QUESTION_LENGTH, MAX_CONTEXT_LENGTH, EMBEDDING_DIM, embedding_matrix,
                pos_embedding_matrix, ner_embedding_matrix)

    model.compile(loss='categorical_crossentropy', optimizer='nadam', metrics='accuracy')
    model.summary()
    plot_model(model, rankdir='TB', show_shapes=True, show_dtype=True, to_file="/models/drqa.png")

    tr_s_one = one_hot(train_df1.s_idx, depth=MAX_CONTEXT_LENGTH)
    tr_e_one = one_hot(train_df1.e_idx, depth=MAX_CONTEXT_LENGTH)
    val_s_one = one_hot(val_df1.s_idx, depth=MAX_CONTEXT_LENGTH)
    val_e_one = one_hot(val_df1.e_idx, depth=MAX_CONTEXT_LENGTH)

    x_tr = {'context': tr_context_padded, 'question': tr_question_padded, 'pos': train_pos_input,
            'ner': train_ner_input, 'em': train_em_input, 'tf': train_tf_input}
    x_val = {'context': val_context_padded, 'question': val_question_padded, 'pos': val_pos_input,
             'ner': val_ner_input, 'em': val_em_input, 'tf': val_tf_input}

    y_tr = {'start': tr_s_one, 'end': tr_e_one}
    y_val = {'start': val_s_one, 'end': val_e_one}

    mycb = EarlyStopping(patience=5, restore_best_weights=True)
    model.fit(x_tr, y_tr, validation_data=(x_val, y_val), epochs=EPOCHS, batch_size=16, callbacks=[mycb])
    model.save_weights(f"{MODELS_DIR}/drqa_weights.h5")

elif MODEL == "bidaf":

    char_embedding_matrix = utils.get_char_embeddings(df_word_listing, df_word_to_idx)

    model = bidaf_model.build_model(MAX_QUESTION_LENGTH, MAX_CONTEXT_LENGTH, EMBEDDING_DIM, embedding_matrix, char_embedding_matrix)

    model.compile(loss='categorical_crossentropy', optimizer='nadam', metrics='accuracy')
    model.summary()
    plot_model(model, rankdir='TB', show_shapes=True, show_dtype=True, to_file="./models/bidaf.png")

    tr_s_one = one_hot(train_df1.s_idx, depth=MAX_CONTEXT_LENGTH)
    tr_e_one = one_hot(train_df1.e_idx, depth=MAX_CONTEXT_LENGTH)
    val_s_one = one_hot(val_df1.s_idx, depth=MAX_CONTEXT_LENGTH)
    val_e_one = one_hot(val_df1.e_idx, depth=MAX_CONTEXT_LENGTH)

    x_tr = {'context': tr_context_padded, 'question': tr_question_padded}
    x_val = {'context': val_context_padded, 'question': val_question_padded}

    y_tr = {'start': tr_s_one, 'end': tr_e_one}
    y_val = {'start': val_s_one, 'end': val_e_one}

    mycb = EarlyStopping(patience=5, restore_best_weights=True)

    model.fit(x_tr, y_tr, validation_data=(x_val, y_val), epochs=EPOCHS, batch_size=16, callbacks=[mycb])
    model.save(f"{MODELS_DIR}/bidaf_weights.h5")

elif MODEL == "our_model":
    tag2idx, idx2tag = utils.create_pos_dicts()
    ner2idx, idx2ner = utils.create_ner_dicts()

    pos_embedding_matrix = to_categorical(list(idx2tag.keys()))
    ner_embedding_matrix = to_categorical(list(idx2ner.keys()))

    print("Extracting features for Train Set")
    train_em_input = utils.compute_exact_match(train_df1, MAX_CONTEXT_LENGTH)
    train_tf_input = utils.compute_tf(train_df1, MAX_CONTEXT_LENGTH)
    train_pos_input = utils.compute_pos(train_df1, tag2idx, MAX_CONTEXT_LENGTH)
    train_ner_input = utils.compute_ner(train_df1, ner2idx, MAX_CONTEXT_LENGTH)

    print("Extracting features for Validation Set")
    val_em_input = utils.compute_exact_match(val_df1, MAX_CONTEXT_LENGTH)
    val_tf_input = utils.compute_tf(val_df1, MAX_CONTEXT_LENGTH)
    val_pos_input = utils.compute_pos(val_df1, tag2idx, MAX_CONTEXT_LENGTH)
    val_ner_input = utils.compute_ner(val_df1, ner2idx, MAX_CONTEXT_LENGTH)

    if isfile(f"{MODELS_DIR}/char_embedding_matrix.csv"):
        char_embedding_matrix = np.genfromtxt(f"{MODELS_DIR}/char_embedding_matrix.csv", delimiter=',')
    else:
        char_embedding_matrix = utils.get_char_embeddings(df_word_listing, df_word_to_idx)
        np.savetxt(f"{MODELS_DIR}/char_embedding_matrix.csv", char_embedding_matrix, delimiter=",")

    model = our_model.build_model(MAX_QUESTION_LENGTH, MAX_CONTEXT_LENGTH, EMBEDDING_DIM,
                                  embedding_matrix, char_embedding_matrix, pos_embedding_matrix, ner_embedding_matrix)

    model.compile(loss='categorical_crossentropy', optimizer='nadam', metrics='accuracy')
    model.summary()
    plot_model(model, rankdir='TB', show_shapes=True, show_dtype=True, to_file="./models/our_model.png")

    tr_s_one = one_hot(train_df1.s_idx, depth=MAX_CONTEXT_LENGTH)
    tr_e_one = one_hot(train_df1.e_idx, depth=MAX_CONTEXT_LENGTH)
    val_s_one = one_hot(val_df1.s_idx, depth=MAX_CONTEXT_LENGTH)
    val_e_one = one_hot(val_df1.e_idx, depth=MAX_CONTEXT_LENGTH)

    x_tr = {'context': tr_context_padded, 'question': tr_question_padded, 'pos': train_pos_input,
            'ner': train_ner_input, 'em': train_em_input, 'tf': train_tf_input}
    x_val = {'context': val_context_padded, 'question': val_question_padded, 'pos': val_pos_input,
             'ner': val_ner_input, 'em': val_em_input, 'tf': val_tf_input}

    y_tr = {'start': tr_s_one, 'end': tr_e_one}
    y_val = {'start': val_s_one, 'end': val_e_one}

    mycb = EarlyStopping(patience=5, restore_best_weights=True)

    model.fit(x_tr, y_tr, validation_data=(x_val, y_val), epochs=EPOCHS, batch_size=BATCH_SIZE, callbacks=[mycb])
    model.save_weights(f"{MODELS_DIR}/our_model_weights.h5")

Creating dictionaries for POS tags...
Creating dictionaries for NER tags...
Extracting features for Train Set
Computing original exact match...
Computing lowercase exact match...
Computing lemmatized exact match...
Computing normalized TF...
Computing POS tags...
Padding POS sequences...
Computing NER tags...
Padding NER sequences...
Extracting features for Validation Set
Computing original exact match...
Computing lowercase exact match...
Computing lemmatized exact match...
Computing normalized TF...
Computing POS tags...
Padding POS sequences...
Computing NER tags...
Padding NER sequences...
Computing character-level embeddings...
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
context (InputLayer)            [(None, 569)]        0                                            
_______________________________________________



In [14]:
test_df1 = test_df.copy()
test_df1, test_tmp1 = preprocess.apply_preprocessing(test_df1, PREPROCESSING_PIPELINE1)

ts_context_padded = utils.pad(test_df1.context, df_tokenizer, MAX_CONTEXT_LENGTH)
ts_answer_padded = utils.pad(test_df1.text, df_tokenizer, MAX_TEXT_LENGTH)
ts_question_padded = utils.pad(test_df1.question, df_tokenizer, MAX_QUESTION_LENGTH)

test_df1['s_idx'] = test_df.apply(
    lambda x: len(preprocess.preprocessing(x.context[:x.answer_start], PREPROCESSING_PIPELINE1).split()), axis=1)
test_df1['e_idx'] = test_df1.apply(lambda x: x.s_idx + len(x.text.split()) - 1, axis=1)

ts_s_one = one_hot(test_df1.s_idx, depth=MAX_CONTEXT_LENGTH)
ts_e_one = one_hot(test_df1.e_idx, depth=MAX_CONTEXT_LENGTH)

if MODEL == 'drqa' or MODEL == "our_model":

    ts_em_input = utils.compute_exact_match(test_df1, MAX_CONTEXT_LENGTH)
    ts_tf_input = utils.compute_tf(test_df1, MAX_CONTEXT_LENGTH)
    ts_pos_input = utils.compute_pos(test_df1, tag2idx, MAX_CONTEXT_LENGTH)
    ts_ner_input = utils.compute_ner(test_df1, ner2idx, MAX_CONTEXT_LENGTH)

    x_ts = {'context': ts_context_padded, 'question': ts_question_padded, 'pos': ts_pos_input,
            'ner': ts_ner_input, 'em': ts_em_input, 'tf': ts_tf_input}
    y_ts = {'start': ts_s_one, 'end': ts_e_one}
else:
    x_ts = {'context': ts_context_padded, 'question': ts_question_padded}
    y_ts = {'start': ts_s_one, 'end': ts_e_one}

Computing original exact match...
Computing lowercase exact match...
Computing lemmatized exact match...
Computing normalized TF...
Computing POS tags...
Padding POS sequences...
Computing NER tags...
Padding NER sequences...


In [15]:
print("Evalutating model...")
evaluation = model.evaluate(x_ts, y_ts, batch_size=BATCH_SIZE)
print(evaluation)


df = pd.concat([train_df, val_df, test_df], 0, ignore_index=True)
x = {**x_tr, **x_val, **x_ts}
predictions = utils.computing_predictions(model, df, x, BATCH_SIZE)

#predictions = utils.computing_predictions(model, train_df, val_df, test_df, x_tr, x_val, x_ts)

print("Saving predictions as json...")
with open('predictions.json', 'w') as outfile:
    json.dump(predictions, outfile)

f1, precision, recall = utils.evaluate_model(model, MAX_CONTEXT_LENGTH, val_df1, x_val)
print(f"F1: {f1}\t Precision: {precision}\t Recall: {recall}\t")

Evalutating model...
[8.327722549438477, 4.146247863769531, 4.181472301483154, 0.12131715565919876, 0.13344886898994446]
Preprocessing on datasets...
Applying expand_contractions2, tokenization_spacy, remove_chars, split_alpha_num_sym and strip_text.


  df = pd.concat([train_df, val_df, test_df], 0, ignore_index=True)


Calculating predictions...
Computing answers...
Saving predictions as json...
Computing F1 score, precision and recall...
F1: 0.15346663335724842	 Precision: 0.175543284805333	 Recall: 0.20210205902976985	


In [17]:
from tensorflow.test import is_gpu_available
is_gpu_available()

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.


True

In [19]:
from tensorflow.config import list_physical_devices
list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]