In [1]:
import pandas as pd
from keras.utils.vis_utils import plot_model
from tensorflow.python.keras.callbacks import EarlyStopping
from tensorflow.python.keras.utils.np_utils import to_categorical
from tensorflow import one_hot
import load_data
import preprocess
import utils
import drqa_model
import bidaf_model
import our_model
import sys
import io
import json

from os.path import isfile
import numpy as np
from tensorflow.keras.preprocessing.text import tokenizer_from_json 
from settings import EMBEDDING_DIM, MODEL, EPOCHS, BATCH_SIZE, MODELS_DIR

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\userl\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


✔ Download and installation successful
You can now load the model via spacy.load('en_core_web_sm')


In [2]:
print("Loading dataset...")
dataframe = load_data.load_dataset()
print(dataframe.shape)

Loading dataset...
(87599, 6)


In [4]:
print("Splitting train and test set...")
train_df, test_df = load_data.split_test_set(dataframe)
print(train_df.shape, test_df.shape)

Splitting train and test set...
(78161, 6) (9438, 6)


In [5]:
print("Splitting train and validation set...")
train_df, val_df = load_data.split_validation_set(train_df, rate=0.2)
print(train_df.shape, val_df.shape)

Splitting train and validation set...
(61143, 6) (17018, 6)


In [6]:
PREPROCESSING_PIPELINE1 = [preprocess.expand_contractions,
                           preprocess.tokenization_spacy,
                           preprocess.remove_chars,
                           preprocess.split_alpha_num_sym,
                           preprocess.spell_correction,
                           preprocess.lemmatization,
                           preprocess.lower,
                           preprocess.strip_text]

print("Preprocessing training data...")
train_df1 = train_df.copy()
train_df1, train_tmp1 = preprocess.apply_preprocessing(train_df1, PREPROCESSING_PIPELINE1)

print("Preprocessing validation data...")
val_df1 = val_df.copy()
val_df1, val_tmp1 = preprocess.apply_preprocessing(val_df1, PREPROCESSING_PIPELINE1)

Preprocessing training data...
Preprocessing validation data...


In [7]:
#import importlib
#importlib.reload(utils)

<module 'utils' from 'C:\\Users\\userl\\Documents\\GitHub\\FFSquad\\utils.py'>

In [8]:
# load already saved content or compute it from scratch
load = (isfile(f"{MODELS_DIR}/word_listing.csv") and 
        isfile(f"{MODELS_DIR}/word2idx.json") and
        isfile(f"{MODELS_DIR}/idx2word.json") and
        isfile(f"{MODELS_DIR}/tokenizer.json") and
        isfile(f"{MODELS_DIR}/embedding_matrix.csv") and
        # char embedding matrix is loaded only in case of 'our_model' or 'bidaf'
        (not(MODEL == "our_model" or MODEL == 'bidaf') or
         isfile(f"{MODELS_DIR}/char_embedding_matrix.csv")))
print("load:", load)

if load:
    print("Loading matrices, tokenizers and dictionaries... ")
    #load pre-saved 
    df_word_listing = np.genfromtxt(f"{MODELS_DIR}/word_listing.csv", delimiter=',', encoding='utf-8', dtype='str')
    
    with open(f"{MODELS_DIR}/word2idx.json") as f:
        df_word_to_idx = json.load(f)

    with open(f"{MODELS_DIR}/idx2word.json") as f:
        df_idx_to_word = json.load(f)

    with open(f"{MODELS_DIR}/tokenizer.json") as f:
        tokenizer_json = json.load(f)
        df_tokenizer = tokenizer_from_json(tokenizer_json)

    embedding_matrix = np.genfromtxt(f"{MODELS_DIR}/embedding_matrix.csv", delimiter=',')
    print("Done")
          
else:
    #compute 
    print("Computing matrices, tokenizers and dictionaries... ")
    embedding_matrix, df_word_listing, df_tokenizer, df_word_to_idx, df_idx_to_word = utils.get_embedding_matrix(train_df1, EMBEDDING_DIM)
    
    np.savetxt(f"{MODELS_DIR}/embedding_matrix.csv", embedding_matrix, delimiter=",")
    np.savetxt(f"{MODELS_DIR}/word_listing.csv", df_word_listing, delimiter=",", fmt ="%s", encoding='utf-8')

    with open(f"{MODELS_DIR}/word2idx.json", 'w') as f:
        json.dump(df_word_to_idx, f)

    with open(f"{MODELS_DIR}/idx2word.json", 'w') as f:
        json.dump(df_idx_to_word, f)

    tokenizer_json = df_tokenizer.to_json()
    with io.open(f"{MODELS_DIR}/tokenizer.json", 'w', encoding='utf-8') as f:
        f.write(json.dumps(tokenizer_json, ensure_ascii=False))
    print("Done")

load: False
Computing matrices, tokenizers and dictionaries... 
Loading GloVe embedding model...
There are 53939 words for which we already know the embedding
There are 11303 oov words
Computing out-of-vocabulary embeddings...
Computing embedding matrix...
Done


In [9]:
df_idx_to_word = dict(zip([int(k) for k in df_idx_to_word.keys()], df_idx_to_word.values()))

In [10]:
MAX_CONTEXT_LENGTH, MAX_TEXT_LENGTH, MAX_QUESTION_LENGTH = utils.get_max_length(train_df1)

#MAX_CONTEXT_LENGTH, MAX_TEXT_LENGTH, MAX_QUESTION_LENGTH = 662, 43, 40
print("Padding data...")
tr_context_padded = utils.pad(train_df1.context, df_tokenizer, MAX_CONTEXT_LENGTH)
tr_answer_padded = utils.pad(train_df1.text, df_tokenizer, MAX_TEXT_LENGTH)
tr_question_padded = utils.pad(train_df1.question, df_tokenizer, MAX_QUESTION_LENGTH)

val_context_padded = utils.pad(val_df1.context, df_tokenizer, MAX_CONTEXT_LENGTH)
val_answer_padded = utils.pad(val_df1.text, df_tokenizer, MAX_TEXT_LENGTH)
val_question_padded = utils.pad(val_df1.question, df_tokenizer, MAX_QUESTION_LENGTH)

Max length for context is 662
Max length adopted for context is 728
Max length for answer is 43
Max length adopted for answer is 47
Max length for question is 40
Max length adopted for question is 44
Padding data...


In [11]:
print("Computing start and end indices... ")
train_df1['s_idx'] = train_df.apply(
    lambda x: len(preprocess.preprocessing(x.context[:x.answer_start], PREPROCESSING_PIPELINE1).split()), axis=1)
train_df1['e_idx'] = train_df1.apply(lambda x: x.s_idx + len(x.text.split()) - 1, axis=1)

val_df1['s_idx'] = val_df.apply(
    lambda x: len(preprocess.preprocessing(x.context[:x.answer_start], PREPROCESSING_PIPELINE1).split()), axis=1)
val_df1['e_idx'] = val_df1.apply(lambda x: x.s_idx + len(x.text.split()) - 1, axis=1)
print("Done")

Computing start and end indices... 
Done


In [12]:
#save weights instead of saving entire model

if MODEL == 'basemodel' or MODEL == None:
    pass

elif MODEL == 'drqa':
    tag2idx, idx2tag = utils.create_pos_dicts()
    ner2idx, idx2ner = utils.create_ner_dicts()

    pos_embedding_matrix = to_categorical(list(idx2tag.keys()))
    ner_embedding_matrix = to_categorical(list(idx2ner.keys()))

    print("Extracting features for Train Set")
    train_em_input = utils.compute_exact_match(train_df1, MAX_CONTEXT_LENGTH)
    train_tf_input = utils.compute_tf(train_df1, MAX_CONTEXT_LENGTH)
    train_pos_input = utils.compute_pos(train_df1, tag2idx, MAX_CONTEXT_LENGTH)
    train_ner_input = utils.compute_ner(train_df1, ner2idx, MAX_CONTEXT_LENGTH)

    print("Extracting features for Validation Set")
    val_em_input = utils.compute_exact_match(val_df1, MAX_CONTEXT_LENGTH)
    val_tf_input = utils.compute_tf(val_df1, MAX_CONTEXT_LENGTH)
    val_pos_input = utils.compute_pos(val_df1, tag2idx, MAX_CONTEXT_LENGTH)
    val_ner_input = utils.compute_ner(val_df1, ner2idx, MAX_CONTEXT_LENGTH)


    model = drqa_model.build_model(MAX_QUESTION_LENGTH, MAX_CONTEXT_LENGTH, EMBEDDING_DIM, embedding_matrix,
                pos_embedding_matrix, ner_embedding_matrix)

    model.compile(loss='categorical_crossentropy', optimizer='nadam', metrics='accuracy')
    model.summary()
    plot_model(model, rankdir='TB', show_shapes=True, show_dtype=True, to_file=f"{MODELS_DIR}/drqa.png")

    tr_s_one = one_hot(train_df1.s_idx, depth=MAX_CONTEXT_LENGTH)
    tr_e_one = one_hot(train_df1.e_idx, depth=MAX_CONTEXT_LENGTH)
    val_s_one = one_hot(val_df1.s_idx, depth=MAX_CONTEXT_LENGTH)
    val_e_one = one_hot(val_df1.e_idx, depth=MAX_CONTEXT_LENGTH)

    x_tr = {'context': tr_context_padded, 'question': tr_question_padded, 'pos': train_pos_input,
            'ner': train_ner_input, 'em': train_em_input, 'tf': train_tf_input}
    x_val = {'context': val_context_padded, 'question': val_question_padded, 'pos': val_pos_input,
             'ner': val_ner_input, 'em': val_em_input, 'tf': val_tf_input}

    y_tr = {'start': tr_s_one, 'end': tr_e_one}
    y_val = {'start': val_s_one, 'end': val_e_one}

    #mycb = EarlyStopping(patience=5, restore_best_weights=True)
    #model.fit(x_tr, y_tr, validation_data=(x_val, y_val), epochs=EPOCHS, batch_size=BATCH_SIZE, callbacks=[mycb])
    #model.save_weights(f"{MODELS_DIR}/drqa_weights.h5")

elif MODEL == "bidaf":

    char_embedding_matrix = utils.get_char_embeddings(df_word_listing, df_word_to_idx)

    model = bidaf_model.build_model(MAX_QUESTION_LENGTH, MAX_CONTEXT_LENGTH, EMBEDDING_DIM, embedding_matrix, char_embedding_matrix)

    model.compile(loss='categorical_crossentropy', optimizer='nadam', metrics='accuracy')
    model.summary()
    plot_model(model, rankdir='TB', show_shapes=True, show_dtype=True, to_file=f"{MODELS_DIR}/bidaf.png")

    tr_s_one = one_hot(train_df1.s_idx, depth=MAX_CONTEXT_LENGTH)
    tr_e_one = one_hot(train_df1.e_idx, depth=MAX_CONTEXT_LENGTH)
    val_s_one = one_hot(val_df1.s_idx, depth=MAX_CONTEXT_LENGTH)
    val_e_one = one_hot(val_df1.e_idx, depth=MAX_CONTEXT_LENGTH)

    x_tr = {'context': tr_context_padded, 'question': tr_question_padded}
    x_val = {'context': val_context_padded, 'question': val_question_padded}

    y_tr = {'start': tr_s_one, 'end': tr_e_one}
    y_val = {'start': val_s_one, 'end': val_e_one}

    #mycb = EarlyStopping(patience=5, restore_best_weights=True)
    #model.fit(x_tr, y_tr, validation_data=(x_val, y_val), epochs=EPOCHS, batch_size=BATCH_SIZE, callbacks=[mycb])
    #model.save(f"{MODELS_DIR}/bidaf_weights.h5")

elif MODEL == "our_model":
    tag2idx, idx2tag = utils.create_pos_dicts()
    ner2idx, idx2ner = utils.create_ner_dicts()

    pos_embedding_matrix = to_categorical(list(idx2tag.keys()))
    ner_embedding_matrix = to_categorical(list(idx2ner.keys()))

    print("Extracting features for Train Set")
    train_em_input = utils.compute_exact_match(train_df1, MAX_CONTEXT_LENGTH)
    train_tf_input = utils.compute_tf(train_df1, MAX_CONTEXT_LENGTH)
    train_pos_input = utils.compute_pos(train_df1, tag2idx, MAX_CONTEXT_LENGTH)
    train_ner_input = utils.compute_ner(train_df1, ner2idx, MAX_CONTEXT_LENGTH)

    print("Extracting features for Validation Set")
    val_em_input = utils.compute_exact_match(val_df1, MAX_CONTEXT_LENGTH)
    val_tf_input = utils.compute_tf(val_df1, MAX_CONTEXT_LENGTH)
    val_pos_input = utils.compute_pos(val_df1, tag2idx, MAX_CONTEXT_LENGTH)
    val_ner_input = utils.compute_ner(val_df1, ner2idx, MAX_CONTEXT_LENGTH)

    if isfile(f"{MODELS_DIR}/char_embedding_matrix.csv"):
        char_embedding_matrix = np.genfromtxt(f"{MODELS_DIR}/char_embedding_matrix.csv", delimiter=',')
    else:
        char_embedding_matrix = utils.get_char_embeddings(df_word_listing, df_word_to_idx)
        np.savetxt(f"{MODELS_DIR}/char_embedding_matrix.csv", char_embedding_matrix, delimiter=",")

    model = our_model.build_model(MAX_QUESTION_LENGTH, MAX_CONTEXT_LENGTH, EMBEDDING_DIM,
                                  embedding_matrix, char_embedding_matrix, pos_embedding_matrix, ner_embedding_matrix)

    model.compile(loss='categorical_crossentropy', optimizer='nadam', metrics='accuracy')
    model.summary()
    plot_model(model, rankdir='TB', show_shapes=True, show_dtype=True, to_file=f"{MODELS_DIR}/our_model.png")

    tr_s_one = one_hot(train_df1.s_idx, depth=MAX_CONTEXT_LENGTH)
    tr_e_one = one_hot(train_df1.e_idx, depth=MAX_CONTEXT_LENGTH)
    val_s_one = one_hot(val_df1.s_idx, depth=MAX_CONTEXT_LENGTH)
    val_e_one = one_hot(val_df1.e_idx, depth=MAX_CONTEXT_LENGTH)

    x_tr = {'context': tr_context_padded, 'question': tr_question_padded, 'pos': train_pos_input,
            'ner': train_ner_input, 'em': train_em_input, 'tf': train_tf_input}
    x_val = {'context': val_context_padded, 'question': val_question_padded, 'pos': val_pos_input,
             'ner': val_ner_input, 'em': val_em_input, 'tf': val_tf_input}

    y_tr = {'start': tr_s_one, 'end': tr_e_one}
    y_val = {'start': val_s_one, 'end': val_e_one}

    #mycb = EarlyStopping(patience=5, restore_best_weights=True)
    #model.fit(x_tr, y_tr, validation_data=(x_val, y_val), epochs=EPOCHS, batch_size=BATCH_SIZE, callbacks=[mycb])
    #model.save_weights(f"{MODELS_DIR}/our_model_weights.h5")

Creating dictionaries for POS tags...
Creating dictionaries for NER tags...
Extracting features for Train Set
Computing original exact match...
Computing lowercase exact match...
Computing lemmatized exact match...
Computing TF...
Computing POS tags...
Padding POS sequences...
Computing NER tags...
Padding NER sequences...
Extracting features for Validation Set
Computing original exact match...
Computing lowercase exact match...
Computing lemmatized exact match...
Computing TF...
Computing POS tags...
Padding POS sequences...
Computing NER tags...
Padding NER sequences...
Computing character-level embeddings...
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
context (InputLayer)            [(None, 728)]        0                                            
_____________________________________________________________________

In [16]:
#import importlib
#importlib.reload(our_model)

<module 'our_model' from 'C:\\Users\\userl\\Documents\\GitHub\\FFSquad\\our_model.py'>

In [17]:
model = our_model.build_model(MAX_QUESTION_LENGTH, MAX_CONTEXT_LENGTH, EMBEDDING_DIM,
                                  embedding_matrix, char_embedding_matrix, pos_embedding_matrix, ner_embedding_matrix)

model.compile(loss='categorical_crossentropy', optimizer='nadam', metrics='accuracy')
model.summary()
plot_model(model, rankdir='TB', show_shapes=True, show_dtype=True, to_file=f"{MODELS_DIR}/our_model.png")


mycb = EarlyStopping(patience=5, restore_best_weights=True)
model.fit(x_tr, y_tr, validation_data=(x_val, y_val), epochs=1, batch_size=BATCH_SIZE, callbacks=[mycb])
model.save_weights(f"{MODELS_DIR}/{MODEL}_weights.h5")

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
context (InputLayer)            [(None, 728)]        0                                            
__________________________________________________________________________________________________
question (InputLayer)           [(None, 44)]         0                                            
__________________________________________________________________________________________________
p_encoding (Embedding)          (None, 728, 200)     13048800    context[0][0]                    
__________________________________________________________________________________________________
char_p_encoding (Embedding)     (None, 728, 50)      3262200     context[0][0]                    
____________________________________________________________________________________________



In [18]:
#2nd epoch
model.fit(x_tr, y_tr, validation_data=(x_val, y_val), epochs=1, batch_size=BATCH_SIZE, callbacks=[mycb])



KeyboardInterrupt: 

In [None]:
#model.save_weights(f"{MODELS_DIR}/{MODEL}_weights.h5")

In [None]:
test_df1 = test_df.copy()
test_df1, test_tmp1 = preprocess.apply_preprocessing(test_df1, PREPROCESSING_PIPELINE1)

ts_context_padded = utils.pad(test_df1.context, df_tokenizer, MAX_CONTEXT_LENGTH)
ts_answer_padded = utils.pad(test_df1.text, df_tokenizer, MAX_TEXT_LENGTH)
ts_question_padded = utils.pad(test_df1.question, df_tokenizer, MAX_QUESTION_LENGTH)

test_df1['s_idx'] = test_df.apply(
    lambda x: len(preprocess.preprocessing(x.context[:x.answer_start], PREPROCESSING_PIPELINE1).split()), axis=1)
test_df1['e_idx'] = test_df1.apply(lambda x: x.s_idx + len(x.text.split()) - 1, axis=1)

ts_s_one = one_hot(test_df1.s_idx, depth=MAX_CONTEXT_LENGTH)
ts_e_one = one_hot(test_df1.e_idx, depth=MAX_CONTEXT_LENGTH)

if MODEL == 'drqa' or MODEL == "our_model":

    ts_em_input = utils.compute_exact_match(test_df1, MAX_CONTEXT_LENGTH)
    ts_tf_input = utils.compute_tf(test_df1, MAX_CONTEXT_LENGTH)
    ts_pos_input = utils.compute_pos(test_df1, tag2idx, MAX_CONTEXT_LENGTH)
    ts_ner_input = utils.compute_ner(test_df1, ner2idx, MAX_CONTEXT_LENGTH)

    x_ts = {'context': ts_context_padded, 'question': ts_question_padded, 'pos': ts_pos_input,
            'ner': ts_ner_input, 'em': ts_em_input, 'tf': ts_tf_input}
    y_ts = {'start': ts_s_one, 'end': ts_e_one}
else:
    x_ts = {'context': ts_context_padded, 'question': ts_question_padded}
    y_ts = {'start': ts_s_one, 'end': ts_e_one}

In [None]:
print("Evalutating model...")
evaluation = model.evaluate(x_ts, y_ts, batch_size=BATCH_SIZE)
print(evaluation)


df = pd.concat([train_df, val_df, test_df], 0, ignore_index=True)
x = {**x_tr, **x_val, **x_ts}
predictions = utils.computing_predictions(model, df, x, BATCH_SIZE)

#predictions = utils.computing_predictions(model, train_df, val_df, test_df, x_tr, x_val, x_ts)

print("Saving predictions as json...")
with open('predictions.json', 'w') as outfile:
    json.dump(predictions, outfile)

f1, precision, recall = utils.evaluate_model(model, MAX_CONTEXT_LENGTH, val_df1, x_val)
print(f"F1: {f1}\t Precision: {precision}\t Recall: {recall}\t")