# Building the Question Answering System

In [45]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from nltk.tokenize import word_tokenize
import keras

# Preprocessing

## Reading in Data

In [2]:
train_df=pd.read_json("C:/Users/Lukas Buteliauskas/Desktop/training_data.json").reset_index(drop=True)
dev_df=pd.read_json("C:/Users/Lukas Buteliauskas/Desktop/validation_data.json").reset_index(drop=True)

## Word Vectorization
To be able to use words, phrases, questions or other natural language constructs in our model we require a to provide our neural network a numerical representation of our words (as these are the elemental NLP 'particles'). The simplest implementation would be to use 'one hot encoding' and define each word as a vector the size of our dictionary (the number of unique words found in our collection of documents, our corpus). However, this approach will most likely be insufficient for the purposes of a question answering system. word2vec and GloVe are 2 popular choices sophisticated options for word embeddings that also capture word similarities. I will not go into the details of either architecture other than to say that we will not be re-training the word vectors due to the insufficient size of the dataset, and we will begin with the GloVe word embeddings due to it's superior performance in most 'downstream' modelling tasks. Having said that, given the simplicity of swapping word vector representations we will also test out performance with word2vec (providing we can do so in a time-efficient manner).

Info and download links for GloVe can be found at: https://nlp.stanford.edu/projects/glove/

### Defining Custom Functions
For the purpose of not repeating code, avoiding bugs and developing good programming practice/design.

In [55]:
def get_word_vector_dict(url_or_path):
    """Takes a URL or a local path and returns a dictionary of GloVe word vectors where the key is the word and the value is the 
    word vector with the dimension specified in the input file."""
    
    with open(url_or_path, encoding="utf8") as glove_text:
        word_lines=glove_text.readlines()
    word_embeddings=[line.split(" ") for line in word_lines]
    word_vector_dict={element[0]:list(map(float, element[1:])) for element in word_embeddings}
    
    return word_vector_dict


def get_word_vector_df(url_path_or_dict):
    """Takes a URL or path like the previous function, or can take a word vector dictionary and returns a word vector dataframe.
    Rows of the dataframe are the word vectors, columns are the dimensions of the word vector, indices are the words."""
    
    if type(url_path_or_dict) is str:
        with open(url_path_or_dict, encoding="utf8") as glove_text:
            word_lines=glove_text.readlines()
        word_embeddings=[line.split(" ") for line in word_lines]
        word_vector_dict={element[0]:list(map(float, element[1:])) for element in word_embeddings}
        word_vector_df=pd.DataFrame(word_vector_dict).transpose()
    
    else:
        word_vector_df=pd.DataFrame(url_path_or_dict).transpose()
    
    return word_vector_df


def get_unique_tokens(df):
    """Given a dataframe containing contexts, questions and answers, the function returns a list of unique tokens."""
    pieces_of_text=list(df["context"].unique()) + list(df["title"].unique()) + list(df["question"].unique()) 
    pieces_of_text+=list(df["answer_text"].unique())

    non_unique_tokens=[]
    for text in pieces_of_text:
        temp_tokens=word_tokenize(text)
        non_unique_tokens.append(temp_tokens)

    unique_tokens=set()
    for token in non_unique_tokens:
        unique_tokens.update(set(token))
    
    return [token.replace("``", '"').replace("''", '"').lower() for token in list(unique_tokens)]

def split_keep_sep(tokens, sep):
    """Takes a string or a list of tokens, and splits on 'sep' while keeping sep."""
    split_unique_tokens=[]
    for token in tokens:
        for sub_token in re.split("("+ sep + ")", token):
            split_unique_tokens.append(sub_token)
    return split_unique_tokens

### Setting up the Word Vectors
As mentioned above with regards to what model we use for the word vectors, it's important to note that the dimention of the word vectors is a hyperparameter of the Neural Networks to come, so to keep our options open we imported a few different word vectors representations and the custom functions defined above make this a 'one line of code' affair (dictionary or dataframe).


In [4]:
word_vector_50_dict=get_word_vector_dict("C:/Users/Lukas Buteliauskas/Desktop/glove.6B.50d.txt")
word_vector_50_df=get_word_vector_df(word_vector_50_dict)
vocab=np.array(word_vector_50_dict.keys()) #400k words as per the documentation.

word_vector_100_dict=get_word_vector_dict("C:/Users/Lukas Buteliauskas/Desktop/glove.6B.100d.txt")
word_vector_100_df=get_word_vector_df(word_vector_100_dict)

In [5]:
"""word_vector_200_dict=get_word_vector_dict("C:/Users/Lukas Buteliauskas/Desktop/glove.6B.200d.txt")
word_vector_200_df=get_word_vector_df(word_vector_200_dict)

word_vector_300_dict=get_word_vector_dict("C:/Users/Lukas Buteliauskas/Desktop/glove.6B.300d.txt")
word_vector_300_df=get_word_vector_df(word_vector_300_dict)"""

'word_vector_200_dict=get_word_vector_dict("C:/Users/Lukas Buteliauskas/Desktop/glove.6B.200d.txt")\nword_vector_200_df=get_word_vector_df(word_vector_200_dict)\n\nword_vector_300_dict=get_word_vector_dict("C:/Users/Lukas Buteliauskas/Desktop/glove.6B.300d.txt")\nword_vector_300_df=get_word_vector_df(word_vector_300_dict)'

## Ensuring all tokens have word embeddings

In [6]:
""" Main goal is to vectorize stuff into meaningful tokens."""
"""We need to deal with the tokens that don't have word embeddings.
    1. Deal with numbers
    2. Deal with spelling mistakes.
    3. Deal with ...
    4. Deal with words that just don't have a word embedding representation in GloVe.
    """
unique_tokens=get_unique_tokens(train_df)

In [79]:
no_embeddings=[token for token in unique_tokens if token not in word_vector_50_dict.keys()]
split_unique_tokens=split_keep_sep(unique_tokens, "-")
no_embeddings_new=[token for token in split_unique_tokens if token not in word_vector_50_dict.keys()]
print("Number of tokens with no embedding in GloVe (with the current tokenization):")
print("with '-' with words:",len(no_embeddings),"\nwith '-' seperate:", len(no_embeddings_new),"\n")
print(no_embeddings_new[200:300])


Number of tokens with no embedding in GloVe (with the current tokenization):
with '-' with words: 31521 
with '-' seperate: 25146 

['£17', 'vergeltungswaffe', 'maršal', '1595–1610', 'hamaynker', '£168.9', 'ozna', 'vented—a', 'kashua', 'parade—it', 'estuleks', 'hubway', "n'namdi", 'ˈaɪlənd/', 'malinoswki', 'znbr', 'drinkinghouse', 'icristat', 'kinseys', 'برادری\u200e', '15408', 'pratyahara', 'majahan', 'paris–along', '3,569.8', 'mamillaria', 'acetogenesis', '£320', '1up.com', '21–28', '', '73.9549', 'c.s.c', 'reggane', 'longmenshan', '', 'cohering', 'sichuans', 'annullment', 'bros.h', 'lightolier', 'opsonic', 'bryennius', 'rhinosinusitis', 'relatioship', '110,970', 'stadtstaaten', 'ii—during', 'vélhop', 'xyndas', 'endownment', 'extremetech', 'tironian', '45–116', 'prviously', 'chané', 'flag—consisting', 'cftk', 'proteobacterium', '64cu', 'centers—the', "'almond", 'chemotrophy', 'castleereagh', 'matais', 'mauveine', 'whiteheadian', '', '1938–42', 'tukharas', '440s', '−14', '67bn', 'hagk

In [74]:
print(train_df.loc[0,["context"]].values,"\n")
first_chap_tokens=word_tokenize(train_df.loc[0,["context"]][0])

print(first_chap)

[ 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".'] 

['Beyoncé', 'Giselle', 'Knowles-Carter', '(', '/biːˈjɒnseɪ/', 'bee-YON-say', ')', '(', 'born', 'September', '4', ',', '1981', ')', 'is', 'an', 'American', 'singer', ',', 'songwriter', ',', 'record', 'producer', 'and', 'actress', '.', 'Born', 'and', 'raised', 'in', 'Houston', ',', 'Texas', ',', 