In [1]:
'''
Question Answering (QA) System using NLP with SQuAD
EE562 Group3 Project
Megha Chandra Nandyala
Amisha Himanshu Somaiya


APPROACH       : Traditional Machine Learning Approaches
                 - Multinomial Logistic Regression
                 - Random Forest

PRE-PROCESSING :
Using NLTK-Punkt, Textblob, InferSent and GLoVE
Before giving the inputs to the model, the data needs to be pre-processed.
First, we convert the passage to blobs of paragraphs using an instance of textblob library.
Paragraphs are then split to sentences and a maximum length is pre-determined based on model capability.
If the sentence length is more than this, it is truncated and if it is less, the sentence is padded with special padding tokens.
The sentences are then split to words and tokenized. We are using the Punkt tokenizer from NLTK.
Then a vocabulary of words is generated using glove model from Facebook that uses GLOVE word to vector embeddings.
Then, separate embeddings are generated for the passage and for the questions.
Embeddings are vectors that group words with similar context together based on their similarity score.
For eg, apple and banana will have the same grouping but apple and car will not.

HAND-CRAFTED FEATURES :
Generate features of Euclidean Distance and Cosine Similarity and append to the dataframe

FIT and PREDICT :
Use the resultant dataframes with context, question, sentence embeddings, question embeddings, and hand-crafted features to fit and predict on 2 classifiers :
multinomial logistic regression and random forest.

METRICS :
Evaluate using F1 and EM scores.

REFERENCES :
https://arxiv.org/abs/1705.02364
https://github.com/facebookresearch/InferSent
https://nlp.stanford.edu/projects/glove/
https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
https://www.v7labs.com/blog/f1-score-guide
https://huggingface.co/spaces/evaluate-metric/exact_match

'''



!pip install -U scikit-learn


Collecting scikit-learn
  Downloading scikit_learn-1.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m77.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.2.2
    Uninstalling scikit-learn-1.2.2:
      Successfully uninstalled scikit-learn-1.2.2
Successfully installed scikit-learn-1.3.2


In [3]:
# import libraries
import pandas as pd
import numpy as np
import pickle
from textblob import TextBlob
import torch

from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics.pairwise import cosine_similarity

import numpy as np
from numpy import dot
from numpy.linalg import norm
import joblib


import warnings
warnings.filterwarnings('ignore')

In [4]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [8]:
import sys
# sys.path.append('/content/drive/MyDrive/qa-nlp/')  #megha
sys.path.append('/content/drive/MyDrive/EE562_Group3_Project/multinomial_and_randomforest/')    #amisha

In [6]:
# Use the Punkt unsupervised tokenizer from the NLTK (Natural Language ToolKit) library
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [10]:
# Use Infersent model from Facebook to create vocabulary and separate embeddings for question and for context
# Infersent uses GLoVE pre-trained word to vector embeddings
# Referred https://github.com/facebookresearch/InferSent
from models import InferSent

# megha
# V = 1
# MODEL_PATH = '/content/drive/MyDrive/qa-nlp/encoder/infersent%s.pkl' % V
# params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
#                 'pool_type': 'max', 'dpout_model': 0.0, 'version': V}
# infersent = InferSent(params_model)
# infersent.load_state_dict(torch.load(MODEL_PATH))
# W2V_PATH = '/content/drive/MyDrive/qa-nlp/GloVe/glove.840B.300d.txt'
# infersent.set_w2v_path(W2V_PATH)


# amisha
V = 1
MODEL_PATH = '/content/drive/MyDrive/EE562_Group3_Project/multinomial_and_randomforest/encoder/infersent%s.pkl' % V
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': V}
infersent = InferSent(params_model)
infersent.load_state_dict(torch.load(MODEL_PATH))
W2V_PATH = '/content/drive/MyDrive/EE562_Group3_Project/multinomial_and_randomforest/GloVe/glove.840B.300d.txt'
infersent.set_w2v_path(W2V_PATH)

In [13]:
# read training data

# megha
# train = pd.read_json('/content/drive/MyDrive/qa-nlp/data/train-v1.1.json')

# amisha
train = pd.read_json('/content/drive/MyDrive/EE562_Group3_Project/multinomial_and_randomforest/data/train-v1.1.json')

In [25]:
# create dataframes and add to them the extracted individual attributes of data
contexts, questions, answers, titles = [], [], [], [] #initializations

def get_attributes(item):  #extract and create separate lists
    data = item['data']
    title = data['title']
    for paragraph in data['paragraphs']:
        for qas in paragraph['qas']:
            answers.append(qas['answers'][0]['text'])
            questions.append(qas['question'])
            contexts.append(paragraph['context'])
            titles.append(title)



def build_dataframe(train):   #create dataframe, later add handcrafted features to this dataframe
    train.apply(get_attributes, axis = 1)
    train_df = pd.DataFrame({
    'contexts':contexts,
    'questions': questions,
    'answers': answers,
    'titles': titles
})
    return train_df

train_df = build_dataframe(train)
train_df = train_df.head(5000)
train_df.shape

(5000, 4)

In [26]:
# The blobs i.e. paragraphs are further split into sentences for processing
train_df['sentences'] = train_df['contexts'].apply(lambda x : [item.raw for item in TextBlob(x).sentences ])

In [27]:
# custom function to get the answer from sentences
def get_target(item):
    '''
    Builds the target using the index number of answer in the list of sentences
    '''
    for index, sentence in enumerate( item['sentences']):
        if item['answers'] in sentence:
            return index
    return 0

train_df['target'] = train_df.apply(get_target, axis = 1)

In [28]:
# custom function to get all sentences together in a list
def get_all_sentences(sentences):
    all_sentences = []
    sentences = sentences.tolist()
    for context_sentences in sentences:
        for setence in context_sentences:
            all_sentences.append(setence)

    all_sentences = list(dict.fromkeys(all_sentences))
    return all_sentences

In [29]:
# Generate vocabulary using InferSent which uses GLoVE inherently
paras = list(train_df["contexts"].drop_duplicates().reset_index(drop= True))
blob = TextBlob(" ".join(paras))
sentences = get_all_sentences(train_df['sentences'])
infersent.build_vocab(sentences, tokenize=True)

Found 14622(/15388) words with w2v vectors
Vocab size : 14622


In [30]:
#generate separate embeddings for context (i.e. sentences) and for the questions
#sentence Embeddings
dict_embeddings = {}
for i in range(len(sentences)):
    dict_embeddings[sentences[i]] = infersent.encode([sentences[i]], tokenize=True)[0]

#question Embeddings
questions = list(train_df["questions"])
for i in range(len(questions)):
    dict_embeddings[questions[i]] = infersent.encode([questions[i]], tokenize=True)[0]


In [31]:
#method definition to save all embeddings for sentences in a list embeddings
def get_context_embeddings(item):
    embeddings = []
    for sentence in item.sentences:
        embeddings.append(dict_embeddings[sentence])
    return embeddings

In [32]:
#add these embeddings to the dataframe
train_df['question_embedding'] = train_df['questions'].apply(lambda x : dict_embeddings[x])
train_df['context_embedding'] = train_df.apply(get_context_embeddings, axis = 1)

In [33]:
# generate hand-crafted features using Euclidean Distance and Cosine Similarity Score
from sklearn.metrics.pairwise import euclidean_distances

In [34]:
# calculate the metrics
def get_metric(item, metric):
    result = []
    for i in range(0,len(item.sentences)):
        question_embedding = [item.question_embedding]
        sentence_embedding = [item['context_embedding'][i]]

        if metric == 'cosine_similarity':
            metric = cosine_similarity(question_embedding, sentence_embedding)

        if metric == 'euclidean':
            metric = euclidean_distances(question_embedding, sentence_embedding)

        result.append(metric[0][0])
    return result


In [35]:
# add the hand-crafted features to the dataframes
train_df['cosine_similarity'] = train_df.apply(lambda item : get_metric(item, 'cosine_similarity'), axis = 1)
train_df['euclidean'] = train_df.apply(lambda item : get_metric(item, 'euclidean'), axis = 1)

In [36]:
train_df.head()

Unnamed: 0,contexts,questions,answers,titles,sentences,target,question_embedding,context_embedding,cosine_similarity,euclidean
0,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,Saint Bernadette Soubirous,University_of_Notre_Dame,"[Architecturally, the school has a Catholic ch...",5,"[0.1101008, 0.1142294, 0.11560897, 0.054894753...","[[0.055199962, 0.05013141, 0.047870383, 0.0162...","[0.5752636, 0.5752636, 0.5752636, 0.5752636, 0...","[3.8162625, 3.8162625, 3.8162625, 3.8162625, 3..."
1,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,a copper statue of Christ,University_of_Notre_Dame,"[Architecturally, the school has a Catholic ch...",2,"[0.10951651, 0.11030623, 0.05210006, 0.0305399...","[[0.055199962, 0.05013141, 0.047870383, 0.0162...","[0.5459254, 0.5459254, 0.5459254, 0.5459254, 0...","[3.590196, 3.590196, 3.590196, 3.590196, 3.590..."
2,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,the Main Building,University_of_Notre_Dame,"[Architecturally, the school has a Catholic ch...",1,"[0.011956469, 0.14930709, 0.028481215, 0.05278...","[[0.055199962, 0.05013141, 0.047870383, 0.0162...","[0.6082523, 0.6082523, 0.6082523, 0.6082523, 0...","[3.4122276, 3.4122276, 3.4122276, 3.4122276, 3..."
3,"Architecturally, the school has a Catholic cha...",What is the Grotto at Notre Dame?,a Marian place of prayer and reflection,University_of_Notre_Dame,"[Architecturally, the school has a Catholic ch...",4,"[0.0711433, 0.054118324, -0.013959841, 0.05310...","[[0.055199962, 0.05013141, 0.047870383, 0.0162...","[0.50993013, 0.50993013, 0.50993013, 0.5099301...","[3.6493201, 3.6493201, 3.6493201, 3.6493201, 3..."
4,"Architecturally, the school has a Catholic cha...",What sits on top of the Main Building at Notre...,a golden statue of the Virgin Mary,University_of_Notre_Dame,"[Architecturally, the school has a Catholic ch...",1,"[0.16131133, 0.15654244, 0.08214858, 0.0437286...","[[0.055199962, 0.05013141, 0.047870383, 0.0162...","[0.52223635, 0.52223635, 0.52223635, 0.5222363...","[3.7629066, 3.7629066, 3.7629066, 3.7629066, 3..."


In [37]:
train_df_copy = train_df.copy()

In [38]:
# function to find maximum number of sentences in any context
def find_max_number_of_sentences():
    max_number_of_sentences = 0
    for i in range(0, train_df.shape[0]):
        length = len(train_df.iloc[i].sentences)
        if length > max_number_of_sentences:
            max_number_of_sentences = length
    return max_number_of_sentences

max_number_of_sentences = find_max_number_of_sentences()
max_number_of_sentences


22

In [39]:
# if the length of data is less than the maximum length, add padding to it
def pad(data, max_length):
    length_of_data = len(data)
    pad_number = max_length - length_of_data
    data = data + [np.nan] * pad_number
    return data


In [40]:
# combine the features, add padding if necesssary and return the resultant data
resultant_data = []
def combine_features(item):
    length_of_sentence = len(item.sentences)
    cosine_similarity = item.cosine_similarity
    euclidean = item.euclidean

    if length_of_sentence < max_number_of_sentences:
        euclidean = pad(euclidean, max_number_of_sentences)
        cosine_similarity = pad(cosine_similarity, max_number_of_sentences)

    features = euclidean + cosine_similarity + [item.target]
    resultant_data.append(features)
train_df_copy.apply(combine_features, axis = 1)

resultant_data = pd.DataFrame(resultant_data)

In [41]:
X = resultant_data.iloc[:,:-1]
y = resultant_data.iloc[:,-1]

In [42]:
# train test split
train_x, test_x, train_y, test_y = train_test_split(X, y, train_size=0.8, random_state = 5)

In [43]:
# function to find exact match
def exact_match(y_true, y_pred):
    non_nan_mask = ~np.isnan(y_true)
    em_score = np.array_equal(y_true[non_nan_mask], y_pred[non_nan_mask])
    return em_score

In [44]:
def exact_match(y_true, y_pred):
    non_nan_mask = ~np.isnan(y_true)
    em_score = np.array_equal(y_true[non_nan_mask], y_pred[non_nan_mask])
    return 1 if em_score else 0
total_instances = len(test_y)

In [45]:
from sklearn.impute import SimpleImputer

# Create an imputer
imputer = SimpleImputer(strategy='mean')

# Fit and transform the training data
train_x_imputed = imputer.fit_transform(train_x)
test_x_imputed = imputer.fit_transform(test_x)

mul_lr = linear_model.LogisticRegression(multi_class='multinomial', solver='newton-cg')
mul_lr.fit(train_x_imputed, train_y)

print("Multinomial Logistic regression Train Accuracy : ", metrics.accuracy_score(train_y, mul_lr.predict(train_x_imputed)))
print("Multinomial Logistic regression Test Accuracy : ", metrics.accuracy_score(test_y, mul_lr.predict(test_x_imputed)))

test_y_pred = mul_lr.predict(test_x_imputed)

# Calculate F1 score for multiclass classification
f1 = metrics.f1_score(test_y, test_y_pred, average='weighted')
print("Multinomial Logistic regression Test F1 Score:", f1)

correct_predictions = sum(exact_match(y_true, y_pred) for y_true, y_pred in zip(test_y.values, test_y_pred))

em_score = correct_predictions / total_instances
print("Multinomial Logistic regression Exact Match Score:", em_score)

Multinomial Logistic regression Train Accuracy :  0.374
Multinomial Logistic regression Test Accuracy :  0.374
Multinomial Logistic regression Test F1 Score: 0.22427304569685066
Multinomial Logistic regression Exact Match Score: 0.374


In [46]:
# Random Forest Classifier
rf = RandomForestClassifier(min_samples_leaf=8, n_estimators=60)
rf.fit(train_x_imputed, train_y)

print("Random Forest Classifier Train Accuracy : ", metrics.accuracy_score(train_y, rf.predict(train_x_imputed)))
print("Random Forest Classifier Test Accuracy : ", metrics.accuracy_score(test_y, rf.predict(test_x_imputed)))
print("Random Forest Classifier F1 Score : ", metrics.f1_score(test_y, rf.predict(test_x_imputed), average='weighted'))
correct_predictions = sum(exact_match(y_true, y_pred) for y_true, y_pred in zip(test_y.values, test_y_pred))

em_score = correct_predictions / total_instances
print("Random Forest Classifier Exact Match Score:", em_score)

Random Forest Classifier Train Accuracy :  0.54925
Random Forest Classifier Test Accuracy :  0.375
Random Forest Classifier F1 Score :  0.30240551997552406
Random Forest Classifier Exact Match Score: 0.374
