In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [4]:
%%capture
!pip install wikipedia==1.4.0
!pip install scikit-learn==1.0.2
!pip install gensim==4.0.1

In [5]:
import json
import numpy as np
import pandas as pd

In [6]:
# based on: https://www.kaggle.com/code/sanjay11100/squad-stanford-q-a-json-to-pandas-dataframe
def squad_json_to_dataframe(file_path, record_path=['data','paragraphs','qas','answers']):
    """
    input_file_path: path to the squad json file.
    record_path: path to deepest level in json file default value is
    ['data','paragraphs','qas','answers']
    """
    file = json.loads(open(file_path).read())
    # parsing different level's in the json file
    js = pd.json_normalize(file, record_path)
    m = pd.json_normalize(file, record_path[:-1])
    r = pd.json_normalize(file,record_path[:-2])
    # combining it into single dataframe
    idx = np.repeat(r['context'].values, r.qas.str.len())
    m['context'] = idx
    data = m[['id','question','context','answers']].set_index('id').reset_index()
    data['c_id'] = data['context'].factorize()[0]
    return data

In [1]:
!kaggle datasets download -d rajpurkar/squad1


403 - Forbidden - Permission 'datasets.get' was denied


In [7]:
# loading the data
file_path = '/content/train-v1.1.json'
data = squad_json_to_dataframe(file_path)
data

Unnamed: 0,id,question,context,answers,c_id
0,5733be284776f41900661182,To whom did the Virgin Mary allegedly appear i...,"Architecturally, the school has a Catholic cha...","[{'answer_start': 515, 'text': 'Saint Bernadet...",0
1,5733be284776f4190066117f,What is in front of the Notre Dame Main Building?,"Architecturally, the school has a Catholic cha...","[{'answer_start': 188, 'text': 'a copper statu...",0
2,5733be284776f41900661180,The Basilica of the Sacred heart at Notre Dame...,"Architecturally, the school has a Catholic cha...","[{'answer_start': 279, 'text': 'the Main Build...",0
3,5733be284776f41900661181,What is the Grotto at Notre Dame?,"Architecturally, the school has a Catholic cha...","[{'answer_start': 381, 'text': 'a Marian place...",0
4,5733be284776f4190066117e,What sits on top of the Main Building at Notre...,"Architecturally, the school has a Catholic cha...","[{'answer_start': 92, 'text': 'a golden statue...",0
...,...,...,...,...,...
87594,5735d259012e2f140011a09d,In what US state did Kathmandu first establish...,"Kathmandu Metropolitan City (KMC), in order to...","[{'answer_start': 229, 'text': 'Oregon'}]",18890
87595,5735d259012e2f140011a09e,What was Yangon previously known as?,"Kathmandu Metropolitan City (KMC), in order to...","[{'answer_start': 414, 'text': 'Rangoon'}]",18890
87596,5735d259012e2f140011a09f,With what Belorussian city does Kathmandu have...,"Kathmandu Metropolitan City (KMC), in order to...","[{'answer_start': 476, 'text': 'Minsk'}]",18890
87597,5735d259012e2f140011a0a0,In what year did Kathmandu create its initial ...,"Kathmandu Metropolitan City (KMC), in order to...","[{'answer_start': 199, 'text': '1975'}]",18890


In [8]:
# how many documents do we have?
data['c_id'].unique().size

18891

#### Get the Unique Documents
Let's select the unique documents in our data. This will be the list of documents to search for the answers.

In [9]:
documents = data[['context', 'c_id']].drop_duplicates().reset_index(drop = True)
documents

Unnamed: 0,context,c_id
0,"Architecturally, the school has a Catholic cha...",0
1,"As at most other universities, Notre Dame's st...",1
2,The university is the major seat of the Congre...,2
3,The College of Engineering was established in ...,3
4,All of Notre Dame's undergraduate students are...,4
...,...,...
18886,"Institute of Medicine, the central college of ...",18886
18887,Football and Cricket are the most popular spor...,18887
18888,The total length of roads in Nepal is recorded...,18888
18889,The main international airport serving Kathman...,18889


#### Document Retrieval¶
In this section, we are going to explore the techniques to retrieve documents. First, we are going to create our document vectorizer. We use this vectorizer to encode the documents and the questions into vectors. After, we can search for a question comparing with the document vectors. In the end, the algorithm will return the  
k
  most similar document vectors to a question vector.



#### TF-IDF
"In information retrieval, TF-IDF, short for term frequency–inverse document frequency, is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus. It is often used as a weighting factor in searches of information retrieval, text mining, and user modeling." Wikipedia

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

In [11]:
# defining the TF-IDF
tfidf_configs = {
    'lowercase': True,
    'analyzer': 'word',
    'stop_words': 'english',
    'binary': True,
    'max_df': 0.9,
    'max_features': 10_000
}

# defining the number of documents to retrieve
retriever_configs = {
    'n_neighbors': 10,
    'metric': 'cosine'
}

In [12]:
# defining our pipeline
embedding = TfidfVectorizer(**tfidf_configs)
retriever = NearestNeighbors(**retriever_configs)

In [13]:
# let's train the model to retrieve the document id 'c_id'
X = embedding.fit_transform(documents['context'])
retriever.fit(X, documents['c_id'])

NearestNeighbors(metric='cosine', n_neighbors=10)

Let's test the vectorizer, what information our model is using to extract the vector?

In [14]:
def transform_text(vectorizer, text):
    '''
    Print the text and the vector[TF-IDF]
    vectorizer: sklearn.vectorizer
    text: str
    '''
    print('Text:', text)
    vector = vectorizer.transform([text])
    vector = vectorizer.inverse_transform(vector)
    print('Vect:', vector)

In [15]:
import wikipedia as wiki

k = 5
question = "What are the tourist hotspots in Portugal?"

results = wiki.search(question, results=k)
print('Question:', question)
print('Pages:  ', results)

Question: What are the tourist hotspots in Portugal?
Pages:   ['Tourist attraction', 'Tourist tax', 'Portugal', 'Algarve', 'Porto']


In [16]:
# vectorize the question
transform_text(embedding, question)

Text: What are the tourist hotspots in Portugal?
Vect: [array(['tourist', 'portugal'], dtype='<U18')]


What is the most similar document to this question?

In [17]:
# predict the most similar document

X = embedding.transform([question])
c_id = retriever.kneighbors(X, return_distance = False)[0][0]
selected = documents.iloc[c_id]['context']

#vectorize the document
transform_text(embedding, selected)


Text: The two largest metropolitan areas have subway systems: Lisbon Metro and Metro Sul do Tejo in the Lisbon Metropolitan Area and Porto Metro in the Porto Metropolitan Area, each with more than 35 km (22 mi) of lines. In Portugal, Lisbon tram services have been supplied by the Companhia de Carris de Ferro de Lisboa (Carris), for over a century. In Porto, a tram network, of which only a tourist line on the shores of the Douro remain, began construction on 12 September 1895 (a first for the Iberian Peninsula). All major cities and towns have their own local urban transport network, as well as taxi services.
Vect: [array(['urban', 'transport', 'towns', 'tourist', 'systems', 'supplied',
       'subway', 'shores', 'services', 'september', 'remain', 'portugal',
       'porto', 'peninsula', 'network', 'mi', 'metropolitan', 'metro',
       'major', 'local', 'lisbon', 'lines', 'line', 'largest', 'km',
       'iberian', 'construction', 'cities', 'century', 'began', 'areas',
       'area', '35

### Evaluation

In [18]:
%%time
# predict one document for each question
X = embedding.transform(data['question'])
y_test = data['c_id']
y_pred = retriever.kneighbors(X, return_distance=False)

CPU times: user 23.7 s, sys: 17.1 s, total: 40.8 s
Wall time: 41.8 s


In [19]:
# top documents predicted for each question
y_pred

array([[    0,  3694, 10613, ..., 17590,  6913,  6912],
       [    7,  1469,     2, ...,    29, 14201,    17],
       [   38,  1469, 14152, ...,    28,     7, 14201],
       ...,
       [18890, 18884, 18836, ..., 12302, 18837,  4200],
       [18890,  3537, 18841, ..., 16014, 18884, 10882],
       [12592, 12591, 12598, ..., 12593, 12600, 12588]])

In [20]:
def top_accuracy(y_true, y_pred) -> float:
    right, count = 0,0
    for i, y_t in enumerate(y_true):
        count += 1
        if y_t in y_pred[i]:
            right += 1
    return right / count if count > 0 else 0

In [21]:
acc = top_accuracy(y_test, y_pred)
print('Accuracy:', f'{acc:.4f}')
print('Quantity:', int(acc*len(y_pred)), 'from', len(y_pred))

Accuracy: 0.7148
Quantity: 62615 from 87599


#### Discussion

1. This is a difficult problem, because we have multiples documents (in this notebook, ~19k documents) and the answer can be in one or more documents. Thus, the retriever usually returns  k  documents, because it is not complete/fair return only one document.

2. We reach a high accuracy with top-10 (71.48%); in top-1 a low accuray (43.22%) becase we have a lot of documents, and some are pretty similar. Actually, this top-1 and top-10 are very good accuracy for this problem.

3. TF-IDF has some problems: (1) this algorithm is only able to compute similarity between questions and documents that present the same words, so it can not capture synonyms; and (2) cannot understand the question context or the meaning of the words.

#### Word2Vec / Embedding
"Word2vec is a technique for natural language processing published in 2013. The word2vec algorithm uses a neural network model to learn word associations from a large corpus of text. Once trained, such a model can detect synonymous words or suggest additional words for a partial sentence." Wikipedia

In [22]:
from gensim.parsing.preprocessing import preprocess_string

# create a corpus of tokens
corpus = documents['context'].tolist()
corpus = [preprocess_string(t) for t in corpus]
# corpus

In [23]:
from gensim.models import Word2Vec
import gensim.downloader

In [24]:
# train your own model
vectorizer = Word2Vec(sentences=corpus, vector_size=300, window=5, min_count=1, workers=4).wv

In [25]:
# similar words to 'tourist'
vectorizer.most_similar('tourist', topn=5)

[('destin', 0.9592567682266235),
 ('world’', 0.9537814259529114),
 ('visitor', 0.9493675827980042),
 ('safest', 0.9376911520957947),
 ('melbourn', 0.9325165152549744)]

In [26]:
def transform_text2(vectorizer, text, verbose=False):
    '''
    Transform the text in a vector[Word2Vec]
    vectorizer: sklearn.vectorizer
    text: str
    '''
    tokens = preprocess_string(text)
    words = [vectorizer[w] for w in tokens if w in vectorizer]
    if verbose:
        print('Text:', text)
        print('Vector:', [w for w in tokens if w in vectorizer])
    elif len(words):
        return np.mean(words, axis=0)
    else:
        return np.zeros((300), dtype=np.float32)

In [27]:
# just testing our Word2Vec
transform_text2(vectorizer, question, verbose=True)

Text: What are the tourist hotspots in Portugal?
Vector: ['tourist', 'hotspot', 'portug']


In [28]:
# let's train the model to retrieve the document id 'c_id'
retriever2 = NearestNeighbors(**retriever_configs)

#vectorizer the documents, fit the retriever

X = documents['context'].apply(lambda x: transform_text2(vectorizer, x)).tolist()
retriever2.fit(X, documents['c_id'])


NearestNeighbors(metric='cosine', n_neighbors=10)

#### Evaluation¶


In [29]:
%%time
# vectorizer the questions
X = data['question'].apply(lambda x: transform_text2(vectorizer, x)).tolist()

# predict one document for each question
y_test = data['c_id']
y_pred = retriever2.kneighbors(X, return_distance=False)

CPU times: user 1min 3s, sys: 6.6 s, total: 1min 9s
Wall time: 57.8 s


In [30]:
# top documents predicted for each question
y_pred

array([[17575,  2637, 18125, ...,  7082,  2014,  7052],
       [ 2424,  7290,  4816, ..., 17028, 10670,  6981],
       [ 7019,     0, 10619, ...,  5560,  9839, 17525],
       ...,
       [ 7933,  9491,   686, ...,  7004,  9497, 10082],
       [ 2844,  1623, 18538, ..., 15443, 12595, 10500],
       [13316, 15409, 11381, ...,  1735, 13332, 13276]])

In [31]:
acc = top_accuracy(y_test, y_pred)
print('Accuracy:', f'{acc:.4f}')
print('Quantity:', int(acc*len(y_pred)), 'from', len(y_pred))

Accuracy: 0.1211
Quantity: 10611 from 87599


#### Discussion
1. We did not reach a good accuracy (12.15%) in top-10; and a really low accuray (3.07%) in top-1. Thus, the TF-IDF was better.

2. Maybe, the vectorizer did not receive enough data to be trained. Thus, I suggest use pretrained models, like 'word2vec-google-news-300'.

3. Another problem: I simply compute the average of the words to compose the document/question embedding; we do have other pooling strategies to work with sentences. Or, we can try more robust embedding techniques, such as BERT, MT5, DPR, etc.

### Conclusion

1. As mentioned, this problem is really complex, due to the number of documents.

2. TF-IDF reached a great top-10 accuracy (71.48%) for this dataset, and it can increases returning more documents.

3. We also have other algorithms to work with Document Retriveal, such as BM25 and DPR.