In [1]:
%%capture
!pip install wikipedia==1.4.0
!pip install scikit-learn==1.0.2
!pip install gensim==4.0.1

In [3]:
import wikipedia as wiki

k = 5
question = "What are the tourist hotspots in Egypt?"

results = wiki.search(question, results=k)
print('Question:', question)
print('Pages:  ', results)

Question: What are the tourist hotspots in Egypt?
Pages:   ['Tourist attraction', 'Northern coast of Egypt', 'Tourism', 'Tourism in Pakistan', 'List of suicide locations']


<a id="data"></a>

---
# Data Exploration



In [4]:
import json
import numpy as np
import pandas as pd

In [5]:
import os

# list the available data
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/stanford-question-answering-dataset/train-v1.1.json
/kaggle/input/stanford-question-answering-dataset/dev-v1.1.json


In [7]:
# based on: https://www.kaggle.com/code/sanjay11100/squad-stanford-q-a-json-to-pandas-dataframe
def squad_json_to_dataframe(file_path, record_path=['data','paragraphs','qas','answers']):
    """
    input_file_path: path to the squad json file.
    record_path: path to deepest level in json file default value is
    ['data','paragraphs','qas','answers']
    """
    file = json.loads(open(file_path).read())
    # parsing different level's in the json file
    js = pd.json_normalize(file, record_path)
    m = pd.json_normalize(file, record_path[:-1])
    r = pd.json_normalize(file,record_path[:-2])
    # combining it into single dataframe
    idx = np.repeat(r['context'].values, r.qas.str.len())
    m['context'] = idx
    data = m[['id','question','context','answers']].set_index('id').reset_index()
    data['c_id'] = data['context'].factorize()[0]
    return data

In [9]:
# loading the data
file_path = '/kaggle/input/stanford-question-answering-dataset/train-v1.1.json'
data = squad_json_to_dataframe(file_path)
data.head()

Unnamed: 0,id,question,context,answers,c_id
0,5733be284776f41900661182,To whom did the Virgin Mary allegedly appear i...,"Architecturally, the school has a Catholic cha...","[{'answer_start': 515, 'text': 'Saint Bernadet...",0
1,5733be284776f4190066117f,What is in front of the Notre Dame Main Building?,"Architecturally, the school has a Catholic cha...","[{'answer_start': 188, 'text': 'a copper statu...",0
2,5733be284776f41900661180,The Basilica of the Sacred heart at Notre Dame...,"Architecturally, the school has a Catholic cha...","[{'answer_start': 279, 'text': 'the Main Build...",0
3,5733be284776f41900661181,What is the Grotto at Notre Dame?,"Architecturally, the school has a Catholic cha...","[{'answer_start': 381, 'text': 'a Marian place...",0
4,5733be284776f4190066117e,What sits on top of the Main Building at Notre...,"Architecturally, the school has a Catholic cha...","[{'answer_start': 92, 'text': 'a golden statue...",0


In [10]:
# how many documents do we have?
data['c_id'].unique().size

18891

In [11]:
documents = data[['context', 'c_id']].drop_duplicates().reset_index(drop=True)
documents

Unnamed: 0,context,c_id
0,"Architecturally, the school has a Catholic cha...",0
1,"As at most other universities, Notre Dame's st...",1
2,The university is the major seat of the Congre...,2
3,The College of Engineering was established in ...,3
4,All of Notre Dame's undergraduate students are...,4
...,...,...
18886,"Institute of Medicine, the central college of ...",18886
18887,Football and Cricket are the most popular spor...,18887
18888,The total length of roads in Nepal is recorded...,18888
18889,The main international airport serving Kathman...,18889


<a href='#top'><span class="label label-info" style="font-size: 125%">Back to Top</span></a>

## TF-IDF

"In information retrieval, TF-IDF, short for term frequency–inverse document frequency, is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus. It is often used as a weighting factor in searches of information retrieval, text mining, and user modeling." [Wikipedia](https://en.wikipedia.org/wiki/Tf%E2%80%93idf)

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

# defining the TF-IDF
tfidf_configs = {
    'lowercase': True,
    'analyzer': 'word',
    'stop_words': 'english',
    'binary': True,
    'max_df': 0.9,
    'max_features': 10_000
}
# defining the number of documents to retrieve
retriever_configs = {
    'n_neighbors': 10,
    'metric': 'cosine'
}

# defining our pipeline
embedding = TfidfVectorizer(**tfidf_configs)
retriever = NearestNeighbors(**retriever_configs)

In [14]:
# let's train the model to retrieve the document id 'c_id'
X = embedding.fit_transform(documents['context'])
retriever.fit(X, documents['c_id'])

NearestNeighbors(metric='cosine', n_neighbors=10)

In [15]:
def transform_text(vectorizer, text):
    '''
    Print the text and the vector[TF-IDF]
    vectorizer: sklearn.vectorizer
    text: str
    '''
    print('Text:', text)
    vector = vectorizer.transform([text])
    vector = vectorizer.inverse_transform(vector)
    print('Vect:', vector)

In [16]:
# vectorize the question
transform_text(embedding, question)

Text: What are the tourist hotspots in Egypt?
Vect: [array(['tourist', 'egypt'], dtype='<U18')]


In [17]:
# predict the most similar document
X = embedding.transform([question])
c_id = retriever.kneighbors(X, return_distance=False)[0][0]
selected = documents.iloc[c_id]['context']

# vectorize the document
transform_text(embedding, selected)

Text: Egypt has a wide range of beaches situated on the Mediterranean and the Red Sea that extend to over 3,000 km. The Red Sea has serene waters, coloured coral reefs, rare fish and beautiful mountains. The Akba Gulf beaches also provide facilities for practising sea sports. Safaga tops the Red Sea zone with its beautiful location on the Suez Gulf. Last but not least, Sharm el-Sheikh (or City of Peace), Hurghada, Luxor (known as world's greatest open-air museum/ or City of the ⅓ of world monuments), Dahab, Ras Sidr, Marsa Alam, Safaga and the northern coast of the Mediterranean are major tourist's destinations of the recreational tourism.
Vect: [array(['zone', 'world', 'wide', 'waters', 'tourist', 'tourism', 'tops',
       'suez', 'sports', 'situated', 'sea', 'reefs', 'red',
       'recreational', 'rare', 'range', 'provide', 'peace', 'open',
       'northern', 'museum', 'mountains', 'monuments', 'mediterranean',
       'major', 'location', 'known', 'km', 'gulf', 'greatest', 'fish',
  

### Evaluation

In [18]:
%%time
# predict one document for each question
X = embedding.transform(data['question'])
y_test = data['c_id']
y_pred = retriever.kneighbors(X, return_distance=False)

CPU times: user 21.9 s, sys: 15 s, total: 37 s
Wall time: 37 s


In [19]:
# top documents predicted for each question
y_pred

array([[    0,  3694, 10613, ...,  6913,  6912, 17590],
       [    7,  1469,     2, ..., 14201,    29,    17],
       [   38,  1469, 14152, ...,    28,     7, 14201],
       ...,
       [18890, 18884, 18836, ..., 12302, 18860, 18837],
       [18890,  3537, 18841, ..., 16014, 18884, 10882],
       [12592, 12591, 12598, ..., 12593, 12600, 12588]])

In [20]:
def top_accuracy(y_true, y_pred) -> float:
    right, count = 0, 0
    for i, y_t in enumerate(y_true):
        count += 1
        if y_t in y_pred[i]:
            right += 1
    return right / count if count > 0 else 0

In [21]:
acc = top_accuracy(y_test, y_pred)
print('Accuracy:', f'{acc:.4f}')
print('Quantity:', int(acc*len(y_pred)), 'from', len(y_pred))

Accuracy: 0.7150
Quantity: 62635 from 87599


In [22]:
from gensim.parsing.preprocessing import preprocess_string

# create a corpus of tokens
corpus = documents['context'].tolist()
corpus = [preprocess_string(t) for t in corpus]

In [23]:
from gensim.models import Word2Vec
import gensim.downloader

# you can download a pretrained Word2Vec
# - or you can train your own model

# download a model
# 'glove-wiki-gigaword-300' (376.1 MB)
# 'word2vec-ruscorpora-300' (198.8 MB)
# 'word2vec-google-news-300' (1.6 GB)
# vectorizer = gensim.downloader.load('word2vec-ruscorpora-300')

# train your own model
vectorizer = Word2Vec(sentences=corpus, vector_size=300, window=5, min_count=1, workers=4).wv

In [24]:
# similar words to 'tourist'
vectorizer.most_similar('tourist', topn=5)

[('destin', 0.9567534327507019),
 ('world’', 0.949833869934082),
 ('melbourn', 0.9431617259979248),
 ('visitor', 0.9376100897789001),
 ('seattl', 0.9344044923782349)]

In [25]:
def transform_text2(vectorizer, text, verbose=False):
    '''
    Transform the text in a vector[Word2Vec]
    vectorizer: sklearn.vectorizer
    text: str
    '''
    tokens = preprocess_string(text)
    words = [vectorizer[w] for w in tokens if w in vectorizer]
    if verbose:
        print('Text:', text)
        print('Vector:', [w for w in tokens if w in vectorizer])
    elif len(words):
        return np.mean(words, axis=0)
    else:
        return np.zeros((300), dtype=np.float32)

In [26]:
# just testing our Word2Vec
transform_text2(vectorizer, question, verbose=True)

Text: What are the tourist hotspots in Egypt?
Vector: ['tourist', 'hotspot', 'egypt']


In [27]:
# let's train the model to retrieve the document id 'c_id'
retriever2 = NearestNeighbors(**retriever_configs)

# vectorizer the documents, fit the retriever
X = documents['context'].apply(lambda x: transform_text2(vectorizer, x)).tolist()
retriever2.fit(X, documents['c_id'])

NearestNeighbors(metric='cosine', n_neighbors=10)

### Evaluation

In [28]:
%%time
# vectorizer the questions
X = data['question'].apply(lambda x: transform_text2(vectorizer, x)).tolist()

# predict one document for each question
y_test = data['c_id']
y_pred = retriever2.kneighbors(X, return_distance=False)

CPU times: user 46.9 s, sys: 8.97 s, total: 55.9 s
Wall time: 36.9 s


In [29]:
# top documents predicted for each question
y_pred

array([[ 2637, 17575, 18125, ...,  7053,  7050,  7052],
       [ 2424,  7290,  4816, ...,  6981, 17019, 10670],
       [    0,  7019, 10636, ..., 10503,  5560, 18045],
       ...,
       [ 7933,  9491, 14938, ...,  9497,  2805, 16902],
       [ 2844,  1623, 18538, ..., 10493,   563, 14340],
       [13316, 15409, 11381, ..., 13328, 15392, 18548]])

In [30]:
acc = top_accuracy(y_test, y_pred)
print('Accuracy:', f'{acc:.4f}')
print('Quantity:', int(acc*len(y_pred)), 'from', len(y_pred))

Accuracy: 0.1213
Quantity: 10625 from 87599
