# 1. Import Libraries

In [1]:
import pandas as pd
import numpy as np
import json

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

# 2. Convert SQuAD JSON to DataFrame

In [2]:
# Function to convert SQuAD JSON to DataFrame 
def squad_json_to_dataframe(input_file_path, record_path=['data', 'paragraphs', 'qas', 
                                                          'answers'], verbose=1):
    if verbose:
        print("Reading the JSON file")
    with open(input_file_path, 'r') as file:
        data = json.load(file)
    if verbose:
        print("Processing...")
    js = pd.json_normalize(data, record_path)
    m = pd.json_normalize(data, record_path[:-1])
    r = pd.json_normalize(data, record_path[:-2])
    idx = np.repeat(r['context'].values, r.qas.str.len())
    m['context'] = idx
    main = m[['id', 'question', 'context', 'answers']].set_index('id').reset_index()
    main['c_id'] = main['context'].factorize()[0]
    if verbose:
        print("Shape of the DataFrame is {}".format(main.shape))
        print("Done")
    return main

# 3. Load Training and Development Data

In [3]:
# training data
input_file_path = '/kaggle/input/stanford-question-answering-dataset/train-v1.1.json'
record_path = ['data','paragraphs','qas','answers']
train = squad_json_to_dataframe(input_file_path=input_file_path,record_path=record_path)

Reading the JSON file
Processing...
Shape of the DataFrame is (87599, 5)
Done


In [4]:
train.head()

Unnamed: 0,id,question,context,answers,c_id
0,5733be284776f41900661182,To whom did the Virgin Mary allegedly appear i...,"Architecturally, the school has a Catholic cha...","[{'answer_start': 515, 'text': 'Saint Bernadet...",0
1,5733be284776f4190066117f,What is in front of the Notre Dame Main Building?,"Architecturally, the school has a Catholic cha...","[{'answer_start': 188, 'text': 'a copper statu...",0
2,5733be284776f41900661180,The Basilica of the Sacred heart at Notre Dame...,"Architecturally, the school has a Catholic cha...","[{'answer_start': 279, 'text': 'the Main Build...",0
3,5733be284776f41900661181,What is the Grotto at Notre Dame?,"Architecturally, the school has a Catholic cha...","[{'answer_start': 381, 'text': 'a Marian place...",0
4,5733be284776f4190066117e,What sits on top of the Main Building at Notre...,"Architecturally, the school has a Catholic cha...","[{'answer_start': 92, 'text': 'a golden statue...",0


In [5]:
# dev data
input_file_path = '/kaggle/input/stanford-question-answering-dataset/dev-v1.1.json'
record_path = ['data','paragraphs','qas','answers']
verbose = 0
dev = squad_json_to_dataframe(input_file_path=input_file_path,record_path=record_path)

Reading the JSON file
Processing...
Shape of the DataFrame is (10570, 5)
Done


In [6]:
dev.head()

Unnamed: 0,id,question,context,answers,c_id
0,56be4db0acb8001400a502ec,Which NFL team represented the AFC at Super Bo...,Super Bowl 50 was an American football game to...,"[{'answer_start': 177, 'text': 'Denver Broncos...",0
1,56be4db0acb8001400a502ed,Which NFL team represented the NFC at Super Bo...,Super Bowl 50 was an American football game to...,"[{'answer_start': 249, 'text': 'Carolina Panth...",0
2,56be4db0acb8001400a502ee,Where did Super Bowl 50 take place?,Super Bowl 50 was an American football game to...,"[{'answer_start': 403, 'text': 'Santa Clara, C...",0
3,56be4db0acb8001400a502ef,Which NFL team won Super Bowl 50?,Super Bowl 50 was an American football game to...,"[{'answer_start': 177, 'text': 'Denver Broncos...",0
4,56be4db0acb8001400a502f0,What color was used to emphasize the 50th anni...,Super Bowl 50 was an American football game to...,"[{'answer_start': 488, 'text': 'gold'}, {'answ...",0


# 4. Prepare Documents DataFrame

This DataFrame will be used for similarity search.

In [7]:
# Prepare documents DataFrame
documents = train[['context', 'c_id']].drop_duplicates().reset_index(drop=True)
documents

Unnamed: 0,context,c_id
0,"Architecturally, the school has a Catholic cha...",0
1,"As at most other universities, Notre Dame's st...",1
2,The university is the major seat of the Congre...,2
3,The College of Engineering was established in ...,3
4,All of Notre Dame's undergraduate students are...,4
...,...,...
18886,"Institute of Medicine, the central college of ...",18886
18887,Football and Cricket are the most popular spor...,18887
18888,The total length of roads in Nepal is recorded...,18888
18889,The main international airport serving Kathman...,18889


# 5. Define and Train TF-IDF Vectorizer and Nearest Neighbors

In [8]:
# Define TF-IDF and Nearest Neighbors configurations
tfidf_configs = {
    'lowercase': True,
    'analyzer': 'word',
    'stop_words': 'english',
    'binary': True,
    'max_df': 0.9,
    'max_features': 20_000  # Increase this number
}

retriever_configs = {
    'n_neighbors': 20,  # Increase the number of neighbors
    'metric': 'cosine'
}

# Initialize TF-IDF and Nearest Neighbors
embedding = TfidfVectorizer(**tfidf_configs)
retriever = NearestNeighbors(**retriever_configs)

# Train the model
X = embedding.fit_transform(documents['context'])
retriever.fit(X, documents['c_id'])

In [9]:
pip install wikipedia

Note: you may need to restart the kernel to use updated packages.


# 6. Search Wikipedia for Relevant Information

In [10]:
import wikipedia as wiki

def search_wikipedia(question, k=5):
    """
    Search Wikipedia for the given question.

    Parameters:
    - question: The query to search for.
    - k: Number of results to return.

    Returns:
    - List of Wikipedia page titles.
    """
    results = wiki.search(question, results=k)
    return results

# Example query
question = "What are the tourist hotspots in Portugal?"
results = search_wikipedia(question)
print(f"Question: {question}")
print(f"Pages:  {results}")


Question: What are the tourist hotspots in Portugal?
Pages:  ['Tourist attraction', 'Portugal', 'Algarve', 'Porto', 'Azores']


# 7. Transform and Print TF-IDF Vectors

In [11]:
def transform_text(vectorizer, text):
    """
    Transform text to TF-IDF vector and print the vector.

    Parameters:
    - vectorizer: The TF-IDF vectorizer.
    - text: The text to transform.
    """
    vector = vectorizer.transform([text])
    vector = vectorizer.inverse_transform(vector)
    print('Text:', text)
    print('Vect:', vector)

# Vectorize the question
transform_text(embedding, question)

Text: What are the tourist hotspots in Portugal?
Vect: [array(['tourist', 'portugal'], dtype='<U18')]


# 8. Predict the Most Similar Document

In [12]:
# Predict the most similar document
X = embedding.transform([question])
c_id = retriever.kneighbors(X, return_distance=False)[0][0]
selected = documents.iloc[c_id]['context']

# Vectorize the selected document
transform_text(embedding, selected)

Text: The two largest metropolitan areas have subway systems: Lisbon Metro and Metro Sul do Tejo in the Lisbon Metropolitan Area and Porto Metro in the Porto Metropolitan Area, each with more than 35 km (22 mi) of lines. In Portugal, Lisbon tram services have been supplied by the Companhia de Carris de Ferro de Lisboa (Carris), for over a century. In Porto, a tram network, of which only a tourist line on the shores of the Douro remain, began construction on 12 September 1895 (a first for the Iberian Peninsula). All major cities and towns have their own local urban transport network, as well as taxi services.
Vect: [array(['urban', 'transport', 'tram', 'towns', 'tourist', 'taxi',
       'systems', 'supplied', 'subway', 'shores', 'services', 'september',
       'remain', 'portugal', 'porto', 'peninsula', 'network', 'mi',
       'metropolitan', 'metro', 'major', 'local', 'lisbon', 'lines',
       'line', 'largest', 'km', 'iberian', 'douro', 'construction',
       'cities', 'century', 'beg

# 9. Evaluate Model Accuracy

Computes the accuracy of the retrieval system 
by checking if the true context ID is among the retrieved nearest neighbors for each question.

In [13]:
def top_accuracy(y_true, y_pred) -> float:
    """
    Calculate the top accuracy of predictions.

    Parameters:
    - y_true: True labels.
    - y_pred: Predicted labels.

    Returns:
    - Accuracy score.
    """
    right, count = 0, 0
    for i, y_t in enumerate(y_true):
        count += 1
        if y_t in y_pred[i]:
            right += 1
    return right / count if count > 0 else 0

In [14]:
# Evaluate model
X = embedding.transform(train['question'])
y_test = train['c_id']
y_pred = retriever.kneighbors(X, return_distance=False)
acc = top_accuracy(y_test, y_pred)
print(f'Accuracy: {acc:.4f}')

Accuracy: 0.8311


In [15]:
def mean_reciprocal_rank(y_true, y_pred):
    rr = []
    for true, pred in zip(y_true, y_pred):
        if true in pred:
            rr.append(1 / (np.where(pred == true)[0][0] + 1))
        else:
            rr.append(0)
    return np.mean(rr)

def top_k_accuracy(y_true, y_pred, k=5):
    top_k = []
    for true, pred in zip(y_true, y_pred):
        top_k.append(int(true in pred[:k]))
    return np.mean(top_k)

In [16]:
# Example usage
y_true = train['c_id'].values
y_pred = retriever.kneighbors(embedding.transform(train['question']), return_distance=False)

mrr = mean_reciprocal_rank(y_true, y_pred)
top1_acc = top_k_accuracy(y_true, y_pred, k=5)
top5_acc = top_k_accuracy(y_true, y_pred, k=20)

print(f'MRR: {mrr:.4f}')
print(f'Top-5 Accuracy: {top1_acc:.4f}')
print(f'Top-20 Accuracy: {top5_acc:.4f}')

MRR: 0.5932
Top-5 Accuracy: 0.7160
Top-20 Accuracy: 0.8311
