### Data preprocessing for Standford Question Answering Dataset
- Brian Morris

In [21]:
import numpy as np
import pandas as pd 
import json

In [2]:
from subprocess import check_output
print(check_output(["ls", "./data"]).decode("utf8"))

dev-v1.1.json
dev.csv
train-v1.1.json
train.csv



In [3]:
def squad_json_to_dataframe_train(input_file_path):
    record_path = ['data','paragraphs','qas','answers']
    with open(input_file_path, 'r') as f:
        file = json.load(f)
    
    # Parse different levels in the JSON file using pd.json_normalize
    js = pd.json_normalize(file, record_path)
    m = pd.json_normalize(file, record_path[:-1])
    r = pd.json_normalize(file, record_path[:-2])
    
    # Combine it into a single dataframe
    idx = np.repeat(r['context'].values, r.qas.str.len())
    ndx = np.repeat(m['id'].values, m['answers'].str.len())
    m['context'] = idx
    js['q_idx'] = ndx
    main = pd.concat([m[['id', 'question', 'context']].set_index('id'), js.set_index('q_idx')], axis=1, sort=False).reset_index()
    main['c_id'] = main['context'].factorize()[0]
    main.rename(columns={'text': 'answer'}, inplace=True)

    print(f"Shape of the DataFrame is {main.shape}")
    
    return main

In [4]:
def squad_json_to_dataframe_dev(input_file_path):
    record_path = ['data','paragraphs','qas','answers']

    # Load the JSON file
    with open(input_file_path, 'r') as f:
        file = json.load(f)
    

    # Parsing different levels in the JSON file using pd.json_normalize
    js = pd.json_normalize(file, record_path)
    m = pd.json_normalize(file, record_path[:-1])
    r = pd.json_normalize(file, record_path[:-2])
    
    # Create context index for each question-answer pair
    idx = np.repeat(r['context'].values, r.qas.str.len())
    m['context'] = idx

    # Combine into a single DataFrame with relevant columns
    main = m[['id', 'question', 'context', 'answers']].reset_index(drop=True)
    main['c_id'] = main['context'].factorize()[0]  # Create a unique ID for each context

    print(f"Shape of the DataFrame is {main.shape}")
    main.rename(columns={'text': 'answer'}, inplace=True)
    return main

In [22]:
input_file_path = './data/train-v1.1.json'
train_df = squad_json_to_dataframe_train(input_file_path)

Shape of the DataFrame is (87599, 6)


In [6]:
train_df.head()


Unnamed: 0,index,question,context,answer_start,answer,c_id
0,5733be284776f41900661182,To whom did the Virgin Mary allegedly appear i...,"Architecturally, the school has a Catholic cha...",515,Saint Bernadette Soubirous,0
1,5733be284776f4190066117f,What is in front of the Notre Dame Main Building?,"Architecturally, the school has a Catholic cha...",188,a copper statue of Christ,0
2,5733be284776f41900661180,The Basilica of the Sacred heart at Notre Dame...,"Architecturally, the school has a Catholic cha...",279,the Main Building,0
3,5733be284776f41900661181,What is the Grotto at Notre Dame?,"Architecturally, the school has a Catholic cha...",381,a Marian place of prayer and reflection,0
4,5733be284776f4190066117e,What sits on top of the Main Building at Notre...,"Architecturally, the school has a Catholic cha...",92,a golden statue of the Virgin Mary,0


In [7]:
train_df.to_csv('./data/train.csv',index=False)

In [8]:
input_file_path = './data/dev-v1.1.json'
dev_df = squad_json_to_dataframe_dev(input_file_path)

Shape of the DataFrame is (10570, 5)


In [60]:
dev_df.to_csv('./data/dev.csv',index=False)

In [61]:
print(train_df['context'][0])

Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.


In [62]:
print(train_df['question'][0])

To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?


In [63]:
print(train_df['answer'][0])

Saint Bernadette Soubirous


In [33]:
train_df.head(1)

Unnamed: 0,index,question,context,answer_start,text,c_id
0,5733be284776f41900661182,To whom did the Virgin Mary allegedly appear i...,"Architecturally, the school has a Catholic cha...",515,Saint Bernadette Soubirous,0


In [36]:
train_df['index'].nunique()

87599

In [37]:
train_df['c_id'].nunique()

18891

So we have 87,599 questions on 18891 texts.

In [14]:
print(dev_df['answers'][4])

[{'answer_start': 488, 'text': 'gold'}, {'answer_start': 488, 'text': 'gold'}, {'answer_start': 521, 'text': 'gold'}]


In [None]:
pd.set_option('display.max_colwidth', None)

In [20]:
dev_df.head()

Unnamed: 0,id,question,context,answers,c_id
0,56be4db0acb8001400a502ec,Which NFL team represented the AFC at Super Bo...,Super Bowl 50 was an American football game to...,"[{'answer_start': 177, 'text': 'Denver Broncos...",0
1,56be4db0acb8001400a502ed,Which NFL team represented the NFC at Super Bo...,Super Bowl 50 was an American football game to...,"[{'answer_start': 249, 'text': 'Carolina Panth...",0
2,56be4db0acb8001400a502ee,Where did Super Bowl 50 take place?,Super Bowl 50 was an American football game to...,"[{'answer_start': 403, 'text': 'Santa Clara, C...",0
3,56be4db0acb8001400a502ef,Which NFL team won Super Bowl 50?,Super Bowl 50 was an American football game to...,"[{'answer_start': 177, 'text': 'Denver Broncos...",0
4,56be4db0acb8001400a502f0,What color was used to emphasize the 50th anni...,Super Bowl 50 was an American football game to...,"[{'answer_start': 488, 'text': 'gold'}, {'answ...",0


In [34]:
train_df.shape

(87599, 6)