<a href="https://colab.research.google.com/github/CorentinMAG/NLP/blob/main/BIDAF/bidaf_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In order to run this notebook, the first things you should do are :
* pip install pandas numpy tensorflow nltk gensim sklearn
* modify the `SQUAD_PATH` variable (path to squad file)
* modify all others paths (where to save datasets, tokenizers...)



In [1]:
import json
import os
import pandas as pd
import numpy as np
import tensorflow as tf
import nltk
from nltk import word_tokenize
nltk.download('punkt')
import gensim.downloader as gloader
from sklearn.model_selection import train_test_split
import re
import pickle
import tensorflow as tf

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
EMBEDDING_SIZE = 300
SQUAD_PATH = os.path.join(os.getcwd(), 'drive/MyDrive/NLP/BIDAF/utils/data/training_set.json')

def load_dataset(path, record_path = ['data', 'paragraphs', 'qas', 'answers'], verbose = True, with_answer = True):

  """
  parse the SQUAD dataset into a dataframe
  """

  if verbose:
      print("Reading the json file")
  # if the file encoding is not UTF8 an exception should be raised    
  file = json.loads(open(path).read())

  if verbose:
      print("[INFO] processing...")

  # parsing different level's in the json file
  if with_answer:
    js = pd.json_normalize(file , record_path )
  m = pd.json_normalize(file, record_path[:-1] )
  r = pd.json_normalize(file, record_path[:-2])
  title = pd.json_normalize(file['data'], record_path = ['paragraphs'], meta = 'title')
  t = pd.json_normalize(file, record_path[0])

  #combining it into single dataframe
  contexts = np.repeat(r['context'].values, r['qas'].str.len())
  m['context'] = contexts
  m['title'] = np.repeat(title['title'].values, r['qas'].str.len())
  m['c_id'] = m['context'].factorize()[0]
  m = m.drop(['answers'], axis = 1)

  if with_answer:
    main = js.merge(m, left_index = True, right_index = True)
  else:
    main = m
  if verbose:
      print(f"[INFO] there are {main.shape[0]} questions with single answer")
      print(f"[INFO] there are {main.groupby('c_id').sum().shape[0]} different contexts")
      print(f"[INFO] there are {len(t)} unrelated subjects")
      print("[INFO] Done")
  return main

def download_glove_model(embedding_dimension = 50):

  """
  download glove model
  """

  download_path = "glove-wiki-gigaword-{}".format(embedding_dimension)
  try:
    print('[INFO] downloading glove {}'.format(embedding_dimension))
    emb_model = gloader.load(download_path)
    print('[INFO] done !')
  except ValueError as e:
      print("Glove: 50, 100, 200, 300")
      raise e
  return emb_model

In [3]:
squad_dataset = load_dataset(SQUAD_PATH)

Reading the json file
[INFO] processing...
[INFO] there are 87599 questions with single answer
[INFO] there are 18891 different contexts
[INFO] there are 442 unrelated subjects
[INFO] Done


In [4]:
squad_dataset.head()

Unnamed: 0,answer_start,text,question,id,context,title,c_id
0,515,Saint Bernadette Soubirous,To whom did the Virgin Mary allegedly appear i...,5733be284776f41900661182,"Architecturally, the school has a Catholic cha...",University_of_Notre_Dame,0
1,188,a copper statue of Christ,What is in front of the Notre Dame Main Building?,5733be284776f4190066117f,"Architecturally, the school has a Catholic cha...",University_of_Notre_Dame,0
2,279,the Main Building,The Basilica of the Sacred heart at Notre Dame...,5733be284776f41900661180,"Architecturally, the school has a Catholic cha...",University_of_Notre_Dame,0
3,381,a Marian place of prayer and reflection,What is the Grotto at Notre Dame?,5733be284776f41900661181,"Architecturally, the school has a Catholic cha...",University_of_Notre_Dame,0
4,92,a golden statue of the Virgin Mary,What sits on top of the Main Building at Notre...,5733be284776f4190066117e,"Architecturally, the school has a Catholic cha...",University_of_Notre_Dame,0


In [5]:
SAMPLES = squad_dataset.shape[0]

def preprocess_sentence(text):

  """
  lowercase and strip the given text
  """

  text = text.lower()
  text = text.strip()
  return text

def clean_dataset(dataset, with_answer = True):

  """
  preprocess the dataset
  """

  _dataset = dataset.copy()

  cleaned_questions = _dataset['question'].apply(preprocess_sentence)

  # we process only different contexts and then we duplicate them
  unique_context = pd.Series(_dataset['context'].unique())
  count_c = _dataset.groupby('c_id').size()
  cleaned_contexts = unique_context.apply(preprocess_sentence)

  _dataset['question'] = cleaned_questions

  if with_answer:
    cleaned_texts = _dataset['text'].apply(preprocess_sentence)
    _dataset['text'] = cleaned_texts
  _dataset['context'] = pd.Series(np.repeat(cleaned_contexts, count_c).tolist())

  return _dataset

In [6]:
squad_dataset = clean_dataset(squad_dataset)

In [7]:
def get_tokenizer(dataset, glove_model = None):

  """
  create the word and char tokenizers and feed them 
  on the given dataset and the glove vocabulary
  """

  tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token = 'UNK', filters = '')

  # we will only keep the 200 - 1 most frequent characters (otherwise oom issue)
  # others tokens are replaced by UNK token 
  # we keep 199 most frequent tokens and indice 1 is UNK token (so we keep 198 tokens)
  char_tokenizer = tf.keras.preprocessing.text.Tokenizer(char_level = True, filters = '', oov_token = 'UNK', num_words = 200)

  if glove_model == None:
    glove_model = download_glove_model(EMBEDDING_SIZE)

  tokenized_questions = dataset['question'].apply(word_tokenize).to_list()

  contexts = pd.Series(dataset['context'].unique())
  tokenized_contexts = contexts.apply(word_tokenize).to_list()

  sequences = glove_model.index2entity + tokenized_questions + tokenized_contexts

  del glove_model # we  don't need anymore the glove model

  tokenizer.fit_on_texts(sequences)
  char_tokenizer.fit_on_texts(dataset['question'].to_list() + contexts.to_list())

  return tokenizer, char_tokenizer


def update_tokenizer(dataset, tokenizer, char_tokenizer):

  """
  update the existing word/char vocabulary on a new dataset
  """

  tokenized_questions = dataset['question'].apply(word_tokenize).to_list()

  contexts = pd.Series(dataset['context'].unique())
  tokenized_contexts = contexts.apply(word_tokenize).to_list()

  sequences = tokenized_questions + tokenized_contexts
  tokenizer.fit_on_texts(sequences)

  char_tokenizer.fit_on_texts(dataset['question'].to_list() + contexts.to_list())

def get_start_end(row):

  """
  get the start and end span for each sample,
  if the span cannot be found return -1
  """

  context = row['context']
  answer = row['text']
  tok_answer = word_tokenize(answer)

  _start = context.find(answer)

  if _start == -1:
    # the answer is not in the context
    # maybe due to a typo
    row['start'] = -1
    row['end'] = -1
    return row

  lc = context[:_start]
  lc = word_tokenize(lc)

  start = len(lc)
  end = start + len(tok_answer)

  row['start'] = start
  row['end'] = end

  return row

def tokenize(dataset, tokenizer, char_tokenizer):

  """
  tokenize the given dataset
  """

  _dataset = dataset.copy()

  tokenized_questions = _dataset['question'].apply(word_tokenize).to_list()
  tokenized_contexts = _dataset['context'].apply(word_tokenize).to_list()

  t_q = tokenizer.texts_to_sequences(tokenized_questions)
  t_c = tokenizer.texts_to_sequences(tokenized_contexts)

  c_q = []
  c_c = []

  for question, context in zip(tokenized_questions, tokenized_contexts):
    _q = char_tokenizer.texts_to_sequences(question)
    _c = char_tokenizer.texts_to_sequences(context)
    c_q.append(_q)
    c_c.append(_c)

  _dataset['tokenized_question'] = t_q
  _dataset['tokenized_context'] = t_c

  _dataset['char_tokenized_question'] = c_q
  _dataset['char_tokenized_context'] = c_c

  return _dataset

def split(dataset, test_size = 0.2, random_state = 42):

  """
  split the dataset in two part: the training and the validation
  """

  # random_state for deterministic state
  tr, vl = train_test_split(dataset, test_size = test_size, random_state = random_state)
  tr.reset_index(drop = True, inplace = True)
  vl.reset_index(drop = True, inplace = True)

  return tr,vl

def df_to_json(df, path, with_answer = True):

  """
  parse the given dataframe into the SQUAD json format and
  save it
  """
  
  data = []

  for title, articles in df.groupby('title'):
    chapter = {'title': title}
    paragraphs = []
    for context, contents in articles.groupby('context'):
      paragraph = {'context': context}
      qas = []
      for i, content in contents.iterrows():
        if with_answer:
          qa = {'answers': [{'answer_start': content['answer_start'], 'text': content['text']}], 'question': content['question'], 'id': content['id']}
        else:
          qa = {'question': content['question'], 'id': content['id']}
        qas.append(qa)
      paragraph.update({'qas': qas})
      paragraphs.append(paragraph)
    chapter.update({'paragraphs': paragraphs})
    data.append(chapter)
  raw_data = {'data': data}

  with open(path, 'w') as handle:
    json.dump(raw_data, handle)

  print(f'dataset saved in {path}')

In [8]:
tr_df, vl_df = split(squad_dataset)

In [9]:
tr_df.shape[0],vl_df.shape[0]

(70079, 17520)

Our vocabulary is based on the Glove vocabulary, and we add terms from the training set

In [10]:
tokenizer, char_tokenizer = get_tokenizer(tr_df)

[INFO] downloading glove 300
[INFO] done !


In [11]:
print(len(tokenizer.word_index))
len(char_tokenizer.word_index)

429064


1263

We then update our vocabulary with terms from the validation set

In [12]:
update_tokenizer(vl_df, tokenizer, char_tokenizer)

In [13]:
print(len(tokenizer.word_index))
len(char_tokenizer.word_index)

429758


1265

In [14]:
# take a while
tr_df = tr_df.apply(get_start_end, axis = 1)
vl_df = vl_df.apply(get_start_end, axis = 1)

we get rid of samples where the answer doesn't match the context (maybe there is a typo in the answer or the context).  
To avoid to discard many samples, we could lemmatize / stem the text.   
Obviously, lemmatization is a better choice for our task, but if we want a really accurate lemmatization processing, we need to do POS tagging.

In [15]:
tr_df[tr_df['start'] == -1].shape[0], vl_df[vl_df['start'] == -1].shape[0]

(69, 15)

In [16]:
tr_df[tr_df['start'] == -1]

Unnamed: 0,answer_start,text,question,id,context,title,c_id,start,end
87,92,"march 14, 2013",on what date was the 2013 human development re...,56de31984396321400ee2672,some countries were not included for various r...,Human_Development_Index,2185,-1,-1
3133,0,catalan sociolinguistics,what field studies the placement of catalan in...,56e17f5de3433e1400422f8c,"in central catalan, unstressed vowels reduce t...",Catalan_language,3470,-1,-1
3983,168,the valencian academy of language,what is the official regulating body of valen...,56e1b97fcd28a01900c67ad8,"valencian is classified as a western dialect, ...",Catalan_language,3488,-1,-1
6198,103,catalan,what language is the regulator meant to standa...,56e1b4decd28a01900c67a91,"in alghero, the iec has adapted its standard t...",Catalan_language,3486,-1,-1
6994,94,western catalonia,where are the provinces of lleida and tarragona?,56e1b738cd28a01900c67aae,"in 2011, the aragonese government passed a dec...",Catalan_language,3487,-1,-1
...,...,...,...,...,...,...,...,...,...
66889,113,some individual caribbean islands in the west ...,what did great britain gain in the west indies...,572e8003c246551400ce425f,"many middle and small powers in europe, unlike...",Seven_Years%27_War,15282,-1,-1
66972,446,reminiscent of the more famous and compact str...,"what is the precedent for the ""second hundred ...",572e81f2cb0c0d14000f1206,"the war was successful for great britain, whic...",Seven_Years%27_War,15283,-1,-1
67376,434,"sicily, and savoy, although sided with franco-...",who would sicily and savoy normally align with?,572e8578c246551400ce42bd,"realizing that war was imminent, prussia preem...",Seven_Years%27_War,15281,-1,-1
69867,56,uniformity,what do the dialects of catalan feature?,56e180f5e3433e1400422f96,catalan sociolinguistics studies the situation...,Catalan_language,3471,-1,-1


In [17]:
vl_df[vl_df['start'] == -1]

Unnamed: 0,answer_start,text,question,id,context,title,c_id,start,end
171,166,barcelona province,in what densely populated area is it spoken?,56e18a90e3433e1400422fac,western catalan comprises the two dialects of ...,Catalan_language,3474,-1,-1
1536,3,the balearic islands,where is iec's standard used?,56e1a3cbe3433e1400423066,"standard catalan, virtually accepted by all sp...",Catalan_language,3484,-1,-1
3007,162,merged,what have a and e done in eastern dialects?,56e18710cd28a01900c679b9,the dialects of the catalan language feature a...,Catalan_language,3472,-1,-1
4925,114,la franja,where is the catalan speaking part of aragon?,56e1b4decd28a01900c67a8e,"in alghero, the iec has adapted its standard t...",Catalan_language,3486,-1,-1
5782,69,seven,how many stressed phonemes are there in catalan?,56e18bfbe3433e1400422fb5,central catalan is considered the standard pro...,Catalan_language,3468,-1,-1
5897,118,treatment of unstressed a and e,what is the major difference between the two b...,56e18710cd28a01900c679b7,the dialects of the catalan language feature a...,Catalan_language,3472,-1,-1
5937,50,vulgar latin,what is the vowel system of catalan?,56e18bfbe3433e1400422fb4,western catalan comprises the two dialects of ...,Catalan_language,3468,-1,-1
6561,3,alghero,where has the iec adapted its standard to the ...,56e1b264e3433e14004230a6,the most notable difference between both stand...,Catalan_language,3485,-1,-1
7378,246,to later conflicts like the napoleonic wars,what was a later conflict that some considered...,572e81f2cb0c0d14000f1207,"the war was successful for great britain, whic...",Seven_Years%27_War,15283,-1,-1
11252,176,section pronunciation,where do you find dialectic vowel reductions?,56e17b08cd28a01900c679af,catalan has inherited the typical vowel system...,Catalan_language,3469,-1,-1


In [18]:
# we get rid of samples where the answer doesn't match the context
tr_df = tr_df[tr_df['start'] != -1]
vl_df = vl_df[vl_df['start'] != -1]

In [19]:
tr_df = tokenize(tr_df, tokenizer, char_tokenizer)
vl_df = tokenize(vl_df, tokenizer, char_tokenizer)

In [20]:
tr_df.head()

Unnamed: 0,answer_start,text,question,id,context,title,c_id,start,end,tokenized_question,tokenized_context,char_tokenized_question,char_tokenized_context
0,154,phillies,what team had dallas green managed in 1980?,572667e6708984140094c4f9,"after over a dozen more subpar seasons, in 198...",Chicago_Cubs,8880,29,30,"[11, 309, 49, 11808, 646, 2132, 6, 2627, 9]","[61, 83, 10, 6737, 62, 70020, 1740, 3, 6, 3372...","[[20, 11, 5, 4], [4, 3, 5, 16], [11, 5, 13], [...","[[5, 17, 4, 3, 10], [8, 24, 3, 10], [5], [13, ..."
1,156,rudy giuliani,which candidate withdrew from the presidential...,56dec2483277331400b4d712,schwarzenegger's endorsement in the republican...,Arnold_Schwarzenegger,2311,23,25,"[27, 2789, 4161, 23, 2, 1534, 698, 6, 417, 4, ...","[1084, 19, 9106, 6, 2, 1467, 477, 4, 2, 420, 1...","[[20, 11, 6, 14, 11], [14, 5, 7, 13, 6, 13, 5,...","[[9, 14, 11, 20, 5, 10, 39, 3, 7, 3, 19, 19, 3..."
2,224,wild ones outside the area,captive animals can distinguish co-inhabitats ...,5726e5995951b619008f81bb,it has been observed that well-fed predator an...,Predation,9822,38,43,"[11888, 727, 65, 3733, 419169, 23, 11, 48, 136...","[30, 40, 59, 2316, 20, 63225, 4421, 727, 6, 10...","[[14, 5, 18, 4, 6, 24, 3], [5, 7, 6, 16, 5, 12...","[[6, 4], [11, 5, 9], [22, 3, 3, 7], [8, 22, 9,..."
3,919,the battle of trafalgar,the results of which battle allowed the britis...,5726486f708984140094c157,"after returning from egypt, napoleon engineere...",Napoleon,8418,158,162,"[2, 1324, 4, 27, 326, 495, 2, 132, 8, 6280, 15...","[61, 3986, 23, 598, 3, 545, 9789, 10, 2313, 6,...","[[4, 11, 3], [10, 3, 9, 15, 12, 4, 9], [8, 17]...","[[5, 17, 4, 3, 10], [10, 3, 4, 15, 10, 7, 6, 7..."
4,382,hanged,how was vesey executed in 1822?,5730299db2c2fd14005689a7,"by 1820, charleston's population had grown to ...","Charleston,_South_Carolina",15719,74,75,"[44, 13, 25121, 2181, 6, 10202, 9]","[18, 9015, 3, 1909, 19, 104, 49, 2555, 8, 2106...","[[11, 8, 20], [20, 5, 9], [24, 3, 9, 3, 21], [...","[[22, 21], [28, 40, 31, 29], [23], [14, 11, 5,..."


We display some useful stats in order to define the padding size (at the word and character level, for both question and context)

In [21]:
print(tr_df['tokenized_question'].str.len().describe())
vl_df['tokenized_question'].str.len().describe()

count    70010.000000
mean        11.275532
std          3.715821
min          1.000000
25%          9.000000
50%         11.000000
75%         13.000000
max         60.000000
Name: tokenized_question, dtype: float64


count    17505.000000
mean        11.335504
std          3.754207
min          1.000000
25%          9.000000
50%         11.000000
75%         13.000000
max         38.000000
Name: tokenized_question, dtype: float64

In [22]:
print(tr_df['tokenized_question'].str.len().quantile(0.99))
vl_df['tokenized_question'].str.len().quantile(0.99)

22.0


23.0

In [23]:
print(tr_df['tokenized_context'].str.len().describe())
vl_df['tokenized_context'].str.len().describe()

count    70010.000000
mean       137.824439
std         56.941382
min         22.000000
25%        102.000000
50%        127.000000
75%        164.000000
max        766.000000
Name: tokenized_context, dtype: float64


count    17505.000000
mean       137.211083
std         55.912622
min         22.000000
25%        102.000000
50%        126.000000
75%        163.000000
max        766.000000
Name: tokenized_context, dtype: float64

In [24]:
print(tr_df['tokenized_context'].str.len().quantile(0.99))
vl_df['tokenized_context'].str.len().quantile(0.99)

324.0


324.0

In [25]:
def len_words(dataset):

  """
  return the word's length
  """

  count_q = []
  count_c = []

  for idx, row in dataset.iterrows():
    for w in row['char_tokenized_question']:
      l = len(w)
      count_q.append(l)
      
    for w in row['char_tokenized_context']:
      m = len(w)
      count_c.append(m)
  
  return pd.Series(count_q), pd.Series(count_c)

t_q,t_c = len_words(tr_df)
v_q,v_c = len_words(vl_df)

In [26]:
print(t_q.describe())
t_c.describe()

count    789400.000000
mean          4.447926
std           2.677579
min           1.000000
25%           2.000000
50%           4.000000
75%           6.000000
max          30.000000
dtype: float64


count    9.649089e+06
mean     4.626070e+00
std      2.969605e+00
min      1.000000e+00
25%      2.000000e+00
50%      4.000000e+00
75%      7.000000e+00
max      3.700000e+01
dtype: float64

In [27]:
print(v_q.describe())
v_c.describe()

count    198428.000000
mean          4.453232
std           2.686696
min           1.000000
25%           2.000000
50%           4.000000
75%           6.000000
max          24.000000
dtype: float64


count    2.401880e+06
mean     4.629710e+00
std      2.972670e+00
min      1.000000e+00
25%      2.000000e+00
50%      4.000000e+00
75%      7.000000e+00
max      3.700000e+01
dtype: float64

In [28]:
print(t_q.quantile(0.99))
t_c.quantile(0.99)

12.0


13.0

In [29]:
print(v_q.quantile(0.99))
v_c.quantile(0.99)

12.0


13.0

There are obviously some outliers. We are compeled to get rid of some samples because of memory issues.

We will get rid of contexts that have more than 400 words and questions that have more than 25 words.

We will set the length of a word to 15 characters

**EDIT :** These numbers are huge but we won't get out of memory errors if we build a sequence generator. If you don't want to use the sequence generator, you should reduce these numbers.

**EDIT :** Now that we use a sequence generator, we could define `*_MAXLEN` variables according to the stats provided by the training set

In [30]:
QUESTION_MAXLEN = 25
CONTEXT_MAXLEN = 400
WORD_MAXLEN = 15
BATCH_SIZE = 10

In [31]:
tr_df.shape, vl_df.shape

((70010, 13), (17505, 13))

In [32]:
tr_df = tr_df[(tr_df['tokenized_question'].str.len() <= QUESTION_MAXLEN) & (tr_df['tokenized_context'].str.len() <= CONTEXT_MAXLEN) & (tr_df['start'] <= CONTEXT_MAXLEN) & (tr_df['end'] <= CONTEXT_MAXLEN) ].reset_index(drop = True)
vl_df = vl_df[(vl_df['tokenized_question'].str.len() <= QUESTION_MAXLEN) & (vl_df['tokenized_context'].str.len() <= CONTEXT_MAXLEN) & (vl_df['start'] <= CONTEXT_MAXLEN) & (vl_df['end'] <= CONTEXT_MAXLEN) ].reset_index(drop = True)

In [33]:
tr_df.shape[0], vl_df.shape[0]

(69606, 17413)

In [34]:
 print(f' we get rid of : {SAMPLES - (tr_df.shape[0] + vl_df.shape[0])} samples')

 we get rid of : 580 samples


In [35]:
# save datasets in json format
path_to_train_set = os.path.join(os.getcwd(), 'drive/MyDrive/NLP/BIDAF/utils/data/train_set.json')
df_to_json(tr_df, path_to_train_set)

path_to_valid_set = os.path.join(os.getcwd(), 'drive/MyDrive/NLP/BIDAF/utils/data/valid_set.json')
df_to_json(vl_df, path_to_valid_set)

dataset saved in /content/drive/MyDrive/NLP/BIDAF/utils/data/train_set.json
dataset saved in /content/drive/MyDrive/NLP/BIDAF/utils/data/valid_set.json


In [36]:
# we save both tokenizers
tokenizers_folder = os.path.join(os.getcwd(), 'drive/MyDrive/NLP/BIDAF/utils', 'tokenizers')
if not os.path.exists(tokenizers_folder):
  os.makedirs(tokenizers_folder)

path_word_tokenizer = os.path.join(tokenizers_folder, 'word_tokenizer.pkl')
with open(path_word_tokenizer, 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol = pickle.HIGHEST_PROTOCOL)

path_char_tokenizer = os.path.join(tokenizers_folder, 'char_tokenizer.pkl')
with open(path_char_tokenizer, 'wb') as handle:
    pickle.dump(char_tokenizer, handle, protocol = pickle.HIGHEST_PROTOCOL)

We create the iterator. The iterator allows us to work with much bigger data, because it is loaded into memory only when we need them

In [37]:
# utils/datasets/dataset.py
class SQUAD_dataset(tf.keras.utils.Sequence):

  """
  utility class to create a working dataset that
  can be given to a neural network
  """

  def __init__(self, data, question_maxlen, context_maxlen, word_maxlen, batch_size, with_answer = True):
    self.QUESTION_MAXLEN = question_maxlen
    self.CONTEXT_MAXLEN = context_maxlen
    self.WORD_MAXLEN = word_maxlen
    self.batch_size = batch_size
    self.with_answer = with_answer
    self.__get_batches(data)

  def __len__(self):
    return len(self.batches)

  def __get_batches(self, data):
    batches = [data[i:i+self.batch_size] for i in range(0, len(data), self.batch_size)]
    self.batches = batches

  def __repr__(self):
    template = '''SQUAD_dataset : questions : ({0}, {1}), contexts : ({0}, {2}), char_questions : ({0}, {1}, {3}), char_contexts : ({0}, {2}, {3}), id : ({0}, 1)'''.format(self.batch_size, self.QUESTION_MAXLEN, self.CONTEXT_MAXLEN, self.WORD_MAXLEN)
    return template

  @classmethod
  def from_file(cls, path):
    path = os.path.join(os.getcwd(), path)
    with open(path, 'rb') as handle:
      dataset = pickle.load(handle)
    return dataset

  def to_pickle(self, path):
    path = os.path.join(os.getcwd(), path)
    folder = os.path.dirname(path)

    if not os.path.exists(folder):
      os.makedirs(folder)

    with open(path, 'wb') as handle:
      pickle.dump(self, handle, protocol = pickle.HIGHEST_PROTOCOL)

  def __getitem__(self, idx):
    batch = self.batches[idx].reset_index(drop = True)

    id = np.asarray(batch['id'])

    # questions and contexts words padding
    q_w = tf.keras.preprocessing.sequence.pad_sequences(batch['tokenized_question'], padding = 'post', maxlen = self.QUESTION_MAXLEN)
    c_w = tf.keras.preprocessing.sequence.pad_sequences(batch['tokenized_context'], padding = 'post', maxlen = self.CONTEXT_MAXLEN)

    # question_char padding
    q_c = np.zeros((q_w.shape[0], self.QUESTION_MAXLEN, self.WORD_MAXLEN), dtype = np.int32)

    for i, value in batch['char_tokenized_question'].iteritems():
      v = tf.keras.preprocessing.sequence.pad_sequences(value, padding = 'post', maxlen = self.WORD_MAXLEN, truncating = 'post')
      to_add = self.QUESTION_MAXLEN - v.shape[0]
      add = np.zeros((to_add, self.WORD_MAXLEN))
      arr = np.vstack([v,add])
      q_c[i] = arr

    # context_char padding
    c_c = np.zeros((q_w.shape[0], self.CONTEXT_MAXLEN, self.WORD_MAXLEN), dtype = np.int32)

    for i, value in batch['char_tokenized_context'].iteritems():
      v = tf.keras.preprocessing.sequence.pad_sequences(value, padding = 'post', maxlen = self.WORD_MAXLEN, truncating = 'post')
      to_add = self.CONTEXT_MAXLEN - v.shape[0]
      add = np.zeros((to_add, self.WORD_MAXLEN))
      arr = np.vstack([v,add])
      c_c[i] = arr

    # one hot encode start and end
    if self.with_answer:
      y_start = tf.keras.utils.to_categorical(batch['start'].values, self.CONTEXT_MAXLEN)
      y_end = tf.keras.utils.to_categorical(batch['end'].values, self.CONTEXT_MAXLEN)

      # (inputs), (outputs), (id)
      return (q_w, c_w, q_c, c_c), (y_start, y_end), (id,)
    return (q_c, c_w, q_c, c_c), (id,)

In [38]:
tr_data = SQUAD_dataset(tr_df, batch_size = BATCH_SIZE, question_maxlen = QUESTION_MAXLEN, context_maxlen = CONTEXT_MAXLEN, word_maxlen = WORD_MAXLEN)
vl_data = SQUAD_dataset(vl_df, batch_size = BATCH_SIZE, question_maxlen = QUESTION_MAXLEN, context_maxlen = CONTEXT_MAXLEN, word_maxlen = WORD_MAXLEN)

In [39]:
# number of batches
print(len(tr_data))
len(vl_data)

6961


1742

In [40]:
tr_data

SQUAD_dataset : questions : (10, 25), contexts : (10, 400), char_questions : (10, 25, 15), char_contexts : (10, 400, 15), id : (10, 1)

In [41]:
tr_data.to_pickle('drive/MyDrive/NLP/BIDAF/utils/datasets/train_dataset.pkl')
vl_data.to_pickle('drive/MyDrive/NLP/BIDAF/utils/datasets/valid_dataset.pkl')

Now that the preprocessing is over we can preprocess a (mock) unseen dataset. It is basically the same that the one we have seen just before, but it does not contain the start and end span (text and answer_start fields)

In [42]:
# with_answer = False to parse a dataset with no answer
unseen_dataset = load_dataset(SQUAD_PATH, with_answer = False)

Reading the json file
[INFO] processing...
[INFO] there are 87599 questions with single answer
[INFO] there are 18891 different contexts
[INFO] there are 442 unrelated subjects
[INFO] Done


In [43]:
s = unseen_dataset.shape[0]

In [44]:
unseen_dataset = clean_dataset(unseen_dataset, with_answer = False)
unseen_dataset = tokenize(unseen_dataset, tokenizer, char_tokenizer)
unseen_dataset = unseen_dataset[(unseen_dataset['tokenized_question'].str.len() <= QUESTION_MAXLEN) & (unseen_dataset['tokenized_context'].str.len() <= CONTEXT_MAXLEN)].reset_index(drop = True)
print(f' we get rid of : {s - (unseen_dataset.shape[0])} samples')

 we get rid of : 496 samples


In [45]:
unseen_path = os.path.join(os.getcwd(), 'drive/MyDrive/NLP/BIDAF/utils/data/unseen_set.json')
df_to_json(unseen_dataset, unseen_path, with_answer = False)

dataset saved in /content/drive/MyDrive/NLP/BIDAF/utils/data/unseen_set.json


In [46]:
unseen_data = SQUAD_dataset(unseen_dataset, batch_size = BATCH_SIZE, question_maxlen = QUESTION_MAXLEN, context_maxlen = CONTEXT_MAXLEN, word_maxlen = WORD_MAXLEN, with_answer = False)

In [47]:
unseen_data

SQUAD_dataset : questions : (10, 25), contexts : (10, 400), char_questions : (10, 25, 15), char_contexts : (10, 400, 15), id : (10, 1)

In [48]:
unseen_data.to_pickle('drive/MyDrive/NLP/BIDAF/utils/datasets/unseen_dataset.pkl')