<a href="https://colab.research.google.com/github/CorentinMAG/NLP/blob/main/BIDAF/bidaf_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In order to run this notebook, first things you should do are :
* mount your drive endpoint
* go at the end and modify paths ( i don't know if numpy.save() also create missing folders )


In [1]:
import json
import os
import pandas as pd
import numpy as np
import tensorflow as tf
import nltk
from nltk import word_tokenize
nltk.download('punkt')
import gensim.downloader as gloader
from sklearn.model_selection import train_test_split
import re
import pickle

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
EMBEDDING_SIZE = 300

def load_dataset(path, record_path = ['data', 'paragraphs', 'qas', 'answers'], verbose = True):

  if verbose:
      print("Reading the json file")
  # if the file encoding is not UTF8 an exception should be raised    
  file = json.loads(open(path).read())

  if verbose:
      print("[INFO] processing...")

  # parsing different level's in the json file
  js = pd.json_normalize(file , record_path )
  m = pd.json_normalize(file, record_path[:-1] )
  r = pd.json_normalize(file,record_path[:-2])
  t = pd.json_normalize(file,record_path[0])

  #combining it into single dataframe
  idx = np.repeat(r['context'].values, r.qas.str.len())
  ndx  = np.repeat(m['id'].values,m['answers'].str.len())
  m['context'] = idx
  js['q_idx'] = ndx
  main = pd.concat([ m[['id','question','context']].set_index('id'), js.set_index('q_idx')],1,sort = False).reset_index()
  main['c_id'] = main['context'].factorize()[0]
  if verbose:
      print(f"[INFO] there are {main.shape[0]} questions with single answer")
      print(f"[INFO] there are {main.groupby('c_id').sum().shape[0]} different contexts")
      print(f"[INFO] there are {len(t)} unrelated subjects")
      print("[INFO] Done")
  return main

def download_glove_model(embedding_dimension = 50):
  download_path = "glove-wiki-gigaword-{}".format(embedding_dimension)
  try:
    print('[INFO] downloading glove {}'.format(embedding_dimension))
    emb_model = gloader.load(download_path)
    print('[INFO] done !')
  except ValueError as e:
      print("Glove: 50, 100, 200, 300")
      raise e
  return emb_model

In [3]:
dataset_path = os.path.join(os.getcwd(),'drive/MyDrive/NLP/SQUAD_project/data/training_set.json')
squad_dataset = load_dataset(dataset_path)

Reading the json file
[INFO] processing...
[INFO] there are 87599 questions with single answer
[INFO] there are 18891 different contexts
[INFO] there are 442 unrelated subjects
[INFO] Done


In [4]:
squad_dataset.head()

Unnamed: 0,index,question,context,answer_start,text,c_id
0,5733be284776f41900661182,To whom did the Virgin Mary allegedly appear i...,"Architecturally, the school has a Catholic cha...",515,Saint Bernadette Soubirous,0
1,5733be284776f4190066117f,What is in front of the Notre Dame Main Building?,"Architecturally, the school has a Catholic cha...",188,a copper statue of Christ,0
2,5733be284776f41900661180,The Basilica of the Sacred heart at Notre Dame...,"Architecturally, the school has a Catholic cha...",279,the Main Building,0
3,5733be284776f41900661181,What is the Grotto at Notre Dame?,"Architecturally, the school has a Catholic cha...",381,a Marian place of prayer and reflection,0
4,5733be284776f4190066117e,What sits on top of the Main Building at Notre...,"Architecturally, the school has a Catholic cha...",92,a golden statue of the Virgin Mary,0


In [5]:
SAMPLES = squad_dataset.shape[0]

def preprocess_sentence(text):
  text = text.lower()
  #text = re.sub(r'(\.)',r' \1 ', text)
  text = text.strip()
  return text

def clean_dataset(dataset):

  _dataset = dataset.copy()

  cleaned_questions = _dataset['question'].apply(preprocess_sentence)
  cleaned_texts = _dataset['text'].apply(preprocess_sentence)

  unique_context = pd.Series(_dataset['context'].unique())
  count_c = _dataset.groupby('c_id').count()['text']
  cleaned_contexts = unique_context.apply(preprocess_sentence)

  _dataset['question'] = cleaned_questions
  _dataset['text'] = cleaned_texts
  _dataset['context'] = pd.Series(np.repeat(cleaned_contexts, count_c).tolist())

  return _dataset

In [6]:
squad = clean_dataset(squad_dataset)

In [7]:
def get_tokenizer(dataset, glove_model = None):

  tokenizer = tf.keras.preprocessing.text.Tokenizer(filters = '')
  char_tokenizer = tf.keras.preprocessing.text.Tokenizer(char_level = True, filters = '', oov_token = 'UNK', num_words = 200) # we will only keep the 200 - 1 most frequents characters (otherwise oom issue)
                                                                                                                              # others tokens are replaced by UNK token 
                                                                                                                              # we keep 1 - 199 and 1 is UNK token (so we keep 198 tokens)
  if glove_model == None:
    glove_model = download_glove_model(EMBEDDING_SIZE)

  tokenized_questions = dataset['question'].apply(word_tokenize).to_list()

  contexts = pd.Series(dataset['context'].unique())
  count = dataset.groupby('c_id').count()['context']
  tokenized_contexts = contexts.apply(word_tokenize).to_list()

  sequences = glove_model.index2entity + tokenized_questions + tokenized_contexts

  del glove_model # we  don't need anymore the glove model

  tokenizer.fit_on_texts(sequences)
  char_tokenizer.fit_on_texts(dataset['question'] + dataset['context'])

  return tokenizer, char_tokenizer


def update_tokenizer(dataset, tokenizer, char_tokenizer):

  tokenized_questions = dataset['question'].apply(word_tokenize).to_list()

  contexts = pd.Series(dataset['context'].unique())
  count = dataset.groupby('context').count()['c_id'].reset_index(drop = True)
  tokenized_contexts = contexts.apply(word_tokenize).to_list()

  sequences = tokenized_questions + tokenized_contexts
  tokenizer.fit_on_texts(sequences)

  char_tokenizer.fit_on_texts(dataset['question'] + dataset['context'])


def get_start_end(row):

  context = row['context']
  answer = row['text']
  tok_answer = word_tokenize(answer)

  _start = context.find(answer)

  lc = context[:_start]
  lc = word_tokenize(lc)

  start = len(lc)
  end = start + len(tok_answer)

  row['start'] = start
  row['end'] = end

  return row

def tokenize(dataset, tokenizer, char_tokenizer):

  _dataset = dataset.copy()

  tokenized_questions = _dataset['question'].apply(word_tokenize).to_list()
  tokenized_contexts = _dataset['context'].apply(word_tokenize).to_list()

  t_q = tokenizer.texts_to_sequences(tokenized_questions)
  t_c = tokenizer.texts_to_sequences(tokenized_contexts)

  c_q = []
  c_c = []

  for question, context in zip(tokenized_questions, tokenized_contexts):
    _q = char_tokenizer.texts_to_sequences(question)
    _c = char_tokenizer.texts_to_sequences(context)
    c_q.append(_q)
    c_c.append(_c)

  _dataset['tokenized_question'] = t_q
  _dataset['tokenized_context'] = t_c

  _dataset['char_tokenized_question'] = pd.Series(c_q)
  _dataset['char_tokenized_context'] = pd.Series(c_c)

  return _dataset

def split(dataset, test_size = 0.2, random_state = 42):

  # random_state for deterministic state

  tr, vl = train_test_split(dataset, test_size = test_size, random_state = random_state)
  tr.reset_index(drop = True, inplace = True)
  vl.reset_index(drop = True, inplace = True)

  return tr,vl

def convert(context , coord = None, tokenizer = None):
  if coord:
    start = coord[0]
    end = coord[1]
    if type(context) == str:
      context = word_tokenize(context)
      answer = context[start:end]
      return ' '.join(answer).strip()
    else:
      answer = ''
      for i in range(start, end):
        t = context[i]
        answer+= tokenizer.index_word[t] + ' '
      return answer.strip()
  else:
    if type(context) == str:
      return context
    else:
      c = ''
      for t in context:
        c += tokenizer.index_word[t] + ' '
      return c.strip()

In [8]:
tr_df, vl_df = split(squad)

In [9]:
tr_df.shape[0],vl_df.shape[0]

(70079, 17520)

Our vocabulary is based on the Glove vocabulary, and we add terms from the training set

In [10]:
tokenizer, char_tokenizer = get_tokenizer(tr_df)

[INFO] downloading glove 300


In [11]:
print(len(tokenizer.word_index))
len(char_tokenizer.word_index)

429063


1263

We then update our vocabulary with terms from the validation set

In [12]:
update_tokenizer(vl_df, tokenizer, char_tokenizer)

In [13]:
print(len(tokenizer.word_index))
len(char_tokenizer.word_index)

429757


1265

In [14]:
# take a while

tr_df = tr_df.apply(get_start_end, axis = 1)
vl_df = vl_df.apply(get_start_end, axis = 1)

we get rid of samples where the answer doesn't match the context (maybe there is a typo in the answer or the context).  
To avoid to discard many samples, we could lemmatize / stem the text.   
Obviously, lemmatization is a better choice for our task, but if we want a really accurate lemmatization processing, we need to do POS tagging.

In [15]:
tr_df[tr_df['start'] == -1].shape[0], vl_df[vl_df['start'] == -1].shape[0]

(0, 0)

In [16]:
tr_df = tokenize(tr_df, tokenizer, char_tokenizer)
vl_df = tokenize(vl_df, tokenizer, char_tokenizer)

In [17]:
tr_df.head()

Unnamed: 0,index,question,context,answer_start,text,c_id,start,end,tokenized_question,tokenized_context,char_tokenized_question,char_tokenized_context
0,572667e6708984140094c4f9,what team had dallas green managed in 1980?,"after over a dozen more subpar seasons, in 198...",154,phillies,8880,29,30,"[10, 308, 48, 11807, 645, 2131, 5, 2626, 8]","[60, 82, 9, 6736, 61, 70019, 1739, 2, 5, 3371,...","[[20, 11, 5, 4], [4, 3, 5, 16], [11, 5, 13], [...","[[5, 17, 4, 3, 10], [8, 24, 3, 10], [5], [13, ..."
1,56dec2483277331400b4d712,which candidate withdrew from the presidential...,schwarzenegger's endorsement in the republican...,156,rudy giuliani,2311,23,25,"[26, 2788, 4160, 22, 1, 1533, 697, 5, 416, 3, ...","[1083, 18, 9105, 5, 1, 1466, 476, 3, 1, 419, 1...","[[20, 11, 6, 14, 11], [14, 5, 7, 13, 6, 13, 5,...","[[9, 14, 11, 20, 5, 10, 39, 3, 7, 3, 19, 19, 3..."
2,5726e5995951b619008f81bb,captive animals can distinguish co-inhabitats ...,it has been observed that well-fed predator an...,224,wild ones outside the area,9822,38,43,"[11887, 726, 64, 3732, 419168, 22, 10, 47, 135...","[29, 39, 58, 2315, 19, 63224, 4420, 726, 5, 9,...","[[14, 5, 18, 4, 6, 24, 3], [5, 7, 6, 16, 5, 12...","[[6, 4], [11, 5, 9], [22, 3, 3, 7], [8, 22, 9,..."
3,5726486f708984140094c157,the results of which battle allowed the britis...,"after returning from egypt, napoleon engineere...",919,the battle of trafalgar,8418,158,162,"[1, 1323, 3, 26, 325, 494, 1, 131, 7, 6279, 15...","[60, 3985, 22, 597, 2, 544, 9788, 9, 2312, 5, ...","[[4, 11, 3], [10, 3, 9, 15, 12, 4, 9], [8, 17]...","[[5, 17, 4, 3, 10], [10, 3, 4, 15, 10, 7, 6, 7..."
4,5730299db2c2fd14005689a7,how was vesey executed in 1822?,"by 1820, charleston's population had grown to ...",382,hanged,15719,74,75,"[43, 12, 25120, 2180, 5, 10201, 8]","[17, 9014, 2, 1908, 18, 103, 48, 2554, 7, 2106...","[[11, 8, 20], [20, 5, 9], [24, 3, 9, 3, 21], [...","[[22, 21], [27, 40, 29, 28], [23], [14, 11, 5,..."


In [18]:
print(tr_df['tokenized_question'].str.len().describe())
vl_df['tokenized_question'].str.len().describe()

count    70079.000000
mean        11.273962
std          3.715555
min          1.000000
25%          9.000000
50%         11.000000
75%         13.000000
max         60.000000
Name: tokenized_question, dtype: float64


count    17520.000000
mean        11.333961
std          3.753941
min          1.000000
25%          9.000000
50%         11.000000
75%         13.000000
max         38.000000
Name: tokenized_question, dtype: float64

In [19]:
print(tr_df['tokenized_context'].str.len().describe())
vl_df['tokenized_context'].str.len().describe()

count    70079.000000
mean       137.777323
std         56.960478
min         22.000000
25%        102.000000
50%        127.000000
75%        164.000000
max        766.000000
Name: tokenized_context, dtype: float64


count    17520.000000
mean       137.163071
std         55.942698
min         22.000000
25%        102.000000
50%        126.000000
75%        163.000000
max        766.000000
Name: tokenized_context, dtype: float64

In [20]:
def len_words(dataset):
  count_q = []
  count_c = []

  for idx, row in dataset.iterrows():
    for w in row['char_tokenized_question']:
      l = len(w)
      count_q.append(l)
      
    for w in row['char_tokenized_context']:
      m = len(w)
      count_c.append(m)
  
  return pd.Series(count_q), pd.Series(count_c)

t_q,t_c = len_words(tr_df)
v_q,v_c = len_words(vl_df)

In [21]:
print(t_q.describe())
t_c.describe()

count    790068.000000
mean          4.447850
std           2.677574
min           1.000000
25%           2.000000
50%           4.000000
75%           6.000000
max          30.000000
dtype: float64


count    9.655297e+06
mean     4.626008e+00
std      2.969615e+00
min      1.000000e+00
25%      2.000000e+00
50%      4.000000e+00
75%      7.000000e+00
max      3.700000e+01
dtype: float64

In [22]:
print(v_q.describe())
v_c.describe()

count    198571.000000
mean          4.453077
std           2.686618
min           1.000000
25%           2.000000
50%           4.000000
75%           6.000000
max          24.000000
dtype: float64


count    2.403097e+06
mean     4.629671e+00
std      2.972735e+00
min      1.000000e+00
25%      2.000000e+00
50%      4.000000e+00
75%      7.000000e+00
max      3.700000e+01
dtype: float64

There are obviously some outliers. We are compeled to get rid of some samples because of memory issues.

We will get rid of contexts that have more than 300 words and questions that have more than 20 words.

We will set the length of a word to 20 (in our dataset, the max length encoutered is 37)

In [23]:
QUESTION_MAXLEN = 20
CONTEXT_MAXLEN = 300
WORD_LEN = 20

In [24]:
tr_df = tr_df[(tr_df['tokenized_question'].str.len() <= 20) & (tr_df['tokenized_context'].str.len() <= 300) & (tr_df['start'] <= 300) & (tr_df['end'] <= 300) ].reset_index(drop = True)
vl_df = vl_df[(vl_df['tokenized_question'].str.len() <= 20) & (vl_df['tokenized_context'].str.len() <= 300) & (vl_df['start'] <= 300) & (vl_df['end'] <= 300) ].reset_index(drop = True)

In [25]:
tr_df.shape[0], vl_df.shape[0]

(67482, 16854)

In [26]:
 print(f' we get rid of : {SAMPLES - (tr_df.shape[0] + vl_df.shape[0])} samples')

 we get rid of : 3263 samples


In [27]:
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
%mv tokenizer.pickle 'drive/MyDrive/NLP/data/' 

In [28]:
with open('char_tokenizer.pickle', 'wb') as handle:
    pickle.dump(char_tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
%mv char_tokenizer.pickle 'drive/MyDrive/NLP/data/' 

In [29]:
tr_padded_questions = tf.keras.preprocessing.sequence.pad_sequences(tr_df['tokenized_question'], padding = 'post', maxlen = QUESTION_MAXLEN)
tr_padded_contexts = tf.keras.preprocessing.sequence.pad_sequences(tr_df['tokenized_context'], padding = 'post', maxlen = CONTEXT_MAXLEN)

In [30]:
np.save('drive/MyDrive/NLP/data/tr_padded_questions.npy', tr_padded_questions)
np.save('drive/MyDrive/NLP/data/tr_padded_contexts.npy', tr_padded_contexts)

In [31]:
vl_padded_questions = tf.keras.preprocessing.sequence.pad_sequences(vl_df['tokenized_question'], padding = 'post', maxlen = QUESTION_MAXLEN)
vl_padded_contexts = tf.keras.preprocessing.sequence.pad_sequences(vl_df['tokenized_context'], padding = 'post', maxlen = CONTEXT_MAXLEN)

In [32]:
np.save('drive/MyDrive/NLP/data/vl_padded_questions.npy', vl_padded_questions)
np.save('drive/MyDrive/NLP/data/vl_padded_contexts.npy', vl_padded_contexts)

In [33]:
pad_char_c = np.zeros((vl_df.shape[0], QUESTION_MAXLEN, WORD_LEN), dtype = np.int32)

for i, value in vl_df['char_tokenized_question'].iteritems():
  v = tf.keras.preprocessing.sequence.pad_sequences(value, padding = 'post', maxlen = WORD_LEN)
  to_add = QUESTION_MAXLEN - v.shape[0]
  add = np.zeros((to_add, WORD_LEN))
  arr = np.vstack([v,add])
  pad_char_c[i] = arr

np.save('drive/MyDrive/NLP/data/vl_char_padded_questions.npy', pad_char_c)

In [34]:
pad_char_c = np.zeros((tr_df.shape[0], QUESTION_MAXLEN,WORD_LEN), dtype = np.int32)

for i, value in tr_df['char_tokenized_question'].iteritems():
  v = tf.keras.preprocessing.sequence.pad_sequences(value, padding = 'post', maxlen = WORD_LEN)
  to_add = QUESTION_MAXLEN - v.shape[0]
  add = np.zeros((to_add, WORD_LEN))
  arr = np.vstack([v,add])
  pad_char_c[i] = arr

np.save('drive/MyDrive/NLP/data/tr_char_padded_questions.npy', pad_char_c)

In [35]:
del pad_char_c # to free memory

In [36]:
pad_char_q = np.zeros((vl_df.shape[0], CONTEXT_MAXLEN, WORD_LEN), dtype = np.int32)

for i, value in vl_df['char_tokenized_context'].iteritems():
  v = tf.keras.preprocessing.sequence.pad_sequences(value, padding = 'post', maxlen = WORD_LEN)
  to_add = CONTEXT_MAXLEN - v.shape[0]
  add = np.zeros((to_add, WORD_LEN))
  arr = np.vstack([v,add])
  pad_char_q[i] = arr

np.save('drive/MyDrive/NLP/data/vl_char_padded_contexts.npy', pad_char_q)

In [37]:
pad_char_q =  np.zeros((tr_df.shape[0], CONTEXT_MAXLEN,WORD_LEN))

for i, value in tr_df['char_tokenized_context'].iteritems():
  v = tf.keras.preprocessing.sequence.pad_sequences(value, padding = 'post', maxlen = WORD_LEN)
  to_add = CONTEXT_MAXLEN - v.shape[0]
  add = np.zeros((to_add, WORD_LEN))
  arr = np.vstack([v,add])
  pad_char_q[i] = arr
  
np.save('drive/MyDrive/NLP/data/tr_char_padded_contexts.npy', pad_char_q)

In [38]:
del pad_char_q # to free memory

In [39]:
num_classes = CONTEXT_MAXLEN
y_start_train = tf.keras.utils.to_categorical(tr_df['start'].values, num_classes)
y_end_train = tf.keras.utils.to_categorical(tr_df['end'].values, num_classes)

y_start_valid = tf.keras.utils.to_categorical(vl_df['start'].values, num_classes)
y_end_valid = tf.keras.utils.to_categorical(vl_df['end'].values, num_classes)

In [40]:
np.save('drive/MyDrive/NLP/data/tr_y_start.npy', y_start_train)
np.save('drive/MyDrive/NLP/data/tr_y_end.npy', y_end_train)

In [41]:
np.save('drive/MyDrive/NLP/data/vl_y_start.npy', y_start_valid)
np.save('drive/MyDrive/NLP/data/vl_y_end.npy', y_end_valid)

In fact we have created a character tokenizer but we won't work at the character level.  
Indeed, we run oom when we try to work at this level.

**EDIT** : it seems we can work at the character level if we significantly reduce the WORD_LEN (from 40 to 20) and only treat the first 199 tokens as real tokens and the others as UNK tokens