This notebook performs the following:
1. Convert data from text to ID.
2. Convert CNN and Daily Mail data into CBT format.
3. Some analysis done on data.

# Imports

In [None]:
import os
import re
import nltk
from collections import Counter
from functools import reduce
from tensorflow.python.platform import gfile
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import random
import glob

# Global Parameters

In [None]:
"""
Tags to be added to vocabulary
Padding, Start of Sentence, Snd of Sentence, Snknown, and End of Question
"""
PAD = "PAD"
SoS = "SoS"
EoS = "EoS"
UNK = "UNK"
EoQ = "EoQ"

tags = [PAD, SoS, EoS, UNK, EoQ]

# Tag IDs
PAD_ID = 0
SoS_ID = 1
EoS_ID = 2
UNK_ID = 3
EoQ_ID = 4

# Number of words to keep
vocab_size = 100000


"""
The following is for converting CNN and Daily Mail data into CBT style. The following tags are POS tags that
    are used to collect true answers from documents. Corresponding code could be found at the end of this 
    notebook under the section Converting CNN and Daily Mail Data.
NN = noun
NNS = noun plural
NNP = proper noun, singular
NNPS = proper noun, plural
VB = verb
VBG = verb gerund
VBD = verb past tense
VBN = verb past participle

"""

included_tags = ['NN', 'NNS', 'NNP', 'NNPS', 'VB', 'VBG', 'VBD', 'VBN']

# Methods

Methods in this section are based on methods in https://github.com/cairoHy/attention-sum-reader/blob/master/data_utils.py. The methods are used to convert words to IDs.

## Tokenizer

In [None]:
def default_tokenizer(sentence):

    """
    A regular expression (RE) to matche a digit follwed by any number of digits
    This is needed to remove line numbers at the start of each sentence
    """
    digit_RE = re.compile(r"\d+")

    # Replace line number with the empty string
    sentence = digit_RE.sub("", sentence)

    """
    The following line splits a string at '|', and return it without '|'
    It is needed for the query sentence
    """
    sentence = " ".join(sentence.split("|"))

    """
    'nltk.word_tokenize(sentence.lower())' returns a list with each entry is a word (or a char) in the original string.
    It ignores spaces and '\n'
    """
    return nltk.word_tokenize(sentence.lower())

## Vocabulary

In [None]:
def gen_vocab(data_file, tokenizer=default_tokenizer, old_counter=None):
  """
  This method reads input data and returns a counter object with entries as {word : its frequency}

  Parameters:
    'data_file': Directory to CBT data
    'tokenizer': Tokenizer to tokenize sentences
    'old_counter': A counter object with entries as {word : its frequency}
  """

  print("Creating word_dict from data %s" % data_file)

  # Check if a counter object is passed
  word_counter = old_counter if old_counter else Counter()
  counter = 0

  with gfile.GFile(data_file) as f:
    for line in f:
      counter += 1

      # 'str.rstrip('\n')' removes mew line char
      tokens = tokenizer(line.rstrip('\n'))

      # Update Counter
      word_counter.update(tokens)

      if counter % 100000 == 0:
        print("Done processing line %d." % counter)

  # Some statistics
  total_words = sum(word_counter.values())
  distinct_words = len(list(word_counter))

  print("Some statistics:")
  print("Total words: " + str(total_words))
  print("Total distinct words: " + str(distinct_words))

  return word_counter


def save_vocab(word_counter, vocab_file, vocab_size=None):
  """
  This method processes Counter object and generates a file with 'vocab_size' words.

  Parameters:
    'word_counter': Counter object
    'vocab_file': A file to write the vocab to
    'vocab_size': The maximum nember of words to keep
  """

  with gfile.GFile(vocab_file, "w") as f:
    for word in tags:
      f.write(word + "\n")

    """
    'word_counter.most_common(vocab_size)' return the 'vocab_size' most common words.
    'map(fun, iter)' ==> fun = 'lambda x: x[0]', iter = 'word_counter.most_common(vocab_size)'
    'x' = (word, freq)
    """
    for word in list(map(lambda x: x[0], word_counter.most_common(vocab_size))):
      f.write(word + "\n")


def load_vocab(vocab_file):
  """
  This method loads 'vocab_file'. It returns a 'word_dict' with entries as {word : its ID}

  Parameters:
    'vocab_file': Path to vocab file
  """

  if not gfile.Exists(vocab_file):
    raise ValueError("Vocabulary file %s not found.", vocab_file)

  word_dict = {}
  word_id = 0

  with gfile.GFile(vocab_file, "r") as f:
    for line in f:

      # Line has a single word with trailing new line char which needs to be removed
      word_dict.update({line.strip(): word_id})
      word_id += 1

  return word_dict

## Main Methods

In [None]:
def sentence_to_token_ids(sentence, word_dict, tokenizer=default_tokenizer):
  """
  This method translates words in a sentence into the corresponding IDs
  Example：
    sentence: ["I", "have", "a", "dog"]
    word_list：{"I": 1, "have": 2, "a": 4, "dog": 7"}
    return: [1, 2, 4, 7]

  Parameters:
    'sentence': input string
    'word_dict': Word -> ID mapping dictionary
    'tokenizer': Tokenizer to tokenize sentences

    Returns: List of IDs
  """

  """
  'dictionary.get(keyname, value)', 'value' is the value to return if 
    'keyname' does not exist
  """
  return [word_dict.get(token, UNK_ID) for token in tokenizer(sentence)]


def data_to_token_ids(data_file, target_file, vocab_file):
  """
  This method generates a file out of the input data such that each word is replaced 
    by its ID

  Each CBT example has 22 lines as:
    First 20 lines: Context with number of lines (e.g., 1 to 20)
    Line 21: Question with line number (e.g., 21) \t true answer \t \t candidate answer 1 | candidate answer 2 | ... | candidate answer 10
    Line 22: Blank

  Parameters:
    'data_file': Directory to CBT data file
    'target_file': A file to write IDs to
    'vocab_file': vocab file
  """

  if gfile.Exists(target_file):
      return

  print("Tokenizing data in {}".format(data_file))

  word_dict = load_vocab(vocab_file)
  counter = 0
    
  document = ''
  
  with gfile.GFile(data_file, mode="r") as data_file:
    with gfile.GFile(target_file, mode="w") as tokens_file:
      for line in data_file:
        counter += 1
        if counter % 100000 == 0:
          print("Tokenizing line %d" % counter)

        if counter % 22 == 21:
          """
          line.split("\t")[0] = query
          line.split("\t")[1] = true answer
          line.split("\t")[2] = empty string
          line.split("\t")[3] = candidate answers
          """
          q, true_answer, _, CA = line.split("\t")

          token_ids_q = sentence_to_token_ids(q, word_dict)
          token_ids_CA = [word_dict.get(a.lower(), UNK_ID) for a in CA.rstrip("\n").split("|")]
          query = " ".join([str(tok) for tok in token_ids_q]) + "\t" \
                            + str(word_dict.get(true_answer.lower(), UNK_ID)) + "\t" \
                            + "|".join([str(tok) for tok in token_ids_CA]) + "\n"
          
          tokens_file.write(document + query)
          document = ''
          
        elif counter % 22 < 21:
          token_ids = sentence_to_token_ids(line, word_dict)
          document = document + " ".join([str(tok) for tok in token_ids]) + "\n"
          #tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")

def prepare_data(data_dir, train_file, valid_file, test_file, vocab_size, output_dir):
  """
  This method takes paths to all data files and generates corresponding ID files
  """

  if not gfile.Exists(os.path.join(data_dir, output_dir)):
    os.mkdir(os.path.join(data_dir, output_dir))

  os_train_file = os.path.join(data_dir, train_file + '.txt')
  os_valid_file = os.path.join(data_dir, valid_file + '.txt')
  os_test_file = os.path.join(data_dir, test_file + '.txt')

  id_train_file = os.path.join(data_dir, output_dir, train_file + ".%d.id.txt" % vocab_size)
  id_valid_file = os.path.join(data_dir, output_dir, valid_file + ".%d.id.txt" % vocab_size)
  id_test_file = os.path.join(data_dir, output_dir, test_file + ".%d.id.txt" % vocab_size)
  vocab_file = os.path.join(data_dir, output_dir, "vocab.%d.txt" % vocab_size)
  
  if not gfile.Exists(vocab_file):
    word_counter = gen_vocab(os_train_file)
    word_counter = gen_vocab(os_valid_file, old_counter=word_counter)
    word_counter = gen_vocab(os_test_file, old_counter=word_counter)
    save_vocab(word_counter, vocab_file, vocab_size)

  # Create train, valid, and test files represented by IDs
  data_to_token_ids(os_train_file, id_train_file, vocab_file)
  data_to_token_ids(os_valid_file, id_valid_file, vocab_file)
  data_to_token_ids(os_test_file, id_test_file, vocab_file)

  return vocab_file, id_train_file, id_valid_file, id_test_file

# Prepare Data

## CBT

In [None]:
"""
'cloze_types' is a list to hold types of missing words (e.g., Named Entity (NE), Common Noun (CN), and so on..).
  The list is needed to biuld paths to corresponding files. Instead of building 4 sets of paths, it is better
  to build one set and only change the part that identifies the missimg word's type. So, the same set of paths 
  could be used to access all data files.

'output_dir' is the directory to store ID files. The same directory name could be used for all data types (e.g., NE, CN, and so on..)
  because 'output_dir' is a sub-directory in each data type's folder.
"""

cloze_types = ["CN", "NE", "V", "P"]
output_dir = "processedData"

In [None]:
for cloze_type in cloze_types:
    # Directories to CBT data
    data_dir = "./data/" + cloze_type + "_data/"
    train_file = "cbtest_" + cloze_type + "_train"
    valid_file = "cbtest_" + cloze_type + "_valid_2000ex"
    test_file = "cbtest_" + cloze_type + "_test_2500ex"

    vocab_file, id_train_file, id_valid_file, id_test_file = prepare_data(
          data_dir, train_file, valid_file, test_file, vocab_size, output_dir)

## CNN and Daily Mail

In [None]:
"""
'output_dir' is the directory to store ID files. The same directory name could be used for all data types (e.g., NE, CN, and so on..)
  because 'output_dir' is a sub-directory in each data type's folder.
"""

dataset_name = ["CNN", "DM"]
output_dir = "processedData"

In [None]:
for name in dataset_name:
    # Directories to data
    data_dir = "./data/" + name + "_data/"
    train_file = name + "_train"
    valid_file = name + "_valid_2000ex"
    test_file = name + "_test_2500ex"

    vocab_file, id_train_file, id_valid_file, id_test_file = prepare_data(
          data_dir, train_file, valid_file, test_file, vocab_size, output_dir)

Creating word_dict from data ./data/CNN_data/CNN_train.txt
Done processing line 100000.
Done processing line 200000.
Done processing line 300000.
Done processing line 400000.
Done processing line 500000.
Done processing line 600000.
Done processing line 700000.
Done processing line 800000.
Done processing line 900000.
Done processing line 1000000.
Done processing line 1100000.
Done processing line 1200000.
Done processing line 1300000.
Some statistics:
Total words: 44884418
Total distinct words: 252787
Creating word_dict from data ./data/CNN_data/CNN_valid_2000ex.txt
Some statistics:
Total words: 45889828
Total distinct words: 255469
Creating word_dict from data ./data/CNN_data/CNN_test_2500ex.txt
Some statistics:
Total words: 47179397
Total distinct words: 259034
Tokenizing data in ./data/CNN_data/CNN_train.txt
Tokenizing line 100000
Tokenizing line 200000
Tokenizing line 300000
Tokenizing line 400000
Tokenizing line 500000
Tokenizing line 600000
Tokenizing line 700000
Tokenizing line

# Converting CNN and Daily Mail Data

In [None]:
# A method to get all file names of CNN and Daily Mail data
def getFileNames (extension):
    return [file for file in glob.glob(extension)]

In [None]:
# A method to convert CNN and Daily Mail data into CBT style by processing a file at a time
def writeData(input_files, output_file):
    """
    This method converts CNN and Daily Mail data into CBT style by processing a file at a time. It returns the 
        number of written files.

  Parameters:
    'input_files': A list of directories to CNN or Daily Mail data.
    'output_file': A file to write converted data to.
    'valid_query_length': minimum length of a query. If a query has less than 'valid_query_length' words,
        then the context is ignored.
    """
    # Keep track of line numbers
    line_number = 1
    
    # String to store context
    document = ''

    """
    A counter to keep track of number of converted files. It is needed because some files 
        have contexts of less than 20 lines. Such files are ignored.
    """
    converted_files = 0
    
    """
    A counter to keep track of number of processed files.
        
    """
    processed_files = 0
    
    for file in input_files:
        
        # Reset line counter
        line_number = 1

        # Reset 'document'
        document = ''
                    
        processed_files += 1
        if processed_files % 1000 == 0:
            print("Processed files so far is ", processed_files)
            
        with open(file, mode="r") as f:
            with open(output_file, mode="a+") as w:
            
                for line in f:

                    # Collect context of 20 lines, and check line is not empty and does not have '@highlight'
                    if line_number < 21 and line != '\n' and not ("@highlight" in line):
                            
                        # Add '\n' to 'line' if it does not have it
                        line = line if '\n' in line else line + '\n'
                        
                        # Build context line by line
                        document = document + str(line_number) + ' ' + line
                        
                        line_number += 1

                    # line #21 is query line
                    elif line_number == 21 and line != '\n' and not ("@highlight" in line):
                        
                        # Remove non-alphanumeric values from 'line'
                        line = re.sub('[^A-Za-z0-9]+', ' ', line)
                        
                        # Remove single characters from 'line'
                        line =  re.sub(r"\b[a-zA-Z]\b", "", line)
                        
                        # Tokenize line, POS tag it, and shuffle tags 
                        tokenized_line = word_tokenize(line)
                        tags = nltk.pos_tag(tokenized_line)
                        random.shuffle(tags)

                        """
                        Get first word in 'tags' with accepted tag. If there is no such word, discard 
                          the whole document.
                        """

                        selected_word = None
                        for word, tag in tags:
                            #selected_word = word
                            if tag in included_tags:
                                selected_word = word
                                break

                        if (selected_word):
                            
                            # Construct a temporary document without 'selected_word'
                            document_tmp = re.sub(selected_word, '', document)

                            # Check if population (document_tmp) has more than 9 uniqe words
                            if (len(document_tmp.split()) > 9):
                                    
                                # Randomly sample 9 candidatie answers from 'document_tmp'
                                candidate_answers = random.sample(document_tmp.split(), 9)

                                # Append 'selected_word' to 'candidate_answers' and shuffle it
                                candidate_answers.append(selected_word)
                                random.shuffle(candidate_answers)

                                # Build query line
                                query = str(line_number) + ' ' + line.rstrip().replace(selected_word, 'XXXXX', 1)

                                query_line = query + '\t' + selected_word + '\t' + '' + '\t' + '|'.join(candidate_answers)

                                # write 'document' and 'query', and add empty line after them
                                w.write(document + query_line + '\n\n')

                                # Advance converted files counter
                                converted_files += 1
                                
                                # Stop reading data
                                break
                        
                        
                        
    print("Number of converted files is ", converted_files)
    return converted_files

## CNN

In [None]:
cnn_files = getFileNames("./cnn_stories/cnn/stories/*.story")

In [None]:
print("Number of CNN data files is", len(cnn_files))

Number of CNN data files is 92579


In [None]:
# Split into train, validation, and test set

# Sizes of sets
test_size = 2500
val_size = 2000
train_size = len(cnn_files) - (test_size + val_size)

random.shuffle(cnn_files)

train_files = cnn_files[0 : train_size]
val_files = cnn_files[train_size : (train_size + val_size)]
test_files = cnn_files[(train_size + val_size) : (train_size + val_size + train_size)]

In [None]:
print("Number of CNN train data files is", len(train_files))
print("Number of CNN validation data files is", len(val_files))
print("Number of CNN test data files is", len(test_files))

Number of CNN train data files is 88079
Number of CNN validation data files is 2000
Number of CNN test data files is 2500


In [None]:
CNN_data_directory = './cnn_stories/'
total_train_data = writeData(train_files, CNN_data_directory + 'CNN_train6.txt')
total_val_data = writeData(val_files, CNN_data_directory + 'CNN_valid_2000ex6.txt')
total_test_data = writeData(test_files, CNN_data_directory + 'CNN_test_2500ex6.txt')

Processed files so far is  1000
Processed files so far is  2000
Processed files so far is  3000
Processed files so far is  4000
Processed files so far is  5000
Processed files so far is  6000
Processed files so far is  7000
Processed files so far is  8000
Processed files so far is  9000
Processed files so far is  10000
Processed files so far is  11000
Processed files so far is  12000
Processed files so far is  13000
Processed files so far is  14000
Processed files so far is  15000
Processed files so far is  16000
Processed files so far is  17000
Processed files so far is  18000
Processed files so far is  19000
Processed files so far is  20000
Processed files so far is  21000
Processed files so far is  22000
Processed files so far is  23000
Processed files so far is  24000
Processed files so far is  25000
Processed files so far is  26000
Processed files so far is  27000
Processed files so far is  28000
Processed files so far is  29000
Processed files so far is  30000
Processed files so 

## Daily Mail

In [None]:
DM_files = getFileNames("./dailymail_stories/dailymail/stories/*.story")

In [None]:
print("Number of Daily Mail data files is", len(DM_files))

Number of Daily Mail data files is 219506


In [None]:
# Split into train, validation, and test set

# Sizes of sets
test_size = 2500
val_size = 2000
train_size = len(DM_files) - (test_size + val_size)

random.shuffle(DM_files)

train_files = DM_files[0 : train_size]
val_files = DM_files[train_size : (train_size + val_size)]
test_files = DM_files[(train_size + val_size) : (train_size + val_size + train_size)]

In [None]:
print("Number of Daily Mail train data files is", len(train_files))
print("Number of Daily Mail validation data files is", len(val_files))
print("Number of Daily Mail test data files is", len(test_files))

Number of Daily Mail train data files is 215006
Number of Daily Mail validation data files is 2000
Number of Daily Mail test data files is 2500


In [None]:
# To limit train data size
maxSize = 100000

DM_data_directory = './dailymail_stories/'
total_train_data = writeData(train_files[0 : maxSize], DM_data_directory + 'DM_train.txt')
total_val_data = writeData(val_files, DM_data_directory + 'DM_valid_2000ex.txt')
total_test_data = writeData(test_files, DM_data_directory + 'DM_test_2500ex.txt')

Processed files so far is  1000
Processed files so far is  2000
Processed files so far is  3000
Processed files so far is  4000
Processed files so far is  5000
Processed files so far is  6000
Processed files so far is  7000
Processed files so far is  8000
Processed files so far is  9000
Processed files so far is  10000
Processed files so far is  11000
Processed files so far is  12000
Processed files so far is  13000
Processed files so far is  14000
Processed files so far is  15000
Processed files so far is  16000
Processed files so far is  17000
Processed files so far is  18000
Processed files so far is  19000
Processed files so far is  20000
Processed files so far is  21000
Processed files so far is  22000
Processed files so far is  23000
Processed files so far is  24000
Processed files so far is  25000
Processed files so far is  26000
Processed files so far is  27000
Processed files so far is  28000
Processed files so far is  29000
Processed files so far is  30000
Processed files so 

# Data Analysis

This section provides code to check the number of unique answers for each dataset

In [None]:
def collect_true_answers(data_file):
    """
    This method reads data files and returns a dictionary with (true answer, its frequency) as 
      key-value pairs

    Parameters:
    'data_file': Directory to data file
    """
    
    true_answers = {}
    counter = 0
  
    with gfile.GFile(data_file, mode="r") as data_file:
        for line in data_file:
            counter += 1

            if counter % 22 == 21:
                """
                line.split("\t")[0] = query
                line.split("\t")[1] = true answer
                line.split("\t")[2] = empty string
                line.split("\t")[3] = candidate answers
                """
                
                true_answers.update({line.split("\t")[1]: true_answers.get(line.split("\t")[1], 0) + 1})
                
    return true_answers

In [None]:
data_sets = ["CNN", "DM"]

for i in range(len(data_sets)):
    
    # Train data
    data_directory = "./data/" + data_sets[i] + "_data/" + data_sets[i] + "_train.txt"
    dictionary = collect_true_answers(data_directory)
    
    print("Unique answers for " + data_sets[i] + " train data is:", len(dictionary))
    print("total answers for " + data_sets[i] + " train data is:", sum(dictionary.values()))
    
    # Validation data
    data_directory = "./data/" + data_sets[i] + "_data/" + data_sets[i] + "_valid_2000ex.txt"
    dictionary = collect_true_answers(data_directory)
    
    print("Unique answers for " + data_sets[i] + " validation data is:", len(dictionary))
    print("total answers for " + data_sets[i] + " validation data is:", sum(dictionary.values()))

    # Test data
    data_directory = "./data/" + data_sets[i] + "_data/" + data_sets[i] + "_test_2500ex.txt"
    dictionary = collect_true_answers(data_directory)
    
    print("Unique answers for " + data_sets[i] + " test data is:", len(dictionary))
    print("total answers for " + data_sets[i] + " test data is:", sum(dictionary.values()))