Data obtained from:

https://cs.nyu.edu/~kcho/DMQA/

# Parse data and drop to disk

In [2]:
import glob

In [3]:
FILES = glob.glob('../data/cnn/questions/training/*')

In [4]:
FILES

['../data/cnn/questions/training/000021e9748578a514493c773809465c661a28d1.question',
 '../data/cnn/questions/training/00005db6c024901c6eaeec5e3334c66297cb0cee.question',
 '../data/cnn/questions/training/0000638b5bcbd1500b499694f758c5cbe4bbb805.question',
 '../data/cnn/questions/training/00007493c579410e5d0f5b6b0106a0bd498ea53e.question',
 '../data/cnn/questions/training/000076cf6fa641238d1e6fab691f2fd1f16c8cab.question',
 '../data/cnn/questions/training/00008b480cc9e487cecdd57bd2c83f3651d0a070.question',
 '../data/cnn/questions/training/00008dbe0204d07c38a8e4fc8aae774a088be1e2.question',
 '../data/cnn/questions/training/0000f9bca2d96f6aca145d314589fb5cff5039cf.question',
 '../data/cnn/questions/training/00013e459883688edb68c0a20b087b9ebf78811a.question',
 '../data/cnn/questions/training/000158d86f39ed3b183ef846ebaf05c6b3df1492.question',
 '../data/cnn/questions/training/000169e9e58f15e4d2d86aaf5c8bb1d7acc06cf0.question',
 '../data/cnn/questions/training/0001a2bd1cf51f2ba8c0609c9ea3838c

In [5]:
len(FILES)

380298

In [6]:
with open(FILES[0]) as f:
    content = f.read()

In [7]:
content.split('\n\n')

['http://web.archive.org/web/20150103125807id_/http://edition.cnn.com/2007/TRAVEL/getaways/12/24/offseason.europe/index.html',
 '( @entity0 ) -- each summer , @entity1 greets a stampede of sightseers and shoppers with eager cash registers . before jumping into the peak - season pig pile , consider the advantages of an off - season trip . major cities like @entity10 crackle with energy year - round . given the current weakness of our dollar overseas , the potential price - savings of an off - season trip are enough to brighten a gray winter day . airfares are often hundreds of dollars less . with fewer crowds in @entity1 , you \'ll sleep cheaper . many fine hotels drop their prices , and budget hotels have plenty of vacancies . to save some money on hotels in the off - season , arrive late without a reservation , notice how many empty rooms they have ( look for keys on the rack ) , and give the receptionist an excuse to win your business with a deep discount . explain that you \'re a se

In [8]:
class FORMAT:
    URL = 0
    CONTEXT = 1
    QUESTION = 2
    ANSWER = 3
    ENTITY_MAPPING = 4
    PLACEHOLDER_TOKEN = '@placeholder'

In [9]:
def parse(file):
    with open(file) as f:
        content = f.read()
        parts = content.split('\n\n')
        context = parts[FORMAT.CONTEXT]
        question = parts[FORMAT.QUESTION]
        answer = parts[FORMAT.ANSWER]
        summary = question.replace(FORMAT.PLACEHOLDER_TOKEN, answer)
        return '\t'.join([context, summary])

In [10]:
parsed_files = (parse(f) for f in FILES)

In [11]:
next(parsed_files)

'( @entity0 ) -- each summer , @entity1 greets a stampede of sightseers and shoppers with eager cash registers . before jumping into the peak - season pig pile , consider the advantages of an off - season trip . major cities like @entity10 crackle with energy year - round . given the current weakness of our dollar overseas , the potential price - savings of an off - season trip are enough to brighten a gray winter day . airfares are often hundreds of dollars less . with fewer crowds in @entity1 , you \'ll sleep cheaper . many fine hotels drop their prices , and budget hotels have plenty of vacancies . to save some money on hotels in the off - season , arrive late without a reservation , notice how many empty rooms they have ( look for keys on the rack ) , and give the receptionist an excuse to win your business with a deep discount . explain that you \'re a senior ( hosteller , student , artist , whatever ) with a particular price limit , and bargain from there . note that while touris

In [15]:
SUMMARY_DIRECTORY = '../data/cnn/summary'

In [16]:
import os
if not os.path.exists(SUMMARY_DIRECTORY):
    os.makedirs(SUMMARY_DIRECTORY)

In [17]:
import tqdm
for f in tqdm.tqdm(FILES):
    text = parse(f)
    dst_basename = os.path.basename(f)
    dst = os.path.join(SUMMARY_DIRECTORY, dst_basename)
    with open(dst, 'w') as f_out:
        f_out.write(text)

100%|██████████| 380298/380298 [03:32<00:00, 1793.46it/s]


In [18]:
f_out

<closed file '../data/cnn/summary/ffffd9d55dcc480eaf11fd10ab4eeafdf7641815.question', mode 'w' at 0x10adadc00>

In [19]:
with open(dst) as f:
    print(f.read())

( @entity0 ) -- @entity2 federal police have arrested a man who authorities say could have been involved in the march shooting deaths of three people with ties to the @entity9 @entity8 in @entity10 , @entity2 , the country 's public safety ministry said . @entity14 , known as " @entity15 , " is also suspected in the slaying of a federal police officer , the ministry said in a statement thursday . authorities are investigating the 32 - year - old 's " likely participation in the killings of three people linked to the @entity9 consulate " in @entity10 , the ministry said . the statement did not specify how @entity14 allegedly participated in the shootings . thursday 's statement from @entity2 security officials described @entity14 as a leader of the @entity31 gang , a street gang affiliated with the @entity10 cartel . @entity2 authorities have arrested several others they accuse of being connected to alleged killing of three people with ties to the @entity10 consulate . the shootings occ

# Collect stats on data set to determine train parameters

In [20]:
import os
import tqdm
context_lengths = []
target_lengths = []
for f in tqdm.tqdm(FILES):
    dst_basename = os.path.basename(f)
    dst = os.path.join(SUMMARY_DIRECTORY, dst_basename)
    with open(dst, 'r') as f:
        context, target = f.read().split('\t')
    context_lengths.append(len(context.split(' ')))   
    target_lengths.append(len(target.split(' ')))

100%|██████████| 380298/380298 [01:46<00:00, 3563.10it/s]


In [21]:
import pandas as pd

In [22]:
df = pd.DataFrame(
    {'x': context_lengths,
     'y': target_lengths})

In [23]:
df.describe()

Unnamed: 0,x,y
count,380298.0,380298.0
mean,761.813517,12.470486
std,389.41599,3.214081
min,8.0,1.0
25%,452.0,10.0
50%,700.0,12.0
75%,1006.0,14.0
max,2000.0,47.0
