In [1]:
from sacremoses import MosesDetokenizer
import pandas as pd
import csv


In [2]:
data = pd.read_csv('data/questions.tsv', sep='\t', engine='python', warn_bad_lines=True, quoting=csv.QUOTE_NONE)

In [3]:
example = data['Sentence'][0]

In [4]:
md = MosesDetokenizer(lang='en')

In [5]:
def dumb_quotes(x):
    return (x
        .replace('\u201c', '"').replace('\u201d', '"') # double quotes
        .replace('\u2018', "'").replace('\u2019', "'") # single quotes
    )

In [6]:
def moses_detokenize(sentence):
    return md.detokenize(dumb_quotes(sentence).split())

In [7]:
example.encode('unicode_escape')

b'The nation \\u2019 s largest gun - rights group is taking some Texans to task over their headline - generating demonstrations advocating the legal , open carrying of weapons .'

In [8]:
moses_detokenize(example)

"The nation 's largest gun - rights group is taking some Texans to task over their headline - generating demonstrations advocating the legal, open carrying of weapons."

In [9]:
data['Context'] = data['Sentence'].apply(moses_detokenize)

In [10]:
filtered_data = data.filter(['Article_Id', 'Context', 'Question'], axis=1)

In [11]:
filtered_data

Unnamed: 0,Article_Id,Context,Question
0,1,The nation 's largest gun - rights group is ta...,"What does \""to task\"" mean?"
1,1,The nation 's largest gun - rights group is ta...,What is this group called?
2,1,The nation 's largest gun - rights group is ta...,Which group?
3,1,The nation 's largest gun - rights group is ta...,Why don't you just come out and say the NRA?
4,1,Officials with the National Rifle Association ...,How many people is a small number?
...,...,...,...
19811,1500,John Bennardo is crisscrossing the country to ...,Why are $2 bills seen as so much more desirable?
19812,1500,John Bennardo is crisscrossing the country to ...,What magic are they referring to?
19813,1500,"""I think everyone 's curious about it,"" he said.",Why is everyone so curious about it?
19814,1500,"""I think everyone 's curious about it,"" he said.",Why do they feel everyone would be curious abo...


In [12]:
article_ids = filtered_data['Article_Id']
is_validation = (
    (article_ids <= 100) |
    (article_ids > 1050) & (article_ids <= 1100))
validation = filtered_data[is_validation]
len(validation)

1991

In [13]:
is_test = (
    (article_ids > 100) & (article_ids <= 150) |
    (article_ids > 500) & (article_ids <= 550) |
    (article_ids > 1100) & (article_ids <= 1150)
)
test = filtered_data[is_test]
len(test)

1894

In [14]:
is_train = (
    (article_ids > 150) & (article_ids <= 500) | 
    (article_ids > 550) & (article_ids <= 1050) | 
    (article_ids > 1150)
)
train = filtered_data[is_train]
len(train)

15931

In [15]:
assert (is_train & is_validation).sum() == 0
assert (is_validation & is_test).sum() == 0
assert (is_train & is_test).sum() == 0

train.to_json('./data/train.json', orient='records', lines=True)
test.to_json('./data/test.json', orient='records', lines=True)
validation.to_json('./data/validation.json', orient='records', lines=True)