In [2]:
import re
import random
import numpy as np
import urllib
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F

# Load data

In [41]:
# Use the URL to obtain the corpus from the web 
url = 'https://cogcomp.seas.upenn.edu/Data/QA/QC/train_5500.label'

try:
    url_response = urllib.request.urlopen(url) 
except urllib.error.HTTPError as e:
    e.msg += ' ' + url
    raise

In [43]:
# Obtain raw text from the given URL
# Decode the bytes using UTF-8 format
# Use the keyword 'ignore' for decoding
raw_text = url_response.read().decode('utf8', 'ignore')

In [48]:
print(type(raw_text))
raw_text[:100]

<class 'str'>


'DESC:manner How did serfdom develop in and then leave Russia ?\nENTY:cremat What films featured the c'

# Data Preprocessing

Split the tag from the dataset

In [97]:
# Remove the whitespace in the beginning and the end
# Split the string and convert it into a list of questions
ques_list = raw_text.strip().split('\n')

# Show the last five samples
for q in ques_list[-5:]: 
    print(q) 

print(f'Total tagged questions: {len(ques_list)}')

ENTY:other What 's the shape of a camel 's spine ?
ENTY:currency What type of currency is used in China ?
NUM:temp What is the temperature today ?
NUM:temp What is the temperature for cooking ?
ENTY:currency What currency is used in Australia ?
Total tagged questions: 5452


In [92]:
# Use one sample in the question list to show the data processing
rand_idx = random.randint(0, len(ques_list))
sample = ques_list[rand_idx]
print(sample)

split_sample = sample.split(':')
print(split_sample)

mark, ques = split_sample[0], split_sample[1][:-1] # Remove the question mark
print([ques, mark])

NUM:date In what year did Thatcher gain power ?
['NUM', 'date In what year did Thatcher gain power ?']
['date In what year did Thatcher gain power ', 'NUM']


In [96]:
# Data processing on the whole dataset
tagged_ques_list = [[q.split(':')[1][:-1], q.split(':')[0]] for q in ques_list]

In [95]:
assert len(tagged_ques_list) == len(ques_list), "Different length after data processing"

In [107]:
ques, tags = list(zip(*tagged_ques_list)) # Split the questions and the tags
ques, tags = list(ques), list(tags) # Convert from tuple to list

print(ques[10:15])
print(tags[10:15])

['date When was Ozzy Osbourne born ', 'reason Why do heavier objects travel downhill faster ', 'ind Who was The Pride of the Yankees ', 'ind Who killed Gandhi ', 'event What is considered the costliest disaster the insurance industry has ever faced ']
['NUM', 'DESC', 'HUM', 'HUM', 'ENTY']


Number masking for search space reduction

In [109]:
for i, q in enumerate(ques):
    ques[i] = re.sub('\d', '#', q).split()

Build word2index and tag2index

In [110]:
# Extract all words from the all questions
flatten = lambda l: [item for sublist in l for item in sublist]
vocab = list(set(flatten(ques)))

In [113]:
print(f'Total words: {len(vocab)}')
print(vocab[:10])
print(f'Total tags: {len(set(tags))}')
print(set(tags))

Total words: 9125
['limit', 'accompanied', 'freeway', 'martyrs', 'match', 'Fiedler', 'money', 'pushy', 'grades', 'dissented']
Total tags: 6
{'HUM', 'NUM', 'ABBR', 'ENTY', 'DESC', 'LOC'}
