In [1]:
import os
import random
import argparse


# Define Hyperparams

In [2]:
parser = argparse.ArgumentParser()
parser.add_argument('--dataset', default='msra')
parser.add_argument('--val_ratio', default=0.05)
parser.add_argument('--seed', default=1234)
args = parser.parse_args([])

data_dir = 'data/' + args.dataset
path_train = data_dir + '/train_bio'
path_val = data_dir + '/val_bio'
path_test = data_dir + '/test_bio'


# Load Original Datasets

In [3]:
def load_dataset(path_dataset):
    """Load dataset into memory from text file"""
    dataset = []
    with open(path_dataset) as f:
        words, tags = [], []
        # Each line of the file corresponds to one word and tag
        for line in f:
            if line != '\n':
                line = line.strip('\n')
                if len(line.split()) > 1:
                    word = line.split()[0]
                    tag = line.split()[-1]
                else:
                    continue
                try:
                    if len(word) > 0 and len(tag) > 0:
                        word, tag = str(word), str(tag)
                        words.append(word)
                        tags.append(tag)
                except Exception as e:
                    print('An exception was raised, skipping a word: {}'.format(e))
            else:
                if len(words) > 0:
                    assert len(words) == len(tags)
                    dataset.append((words, tags))
                    words, tags = [], []
    return dataset

def save_dataset(dataset, save_dir):
    """Write sentences.txt and tags.txt files in save_dir from dataset

    Args:
        dataset: ([(["a", "cat"], ["O", "O"]), ...])
        save_dir: (string)
    """
    # Create directory if it doesn't exist
    print('Saving in {}...'.format(save_dir))
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    # Export the dataset
    with open(os.path.join(save_dir, 'sentences.txt'), 'w') as file_sentences, \
        open(os.path.join(save_dir, 'tags.txt'), 'w') as file_tags:
        for words, tags in dataset:
            file_sentences.write('{}\n'.format(' '.join(words)))
            file_tags.write('{}\n'.format(' '.join(tags)))
    print('- done.')

In [8]:
# load train/val/test data, data_train/val/tesi with list type is
# the result. 0-dimension is text and 1-dimension is tags.

In [4]:
data_train = load_dataset(path_train)
data_test = load_dataset(path_test)


if os.path.exists(path_val):
    data_val = load_dataset(path_val)
else:
    total_train_len = len(data_train)
    split_val_len = int(total_train_len * args.val_ratio)
    order = list(range(total_train_len))
    # Set seed for random processing
    random.seed(args.seed)
    random.shuffle(order)
    
    # Split trainsets into train and val with split-val-ratio
    data_val = [data_train[idx] for idx in order[:split_val_len]]
    data_train = [data_train[idx] for idx in order[split_val_len:]]

print(len(data_train), len(data_val), len(data_test))
save_dataset(data_train, data_dir + '/train')
save_dataset(data_val, data_dir + '/val')
save_dataset(data_test, data_dir + '/test')

42750 2250 3442
Saving in data/msra/train...
- done.
Saving in data/msra/val...
- done.
Saving in data/msra/test...
- done.


In [16]:
def build_tags(data_dir, tags_file):
    data_types = ['train', 'val', 'test']
    tags = set()

    for data_type in data_types:
        tags_path = os.path.join(data_dir, data_type, 'tags.txt')
        with open(tags_path, 'r') as file:
            for line in file:
                tag_seq = filter(len, line.strip().split(' '))
                tags.update(list(tag_seq))

    tags = sorted(tags)
    with open(tags_file, 'w') as file:
        file.write('\n'.join(tags))
    return tags
tags = build_tags(data_dir, data_dir + '/tags.txt')
tags

['B-LOC', 'B-ORG', 'B-PER', 'I-LOC', 'I-ORG', 'I-PER', 'O']