In [1]:
def read_data(path):
    with open(path) as f:
        return [x.strip() for x in f.readlines()]

In [2]:
def create_split_id_map(splits):
    id_split_map = {}
    for line in splits:
        id, split, cefr = line.split('\t')
        id_split_map[id] = (split, cefr)
    return id_split_map

In [3]:
def add_splits(data, id_split_map, output_path):
    id = None
    with open(output_path, mode='w') as f:
        f.write('Document\tRaw\tCorrected\tOperation\tCEFR\tSplit\n')
        for line in data:
            doc_info, raw, corrected, op = line.split('\t')
            doc_id = doc_info.split('.')[0].split('-')[-1]
            split, cefr = id_split_map[doc_id]
            if doc_id != id and id != None:
                f.write('\n')
            f.write(f'{doc_info}\t{raw}\t{corrected}\t{op}\t{cefr}\t{split}\n')
            id = doc_id

In [4]:
unsplit_data = read_data('AR-all.alignment-FINAL.tsv')[1:]
splits = read_data('ar_splits.txt')
id_split_map = create_split_id_map(splits)
add_splits(unsplit_data, id_split_map, output_path='AR-all.alignment-FINAL.splits.tsv')

In [5]:
unsplit_data = read_data('EN-all.alignment-FINAL.tsv')[1:]
splits = read_data('en_splits.txt')
id_split_map = create_split_id_map(splits)
add_splits(unsplit_data, id_split_map, output_path='EN-all.alignment-FINAL.splits.tsv')

In [6]:
def collate_data(data):
    ex = []
    train_all_examples = []
    
    dev_all_examples = []
    
    test_all_examples = []
    
    for line in data:
        if line:
            doc_info, raw, corrected, op, cefr, split = line.split('\t')
            ex.append((doc_info, raw, corrected, cefr, op))
        else:
            if split == 'Train':
                train_all_examples.append(ex)
            elif split == 'Dev':
                dev_all_examples.append(ex)
            elif split == 'Test':
                test_all_examples.append(ex)
            
            ex = []
    
    # adding the last example
    if ex:
        if split == 'Train':
            train_all_examples.append(ex)
        elif split == 'Dev':
            dev_all_examples.append(ex)
        elif split == 'Test':
            test_all_examples.append(ex)

    return train_all_examples, dev_all_examples, test_all_examples

In [10]:
def write_collated_data(collated_data, output_path):
    with open(output_path, mode='w') as f:
        f.write('Document\tRaw\tCorrected\tCEFR\tOperation\n')
        for ex in collated_data:
            for sent in ex:
                doc_info, raw, correct, cefr, op = sent
                f.write(f'{doc_info}\t{raw}\t{correct}\t{cefr}\t{op}\n')
            f.write('\n')

In [11]:
data_with_splits = read_data('AR-all.alignment-FINAL.splits.tsv')[1:]
train_examples, dev_examples, test_examples = collate_data(data_with_splits)

In [12]:
write_collated_data(train_examples, 'data/ar/train/train.tokens')
write_collated_data(dev_examples, 'data/ar/dev/dev.tokens')
write_collated_data(test_examples, 'data/ar/test/test.tokens')

In [13]:
data_with_splits = read_data('EN-all.alignment-FINAL.splits.tsv')[1:]
train_examples, dev_examples, test_examples = collate_data(data_with_splits)

In [14]:
write_collated_data(train_examples, 'data/en/train/train.tokens')
write_collated_data(dev_examples, 'data/en/dev/dev.tokens')
write_collated_data(test_examples, 'data/en/test/test.tokens')