Import modules

In [1]:
import sys
import itertools
import os
import json
import csv

if '..' not in sys.path:
    sys.path.append('..')

from src.document import Document

Read raw data files

In [2]:
def get_data(path):
    data = []

    for i in itertools.count(start=1):
        try:
            text = open(os.path.join(path, 'problem-' + str(i) + '.txt'), 'r').read()
            changes = json.load(open(os.path.join(path, 'problem-' + str(i) + '.truth')))
            data.append(Document(text, **changes))
        except FileNotFoundError:
            break
    
    return data

train = get_data('../data/train_raw')
validation = get_data('../data/validation_raw')

Split each text accordingly:
    - split at given split positions if text has multiple authors
    - split to 3 chunks with equal length otherwise
    
Each row is a tuple of (first_text_chunk, second_text_chunk, different_author(T/F))

In [3]:
def gen_chunks_csv(documents, csv_out):
    new_train = []

    for d in documents:
        indices = []
        if not d.has_changes:
            indices = [None, len(d.sentences)//3, (len(d.sentences)//3)*2, None]
        else:
            indices = [None] + d.sent_positions + [None]

        cache = {}
        for i, j in zip(indices[:-1], indices[1:]):
            cache['{} {}'.format(i,j)] = ' '.join(d.sentences[i:j])

        rows = [
            (
                cache['{} {}'.format(i,j)],
                cache['{} {}'.format(j,k)],
                d.has_changes
            ) for i, j, k in zip(indices[:-2], indices[1:-1], indices[2:])
        ]

        new_train.extend(rows)

    with open(csv_out, 'w') as out:
        csv_out = csv.writer(out)
        for row in new_train:
            csv_out.writerow(row)
            
            
train_path = '../data/train_chunks.csv'
if not os.path.exists(train_path):
    gen_chunks_csv(train, train_path)
    
val_path = '../data/validation_chunks.csv'
if not os.path.exists(val_path):
    gen_chunks_csv(validation, val_path)