In [1]:
import itertools
import os
import json
import pandas as pd
import numpy as np

In [2]:
import sys
if '..' not in sys.path:
    sys.path.append('..')

from src.document import Document
from src.stylometry_extractor import StylometryExtractor
from src.text_chunk import TextChunk

In [3]:
def get_data(path):
    data = []

    for i in itertools.count(start=1):
        try:
            text = open(os.path.join(path, 'problem-' + str(i) + '.txt'), 'r').read()
            changes = json.load(open(os.path.join(path, 'problem-' + str(i) + '.truth')))
            data.append(Document(text, **changes))
        except FileNotFoundError:
            break
    
    return data

In [4]:
train = get_data('../data/train_raw')
validation = get_data('../data/validation_raw')

In [5]:
def gen_chunks_csv(documents, csv_out):
    new_train = []
    for d in documents:
        indices = []
        if not d.has_changes:
            indices = [None, len(d.sentences)//3, (len(d.sentences)//3)*2, None]
        else:
            indices = [None] + d.sent_positions + [None]

        cache = {}
        for i, j in zip(indices[:-1], indices[1:]):
            cache['{} {}'.format(i,j)] = ' '.join(d.sentences[i:j])

        rows = [
            (
                cache['{} {}'.format(i,j)],
                cache['{} {}'.format(j,k)],
                d.has_changes
            ) for i, j, k in zip(indices[:-2], indices[1:-1], indices[2:])
        ]

        new_train.extend(rows)

    with open(csv_out, 'w') as out:
        csv_out = csv.writer(out)
        for row in new_train:
            csv_out.writerow(row)

In [6]:
csv_name = '../data/train_chunks.csv'
if not os.path.exists(csv_name):
    gen_chunks_csv(validation, csv_name)

In [7]:
csv_name = '../data/validation_chunks.csv'
if not os.path.exists(csv_name):
    gen_chunks_csv(validation, csv_name)

In [8]:
def gen_features_df(documents):
    rows = []
    for d in documents:
        indices = []
        if not d.has_changes:
            indices = [None, len(d.sentences)//3, (len(d.sentences)//3)*2, None]
        else:
            indices = [None] + d.sent_positions + [None]

        cache = {}
        for i, j in zip(indices[:-1], indices[1:]):
            cache['{} {}'.format(i,j)] = TextChunk(' '.join(d.sentences[i:j]))

        for i, j, k in zip(indices[:-2], indices[1:-1], indices[2:]):
            diff_dict = (cache['{} {}'.format(i,j)]
                         .absolute_difference_with(cache['{} {}'.format(j,k)]))
            diff_dict['different_author'] = d.has_changes
            rows.append(pd.Series(diff_dict))
    
    return pd.DataFrame(rows)

In [9]:
csv_name = '../data/train_chunks_features.csv'
df_train = None
if os.path.exists(csv_name):
    df_train = pd.read_csv(csv_name)
else:
    df_train = gen_features_df(train)
    df_train.to_csv(csv_name, index=False)

In [10]:
sum(df_train["you're"])

5564.084117512164

In [11]:
csv_name = '../data/validation_chunks_features.csv'
df_validation = None
if os.path.exists(csv_name):
    df_validation = pd.read_csv(csv_name)
else:
    df_validation = gen_features_df(validation)
    df_validation.to_csv(csv_name, index=False)