In [2]:
import sys
import os
import glob
import pandas as pd
import numpy as np
import text_extensions_for_pandas as tp
from download_and_correct_corpus import Dataset

In [3]:
files = {
    'csv_files' : ["../corrected_labels/all_conll_corrections_combined.csv"],
    'dev'       : "../original_corpus/eng.testa",
    'test'      : "../original_corpus/eng.testb",
    'train'     : "../original_corpus/eng.train"
}

In [4]:
columns = ['doc_offset', 'corpus_span', 'correct_span']

test_df = pd.DataFrame(columns = columns)
dev_df = pd.DataFrame(columns = columns)
train_df = pd.DataFrame(columns = columns)

for f in files['csv_files']:
    current_df = pd.read_csv(os.path.abspath(f))
    test_df = test_df.append(current_df[(current_df["error_type"]=="Sentence") & (current_df["fold"]=="test")][columns], ignore_index=True)
    dev_df = dev_df.append(current_df[(current_df["error_type"]=="Sentence") & (current_df["fold"]=="dev")][columns], ignore_index=True)
    train_df = train_df.append(current_df[(current_df["error_type"]=="Sentence") & (current_df["fold"]=="train")][columns], ignore_index=True)

In [5]:
test_df.to_csv("../corrected_labels/sentence_corection_test.csv")
dev_df.to_csv("../corrected_labels/sentence_corection_dev.csv")
train_df.to_csv("../corrected_labels/sentence_corection_train.csv")
correction_df = {
    'dev'  : dev_df,
    'test' : test_df,
    'train': train_df
}

In [6]:
splits = ['dev', 'test', 'train']
lines_to_delete = {
    'dev'  : [],
    'test' : [],
    'train': []
}

for split in splits:    
    # Read the raw corpus file lines
    f = open(files[split])
    lines = f.readlines()
    
    # Create a dataframe for the corpus file and process our corrections csv
    dataset = Dataset(files[split])
    current_df = correction_df[split]
    for i, row in current_df.iterrows():
        if split == 'test' and i >= 59:
            continue
        try:
            candidate_lines = dataset.find(row["correct_span"], int(row["doc_offset"]))
        except:
            candidate_lines = dataset.find(row["corpus_span"], int(row["doc_offset"]))
            candidate_lines = (candidate_lines[0]-1, candidate_lines[1]+1)
            print("The correct_span did not match lines, using corpus span instead at {}, {}".format(split, i))
        appended = 0
        for c in range(candidate_lines[0], candidate_lines[1]+1):
            if lines[c] == "\n":
                lines_to_delete[split].append(c)
                appended += 1
        if appended == 0:
            print("Nothing to append here! Check {}, {} again".format(split, i))

Nothing to append here! Check test, 20 again
Nothing to append here! Check test, 30 again




Nothing to append here! Check train, 37 again
Nothing to append here! Check train, 38 again
Nothing to append here! Check train, 39 again
Nothing to append here! Check train, 76 again
Nothing to append here! Check train, 77 again
Nothing to append here! Check train, 78 again
Nothing to append here! Check train, 107 again
Nothing to append here! Check train, 108 again
Nothing to append here! Check train, 111 again


In [7]:
for l in lines_to_delete:
    lines_to_delete[l] = list(dict.fromkeys(lines_to_delete[l]))
    lines_to_delete[l].sort(reverse=True)

In [8]:
import pprint
pprint.pprint(lines_to_delete)

{'dev': [42643,
         38843,
         30692,
         30675,
         30645,
         7869,
         7856,
         7843,
         7430,
         6727,
         6672,
         5414,
         4452,
         4426,
         3216,
         2783],
 'test': [49123,
          48763,
          48676,
          48357,
          48257,
          46910,
          46858,
          46839,
          46144,
          43778,
          43726,
          43649,
          42051,
          8658,
          8636,
          8628,
          8612,
          8597,
          7560,
          6829,
          6104,
          5640,
          5267,
          5047,
          3155,
          1892],
 'train': [219502,
           219329,
           217807,
           216156,
           192381,
           188610,
           188128,
           188098,
           188070,
           188055,
           187979,
           187959,
           179109,
           179107,
           179104,
           173869,
           161214,
 

In [10]:
import json
json = json.dumps(lines_to_delete, indent=4, sort_keys=True)
f = open("../corrected_labels/sentence_corrections.json","w")
f.write(json)
f.close()