In [1]:
import sys
import os
import glob
import pandas as pd
import numpy as np

sys.path.append(os.path.abspath('../../text-extensions-for-pandas'))
import text_extensions_for_pandas as tp

from download_and_correct_corpus import Dataset



In [2]:
files = {
    'csv_files' : ["../corrected_labels/all_conll_corrections_combined.csv"],
    'dev'       : "../original_corpus/eng.testa",
    'test'      : "../original_corpus/eng.testb",
    'train'     : "../original_corpus/eng.train"
}

In [3]:
columns = ['doc_offset', 'corpus_span', 'correct_span', 'correct_ent_type', 'error_type']

test_df = pd.DataFrame(columns = columns)
dev_df = pd.DataFrame(columns = columns)
train_df = pd.DataFrame(columns = columns)

for f in files['csv_files']:
    current_df = pd.read_csv(os.path.abspath(f))
    test_df = test_df.append(current_df[(current_df["error_type"]=="Token") & (current_df["fold"]=="test")][columns], ignore_index=True)
    dev_df = dev_df.append(current_df[(current_df["error_type"]=="Token") & (current_df["fold"]=="dev")][columns], ignore_index=True)
    train_df = train_df.append(current_df[(current_df["error_type"]=="Token") & (current_df["fold"]=="train")][columns], ignore_index=True)
    
display(train_df)
display(dev_df)
display(test_df)

Unnamed: 0,doc_offset,corpus_span,correct_span,correct_ent_type,error_type
0,80,"[44, 59): 'rebels-Interfax'","[51, 59): 'Interfax'",ORG,Token
1,115,"[17, 27): 'FOCUS-News'","[23, 27): 'News'",ORG,Token
2,163,"[220, 232): 'x-AEK Athens'","[222, 232): 'AEK Athens'",ORG,Token
3,163,"[271, 283): 'x-Olympiakos'","[273, 283): 'Olympiakos'",ORG,Token
4,163,"[308, 313): 'x-PAO'","[310, 313): 'PAO'",ORG,Token
5,169,"[50, 61): 'trip-Canada'","[55, 61): 'Canada'",LOC,Token
6,198,"[11, 20): 'WSC-India'","[11, 14): 'WSC'",ORG,Token
7,198,"[11, 20): 'WSC-India'","[15, 20): 'India'",LOC,Token
8,298,"[49, 60): '1997--Ruehe'","[55, 60): 'Ruehe'",PER,Token
9,343,"[11, 31): 'AUSTRALIAN RULES-AFL'","[11, 21): 'AUSTRALIAN'",MISC,Token


Unnamed: 0,doc_offset,corpus_span,correct_span,correct_ent_type,error_type
0,15,"[41, 51): 'CUNNINGHAM'","[33, 51): 'RANDALL CUNNINGHAM'",PER,Token
1,39,"[11, 23): 'Boxing-Bruno'","[18, 23): 'Bruno'",PER,Token
2,60,"[1358, 1371): 'Tripoli-based'","[1358, 1365): 'Tripoli'",LOC,Token
3,65,"[1125, 1134): 'asset-St.'","[1131, 1140): 'St. Louis'",LOC,Token
4,65,"[1359, 1376): 'Minneapolis-based'","[1359, 1370): 'Minneapolis'",LOC,Token
5,65,"[1419, 1429): 'Ohio-based'","[1419, 1423): 'Ohio'",LOC,Token
6,65,"[592, 607): 'St. Louis-based'","[592, 601): 'St. Louis'",LOC,Token
7,65,"[793, 802): 'Mo.-based'","[793, 796): 'Mo.'",LOC,Token
8,175,"[252, 264): 'London-based'","[252, 258): 'London'",LOC,Token
9,181,"[1761, 1774): 'Moscow-backed'","[1761, 1767): 'Moscow'",LOC,Token


Unnamed: 0,doc_offset,corpus_span,correct_span,correct_ent_type,error_type
0,3,"[21, 37): 'SKIING-WORLD CUP'","[28, 37): 'WORLD CUP'",MISC,Token
1,25,"[24, 38): 'FOOTBALL-COLTS'","[33, 38): 'COLTS'",ORG,Token
2,27,"[565, 573): 'X-DENVER'","[567, 573): 'DENVER'",ORG,Token
3,27,"[889, 900): 'Y-GREEN BAY'","[891, 900): 'GREEN BAY'",ORG,Token
4,29,"[25, 44): 'FOOTBALL-OHIO STATE'","[34, 44): 'OHIO STATE'",ORG,Token
5,39,"[1158, 1175): 'AbelardoFernandez'","[1158, 1175): 'Abelardo Fernandez'",PER,Token
6,40,"[215, 236): 'Real Madrid-Barcelona'","[215, 226): 'Real Madrid'",ORG,Token
7,40,"[215, 236): 'Real Madrid-Barcelona'","[227, 236): 'Barcelona'",ORG,Token
8,54,"[11, 27): 'INTERVIEW-ZYWIEC'","[21, 27): 'ZYWIEC'",ORG,Token
9,56,"[11, 16): 'UK-US'","[11, 13): 'UK'",LOC,Token


In [4]:
test_df.to_csv("../corrected_labels/token_corection_test.csv")
dev_df.to_csv("../corrected_labels/token_corection_dev.csv")
train_df.to_csv("../corrected_labels/token_corection_train.csv")
correction_df = {
    'dev'  : dev_df,
    'test' : test_df,
    'train': train_df
}

In [5]:
splits = ['dev', 'test', 'train']

for split in splits:    
    # Read the raw corpus file lines
    f = open(files[split])
    lines = f.readlines()
    
    # Create a dataframe for the corpus file and process our corrections csv
    dataset = Dataset(files[split])
    current_df = correction_df[split]
    
    current_df["line_no"] = 0; 
    current_df["correct_line"] = ""; 
    current_df["fold"] =  split;
    
    for i, row in current_df.iterrows():
        if split == 'test' and i >= 59:
            continue
        try:
            candidate_lines = dataset.find(row["corpus_span"], int(row["doc_offset"]))
        except:
            candidate_lines = dataset.find(row["correct_span"], int(row["doc_offset"]))
            candidate_lines = (candidate_lines[0]-1, candidate_lines[1]+1)
            print("The correct_span did not match lines. {}, {}".format(split, i))
        current_df.at[i,"line_no"] = candidate_lines[0]
        print(candidate_lines[0])

4120
10619
15911
19233
19282
19291
19130
19168
44837
46457
50964
1131
5687
6545
6640
6851
8644
8709
8709
12068
13001
13001
14467
16319
16319
26720
26721
26728
28718
28719
38688
39979
40619
41169
41413
41595
41850
42385
46987
17720
26117
37914
37926
37933
38844
44449
44449
65636
75410
75410
95553
102338
139024
139024
146605
164709
172643
213462


In [8]:
# Turn off the 60-row limit for displaying dataframes
pd.options.display.max_rows = None
full_df = pd.DataFrame()
full_df = test_df.append(dev_df).append(train_df)
full_df = full_df[['fold','line_no', 'doc_offset', 'corpus_span', 'correct_span', 'correct_ent_type']];
full_df

Unnamed: 0,fold,line_no,doc_offset,corpus_span,correct_span,correct_ent_type
0,test,1131,3,"[21, 37): 'SKIING-WORLD CUP'","[28, 37): 'WORLD CUP'",MISC
1,test,5687,25,"[24, 38): 'FOOTBALL-COLTS'","[33, 38): 'COLTS'",ORG
2,test,6545,27,"[565, 573): 'X-DENVER'","[567, 573): 'DENVER'",ORG
3,test,6640,27,"[889, 900): 'Y-GREEN BAY'","[891, 900): 'GREEN BAY'",ORG
4,test,6851,29,"[25, 44): 'FOOTBALL-OHIO STATE'","[34, 44): 'OHIO STATE'",ORG
5,test,8644,39,"[1158, 1175): 'AbelardoFernandez'","[1158, 1175): 'Abelardo Fernandez'",PER
6,test,8709,40,"[215, 236): 'Real Madrid-Barcelona'","[215, 226): 'Real Madrid'",ORG
7,test,8709,40,"[215, 236): 'Real Madrid-Barcelona'","[227, 236): 'Barcelona'",ORG
8,test,12068,54,"[11, 27): 'INTERVIEW-ZYWIEC'","[21, 27): 'ZYWIEC'",ORG
9,test,13001,56,"[11, 16): 'UK-US'","[11, 13): 'UK'",LOC


In [9]:
# Add some document context to the corrections
file_contents = {}
for fold in splits:
    filename = files[fold]
    print(fold, filename)
    with open(filename, 'r') as file:
        file_contents[fold] = file.readlines()

target_lines = []
contexts = []
for fold, line_no in zip(full_df["fold"], full_df["line_no"]):
    target_lines.append(file_contents[fold][line_no].strip())
    contexts.append(
        ""
        .join(file_contents[fold][max(0, line_no - 2):line_no + 3])
        .strip()
        .replace("-DOCSTART-", "DOCSTART") # Minus at beginning of cell confuses Excel
    )

full_df["target_line"] = target_lines
full_df["context"] = contexts
full_df

dev ../original_corpus/eng.testa
test ../original_corpus/eng.testb
train ../original_corpus/eng.train


Unnamed: 0,fold,line_no,doc_offset,corpus_span,correct_span,correct_ent_type,target_line,context
0,test,1131,3,"[21, 37): 'SKIING-WORLD CUP'","[28, 37): 'WORLD CUP'",MISC,SKIING-WORLD NNP I-NP I-MISC,FREESTYLE NNP I-NP O\nSKIING-WORLD NNP I-NP I-...
1,test,5687,25,"[24, 38): 'FOOTBALL-COLTS'","[33, 38): 'COLTS'",ORG,FOOTBALL-COLTS NNS I-NP O,NFL NNP I-NP I-ORG\nAMERICAN NNP I-NP O\nFOOTB...
2,test,6545,27,"[565, 573): 'X-DENVER'","[567, 573): 'DENVER'",ORG,X-DENVER NNP I-NP I-MISC,PA NNP I-NP O\n\nX-DENVER NNP I-NP I-MISC\n12 ...
3,test,6640,27,"[889, 900): 'Y-GREEN BAY'","[891, 900): 'GREEN BAY'",ORG,Y-GREEN NNP I-NP I-MISC,PA NNP I-NP O\n\nY-GREEN NNP I-NP I-MISC\nBAY ...
4,test,6851,29,"[25, 44): 'FOOTBALL-OHIO STATE'","[34, 44): 'OHIO STATE'",ORG,FOOTBALL-OHIO NNP I-NP I-MISC,NCAA NNP I-NP I-ORG\nAMERICAN NNP I-NP O\nFOOT...
5,test,8644,39,"[1158, 1175): 'AbelardoFernandez'","[1158, 1175): 'Abelardo Fernandez'",PER,AbelardoFernandez NNS I-NP I-PER,25-1 NNP I-NP O\n\nAbelardoFernandez NNS I-NP ...
6,test,8709,40,"[215, 236): 'Real Madrid-Barcelona'","[215, 226): 'Real Madrid'",ORG,Real NNP I-NP I-MISC,'s POS B-NP O\nbig JJ I-NP O\nReal NNP I-NP I-...
7,test,8709,40,"[215, 236): 'Real Madrid-Barcelona'","[227, 236): 'Barcelona'",ORG,Real NNP I-NP I-MISC,'s POS B-NP O\nbig JJ I-NP O\nReal NNP I-NP I-...
8,test,12068,54,"[11, 27): 'INTERVIEW-ZYWIEC'","[21, 27): 'ZYWIEC'",ORG,INTERVIEW-ZYWIEC NNP I-NP I-MISC,DOCSTART -X- -X- O\n\nINTERVIEW-ZYWIEC NNP I-N...
9,test,13001,56,"[11, 16): 'UK-US'","[11, 13): 'UK'",LOC,UK-US NNP I-NP I-MISC,DOCSTART -X- -X- O\n\nUK-US NNP I-NP I-MISC\no...


In [10]:
# Mark where a human labeler needs to enter the replacement for the line
full_df["correct_line"] = ""
full_df

Unnamed: 0,fold,line_no,doc_offset,corpus_span,correct_span,correct_ent_type,target_line,context,correct_line
0,test,1131,3,"[21, 37): 'SKIING-WORLD CUP'","[28, 37): 'WORLD CUP'",MISC,SKIING-WORLD NNP I-NP I-MISC,FREESTYLE NNP I-NP O\nSKIING-WORLD NNP I-NP I-...,
1,test,5687,25,"[24, 38): 'FOOTBALL-COLTS'","[33, 38): 'COLTS'",ORG,FOOTBALL-COLTS NNS I-NP O,NFL NNP I-NP I-ORG\nAMERICAN NNP I-NP O\nFOOTB...,
2,test,6545,27,"[565, 573): 'X-DENVER'","[567, 573): 'DENVER'",ORG,X-DENVER NNP I-NP I-MISC,PA NNP I-NP O\n\nX-DENVER NNP I-NP I-MISC\n12 ...,
3,test,6640,27,"[889, 900): 'Y-GREEN BAY'","[891, 900): 'GREEN BAY'",ORG,Y-GREEN NNP I-NP I-MISC,PA NNP I-NP O\n\nY-GREEN NNP I-NP I-MISC\nBAY ...,
4,test,6851,29,"[25, 44): 'FOOTBALL-OHIO STATE'","[34, 44): 'OHIO STATE'",ORG,FOOTBALL-OHIO NNP I-NP I-MISC,NCAA NNP I-NP I-ORG\nAMERICAN NNP I-NP O\nFOOT...,
5,test,8644,39,"[1158, 1175): 'AbelardoFernandez'","[1158, 1175): 'Abelardo Fernandez'",PER,AbelardoFernandez NNS I-NP I-PER,25-1 NNP I-NP O\n\nAbelardoFernandez NNS I-NP ...,
6,test,8709,40,"[215, 236): 'Real Madrid-Barcelona'","[215, 226): 'Real Madrid'",ORG,Real NNP I-NP I-MISC,'s POS B-NP O\nbig JJ I-NP O\nReal NNP I-NP I-...,
7,test,8709,40,"[215, 236): 'Real Madrid-Barcelona'","[227, 236): 'Barcelona'",ORG,Real NNP I-NP I-MISC,'s POS B-NP O\nbig JJ I-NP O\nReal NNP I-NP I-...,
8,test,12068,54,"[11, 27): 'INTERVIEW-ZYWIEC'","[21, 27): 'ZYWIEC'",ORG,INTERVIEW-ZYWIEC NNP I-NP I-MISC,DOCSTART -X- -X- O\n\nINTERVIEW-ZYWIEC NNP I-N...,
9,test,13001,56,"[11, 16): 'UK-US'","[11, 13): 'UK'",LOC,UK-US NNP I-NP I-MISC,DOCSTART -X- -X- O\n\nUK-US NNP I-NP I-MISC\no...,


In [11]:
full_df.to_csv("../corrected_labels/token_corrections_new.csv", index=False)

# Manual Step

Now you need to open up `token_corrections_new.csv` in a spreadsheet 
and manually type in a replacement string for each input line.

Once you've finished your edits, replace the existing file 
`token_corrections.csv` with the results of your edits. Then rerun
the next cell to regenerate `token_corrections.json`.

**Tip:** You can use the clipboard to copy the corrections that didn't 
change from `token_corrections.csv` to `token_corrections_new.csv`.

**Tip:** To be able to view the "context" column after opening the CSV file in
Excel, select the data rows and set the row height to 85

**Tip:** To paste a multiline string into a cell of an Excel file, 
**double**-click the cell and then paste.

**Tip:** If you need to edit a different line from the line in the "line_no"
column, then you can change the value of the "line_no" column.
A good example of this is the correction for "Real Madrid-Barcelona" in 
document 40 of the `test` fold, where the tokenizer treated "Madrid-Barcelona"
as a single token.

In [18]:
# Put the manually-curated file into the format that downstream scripts expect.
token_corrections_json_df = pd.read_csv("../corrected_labels/token_corrections.csv")

# Remove unexpected columns
token_corrections_json_df = token_corrections_json_df[
    ["line_no", "fold", "doc_offset", "corpus_span", "correct_span", "correct_line"]]

# Remove duplicate edits on the same source line
token_corrections_json_df = token_corrections_json_df.drop_duplicates(
    ["fold", "line_no", "doc_offset"], ignore_index=True);

# Use line number as index
token_corrections_json_df = token_corrections_json_df.set_index("line_no")
token_corrections_json_df.to_json("../corrected_labels/token_corrections.json", indent=4)
token_corrections_json_df

Unnamed: 0_level_0,fold,doc_offset,corpus_span,correct_span,correct_line
line_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1131,test,3,"[21, 37): 'SKIING-WORLD CUP'","[28, 37): 'WORLD CUP'",SKIING NNP I-NP O\n- NNP I-NP O\nWORLD NNP I-N...
5687,test,25,"[24, 38): 'FOOTBALL-COLTS'","[33, 38): 'COLTS'",FOOTBALL NNS I-NP O\n- NNS I-NP O\nCOLTS NNS I...
6545,test,27,"[565, 573): 'X-DENVER'","[567, 573): 'DENVER'",X NNP I-NP O\n- NNP I-NP O\nDENVER NNP I-NP I-...
6640,test,27,"[889, 900): 'Y-GREEN BAY'","[891, 900): 'GREEN BAY'",Y NNP I-NP O\n- NNP I-NP O\nGREEN NNP I-NP I-MISC
6851,test,29,"[25, 44): 'FOOTBALL-OHIO STATE'","[34, 44): 'OHIO STATE'",FOOTBALL NNP I-NP O\n- NNP I-NP O\nOHIO NNP I-...
8644,test,39,"[1158, 1175): 'AbelardoFernandez'","[1158, 1175): 'Abelardo Fernandez'",Abelardo NNS I-NP I-PER\nFernandez NNS I-NP I-PER
8709,test,40,"[215, 236): 'Real Madrid-Barcelona'","[215, 226): 'Real Madrid'",Real NNP I-NP I-ORG
8710,test,40,"[215, 236): 'Real Madrid-Barcelona'","[227, 236): 'Barcelona'",Madrid NNP I-NP I-ORG\n- NNP I-NP O\nBarcelona...
12068,test,54,"[11, 27): 'INTERVIEW-ZYWIEC'","[21, 27): 'ZYWIEC'",INTERVIEW NNP I-NP O\n- NNP I-NP O\nZYWIEC NNP...
13001,test,56,"[11, 16): 'UK-US'","[11, 13): 'UK'",UK NNP I-NP I-LOC\n- NNP I-NP O\nUS NNP I-NP I...
