In [19]:
def remove_spaces(text: str) -> str:
    return "".join(text.split())

def sentence_in_raw_document(sentence: str, raw_document: str) -> bool:
    return remove_spaces(sentence) in remove_spaces(raw_document)

In [20]:
import os
import json
import re
from typing import List
from glob import glob

from bs4 import BeautifulSoup

basename = os.path.basename
splitext = os.path.splitext


def get_tokenized_sentences_from_m2_files(m2_file_paths: List[str]) -> List[str]:
    sentence_start_index = 2
    tokenized_sentences: List[str] = []
    for m2_file_path in m2_file_paths:
        with open(m2_file_path) as m2_file:
            for line in m2_file.readlines():
                if line.startswith("S "):  # "S " marks the start of a sentence.
                    tokenized_sentence = line[sentence_start_index:].strip()  # Removes the trailing linefeed.
                    tokenized_sentences.append(tokenized_sentence)
    
    return tokenized_sentences


def get_documents_from_json(document_file_paths: List[str], m2_file_paths: List[str]) -> List[List[str]]:
    def _normalize(text) -> str:
        norm_dict = {"’": "'",
                     "´": "'",
                     "‘": "'",
                     "′": "'",
                     "`": "'",
                     '“': '"',
                     '”': '"',
                     '˝': '"',
                     '¨': '"',
                     '„': '"',
                     '『': '"',
                     '』': '"',
                     '–': '-',
                     '—': '-',
                     '―': '-',
                     '¬': '-',
                     '、': ',',
                     '，': ',',
                     '：': ':',
                     '；': ';',
                     '？': '?',
                     '！': '!',
                     'ِ': ' ',
                     '\u200b': ' '}
        norm_dict = {ord(k): v for k, v in norm_dict.items()}
        return text.translate(norm_dict)
    
    # Extracts tokenized sentences.
    tokenized_sentences = get_tokenized_sentences_from_m2_files(m2_file_paths=m2_file_paths)
    
    # Extracts documents.
    documents: List[List[str]] = []
    current_tokenized_sentence_index = 0
    current_tokenized_sentence = tokenized_sentences[current_tokenized_sentence_index]
    for document_file_path in document_file_paths:
        with open(document_file_path) as document_file:
            # Each line in the document file is a json object.
            raw_json_objects: List[str] = document_file.readlines()
            
            for raw_json_object in raw_json_objects:
                json_object = json.loads(raw_json_object)
                
                # Gets a raw document (simple text) and nomalizes it.
                raw_document: str = json_object["text"]
                normalized_raw_document: str = _normalize(raw_document)
                
                document: List[str] = []
                partial_normalized_raw_document = remove_spaces(normalized_raw_document)
                while sentence_in_raw_document(sentence=current_tokenized_sentence, raw_document=partial_normalized_raw_document):
                    document.append(current_tokenized_sentence)
                    
                    # Prevents sentences in the next document from being recognized as sentences in the current document.
                    # E.g. a sentence exists in 2 consecutive documents.
                    partial_normalized_raw_document = partial_normalized_raw_document.replace(remove_spaces(current_tokenized_sentence), "", 1)
                    
                    # Evaluates the next sentence which is extracted from a .m2 file.
                    current_tokenized_sentence_index += 1
                    try:
                        current_tokenized_sentence = tokenized_sentences[current_tokenized_sentence_index]
                    except IndexError:  # When the error occurs, the last sentence has been evaluated.
                        break
                                    
                documents.append(document)
    
    return documents


def get_documents_from_sgml(document_file_paths: List[str], m2_file_paths: List[str], dataset_name: str) -> List[str]:
    # Extracts tokenized sentences.
    tokenized_sentences = get_tokenized_sentences_from_m2_files(m2_file_paths=m2_file_paths)
    
    # Extracts documents.
    documents: List[List[str]] = []
    current_tokenized_sentence_index = 0
    current_tokenized_sentence = tokenized_sentences[current_tokenized_sentence_index]
    for document_file_path in document_file_paths:
        with open(document_file_path) as document_file:
            raw_sgml: str = document_file.read()
            
            # Fixes a parsing error in nucle.
            if dataset_name == "nucle":
                raw_sgml = raw_sgml.replace("<nuclearstreet.com/files/folders/1654/download.aspx>", "nuclearstreet.com/filesfolders/1654/download.aspx>")

            doc_pattern = re.compile(r"<DOC.+?/DOC>", flags=re.DOTALL)
            docs = doc_pattern.findall(raw_sgml)
            for doc in docs:
#                 nid_pattern = re.compile(r'<DOC nid="(\d+)">')
#                 nid = nid_pattern.search(doc).group(1)
                
                partial_normalized_raw_document: str = ""
                    
                title_pattern = re.compile(r"<TITLE>(.+?)</TITLE>", flags=re.DOTALL)
                try:
                    title = title_pattern.search(doc).group(1).strip()
                except AttributeError:
                    pass
                else:
                    partial_normalized_raw_document += title
                
                p_pattern = re.compile(r"<P>(.+?)</?P>", flags=re.DOTALL)
                paras: List[str] = [para.strip() for para in p_pattern.findall(doc)]
                partial_normalized_raw_document += " ".join(paras)
                
                # Fixes parsing errors in nucle.
                if dataset_name == "nucle":
                    fake_tag_pattern = re.compile(r"<[^(\{)|^(\s)].+?>")
                    partial_normalized_raw_document = fake_tag_pattern.sub("", partial_normalized_raw_document)
    
                partial_normalized_raw_document = remove_spaces(partial_normalized_raw_document)
    
#                 condition = False
# #                 condition = 2281 <= int(nid) <= 2284
#                 if condition:
#                     print(f"nid:\t{nid}")
#                     print("------")
#                     try:
#                         print(f"title:\t{title}")
#                         print("------")
#                     except:
#                         pass
#                     print(f"paras:\t{paras}")
#                     print("------")
#                     try:
#                         print(f"refs:\t{reference_paras}")
#                         print("------")
#                     except:
#                         pass
#                     print(f"source: {current_tokenized_sentence}")
#                     print("------")
#                     print(f"source rm spaces: {remove_spaces(current_tokenized_sentence)}")
#                     print("------")
#                     print(f"partial:\t{partial_normalized_raw_document}")
#                     print("------")
#                     print(remove_spaces(current_tokenized_sentence) in partial_normalized_raw_document)
#                     print()
# #                     input()
    
                document: List[str] = []
                while sentence_in_raw_document(sentence=current_tokenized_sentence, raw_document=partial_normalized_raw_document):
                    document.append(current_tokenized_sentence)
                    
#                     if condition:
#                         print(f"nid:\t{nid}")
#                         print("======")
#                         try:
#                             print(f"title:\t{title}")
#                             print("======")
#                         except:
#                             pass
#                         print(f"paras:\t{paras}")
#                         print("======")
#                         try:
#                             print(f"refs:\t{reference_paras}")
#                             print("======")
#                         except:
#                             pass
#                         print(f"source: {current_tokenized_sentence}")
#                         print("======")
#                         print(f"source rm spaces: {remove_spaces(current_tokenized_sentence)}")
#                         print("======")
#                         print(f"partial:\t{partial_normalized_raw_document}")
#                         print("======")
#                         print(remove_spaces(current_tokenized_sentence) in partial_normalized_raw_document)
#                         print()
# #                         input()
                    
                    # Prevents sentences in the next document from being recognized as sentences in the current document.
                    # E.g. a sentence exists in 2 consecutive documents.
                    partial_normalized_raw_document = partial_normalized_raw_document.replace(remove_spaces(current_tokenized_sentence), "", 1)
                    
                    # Evaluates the next sentence which is extracted from a .m2 file.
                    current_tokenized_sentence_index += 1
                    try:
                        current_tokenized_sentence = tokenized_sentences[current_tokenized_sentence_index]
                    except IndexError:  # When the error occurs, the last sentence has been evaluated.
                        break
                
                documents.append(document)
    
    return documents


def get_documents_from_lang8_entries_train(document_file_paths: List[str]) -> List[str]:
    # Extracts documents.
    documents = []
    for document_file_path in document_file_paths:
        with open(document_file_path) as document_file:
            lines: List[str] = document_file.readlines()
            
            document: List[str] = []
            for line in lines:
                try:
                    document_sentence = line.split("\t")[4].strip()
                    document.append(document_sentence)
                except IndexError:  # An '\n' seperating 2 documents.
                    documents.append(document)
                    document = []
    
    return documents
    

def get_documents(document_file_paths: List[str], m2_file_paths: List[str], source_file_path: str) -> List[str]:
    # Gets the name of the dataset.
    dataset_name = splitext(basename(source_file_path))[0]
    
    if dataset_name in ["fce", "wi.train", "wi.dev"]:  # json.
        return get_documents_from_json(document_file_paths, m2_file_paths)
    elif dataset_name in ["conll2013", "conll2014", "nucle"]:  # sgml.
        return get_documents_from_sgml(document_file_paths, m2_file_paths, dataset_name)
    else:  # lang8.
        return get_documents_from_lang8_entries_train(document_file_paths)

In [21]:
def get_context(document_level_index: int, document: List[str], 
               previous_sentences_number: int, following_sentences_number: int) -> str:

    def _is_valid_document_level_index(document_level_index: int, document: List[str]) -> bool:
        # Negative indices, which may be generated when making previous context, 
        # are not allowed, as corresponding sentences are following context 
        # for the sentence being evaluated.
        return 0 <= document_level_index < len(document)
    
    # Gets previous context.
    previous_context_sentences = ""
    for previous_context_sentence_index in range(document_level_index - previous_sentences_number, document_level_index):
        if _is_valid_document_level_index(document_level_index=previous_context_sentence_index, document=document):
            previous_context_sentence = document[previous_context_sentence_index]
            previous_context_sentences = previous_context_sentences + "<prev>" + previous_context_sentence
            
    # Gets following context.
    following_context_sentences = ""
    for following_context_sentence_index in range(document_level_index + 1, document_level_index + following_sentences_number + 1):
        if _is_valid_document_level_index(document_level_index=following_context_sentence_index, document=document):
            following_context_sentence = document[following_context_sentence_index]
            following_context_sentences = following_context_sentences + "<fol>" + following_context_sentence
            
    context = previous_context_sentences + following_context_sentences
    
    return context

In [22]:
def make_context(source_file_path: str, document_file_paths: List[str], m2_file_paths: List[str], previous_sentences_number: int, following_sentences_number: int):
    # Gets all source sentences from `ori_path`.
    with open(source_file_path) as source_file:
        source_sentences = source_file.readlines()
    
    # Gets all documents.
    documents: List[List[str]] = get_documents(document_file_paths=document_file_paths, m2_file_paths=m2_file_paths, source_file_path=source_file_path)
    
    # Writes documents.
    if save_documents:
        document_path = f"{splitext(source_file_path)[0]}.documents"
        with open(f"{document_path}", 'w') as f:
            for i in range(len(documents)):
                f.write(f"{i}\t")
                f.write(str(documents[i]))
                f.write("\n\n")
    
    # Gets the path of the context file.
    source_file_path_without_extention = splitext(source_file_path)[0]
    context_file_path = f"{source_file_path_without_extention}.ctx"

    global verbose
    if verbose:
        count = 0

    current_document_index = 0
    current_document: List[str] = documents[current_document_index]
    current_document_masked_spaces_removed: List[str] = [remove_spaces(document_sentence)
                                                         for document_sentence in current_document]
    previous_source_sentence = None
    previous_source_sentence_document_level_index = None
    with open(context_file_path, "w") as context_file:
        for source_sentence in source_sentences:   
            if source_sentence != previous_source_sentence:
                # In most cases the loop will be executed only once. 
                # But for nucle the loop may be executed twice 
                # when no sentence is wrong in a document in nucle
                # since correct sentences in the m2 file of nucle 
                # are ignored when creating nucle.ori.
                while  remove_spaces(source_sentence) not in current_document_masked_spaces_removed:
                    current_document_index += 1
                    current_document = documents[current_document_index]
                    current_document_masked_spaces_removed: List[str] = [remove_spaces(document_sentence)
                                                             for document_sentence in current_document]

                # Gets the document-level index of the source sentence.
                try:
                    document_level_index = current_document_masked_spaces_removed.index(remove_spaces(source_sentence))
                except:
                    print(source_sentence)
                    print(remove_spaces(source_sentence))
                    print(current_document_masked_spaces_removed)
                    raise
            else:
                document_level_index = previous_source_sentence_document_level_index
            
            previous_source_sentence = source_sentence
            previous_source_sentence_document_level_index = document_level_index
            
            if verbose:
                count += 1
                print(f"ln:\t{count}")
                print("---")
                print(f"document-level index:\t{document_level_index}")
                print("---")
                print(f"src sent:\t{source_sentence}")
                print(f"src sent spaces removed:\t{remove_spaces(source_sentence)}")
                print("---")
                print(f"current doc masked spaces removed:\t{current_document_masked_spaces_removed}")
                print("---")
                print()
                input()
            
            # Prevents sentences in the next document from being recognized as sentences in the current document.
            # E.g. a sentence exists in 2 consecutive documents.
            current_document_masked_spaces_removed[document_level_index] = ""
            
            context = get_context(document_level_index=document_level_index, document=current_document, 
                                 previous_sentences_number=previous_sentences_number, following_sentences_number=following_sentences_number)
            
            # Writes the context to file.
            context_file.write(context)
            context_file.write("\n")

In [30]:
import filepath

if __name__ == '__main__':
    
    fp = filepath.FilePath()
    
    # fce.
    fce_ori_path = fp.FCE_ORI
    fce_doc_paths = sorted(glob("/home/neko/GEC/helo_word-master_restricted/data/bea19/fce/json/fce.*.json"))
    fce_m2_paths = sorted(glob(f'{fp.fce_m2}/*m2'))

    # wi train.
    wi_train_ori_path = fp.WI_TRAIN_ORI
    wi_train_doc_paths = [
        "/home/neko/GEC/helo_word-master_restricted/data/bea19/wi+locness/json/A.train.json",
        "/home/neko/GEC/helo_word-master_restricted/data/bea19/wi+locness/json/A.train.json",
        "/home/neko/GEC/helo_word-master_restricted/data/bea19/wi+locness/json/B.train.json",
        "/home/neko/GEC/helo_word-master_restricted/data/bea19/wi+locness/json/C.train.json",
        "/home/neko/GEC/helo_word-master_restricted/data/bea19/wi+locness/json/B.train.json",
        "/home/neko/GEC/helo_word-master_restricted/data/bea19/wi+locness/json/C.train.json",
    ]
    wi_train_m2_paths = sorted(glob(f'{fp.wi_m2}/*train*m2'))
    
    # wi dev.
    wi_dev_ori_path = fp.WI_DEV_ORI
    wi_dev_doc_paths = sorted(glob("/home/neko/GEC/helo_word-master_restricted/data/bea19/wi+locness/json/*.dev.json"))
    wi_dev_m2_paths = sorted(glob(f'{fp.wi_m2}/ABCN.dev.gold.bea19.m2'))
    
    # conll2013.
    conll2013_ori_path = fp.CONLL2013_ORI
    conll2013_doc_paths = sorted(glob("/home/neko/GEC/helo_word-master_restricted/data/conll2013/release2.3.1/revised/data/official.sgml"))
    conll2013_m2_paths = sorted(glob(f'{fp.conll2013_m2}/official-preprocessed.m2'))
    
    # conll2014.
    conll2014_ori_path = fp.CONLL2014_ORI
    conll2014_doc_paths = sorted(glob("/home/neko/GEC/helo_word-master_restricted/data/conll2014/conll14st-test-data/noalt/official-2014.0.sgml"))
    conll2014_m2_paths = sorted(glob(f'{fp.conll2014_m2}/official-2014.combined.m2'))
    
    # nucle.
    nucle_ori_path = fp.NUCLE_ORI
    nucle_doc_paths = sorted(glob("/home/neko/GEC/helo_word-master_restricted/data/bea19/nucle3.3/data/nucle3.2.sgml"))
    nucle_m2_paths = sorted(glob(f'{fp.nucle_m2}/*m2'))
    
    # lang8.
    lang8_ori_path = fp.LANG8_ORI
    # TODO: entries.train is in lang8_en, and it should be included in the datasets.
    lang8_doc_paths = sorted(glob("/home/neko/GEC/helo_word-master_restricted/data/bea19/lang8.bea19/entries.train"))
    lang8_m2_paths = None
    
    # ------
    
    verbose = False
    save_documents = True
    
    
    # Single dataset.
#     ori_path = lang8_ori_path
#     doc_paths = lang8_doc_paths
#     m2_paths = lang8_m2_paths
    
    n_prev = 3
    n_fol = 3
    
#     make_context(ori_path, doc_paths, m2_paths, n_prev, n_fol)
    
    # Batch testing.
    import os
    correct_ver = {
        "fce": 2, 
        "wi.train": 3,
        "wi.dev": 3, 
        "conll2013": 2, 
        "conll2014": 2,
        "nucle": 1,
        "lang8": 1
    }
    for dataset in ["fce", "wi_train", "wi_dev", 
                   "conll2013", "conll2014", "nucle",
                    "lang8"
                   ]:
        make_context(eval(f"{dataset}_ori_path"), eval(f"{dataset}_doc_paths"), eval(f"{dataset}_m2_paths"), 
                    n_prev, n_fol)
        
        dataset = ".".join(dataset.split("_"))
        with open(f"/home/neko/GEC/helo_word-master_restricted/data/parallel/raw/{dataset}.ctx") as f_ctx, \
            open(f'/home/neko/GEC/helo_word-master_restricted/data/parallel/raw/{dataset}.ctx_v{correct_ver[dataset]}_correct') as f_ctx_correct:
            print(f"{dataset}: {f_ctx.read() == f_ctx_correct.read()}")

fce: True
wi.train: True
wi.dev: True
conll2013: True
conll2014: True
nucle: True
lang8: True
