In [97]:
import os
import json
from typing import List
from bs4 import BeautifulSoup

basename = os.path.basename
splitext = os.path.splitext


def get_documents_from_json(document_file_paths: List[str]) -> List[str]:
    # Extracts documents.
    documents = []
    for document_file_path in document_file_paths:
        with open(document_file_path) as document_file:
            # Each line in `f` is a json object.
            raw_json_objects = document_file.readlines()
            
            # Extracts the documents.
            for raw_json_object in raw_json_objects:
                json_object = json.loads(raw_json_object)

                document = json_object["text"]
                documents.append(document)
    
    # Replaces characters.
    norm_dict = {"’": "'",
                 "´": "'",
                 "‘": "'",
                 "′": "'",
                 "`": "'",
                 '“': '"',
                 '”': '"',
                 '˝': '"',
                 '¨': '"',
                 '„': '"',
                 '『': '"',
                 '』': '"',
                 '–': '-',
                 '—': '-',
                 '―': '-',
                 '¬': '-',
                 '、': ',',
                 '，': ',',
                 '：': ':',
                 '；': ';',
                 '？': '?',
                 '！': '!',
                 'ِ': ' ',
                 '\u200b': ' '}
    norm_dict = {ord(k): v for k, v in norm_dict.items()}
    documents = [document.translate(norm_dict) for document in documents]
    
    return documents


def get_documents_from_sgml(document_file_paths: List[str]) -> List[str]:
    # Extracts documents.
    documents = []
    for document_file_path in document_file_paths:
        with open(document_file_path) as document_file:
            raw_sgml = document_file.read()
            sgml_object = BeautifulSoup(raw_sgml)
            
            for doc in sgml_object.find_all("doc"):
                document = ""
                
                try:  # Most documents in conll2013 have no titles, while all in conll2014 do.
                    document += doc.find("title").text.strip()
                except AttributeError:
                    pass
                
                for p in doc.find_all("p"):
                    document += p.text.strip()
                documents.append(document)
    
    return documents


def get_documents(document_file_paths: List[str], source_file_path: str) -> List[str]:
    # Gets the name of the dataset from `ori_path`.
    dataset_name = splitext(basename(source_file_path))[0]
    
    if dataset_name in ["fce", "wi.train", "wi.dev"]:  # json.
        documents = get_documents_from_json(document_file_paths)
        return documents
    elif dataset_name in ["conll2013", "conll2014", "nucle"]:  # sgml.
        documents = get_documents_from_sgml(document_file_paths)
        return documents
    else:  # lang8, xml.
        pass

In [98]:
def remove_spaces(text: str) -> str:
    return "".join(text.split())

# def get_document_indices(query: str, documents: List[str]) -> List[int]:
#     # Removes all spaces in every document.
#     documents_without_spaces = [remove_spaces(document)
#                                 for document in documents]
    
#     # Finds documents consisting the query.
#     doc_indices = []
#     for i in range(len(documents_without_spaces)):
#         doc = documents_without_spaces[i]
        
#         # If the query is in the document, 
#         # records the index of the document.
#         if query in doc:
#             doc_indices.append(i)
    
#     return doc_indices
    

# def get_document(i: int, oris: List[str], documents: List[str]) -> str:
#     def _get_query(oris: List[str], start: int, end: int) -> str:
#         # Sentences for locating the document.
#         ori_sentences = oris[start:end]
#         # Removes all spaces.
#         ori_sentences_without_spaces = [remove_spaces(ori_sentence) 
#                                         for ori_sentence in ori_sentences]
#         # Concatenation.
#         query = "".join(ori_sentences_without_spaces)
        
#         return query
    
    
#     for j in range(1, 10):
#         # Gets the query for locating the document.
#         query = _get_query(oris, i, i+j)
#         # Gets all documents consisting the query.
#         doc_indices = get_document_indices(query, documents)
#         # If only one document is found, that is the needed document.
#         if len(doc_indices) == 1:
#             doc_index = doc_indices[0]
#             return documents[doc_index]
        
#         # Gets the query for locating the document.
#         query = _get_query(oris, i-j+1, i+1)
#         # Gets all documents consisting the query.
#         doc_indices = get_document_indices(query, documents)
#         # If only one document is found, that is the needed document.
#         if len(doc_indices) == 1:
#             doc_index = doc_indices[0]
#             return documents[doc_index]

In [99]:
def sentence_in_document(sentence: str, document: str) -> bool:
    return remove_spaces(sentence) in remove_spaces(document)


def get_context(source_sentence_index: int, 
                previous_sentences_number: int, following_sentences_number: int, 
                source_sentences: List[str], document: str) -> str:
    
    def _remove_line_feed(text):
        return text.split("\n")[0]
        
    # Gets previous context.
    previous_context_sentences = ""
    for previous_context_sentence_index in range(source_sentence_index - previous_sentences_number, source_sentence_index):
        try:
            previous_context_sentence = source_sentences[previous_context_sentence_index]
        except IndexError:
            continue
            
        if sentence_in_document(sentence=previous_context_sentence, document=document):
            previous_context_sentences = previous_context_sentences + "<prev>" + _remove_line_feed(previous_context_sentence)
    
    # Gets following context.
    following_context_sentences = ""
    for following_context_sentence_index in range(source_sentence_index + 1, source_sentence_index + following_sentences_number + 1):
        try:
            following_context_sentence = source_sentences[following_context_sentence_index]
        except IndexError:
            continue
        
        if sentence_in_document(sentence=following_context_sentence, document=document):
            following_context_sentences = following_context_sentences + "<fol>" + _remove_line_feed(following_context_sentence)
    
    context = previous_context_sentences + following_context_sentences
    
    return context

In [100]:
from glob import glob


def make_context(source_file_path: str, document_file_paths: List[str], previous_sentences_number: int, following_sentences_number: int):
    # Gets all source sentences from `ori_path`.
    with open(source_file_path) as source_file:
        source_sentences = source_file.readlines()
    
    # Gets all documents.
    documents = get_documents(document_file_paths=document_file_paths, source_file_path=source_file_path)
    
    # Gets the path of the context file.
    context_file_path = f"{splitext(source_file_path)[0]}.ctx"
    
#     with open(f_context, "w") as f:
#         for i in range(len(oris)):        
#             # Gets the document consisting of `ori_sentence`.
#             doc = get_document(i, oris, documents)

#             # Gets the context.
#             context = get_context(i, n_prev, n_fol, oris, doc)

#             # Writes the context to file.
#             f.write(context)
#             f.write("\n")

    current_document_index = 0
    current_document = documents[current_document_index]
    current_document_spaces_removed = remove_spaces(current_document)
    with open(context_file_path, "w") as context_file:
        for source_sentence_index in range(len(source_sentences)):        
#             document = get_document(source_sentence_index, source_sentences, documents)
#             context = get_context(source_sentence_index=source_sentence_index, 
#                                   previous_sentences_number=previous_sentences_number, following_sentences_number=following_sentences_number, 
#                                   source_sentences=source_sentences, document=document)

            source_sentence = source_sentences[source_sentence_index]

            # Gets the context.
            source_sentence_spaces_removed = remove_spaces(source_sentence)
            if not sentence_in_document(sentence=source_sentence, document=current_document_spaces_removed):
                current_document_index += 1
                current_document = documents[current_document_index]
                current_document_spaces_removed = remove_spaces(current_document)
            
            if verbose:
                print(source_sentence_spaces_removed)
                print(current_document_spaces_removed)
            current_document_spaces_removed = current_document_spaces_removed.replace(source_sentence_spaces_removed, "", 1)

            if verbose:
                print("----- -----")
                print(f"ln: {source_sentence_index + 1}", current_document_spaces_removed)
                print()
                input()
        
            context = get_context(source_sentence_index=source_sentence_index, 
                                  previous_sentences_number=previous_sentences_number, following_sentences_number=following_sentences_number, 
                                  source_sentences=source_sentences, document=current_document)

            # Writes the context to file.
            context_file.write(context)
            context_file.write("\n")

In [101]:
import filepath

if __name__ == '__main__':
    
    fp = filepath.FilePath()
    
    # fce.
    fce_ori_path = fp.FCE_ORI
    fce_doc_paths = sorted(glob("/home/neko/GEC/helo_word-master_restricted/data/bea19/fce/json/fce.*.json"))

    # wi.
    wi_train_ori_path = fp.WI_TRAIN_ORI
    wi_train_doc_paths = [
        "/home/neko/GEC/helo_word-master_restricted/data/bea19/wi+locness/json/A.train.json",
        "/home/neko/GEC/helo_word-master_restricted/data/bea19/wi+locness/json/A.train.json",
        "/home/neko/GEC/helo_word-master_restricted/data/bea19/wi+locness/json/B.train.json",
        "/home/neko/GEC/helo_word-master_restricted/data/bea19/wi+locness/json/C.train.json",
        "/home/neko/GEC/helo_word-master_restricted/data/bea19/wi+locness/json/B.train.json",
        "/home/neko/GEC/helo_word-master_restricted/data/bea19/wi+locness/json/C.train.json",
    ]
    
    wi_dev_ori_path = fp.WI_DEV_ORI
    wi_dev_doc_paths = sorted(glob("/home/neko/GEC/helo_word-master_restricted/data/bea19/wi+locness/json/*.dev.json"))
    
    # conll2013.
    conll2013_ori_path = fp.CONLL2013_ORI
    conll2013_doc_paths = sorted(glob("/home/neko/GEC/helo_word-master_restricted/data/conll2013/release2.3.1/revised/data/official.sgml"))
    
    # conll2014.
    conll2014_ori_path = fp.CONLL2014_ORI
    conll2014_doc_paths = sorted(glob("/home/neko/GEC/helo_word-master_restricted/data/conll2014/conll14st-test-data/noalt/official-2014.0.sgml"))
    
    # nucle.
    nucle_ori_path = fp.NUCLE_ORI
    nucle_doc_paths = sorted(glob("/home/neko/GEC/helo_word-master_restricted/data/bea19/nucle3.3/data/nucle3.2.sgml"))
    
    # ------
    
    verbose = False
    
    ori_path = nucle_ori_path
    doc_paths = nucle_doc_paths
    n_prev = 3
    n_fol = 3
    
    make_context(ori_path, doc_paths, n_prev, n_fol)