In [None]:
import os
import json
import jsonlines

from IPython.display import clear_output

import pandas as pd

from spacy import displacy

import pipeline
import cndutils as ut
import visuals as viz


path = r"C:\Users\Steve\OneDrive - University of Southampton\CNDPipeline\dataset"
test_jsonl = "test_chunks.jsonl"
cust_jsonl = "cust_chunks.jsonl"
index_str = "index.json"
test_filepath = os.path.join(path, test_jsonl)
cust_filepath = os.path.join(path, cust_jsonl)
index_filepath = os.path.join(path, index_str)

with jsonlines.open(test_filepath) as f:
    test_chunks = list(f.iter())
    
try:  
    with jsonlines.open(cust_filepath) as f:
        cust_chunk_list = list(f.iter())
    if len(test_chunks) == 0:
        cust_chunk_list = list()    

except:
    cust_chunk_list = list()

try:
    with open(index_filepath, "r") as index_json:
        index = json.load(index_json)
        
except:
    index = 0
    
lookup = pipeline.ConceptMatcher(cnd.nlp)
    
while index < len(test_chunks):
    
    quit = False
    
    line = test_chunks[index]
            
    with open(index_filepath, "wb") as f:
        f.write(json.dumps(index).encode("utf-8"))
    
    clear_output(wait=True)
  
    #parse document
    doc = cnd(line[str(index)])
    
    # display dependency parse
    displacy.render(doc, style = "dep")
              
    display(viz.sent_frame(doc, compact = False))
    
    # collate, create and display a dict of merged noun_chunks
    chunk_dict = dict()    

    # get the noun root accounting for pronoun lemma
    root = ""
    for n, chunk in enumerate(doc.noun_chunks):
        idx = chunk.root.i
        root =  chunk.root.lemma_.lower() 
        if root == "-pron-":
            root = chunk.text.lower()
    
        chunk_dict[n] = {"text" : str(chunk).lower(), "root" : root, "idx" : idx, "concept" : str(chunk._.CONCEPT), "attribute" : str(chunk._.ATTRIBUTE), "ideology" : str(chunk._.IDEOLOGY)}
    
    display(pd.DataFrame(chunk_dict))
    
    # iterate through noun chunks to get desired text and notes for each and
    # create a dict object of the new chunks data.
    new_chunk_dict = dict()
    for key, value in chunk_dict.items():
        
        chunk_str = value["text"]
        root = value["root"]
        idx = value["idx"]
        concept = value["concept"]
        attribute = value["attribute"]
        ideology = value["ideology"]
        notes = ""

        print(f'{key} => {chunk_str}')
        
        ## get chunk string
        temp_str = ""
        det = False
        if cnd(chunk_str)[0].pos_ == "DET":
            det = True
            temp_str = " ".join(chunk_str.split()[1:])
        else:
            temp_str = chunk_str
        chunk_str = input(f"new chunk text [{temp_str}]: ").lower()
        
        if len(chunk_str) == 0:
            chunk_str = temp_str
            
        # does the user want to quit?
        elif chunk_str == "q":
            print("QUITTING")
            quit = True
            break
            
        # get notes
        temp_notes = ""
        if det:
            temp_notes = "remove det. concepts from root."
        notes = input(f"notes [{temp_notes}]: ")
        if len(notes) == 0:
            notes = temp_notes

        # get root
        root = input(f"root [{root}]: ").lower()
        if len(root) == 0:
            root = value["root"]

        # get idx
        if chunk_str == "nil":
            idx = "nil"
        idx = input(f"idx [{idx}]: ").lower()
        if chunk_str == "nil" and len(idx) == 0:
            idx = "nil"
        elif len(idx) == 0:
            idx = value["idx"]

        # concept lookup
        concept_lookup = lookup.get_concept(root)
        concept = input(f"concept [{concept_lookup}]:").upper()
        if len(concept) == 0:
            concept = concept_lookup

        # attribute lookup
        attribute_lookup = lookup.get_attribute(concept.lower())
        attribute = input(f"attribute [{attribute_lookup}]: ").lower()
        if len(attribute) == 0:
            attribute = attribute_lookup

        # ideology lookup
        ideology_lookup = lookup.get_ideology(concept.lower())
        ideology = input(f"ideology [{ideology_lookup}]: ").lower()
        if len(ideology) == 0:
            ideology = ideology_lookup
        
        new_chunk_dict[key] = {"text" : chunk_str, "root" : root, "idx" : idx, "concept" : concept, "attribute" : attribute, "ideology" : ideology, "notes" : notes}
        
    # if quit has been selected then quit
    if quit == True:
        break
        
    # append the original and new chunk dicts to jsonl object in json readable format   
    line.update({"orig_chunks" : ut.doubleQuoteDict(chunk_dict)})
    line.update({"new_chunks" : ut.doubleQuoteDict(new_chunk_dict)})
    cust_chunk_list.append(line)
    
    #write jsonl object to disk
    with jsonlines.open(os.path.join(path, cust_filepath), 'w') as writer:
        writer.write_all(cust_chunk_list)
        
    index += 1