# Create snippets from GV documents


Running this notebook allows the total corpus of Getuigenverhalen to be divided into snippets of approimately a predefined length, whilst preserving the paragraph and/or sentence structure identified by the Newsreader pipeline. The user can choose to create snippets of the actual terms/text, or alternatively create snippets with the terms replaced by the lemmas identified by the NR pipeline. I.e. it takes the output from the NR pipeline and 

In [29]:
from KafNafParserPy import KafNafParser
import os
import numpy as np



### KafNaf extraction utilities

In [30]:
#get list of term ids from naf file
def get_KafNaf_term_ids_list(parser):

    terms = parser.get_terms()
    
    term_id_list = []
    term_span_id_list = []
    
    for term in terms:
        term_id = term.get_id()
        term_span_id = term.get_span_ids()
        term_id_list.append(term_id)
        term_span_id_list.append(term_span_id[0])
        
    return term_id_list, term_span_id_list
        
        


In [31]:
#get list of lemmas for each term
def get_KafNaf_term_lemma_list(parser):

    lemma_list = []

    terms = parser.get_terms()
    
    for term in terms:
        lemma = term.get_lemma()
        lemma_list.append(lemma)
        
    return lemma_list

In [32]:
#get id of sentence containing term for each term
def get_KafNaf_wf_sentence_ids_list(parser, term_span_id_list):

    sentence_id_list = []
    
    for spanid in term_span_id_list :
        sentence_id = parser.text_layer.get_wf(spanid).get_sent()
        sentence_id_list.append(sentence_id)
        
    return sentence_id_list       

In [33]:
#get id of paragraph containing term for each term
def get_KafNaf_wf_paragraph_ids_list(parser, term_span_id_list):

    paragraph_id_list = []
    
    for spanid in term_span_id_list :
        paragraph_id = parser.text_layer.get_wf(spanid).get_para()
        paragraph_id_list.append(paragraph_id)
        
    return paragraph_id_list

In [34]:
#get text (word) for each term
def get_KafNaf_wf_text_list(parser, term_span_id_list):

    text_list = []
    
    for spanid in term_span_id_list :
        text = parser.text_layer.get_wf(spanid).get_text()
        text_list.append(text)
        
    return text_list

In [35]:
#retrieve information from the naf file
def get_KafNaf_info(file):

    #instantiate parser
    parser = KafNafParser(file)
    
    #get term an correwsponding span id for each term
    term_id_list, term_span_id_list = get_KafNaf_term_ids_list(parser)
    
    #get sentence id for each term
    term_sentence_id_list = get_KafNaf_wf_sentence_ids_list(parser, term_span_id_list)
    
    #get paragraph id for each term
    term_paragraph_id_list = get_KafNaf_wf_paragraph_ids_list(parser, term_span_id_list)
    
    #get text for each term
    term_text_list = get_KafNaf_wf_text_list(parser, term_span_id_list)
    
    #get lemmas for each term
    term_lemma_list = get_KafNaf_term_lemma_list(parser)
    
    term_info_dict ={'id':term_id_list,'span_id':term_span_id_list,'sentence':term_sentence_id_list,'paragraph':term_paragraph_id_list,'text':term_text_list,'lemma':term_lemma_list}
    
    return term_info_dict




### Chunking utilities

In [36]:
def make_chunk(data,ids,uids,uid_start,target_size):

    data_chunk = np.array([]) 
    current_size = 0
    index = np.array(np.where(uids == uid_start)).flatten()
    echo_uid_start = uids[np.copy(index)]
    while current_size <= target_size:
        sub_chunk_uid = uids[index]
        data_sub_chunk = data[np.where(ids == sub_chunk_uid)]
        data_chunk = np.append(data_chunk,data_sub_chunk)
        current_size += len(data_sub_chunk)
        index_completed = np.copy(index)
        index += 1
        if index == len(uids):
            break

        
        
    data_chunk_snippet = ' '.join(data_chunk)
    uid_end = uids[index_completed]
    
    if index == len(uids):
        uid_next = uid_end
    else:
        uid_next = uids[index]
            
    
    return data_chunk_snippet, echo_uid_start, uid_end, uid_next, current_size
        

In [37]:
#create chunks of text of approximately the target size making sure to not split paragraphs
def create_chunk_snippets(data_key, id_key, term_info_dict, target_size):
    
    
    chunk_snippets = []
    chunk_snippets_start_id = []
    chunk_snippets_end_id = []
    chunk_snippets_size =[]
    
    #cast to array type
    data_arr = np.array(term_info_dict[data_key])
    id_arr = np.array(term_info_dict[id_key])
    
    #get list of unique paragraph ids
    uniq_id_arr = np.array(list(dict.fromkeys(id_arr)))
    start_id = uniq_id_arr[0]
    done_chunking = False
    
    while done_chunking == False:
        
        chunk_snippet, echo_start_id, end_id, next_id, snippet_size = make_chunk(data_arr,id_arr, uniq_id_arr, start_id,target_size)

        chunk_snippets_start_id.append(echo_start_id)
        chunk_snippets_end_id.append(end_id)
        chunk_snippets_size.append(snippet_size)
        chunk_snippets.append(chunk_snippet)
        
        if end_id == next_id:
            done_chunking = True
            
            """merge small final chunk with preceding chunk"""
            if chunk_snippets_size[-1] <= target_size//2 :
                
                final_chunk = chunk_snippets[-2]+' '+chunk_snippets[-1]
                chunk_snippets[-2] = final_chunk
                chunk_snippets = chunk_snippets[0:-1]
                #---#
                chunk_snippets_start_id = chunk_snippets_start_id[0:-1]
                #---#
                chunk_snippets_end_id[-2]=chunk_snippets_end_id[-1]
                chunk_snippets_end_id = chunk_snippets_end_id[0:-1]
                #---#
                chunk_snippets_size[-2] = len(final_chunk)
                chunk_snippets_size = chunk_snippets_size[0:-1]
            
        
        start_id = next_id
        
    chunk_dict = {'chunk':chunk_snippets,'start_id':chunk_snippets_start_id,'end_id':chunk_snippets_end_id,'size':chunk_snippets_size,'data_type':data_key,'preserve_type':id_key,'target_size':target_size} 
    
    return chunk_dict

### Writing utilities

In [38]:
def write_chunk_snippets(filename,chunk_dict):
    #
    
    for idx,chunk in enumerate(chunk_dict['chunk']):
        chunk_file = filename+'_'+str(chunk_dict['target_size'])+'_'+chunk_dict['preserve_type']+'_'+str(chunk_dict['start_id'][idx][0])+'-'+str(chunk_dict['end_id'][idx][0])+'_'+chunk_dict['data_type']+'.txt'
        
        with open(chunk_file,'w') as out_file:
            out_file.write(chunk)
        
    
    

### Main()

In [27]:
"""Basic Inputs
   text snippets preserving paragraph structure
"""

#desired approximate length of snippets (in terms)
target_size = 200
#desired data type (text/term or lemmas)
data_type = 'text'
#desired conserved entity (sentence or paragraph)
preserve_type = 'paragraph'

"""Directory structure"""
inputdir='/Users/eslt0101/Data/eScience/EviDENce/Data/NR-Teksts/EviDENce_NR_output/'

outputdir=inputdir+'TargetSize'+str(target_size)+'/'+data_type+'_preserve_'+preserve_type+'/'


""" make output directory"""
os.makedirs(outputdir,exist_ok=True)

""" change to input directory """
os.chdir(inputdir)

directory = os.fsencode(inputdir)

for file in os.listdir(directory):
    # for every file in input directory
    full_file_name = os.fsdecode(file)
    print(full_file_name)
    if full_file_name.endswith('.naf'):
        # create separate directories for the file (NB in output directory) 
        filename, file_extension = os.path.splitext(full_file_name)
        dirname = os.path.join(outputdir,filename)
        os.makedirs(dirname,exist_ok=True)                      
        # create chunks for the file
        file_term_info_dict = get_KafNaf_info(file)
        if file_term_info_dict['id'] != []:
            file_chunk_dict = create_chunk_snippets(data_type,preserve_type,file_term_info_dict,target_size)
        # write chunks for file
            os.chdir(dirname)
            write_chunk_snippets(filename,file_chunk_dict)
            os.chdir(inputdir)
                           
                           
    

GV_ArtEsteem_Indischverzet_04_conversation_clipped.naf
GV_OVCG_Groningeninoorlogstijd_07_conversation_clipped.naf
GV_Zigma_koopvaardij_08_conversation_clipped.naf
GV_GAR_bombardement_05_conversation_clipped.naf
GV_Museon_Cantius Geysel_conversation_clipped.naf
GV_Traktor_Russenoorlog_12_conversation_clipped.naf
GV_CaleidoscoopFilm_ingekwartierd_06_conversation_clipped.naf
GV_MUMA_Molukkers_31b_clipped.naf
GV_Verhalis_kloosterzusters_05d_conversation_clipped.naf
GV_Limburg_gastkind_07_conversation_clipped.naf
GV_Wieberdink_dodenmars_01d_conversation_clipped.naf
GV_Overloon_Venray_06_conversation_clipped.naf
GV_AVA_doven_10_conversation_clipped.naf
GV_Kindermonument_Gaaspstraat_08_conversation_clipped.naf
GV_NIOD_Buchenwald_24_clipped.naf
GV_OVCG_Groningeninoorlogstijd_03_conversation_clipped.naf
GV_Oogland_KNIL_06b_conversation_clipped.naf
GV_NIOD_Buchenwald_38_clipped.naf
GV_Lumen_verzetsgroep_JL_02_conversation_clipped.naf
GV_NMKV_Sachsenhausen_09_conversation_clipped.naf
GV_Nijkerk_V

GV_Overloon_Venray_03_conversation_clipped.naf
GV_Limburg_gastkind_02_conversation_clipped.naf
GV_DeJager_ReisvandeRazzia_interview_de_Graaf_conversation_clipped.naf
GV_DeJager_ReisvandeRazzia_interview_Visser_conversation_clipped.naf
GV_Verhalis_kloosterzusters_05a_conversation_clipped.naf
GV_DeJager_ReisvandeRazzia_interview_Bergs_conversation_clipped.naf
GV_DeJager_ReisvandeRazzia_interview_Vrijdag_conversation_clipped.naf
GV_CaleidoscoopFilm_ingekwartierd_03_conversation_clipped.naf
GV_OVCG_Groningeninoorlogstijd_02_conversation_clipped.naf
GV_NIOD_Buchenwald_32_clipped.naf
GV_Amigonan_Aruba_04_clipped.naf
GV_Oogland_KNIL_06c_conversation_clipped.naf
GV_Lumen_verzetsgroep_JL_03_conversation_clipped.naf
GV_NMKV_Sachsenhausen_08_conversation_clipped.naf
GV_Schiedam_gastgezin1_09_conversation_clipped.naf
GV_Nijkerk_Vluchtelingen_05_conversation_clipped.naf
GV_ArtEsteem_Indischverzet_01_conversation_clipped.naf
GV_Limburg_gastkind_06_conversation_clipped.naf
GV_Overloon_Venray_07_conve

GV_Traktor_Russenoorlog_15_conversation_clipped.naf
GV_GAR_bombardement_02_conversation_clipped.naf
GV_Oogland_KNIL_06a_conversation_clipped.naf
GV_Lumen_verzetsgroep_JL_01_conversation_clipped.naf
GV_Nijkerk_Vluchtelingen_07_conversation_clipped.naf
GV_ArtEsteem_Indischverzet_03_conversation_clipped.naf
GV_NIOD_Buchenwald_25_clipped.naf
GV_DeJager_ReisvandeRazzia_interview_van_Geenen_conversation_clipped.naf
GV_DeJager_ReisvandeRazzia_interview_Gerard_Passchier_conversation_clipped.naf
GV_Overloon_Venray_01_conversation_clipped.naf
GV_Wieberdink_dodenmars_01c_conversation_clipped.naf
GV_Verhalis_kloosterzusters_05c_conversation_clipped.naf
GV_KNMG_medici_11_conversation_clipped.naf
GV_DeJager_ReisvandeRazzia_interview_Diepenhorst_conversation_clipped.naf
GV_NIOD_Buchenwald_26_clipped.naf
GV_Kindermonument_Gaaspstraat_06_conversation_clipped.naf
GV_DdM_Engelandvaarders_04_conversation_clipped.naf
GV_CaleidoscoopFilm_Bakkum_05a_conversation_clipped.naf
GV_NIOD_Buchenwald_08_clipped.naf


In [39]:
"""Basic Inputs
   lemma snippets preserving paragraph structure
"""

#desired approximate length of snippets (in terms)
target_size = 200
#desired data type (text/term or lemmas)
data_type = 'lemma'
#desired conserved entity (sentence or paragraph)
preserve_type = 'paragraph'

"""Directory structure"""
inputdir='/Users/eslt0101/Data/eScience/EviDENce/Data/NR-Teksts/EviDENce_NR_output/'

outputdir=inputdir+'TargetSize'+str(target_size)+'/'+data_type+'_preserve_'+preserve_type+'/'


""" make output directory"""
os.makedirs(outputdir,exist_ok=True)

""" change to input directory """
os.chdir(inputdir)

directory = os.fsencode(inputdir)

for file in os.listdir(directory):
    # for every file in input directory
    full_file_name = os.fsdecode(file)
    print(full_file_name)
    if full_file_name.endswith('.naf'):
        # create separate directories for the file (NB in output directory) 
        filename, file_extension = os.path.splitext(full_file_name)
        dirname = os.path.join(outputdir,filename)
        os.makedirs(dirname,exist_ok=True)                      
        # create chunks for the file
        file_term_info_dict = get_KafNaf_info(file)
        if file_term_info_dict['id'] != []:
            file_chunk_dict = create_chunk_snippets(data_type,preserve_type,file_term_info_dict,target_size)
        # write chunks for file
            os.chdir(dirname)
            write_chunk_snippets(filename,file_chunk_dict)
            os.chdir(inputdir)
                           
                           

GV_ArtEsteem_Indischverzet_04_conversation_clipped.naf
GV_OVCG_Groningeninoorlogstijd_07_conversation_clipped.naf
GV_Zigma_koopvaardij_08_conversation_clipped.naf
GV_GAR_bombardement_05_conversation_clipped.naf
GV_Museon_Cantius Geysel_conversation_clipped.naf
GV_Traktor_Russenoorlog_12_conversation_clipped.naf
GV_CaleidoscoopFilm_ingekwartierd_06_conversation_clipped.naf
GV_MUMA_Molukkers_31b_clipped.naf
GV_Verhalis_kloosterzusters_05d_conversation_clipped.naf
GV_Limburg_gastkind_07_conversation_clipped.naf
GV_Wieberdink_dodenmars_01d_conversation_clipped.naf
GV_Overloon_Venray_06_conversation_clipped.naf
GV_AVA_doven_10_conversation_clipped.naf
GV_Kindermonument_Gaaspstraat_08_conversation_clipped.naf
GV_NIOD_Buchenwald_24_clipped.naf
GV_OVCG_Groningeninoorlogstijd_03_conversation_clipped.naf
GV_Oogland_KNIL_06b_conversation_clipped.naf
GV_NIOD_Buchenwald_38_clipped.naf
GV_Lumen_verzetsgroep_JL_02_conversation_clipped.naf
GV_NMKV_Sachsenhausen_09_conversation_clipped.naf
GV_Nijkerk_V

GV_Overloon_Venray_03_conversation_clipped.naf
GV_Limburg_gastkind_02_conversation_clipped.naf
GV_DeJager_ReisvandeRazzia_interview_de_Graaf_conversation_clipped.naf
GV_DeJager_ReisvandeRazzia_interview_Visser_conversation_clipped.naf
GV_Verhalis_kloosterzusters_05a_conversation_clipped.naf
GV_DeJager_ReisvandeRazzia_interview_Bergs_conversation_clipped.naf
GV_DeJager_ReisvandeRazzia_interview_Vrijdag_conversation_clipped.naf
GV_CaleidoscoopFilm_ingekwartierd_03_conversation_clipped.naf
GV_OVCG_Groningeninoorlogstijd_02_conversation_clipped.naf
GV_NIOD_Buchenwald_32_clipped.naf
GV_Amigonan_Aruba_04_clipped.naf
GV_Oogland_KNIL_06c_conversation_clipped.naf
GV_Lumen_verzetsgroep_JL_03_conversation_clipped.naf
GV_NMKV_Sachsenhausen_08_conversation_clipped.naf
GV_Schiedam_gastgezin1_09_conversation_clipped.naf
GV_Nijkerk_Vluchtelingen_05_conversation_clipped.naf
GV_ArtEsteem_Indischverzet_01_conversation_clipped.naf
GV_Limburg_gastkind_06_conversation_clipped.naf
GV_Overloon_Venray_07_conve

GV_Traktor_Russenoorlog_15_conversation_clipped.naf
GV_GAR_bombardement_02_conversation_clipped.naf
GV_Oogland_KNIL_06a_conversation_clipped.naf
GV_Lumen_verzetsgroep_JL_01_conversation_clipped.naf
GV_Nijkerk_Vluchtelingen_07_conversation_clipped.naf
GV_ArtEsteem_Indischverzet_03_conversation_clipped.naf
GV_NIOD_Buchenwald_25_clipped.naf
GV_DeJager_ReisvandeRazzia_interview_van_Geenen_conversation_clipped.naf
GV_DeJager_ReisvandeRazzia_interview_Gerard_Passchier_conversation_clipped.naf
GV_Overloon_Venray_01_conversation_clipped.naf
GV_Wieberdink_dodenmars_01c_conversation_clipped.naf
GV_Verhalis_kloosterzusters_05c_conversation_clipped.naf
GV_KNMG_medici_11_conversation_clipped.naf
GV_DeJager_ReisvandeRazzia_interview_Diepenhorst_conversation_clipped.naf
GV_NIOD_Buchenwald_26_clipped.naf
GV_Kindermonument_Gaaspstraat_06_conversation_clipped.naf
GV_DdM_Engelandvaarders_04_conversation_clipped.naf
GV_CaleidoscoopFilm_Bakkum_05a_conversation_clipped.naf
GV_NIOD_Buchenwald_08_clipped.naf
