# Generate input for positive class

In [None]:
import os
from tqdm import tqdm
from lxml import etree
import json
from utils_functions import get_term_occurrences_from_ene, load_lexicon, get_ngrams_wt_term_outside_ene


# 1. Create the lexicon from terms embedded in ENE from Perdido XML-TEI

\<rs type="place">\<term type="place">

## 1.1 Corpus Traitement Text

In [None]:
path = '/Users/lmoncla/Documents/Data/Corpus/Choucas/Perdido'
output_filename = 'Traitement_Texte_pivot_lexicon'

words_TT = []
for doc in tqdm(sorted(os.listdir(path))):
    filename = os.path.join(path, doc, doc+'.xml') # version Traitements_Texte
    words_TT.extend(get_term_occurrences_from_ene(filename))

# list to dict with frequency
frequency_dict_geo_TT = {value: words_TT.count(value) for value in words_TT}
print('Size of the lexicon', len(frequency_dict_geo_TT))

#save the dict in file
with open(output_filename + '.json', 'w') as fp:
    json.dump(frequency_dict_geo_TT, fp, ensure_ascii=False)

## 1.2 Corpus Visorando

In [None]:
path = '/Users/lmoncla/Documents/Data/Corpus/Visorando/Perdido'
output_filename = 'Visorando_pivot_lexicon'

words_viso = []
for doc in tqdm(sorted(os.listdir(path))):
    filename = os.path.join(path, doc[:-4]+'.xml') # version visorando
    words_viso.extend(get_term_occurrences_from_ene(filename))

# list to dict with frequency
frequency_dict_geo_viso = {value: words_viso.count(value) for value in words_viso}
print('Size of the lexicon', len(frequency_dict_geo_viso))

#save the dict in file
with open(output_filename + '.json', 'w') as fp:
    json.dump(frequency_dict_geo_viso, fp, ensure_ascii=False)

## 1.3 Lexique TT - Viso

In [None]:
words = [item for item in frequency_dict_geo_viso.keys() if item not in frequency_dict_geo_TT.keys()]
print(813-190)
print(len(words), words)

## 1.3 Lexicons preview

In [None]:
for k, v in frequency_dict_geo_viso.items():
    print(k, v)

In [None]:
# load the dict from file
# only necessery if you want to run the next section and not the previous one (adapt output_filename)

frequency_dict_geo_viso = load_lexicon(output_filename)

# 2. Find occurrences of the lexicon in the corpus (outside ENE)

## 2.1 Corpus Traitement text

In [None]:
path = '/Users/lmoncla/Documents/Data/Corpus/Choucas/Perdido'

#lexicon = frequency_dict_geo_TT
lexicon = frequency_dict_geo_viso

json_content = []
ngram_id = 1
for doc in sorted(os.listdir(path)):
    filename = os.path.join(path, doc, doc+'.xml') # version Traitements_Texte
    json_content.extend(get_ngrams_wt_term_outside_ene(filename, lexicon, ngram_id))

print('number of ngram',len(json_content))

name = 'Traitement_Texte_class1'
with open(name + ".json", "w") as outfile:
    json.dump(json_content,outfile, ensure_ascii=False)

## 2.2 Corpus Visorando

In [None]:
path = '/Users/lmoncla/Documents/Data/Corpus/Visorando/Perdido'
json_content = []
ngram_id = 1
for doc in sorted(os.listdir(path)):
    filename = os.path.join(path, doc[:-4]+'.xml') # version visorando
    json_content.extend(get_ngrams_wt_term_outside_ene(filename, frequency_dict_geo_viso, ngram_id))
                            
print('number of ngram',len(json_content))

name = 'Visorando_class1'
with open(name + ".json", "w") as outfile:
    json.dump(json_content,outfile, ensure_ascii=False)

## 2.3 TT - Viso

In [None]:
len(words)

In [None]:
path = '/Users/lmoncla/Documents/Data/Corpus/Choucas/Perdido'

json_content = []
ngram_id = 1
for doc in sorted(os.listdir(path)):
    filename = os.path.join(path, doc, doc+'.xml') # version Traitements_Texte
    json_content.extend(get_ngrams_wt_term_outside_ene(filename, words, ngram_id))

print('number of ngram',len(json_content))

name = 'TT-viso_class1'
with open(name + ".json", "w") as outfile:
    json.dump(json_content,outfile, ensure_ascii=False)

In [None]:
l = set([ngram['pivot']for ngram in json_content])
len(l)


# 3. Tests with ngrams with pivot at different positions


In [None]:
path = '/Users/lmoncla/Documents/Data/Corpus/Choucas/Perdido'

lexicon = frequency_dict_geo_TT
#lexicon = frequency_dict_geo_viso
position = 7
json_content = []
ngram_id = 1
for doc in sorted(os.listdir(path)):
    filename = os.path.join(path, doc, doc+'.xml') # version Traitements_Texte
    json_content.extend(get_ngrams_wt_term_outside_ene(filename, lexicon, ngram_id, position=position))

print('number of ngram',len(json_content))

name = 'Traitement_Texte_class1_position'+str(position)
with open(name + ".json", "w") as outfile:
    json.dump(json_content,outfile, ensure_ascii=False)

## Get the list of terms in ENE not categorized by Perdido

In [None]:
words = []
for doc in tqdm(sorted(os.listdir(path))):
    filename = os.path.join(path, doc, doc+'.xml') 
    if os.path.exists(filename):
        tree = etree.parse(filename)

        for term in tree.xpath('.//rs[@type="unknown" and @subtype="ene"]/term[@type="unknown"]'):
            phrase = ''
            for w in term.xpath('.//w[@pos="N" or @pos="PREPDET" or @pos="PREP"]'):
                phrase += w.text.lower() + ' '
                #print(w.text, end=' ')
            words.append(phrase.strip())
            #print()

# list to dict with frequency
frequency_dict_unknown = {value: words.count(value) for value in words}
print('Size of the lexicon', len(frequency_dict_unknown))

In [None]:
frequency_dict_unknown

## List of files with rs in term

In [None]:
path = '/Users/lmoncla/Documents/Data/Corpus/Visorando/Perdido'

In [None]:
words = []
for doc in tqdm(sorted(os.listdir(path))):
    filename = os.path.join(path, doc+'.xml') 
    if os.path.exists(filename):
        tree = etree.parse(filename)


        for term in tree.xpath('.//term//rs'):
            print(filename)
            phrase = ''
            for w in term.xpath('.//w'):
                phrase += w.text.lower() + ' '
        #print(phrase, end=' ')
        #print()
            
            #print()

        # list to dict with frequency
        