# Generate input for positive class

In [None]:
import os
from tqdm import tqdm
from lxml import etree
import json

# 1. Create the lexicon from terms embedded in ENE from Perdido XML-TEI

\<rs type="place">\<term type="place">

In [None]:
def get_term_occurrences_from_ene(filename):
    words = []
    if os.path.exists(filename):
        try:
            tree = etree.parse(filename)
            for term in tree.xpath('.//rs[@type="place" and @subtype="ene"]/term[@type="place"]'):
                phrase = ''
                tokens = term.xpath('.//w[@pos="N" or @pos="PREPDET" or @pos="PREP" or @pos="DET"]')
                for i, w in enumerate(tokens):
                    if len(w.text) > 1:
                        if ('DET' not in w.get('pos') and 'PREP' not in w.get('pos')):
                            phrase += w.text.lower() + ' '
                        if ('DET' in w.get('pos') or 'PREP' in w.get('pos')) and (i > 0 and i < len(tokens)-1):
                            phrase += w.text.lower() + ' '
                words.append(phrase.strip())
        except:
            pass
    return words

## 1.1 Corpus Traitement Text

In [None]:
path = '/Users/lmoncla/Documents/Data/Corpus/Choucas/Perdido'
output_filename = 'Traitement_Texte_pivot_lexicon'

words = []
for doc in tqdm(sorted(os.listdir(path))):
    filename = os.path.join(path, doc, doc+'.xml') # version Traitements_Texte
    #filename = os.path.join(path, doc[:-4]+'.xml') # version visorando
    words.extend(get_term_occurrences_from_ene(filename))

# list to dict with frequency
frequency_dict_geo = {value: words.count(value) for value in words}
print('Size of the lexicon', len(frequency_dict_geo))

#save the dict in file
with open(output_filename + '.json', 'w') as fp:
    json.dump(frequency_dict_geo, fp, ensure_ascii=False)

## 1.2 Corpus Visorando

In [None]:
path = '/Users/lmoncla/Documents/Data/Corpus/Visorando/Perdido'
output_filename = 'Visorando_pivot_lexicon'

words = []
for doc in tqdm(sorted(os.listdir(path))):
    filename = os.path.join(path, doc, doc+'.xml') # version Traitements_Texte
    #filename = os.path.join(path, doc[:-4]+'.xml') # version visorando
    words.extend(get_term_occurrences_from_ene(filename))

# list to dict with frequency
frequency_dict_geo = {value: words.count(value) for value in words}
print('Size of the lexicon', len(frequency_dict_geo))

#save the dict in file
with open(output_filename + '.json', 'w') as fp:
    json.dump(frequency_dict_geo, fp, ensure_ascii=False)

In [None]:
for k, v in frequency_dict_geo.items():
    print(k, v)

In [None]:
# load the dict from file
with open(output_filename + '.json') as fp:
    frequency_dict_geo = json.load(fp)

# 2. Find occurrences of the lexicon in the corpus (outside ENE)

In [None]:
def get_ngrams_wt_term_outside_ene(filename):
    json_content = []
    if os.path.exists(filename):
        try:
            tree = etree.parse(filename)
            tokens = tree.xpath('.//w')
            for i, token in enumerate(tokens):
                if token.text in frequency_dict_geo:
                    line = {'num':ngram_id, 'class':'1', 'id_phrase':'0','pivot':token.text,'occurrence': '0', 'url':os.path.join(doc, doc+'.xml')}
                    phrase = []
                    for j in range(3,0,-1):
                        try:
                            words = {'word':tokens[i-j].text, 'POS':tokens[i-j].get('pos'), 'lemma':tokens[i-j].get('lemma')}
                        except IndexError:
                            words = {'word':'_', 'POS':'_', 'lemma':'_'}
                        phrase.append(words)
                    phrase.append({'word':token.text, 'POS':token.get('pos') + '+LS', 'lemma':token.get('lemma')})
                    for j in range(1,4):
                        try:
                            words = {'word':tokens[i+j].text, 'POS':tokens[i+j].get('pos'), 'lemma':tokens[i+j].get('lemma')}
                        except IndexError:
                            words = {'word':'_', 'POS':'_', 'lemma':'_'}
                        phrase.append(words)
                    line['phrase'] = phrase
                    try:
                        print(tokens[i-3].text, tokens[i-2].text , tokens[i-1].text , '[', token.text, ']', tokens[i+1].text, tokens[i+2].text, tokens[i+3].text)
                    except IndexError:
                        pass
                    ngram_id += 1
                    json_content.append(line)
        except :
            pass

## 2.1 Corpus Traitement text

In [None]:
json_content = []
ngram_id = 1
for doc in sorted(os.listdir(path)):
    filename = os.path.join(path, doc, doc+'.xml') # version Traitements_Texte
    json_content.extend(get_ngrams_wt_term_outside_ene(filename))

print('nnumber of ngram',len(json_content))

name = 'Traitement_Texte_class1'
with open(name + ".json", "w") as outfile:
    json.dump(json_content,outfile, ensure_ascii=False)

## 2.2 Corpus Visorando

In [None]:
json_content = []
ngram_id = 1
for doc in sorted(os.listdir(path)):
    filename = os.path.join(path, doc[:-4]+'.xml') # version visorando
    json_content.extend(get_ngrams_wt_term_outside_ene(filename))
                            
print('nnumber of ngram',len(json_content))

name = 'Visorando_class1'
with open(name + ".json", "w") as outfile:
    json.dump(json_content,outfile, ensure_ascii=False)

## Get the list of terms in ENE not categorized by Perdido

In [None]:
words = []
for doc in tqdm(sorted(os.listdir(path))):
    filename = os.path.join(path, doc, doc+'.xml') 
    if os.path.exists(filename):
        tree = etree.parse(filename)

        for term in tree.xpath('.//rs[@type="unknown" and @subtype="ene"]/term[@type="unknown"]'):
            phrase = ''
            for w in term.xpath('.//w[@pos="N" or @pos="PREPDET" or @pos="PREP"]'):
                phrase += w.text.lower() + ' '
                #print(w.text, end=' ')
            words.append(phrase.strip())
            #print()

# list to dict with frequency
frequency_dict_unknown = {value: words.count(value) for value in words}
print('Size of the lexicon', len(frequency_dict_unknown))

In [None]:
frequency_dict_unknown

## List of files with rs in term

In [None]:
path = '/Users/lmoncla/Documents/Data/Corpus/Visorando/Perdido'

In [None]:
words = []
for doc in tqdm(sorted(os.listdir(path))):
    filename = os.path.join(path, doc+'.xml') 
    if os.path.exists(filename):
        tree = etree.parse(filename)


        for term in tree.xpath('.//term//rs'):
            print(filename)
            phrase = ''
            for w in term.xpath('.//w'):
                phrase += w.text.lower() + ' '
        #print(phrase, end=' ')
        #print()
            
            #print()

        # list to dict with frequency
        