# Generate input for negative class

In [None]:
import os
from tqdm import tqdm
from lxml import etree
import json
import random 
import string
from perdido.geoparser import Geoparser


from utils_functions import load_lexicon, load_edda_dataframe, run_perdido, get_term_occurrences_from_ene, segment_sentences, get_ngrams_wt_term_outside_ene

# 1. Load lexicon

The lexicon is created with the notebook `generate_input_positive_class.ipynb`. (TODO: make an independant script)


In [None]:
lexicon_filename =  os.path.join('output', 'Traitement_Texte_pivot_lexicon.json')

frequency_dict_geo_tt = load_lexicon(lexicon_filename)

In [None]:
len(frequency_dict_geo_tt)

# 2. Load EDdA dataset

Load the csv of EDdA dataset as a dataframe. Article from the Encyclop√©die will be used to generate negative ngrams with words from the lexicon as pivot.

In [None]:
edda_dataset_path = '/Users/lmoncla/Nextcloud-LIRIS/GEODE/GEODE - Partage consortium/Corpus/EDdA/EDdA_dataset_articles_superdomainBERT_230327.tsv'
data = load_edda_dataframe(edda_dataset_path, 'Philosophie')
data.head()

In [None]:
data.shape

In [None]:
#sentences_per_article = data['content'].apply(segment_sentences)
df = data.head()
sentences_per_article = df['content'].apply(segment_sentences)

all_sentences = [sentence for sentences in sentences_per_article for sentence in sentences]

# Print the list of sentences
for sentence in all_sentences:
    print(sentence)

# build batches of sentences
batch_size = 50
batches = [all_sentences[i:i + batch_size] for i in range(0, len(all_sentences), batch_size)]

# turn each batch into a concatenate string
batch_strings = [' '.join(batch) for batch in batches]

In [None]:
output_dir = './output/Philosophie'
geoparser = Geoparser(version='Standard', sources=['wiki_gaz'])

for batch in tqdm(batch_strings):
    # generate a random string of 10 characters
    filename = ''.join(random.choices(string.ascii_uppercase + string.digits, k=10))
    doc = run_perdido(batch, geoparser)
    try:
        doc.to_xml(os.path.join(output_dir, filename + '.xml'))
    except:
        print('Error', filename)


In [None]:
filename

## 1.1 Corpus Traitement Text

In [None]:
path = '/Users/lmoncla/Documents/Data/Corpus/Choucas/Perdido'
output_filename = 'Traitement_Texte_pivot_lexicon'

words_TT = []
for doc in tqdm(sorted(os.listdir(path))):
    filename = os.path.join(path, doc, doc+'.xml') # version Traitements_Texte
    words_TT.extend(get_term_occurrences_from_ene(filename))

# list to dict with frequency
frequency_dict_geo_TT = {value: words_TT.count(value) for value in words_TT}
print('Size of the lexicon', len(frequency_dict_geo_TT))

#save the dict in file
with open(output_filename + '.json', 'w') as fp:
    json.dump(frequency_dict_geo_TT, fp, ensure_ascii=False)

## 1.2 Corpus Visorando

In [None]:
path = '/Users/lmoncla/Documents/Data/Corpus/Visorando/Perdido'
output_filename = 'Visorando_pivot_lexicon'

words_viso = []
for doc in tqdm(sorted(os.listdir(path))):
    filename = os.path.join(path, doc[:-4]+'.xml') # version visorando
    words_viso.extend(get_term_occurrences_from_ene(filename))

# list to dict with frequency
frequency_dict_geo_viso = {value: words_viso.count(value) for value in words_viso}
print('Size of the lexicon', len(frequency_dict_geo_viso))

#save the dict in file
with open(output_filename + '.json', 'w') as fp:
    json.dump(frequency_dict_geo_viso, fp, ensure_ascii=False)

## 1.3 Lexique TT - Viso

In [None]:
words = [item for item in frequency_dict_geo_viso.keys() if item not in frequency_dict_geo_TT.keys()]
print(813-190)
print(len(words), words)

## 1.3 Lexicons preview

In [None]:
for k, v in frequency_dict_geo_viso.items():
    print(k, v)

In [None]:
# load the dict from file
# only necessery if you want to run the nex section and not the previous one (adapt output_filename)
with open(output_filename + '.json') as fp:
    frequency_dict_geo_viso = json.load(fp)

# 2. Find occurrences of the lexicon in the corpus (outside ENE)

In [None]:
def get_ngrams_wt_term_outside_ene(filename, frequency_dict_geo, ngram_id):
    json_content = []
    if os.path.exists(filename):
        try:
            
            tree = etree.parse(filename)
            tokens = tree.xpath('.//w')
            for i, token in enumerate(tokens):
                if token.text in frequency_dict_geo:
                    line = {'num':ngram_id, 'class':'1', 'id_phrase':'0','pivot':token.text,'occurrence': '0', 'url':os.path.join(doc, doc+'.xml')}
                    phrase = []
                    for j in range(3,0,-1):
                        try:
                            words = {'word':tokens[i-j].text, 'POS':tokens[i-j].get('pos'), 'lemma':tokens[i-j].get('lemma')}
                        except IndexError:
                            words = {'word':'_', 'POS':'_', 'lemma':'_'}
                        phrase.append(words)
                    phrase.append({'word':token.text, 'POS':token.get('pos') + '+LS', 'lemma':token.get('lemma')})
                    for j in range(1,4):
                        try:
                            words = {'word':tokens[i+j].text, 'POS':tokens[i+j].get('pos'), 'lemma':tokens[i+j].get('lemma')}
                        except IndexError:
                            words = {'word':'_', 'POS':'_', 'lemma':'_'}
                        phrase.append(words)
                    line['phrase'] = phrase
                    try:
                        print(tokens[i-3].text, tokens[i-2].text , tokens[i-1].text , '[', token.text, ']', tokens[i+1].text, tokens[i+2].text, tokens[i+3].text)
                    except IndexError:
                        pass
                    ngram_id += 1
                    json_content.append(line)
        except :
            pass
    
    return json_content

## 2.1 Corpus Traitement text

In [None]:
path = '/Users/lmoncla/Documents/Data/Corpus/Choucas/Perdido'

#lexicon = frequency_dict_geo_TT
lexicon = frequency_dict_geo_viso

json_content = []
ngram_id = 1
for doc in sorted(os.listdir(path)):
    filename = os.path.join(path, doc, doc+'.xml') # version Traitements_Texte
    json_content.extend(get_ngrams_wt_term_outside_ene(filename, lexicon, ngram_id))

print('number of ngram',len(json_content))

name = 'Traitement_Texte_class1'
with open(name + ".json", "w") as outfile:
    json.dump(json_content,outfile, ensure_ascii=False)

## 2.2 Corpus Visorando

In [None]:
path = '/Users/lmoncla/Documents/Data/Corpus/Visorando/Perdido'
json_content = []
ngram_id = 1
for doc in sorted(os.listdir(path)):
    filename = os.path.join(path, doc[:-4]+'.xml') # version visorando
    json_content.extend(get_ngrams_wt_term_outside_ene(filename, frequency_dict_geo_viso, ngram_id))
                            
print('number of ngram',len(json_content))

name = 'Visorando_class1'
with open(name + ".json", "w") as outfile:
    json.dump(json_content,outfile, ensure_ascii=False)

## 2.3 TT - Viso

In [None]:
len(words)

In [None]:
path = '/Users/lmoncla/Documents/Data/Corpus/Choucas/Perdido'

json_content = []
ngram_id = 1
for doc in sorted(os.listdir(path)):
    filename = os.path.join(path, doc, doc+'.xml') # version Traitements_Texte
    json_content.extend(get_ngrams_wt_term_outside_ene(filename, words, ngram_id))

print('number of ngram',len(json_content))

name = 'TT-viso_class1'
with open(name + ".json", "w") as outfile:
    json.dump(json_content,outfile, ensure_ascii=False)

In [None]:
l = set([ngram['pivot']for ngram in json_content])
len(l)


# 3 Tests with ngrams


In [None]:
def get_ngrams_wt_term_outside_ene(filename, frequency_dict_geo, ngram_id, position='center'):
    json_content = []
    if os.path.exists(filename):
        try:
            tree = etree.parse(filename)
            tokens = tree.xpath('.//w')
            for i, token in enumerate(tokens):
                if token.text in frequency_dict_geo:
                    line = {'num':ngram_id, 'class':'1', 'id_phrase':'0','pivot':token.text,'occurrence': '0', 'url':os.path.join(doc, doc+'.xml')}
                    phrase = []
                    if position == 'center':
                        for j in range(3,0,-1):
                            try:
                                words = {'word':tokens[i-j].text, 'POS':tokens[i-j].get('pos'), 'lemma':tokens[i-j].get('lemma')}
                            except IndexError:
                                words = {'word':'_', 'POS':'_', 'lemma':'_'}
                            phrase.append(words)
                        phrase.append({'word':token.text, 'POS':token.get('pos') + '+LS', 'lemma':token.get('lemma')})
                        for j in range(1,4):
                            try:
                                words = {'word':tokens[i+j].text, 'POS':tokens[i+j].get('pos'), 'lemma':tokens[i+j].get('lemma')}
                            except IndexError:
                                words = {'word':'_', 'POS':'_', 'lemma':'_'}
                            phrase.append(words)
                        try:
                            print(tokens[i-3].text, tokens[i-2].text , tokens[i-1].text , '[', token.text, ']', tokens[i+1].text, tokens[i+2].text, tokens[i+3].text)
                        except IndexError:
                            pass
                    if position == 'left':
                        phrase.append({'word':token.text, 'POS':token.get('pos') + '+LS', 'lemma':token.get('lemma')})
                        for j in range(1,7):
                            try:
                                words = {'word':tokens[i+j].text, 'POS':tokens[i+j].get('pos'), 'lemma':tokens[i+j].get('lemma')}
                            except IndexError:
                                words = {'word':'_', 'POS':'_', 'lemma':'_'}
                            phrase.append(words)
                        try:
                            print('[', token.text, ']', tokens[i+1].text, tokens[i+2].text, tokens[i+3].text, tokens[i+4].text, tokens[i+5].text, tokens[i+6].text)
                        except IndexError:
                            pass
                    if position == 'right':
                        for j in range(6,0,-1):
                            try:
                                words = {'word':tokens[i-j].text, 'POS':tokens[j].get('pos'), 'lemma':tokens[i-j].get('lemma')}
                            except IndexError:
                                words = {'word':'_', 'POS':'_', 'lemma':'_'}
                            phrase.append(words)
                        phrase.append({'word':token.text, 'POS':token.get('pos') + '+LS', 'lemma':token.get('lemma')})
                        try:
                            print(tokens[i-6].text, tokens[i-5].text, tokens[i-4].text, tokens[i-3].text, tokens[i-2].text, tokens[i-1].text, '[', token.text, ']')
                        except IndexError:
                            pass
                    line['phrase'] = phrase
                    
                    ngram_id += 1
                    json_content.append(line)
        except:
            print("pass")
            pass
    
    return json_content

In [None]:
def get_ngrams_wt_term_outside_ene(filename, frequency_dict_geo, ngram_id, position=4, ngram_size=7):
    json_content = []
    print_content = ''
    if os.path.exists(filename):
        try:
            tree = etree.parse(filename)
            tokens = tree.xpath('.//w')
            for i, token in enumerate(tokens):
                print_content = ''
                if token.text in frequency_dict_geo:
                    line = {'num':ngram_id, 'class':'1', 'id_phrase':'0','pivot':token.text,'occurrence': '0', 'url':os.path.join(doc, doc+'.xml')}
                    phrase = []
                    for j in range(position-1, 0, -1):
                        try:
                            words = {'word':tokens[i-j].text, 'POS':tokens[i-j].get('pos'), 'lemma':tokens[i-j].get('lemma')}
                            print_content += tokens[i-j].text + ' '
                        except IndexError:
                            words = {'word':'_', 'POS':'_', 'lemma':'_'}
                            print_content += '_ '
                        phrase.append(words)
                        
                    phrase.append({'word':token.text, 'POS':token.get('pos') + '+LS', 'lemma':token.get('lemma')})
                    print_content += '[ ' + token.text + ' ] '
                    for j in range(1, ngram_size+1-position):
                        try:
                            words = {'word':tokens[i+j].text, 'POS':tokens[i-j].get('pos'), 'lemma':tokens[i-j].get('lemma')}
                            print_content += tokens[i+j].text + ' '
                        except IndexError:
                            words = {'word':'_', 'POS':'_', 'lemma':'_'}
                            print_content += '_ '
                        phrase.append(words)
                        
                    print(print_content)
                    line['phrase'] = phrase
                    
                    ngram_id += 1
                    json_content.append(line)
        except:
            print("pass")
            pass
    
    return json_content

In [None]:
path = '/Users/lmoncla/Documents/Data/Corpus/Choucas/Perdido'

lexicon = frequency_dict_geo_TT
#lexicon = frequency_dict_geo_viso
position = 7
json_content = []
ngram_id = 1
for doc in sorted(os.listdir(path)):
    filename = os.path.join(path, doc, doc+'.xml') # version Traitements_Texte
    json_content.extend(get_ngrams_wt_term_outside_ene(filename, lexicon, ngram_id, position=position))

print('number of ngram',len(json_content))

name = 'Traitement_Texte_class1_position'+str(position)
with open(name + ".json", "w") as outfile:
    json.dump(json_content,outfile, ensure_ascii=False)

## Get the list of terms in ENE not categorized by Perdido

In [None]:
words = []
for doc in tqdm(sorted(os.listdir(path))):
    filename = os.path.join(path, doc, doc+'.xml') 
    if os.path.exists(filename):
        tree = etree.parse(filename)

        for term in tree.xpath('.//rs[@type="unknown" and @subtype="ene"]/term[@type="unknown"]'):
            phrase = ''
            for w in term.xpath('.//w[@pos="N" or @pos="PREPDET" or @pos="PREP"]'):
                phrase += w.text.lower() + ' '
                #print(w.text, end=' ')
            words.append(phrase.strip())
            #print()

# list to dict with frequency
frequency_dict_unknown = {value: words.count(value) for value in words}
print('Size of the lexicon', len(frequency_dict_unknown))

In [None]:
frequency_dict_unknown

## List of files with rs in term

In [None]:
path = '/Users/lmoncla/Documents/Data/Corpus/Visorando/Perdido'

In [None]:
words = []
for doc in tqdm(sorted(os.listdir(path))):
    filename = os.path.join(path, doc+'.xml') 
    if os.path.exists(filename):
        tree = etree.parse(filename)


        for term in tree.xpath('.//term//rs'):
            print(filename)
            phrase = ''
            for w in term.xpath('.//w'):
                phrase += w.text.lower() + ' '
        #print(phrase, end=' ')
        #print()
            
            #print()

        # list to dict with frequency
        