In [5]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

from datetime import date
import os
import networkx as nx
import re
from tqdm import tqdm

import sys
sys.path.insert(0, '../scripts')

from network import create_nodes, create_edges, save_nodes_as_csv, save_edges_as_csv




In [2]:
df = pd.read_csv('../data/sc_entities_from_geo_articles.tsv', sep='\t')
df.head()

Unnamed: 0,entity_form,entity_tag,volume,numero,paragraph_id
0,A,NP-Spatial,1,26,1
1,France,NP-Spatial,1,26,1
2,Fontaines,NP-Spatial,1,26,1
3,en,Relation,1,26,1
4,Sologne,NP-Spatial,1,26,1


In [3]:
df_np = df[df['entity_tag'].str.startswith('NP')]

In [6]:
df_np['entity_form_norm'] = df_np['entity_form'].str.lower()
df_np.head()

Unnamed: 0,entity_form,entity_tag,volume,numero,paragraph_id,entity_form_norm
0,A,NP-Spatial,1,26,1,a
1,France,NP-Spatial,1,26,1,france
2,Fontaines,NP-Spatial,1,26,1,fontaines
4,Sologne,NP-Spatial,1,26,1,sologne
7,AA,NP-Spatial,1,27,1,aa


In [13]:
def remove_a_prefix(text, prefix_to_remove):
    for prefix in prefix_to_remove:
        if str(text).startswith(prefix):
            return str(text)[len(prefix):]    
    return text


In [14]:
prefix_to_remove = ["d'", "l'","le","la","les","ce","ces","cette"]

df_np['entity_form_norm'] = df_np['entity_form_norm'].apply(remove_a_prefix, prefix_to_remove=prefix_to_remove)


In [15]:
df_np.head(20)

Unnamed: 0,entity_form,entity_tag,volume,numero,paragraph_id,entity_form_norm
0,A,NP-Spatial,1,26,1,a
1,France,NP-Spatial,1,26,1,france
2,Fontaines,NP-Spatial,1,26,1,fontaines
4,Sologne,NP-Spatial,1,26,1,sologne
7,AA,NP-Spatial,1,27,1,aa
9,France,NP-Spatial,1,27,1,france
11,Gravelines,NP-Spatial,1,27,1,gravelines
14,Suisse,NP-Spatial,1,27,1,suisse
15,Westphalie,NP-Spatial,1,27,1,westphalie
16,la Flandre,NP-Spatial,1,27,1,flandre


In [17]:
df_np[df_np['entity_form'] == 'Allemagne']

Unnamed: 0,entity_form,entity_tag,volume,numero,paragraph_id,entity_form_norm
23,Allemagne,NP-Spatial,1,29,1,allemagne
35,Allemagne,NP-Spatial,1,30,1,allemagne
87,Allemagne,NP-Spatial,1,41,1,allemagne
274,Allemagne,NP-Spatial,1,174,1,allemagne
460,Allemagne,NP-Spatial,1,323,1,allemagne
...,...,...,...,...,...,...
369719,Allemagne,NP-Spatial,17,3032,2,allemagne
369745,Allemagne,NP-Spatial,17,3034,1,allemagne
369780,Allemagne,NP-Spatial,17,3036,1,allemagne
370372,Allemagne,NP-Spatial,17,3057,1,allemagne


In [18]:
df_np[df_np['entity_form_norm'] == 'allemagne']

Unnamed: 0,entity_form,entity_tag,volume,numero,paragraph_id,entity_form_norm
23,Allemagne,NP-Spatial,1,29,1,allemagne
35,Allemagne,NP-Spatial,1,30,1,allemagne
64,d'Allemagne,NP-Spatial,1,33,1,allemagne
87,Allemagne,NP-Spatial,1,41,1,allemagne
274,Allemagne,NP-Spatial,1,174,1,allemagne
...,...,...,...,...,...,...
369745,Allemagne,NP-Spatial,17,3034,1,allemagne
369780,Allemagne,NP-Spatial,17,3036,1,allemagne
369790,d'Allemagne,NP-Spatial,17,3036,1,allemagne
370372,Allemagne,NP-Spatial,17,3057,1,allemagne


In [None]:
input_path = '/Users/lmoncla/Documents/Data/Corpus/EDDA/articles_geographie/perdido-22.06/'
output_path = '../output/'
outputSuffix = 'v' + date.today().strftime("%Y%m%d")

print(outputSuffix)

geocoding = False


In [None]:
d_headwords = {}

G = nx.DiGraph()

# create nodes
print("Create nodes")
for doc in tqdm(os.listdir(input_path)):
    file_id = doc[:-4]
    extension = doc[-4:]

    if extension == '.xml':
        m = re.match("\w+-(\d+)", file_id)
        number = m.groups()[0]
        create_nodes(input_path, doc, number, G, d_headwords, geocoding)

 # create edges
print("Create edges")
for doc in tqdm(os.listdir(input_path)):
    file_id = doc[:-4]
    # print('artcile ' + file_id)
    extension = doc[-4:]

    if extension == '.xml':
        m = re.match("\w+-(\d+)", file_id)
        number = m.groups()[0]
        create_edges(input_path, doc, number, G, d_headwords)

In [None]:
# save graph
nx.write_gexf(G, output_path + 'network-'+outputSuffix+'.gexf')

# save_edges_as_csv(output_path + 'edges-'+outputSuffix+'.csv', ';', G.edges)
save_nodes_as_csv(output_path + 'nodes-' + outputSuffix + '.tsv', '\t', G.nodes)
