In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

from datetime import date
import os
import networkx as nx
import re
from tqdm import tqdm

import sys
sys.path.insert(0, '../scripts')

from network import create_nodes, create_edges, save_nodes_as_csv, save_edges_as_csv




done!


In [2]:
df = pd.read_csv('../data/sc_entities_from_geo_articles.tsv', sep='\t')
df.head()

Unnamed: 0,entity_form,entity_tag,volume,numero,paragraph_id
0,A,NP-Spatial,1,26,1
1,France,NP-Spatial,1,26,1
2,Fontaines,NP-Spatial,1,26,1
3,en,Relation,1,26,1
4,Sologne,NP-Spatial,1,26,1


In [3]:
df_np = df[df['entity_tag'].str.startswith('NP')]

In [4]:
df_np['entity_form_norm'] = df_np['entity_form'].str.lower()
df_np.head()

Unnamed: 0,entity_form,entity_tag,volume,numero,paragraph_id,entity_form_norm
0,A,NP-Spatial,1,26,1,a
1,France,NP-Spatial,1,26,1,france
2,Fontaines,NP-Spatial,1,26,1,fontaines
4,Sologne,NP-Spatial,1,26,1,sologne
7,AA,NP-Spatial,1,27,1,aa


In [5]:
def remove_a_prefix(text, prefix_to_remove):
    for prefix in prefix_to_remove:
        if str(text).startswith(prefix):
            return str(text)[len(prefix):]    
    return text


In [6]:
prefix_to_remove = ["d'", "l'","le","la","les","ce","ces","cette"]

df_np['entity_form_norm'] = df_np['entity_form_norm'].apply(remove_a_prefix, prefix_to_remove=prefix_to_remove)


In [7]:
df_np.head(20)

Unnamed: 0,entity_form,entity_tag,volume,numero,paragraph_id,entity_form_norm
0,A,NP-Spatial,1,26,1,a
1,France,NP-Spatial,1,26,1,france
2,Fontaines,NP-Spatial,1,26,1,fontaines
4,Sologne,NP-Spatial,1,26,1,sologne
7,AA,NP-Spatial,1,27,1,aa
9,France,NP-Spatial,1,27,1,france
11,Gravelines,NP-Spatial,1,27,1,gravelines
14,Suisse,NP-Spatial,1,27,1,suisse
15,Westphalie,NP-Spatial,1,27,1,westphalie
16,la Flandre,NP-Spatial,1,27,1,flandre


In [13]:
print('Allemagne in entity_form:',len(df_np[df_np['entity_form'] == 'Allemagne']))
print('allemagne in entity_form_norm:',len(df_np[df_np['entity_form_norm'] == 'allemagne']))

Allemagne in entity_form: 804
allemagne in entity_form_norm: 1620


In [15]:
len(df_np['entity_form_norm'].unique())

51694

In [20]:

G = nx.MultiDiGraph()
# https://networkx.org/documentation/stable/reference/classes/multidigraph.html

print("Create nodes")
for form in df_np['entity_form_norm'].unique():
    is_head = False #TODO
    G.add_node(form, type='NP-Spatial', is_head=is_head, label=form)
    
    
    



Create nodes


In [23]:
'france' in G.nodes

True

In [32]:
print("Create edges")

# edges attributes:
# related_to = "article", "paragraph", "headword"
# cooccurrence_freq = int

# related_to = "article"
related_to_values = ['article', 'paragraph']
for related_to in related_to_values:
    print('*',related_to)
    if related_to == 'article':
        df_sampled = df_np.groupby(['volume', 'numero'])
    if related_to == 'paragraph':
        df_sampled = df_np.groupby(['volume', 'numero', 'paragraph_id'])

    for _, group in df_sampled:
        #print(group['entity_form_norm'].values)
        
        # create edges between each pair of entities in the same article
        for entity1 in group['entity_form_norm'].values:
            for entity2 in group['entity_form_norm'].values:
                if entity1 != entity2:
                    if G.has_edge(entity1, entity2, related_to):
                        G.edges[entity1, entity2, related_to]['cooccurrence_freq'] += 1
                    else:
                        G.add_edge(entity1, entity2, key=related_to, related_to=related_to, cooccurrence_freq=1)

Create edges
* article
* paragraph


In [33]:
nx.write_gexf(G, '../data/network-NP-Spatial.gexf')
   

In [None]:

nx.draw(G)