In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

from datetime import date
import os
import networkx as nx
import re
from tqdm import tqdm

import sys
sys.path.insert(0, '../scripts')

from network import create_nodes, create_edges, save_nodes_as_csv, save_edges_as_csv




done!


In [2]:
df = pd.read_csv('../data/sc_entities_from_geo_articles.tsv', sep='\t')
df.head()

Unnamed: 0,entity_form,entity_tag,volume,numero,paragraph_id
0,A,NP-Spatial,1,26,1
1,France,NP-Spatial,1,26,1
2,Fontaines,NP-Spatial,1,26,1
3,en,Relation,1,26,1
4,Sologne,NP-Spatial,1,26,1


In [3]:
df_np = df[df['entity_tag'].str.startswith('NP')]

In [4]:
def remove_a_prefix(text, prefix_to_remove):
    for prefix in prefix_to_remove:
        if str(text).lower().startswith(prefix):
            return str(text).lower()[len(prefix):]    
    return str(text).lower()


In [13]:
prefix_to_remove = ["d'", "l'","les ","le ","la ","ces ","cette ","ce "]

df_np['entity_form_norm'] = df_np['entity_form'].apply(remove_a_prefix, prefix_to_remove=prefix_to_remove)
df_np.head()

Unnamed: 0,entity_form,entity_tag,volume,numero,paragraph_id,entity_form_norm
0,A,NP-Spatial,1,26,1,a
1,France,NP-Spatial,1,26,1,france
2,Fontaines,NP-Spatial,1,26,1,fontaines
4,Sologne,NP-Spatial,1,26,1,sologne
7,AA,NP-Spatial,1,27,1,aa


In [14]:
df_np.head(20)

Unnamed: 0,entity_form,entity_tag,volume,numero,paragraph_id,entity_form_norm
0,A,NP-Spatial,1,26,1,a
1,France,NP-Spatial,1,26,1,france
2,Fontaines,NP-Spatial,1,26,1,fontaines
4,Sologne,NP-Spatial,1,26,1,sologne
7,AA,NP-Spatial,1,27,1,aa
9,France,NP-Spatial,1,27,1,france
11,Gravelines,NP-Spatial,1,27,1,gravelines
14,Suisse,NP-Spatial,1,27,1,suisse
15,Westphalie,NP-Spatial,1,27,1,westphalie
16,la Flandre,NP-Spatial,1,27,1,flandre


In [15]:
print('Allemagne in entity_form:',len(df_np[df_np['entity_form'] == 'Allemagne']))
print('allemagne in entity_form_norm:',len(df_np[df_np['entity_form_norm'] == 'allemagne']))

Allemagne in entity_form: 804
allemagne in entity_form_norm: 1620


In [16]:
len(df_np['entity_form_norm'].unique())

48797

In [17]:
np_groups = df_np.groupby('entity_tag')
for name, group in np_groups:
    print(name,len(group.entity_form_norm.unique()))


NP-Misc 1212
NP-Person 13103
NP-Spatial 38820


## Aggregated coocurrences: Homogeneous graph of entities with consolidated edges 

* Nodes are NP-Spatial entities
* Edges are aggregated coocurrences (with frequency as attribute and type: article, paragraph, headword)


In [21]:
df_np_spatial = df_np[df_np.entity_tag == 'NP-Spatial']
df_np_spatial.head()

Unnamed: 0,entity_form,entity_tag,volume,numero,paragraph_id,entity_form_norm
0,A,NP-Spatial,1,26,1,a
1,France,NP-Spatial,1,26,1,france
2,Fontaines,NP-Spatial,1,26,1,fontaines
4,Sologne,NP-Spatial,1,26,1,sologne
7,AA,NP-Spatial,1,27,1,aa


In [22]:
def create_homogeneous_graph(df, tag=None):
    G = nx.MultiDiGraph()
    # https://networkx.org/documentation/stable/reference/classes/multidigraph.html

    #print("Create nodes")
    for form in df['entity_form_norm'].unique():
        is_head = False #TODO
        if tag is not None:
            G.add_node(form, type=tag, is_head=is_head, label=form)
        else:
            G.add_node(form, type='NP', is_head=is_head, label=form)
            #TODO: create nodes while creating edges and add the tag as an attribute
        
    print('# nodes:', len(G.nodes))

    # related_to = "article"
    related_to_values = ['article', 'paragraph']
    for related_to in related_to_values:
        print('*',related_to)
        if related_to == 'article':
            df_sampled = df_np_spatial.groupby(['volume', 'numero'])
        if related_to == 'paragraph':
            df_sampled = df_np_spatial.groupby(['volume', 'numero', 'paragraph_id'])

        for _, group in df_sampled:
            #print(group['entity_form_norm'].values)
            
            # create edges between each pair of entities in the same article
            for entity1 in group['entity_form_norm'].values:
                for entity2 in group['entity_form_norm'].values:
                    if entity1 != entity2:
                        if G.has_edge(entity1, entity2, related_to):
                            G.edges[entity1, entity2, related_to]['cooccurrence_freq'] += 1
                        else:
                            G.add_edge(entity1, entity2, key=related_to, related_to=related_to, cooccurrence_freq=1)
                            
    print('# edges:', len(G.edges))
    return G

In [23]:
G_np_spatial = create_homogeneous_graph(df_np_spatial)

# nodes: 38820
* article
* paragraph
# edges: 1947622


In [24]:
'france' in G_np_spatial.nodes

True

In [29]:
G_np_spatial.edges['france', 'paris', 'article']

{'related_to': 'article', 'cooccurrence_freq': 2814}

In [31]:
# get the list of most frequent edges (use the cooccurrence_freq attribute)

sorted_edges = sorted(G_np_spatial.edges(data=True), key=lambda edge: edge[2]['cooccurrence_freq'], reverse=True)
sorted_edges

[('france', 'paris', {'related_to': 'article', 'cooccurrence_freq': 2814}),
 ('paris', 'france', {'related_to': 'article', 'cooccurrence_freq': 2814}),
 ('lydie', 'sardes', {'related_to': 'article', 'cooccurrence_freq': 1463}),
 ('sardes', 'lydie', {'related_to': 'article', 'cooccurrence_freq': 1463}),
 ('asie', 'sardes', {'related_to': 'article', 'cooccurrence_freq': 1393}),
 ('sardes', 'asie', {'related_to': 'article', 'cooccurrence_freq': 1393}),
 ('paris', 'athènes', {'related_to': 'article', 'cooccurrence_freq': 1285}),
 ('athènes', 'paris', {'related_to': 'article', 'cooccurrence_freq': 1285}),
 ('seine', 'paris', {'related_to': 'article', 'cooccurrence_freq': 1277}),
 ('paris', 'seine', {'related_to': 'article', 'cooccurrence_freq': 1277}),
 ('angleterre',
  'londres',
  {'related_to': 'article', 'cooccurrence_freq': 1155}),
 ('londres',
  'angleterre',
  {'related_to': 'article', 'cooccurrence_freq': 1155}),
 ('paris', 'louvre', {'related_to': 'article', 'cooccurrence_freq': 11

In [32]:
nx.write_gexf(G_np_spatial, '../data/network-NP-Spatial.gexf')

## Mixed nodes graph: heteroneneous graph of entities, paragraphs, and articles

* nodes are NP-Spatial entities, paragraphs, and articles
* edges are coocurrences

In [33]:
def create_homogeneous_graph(df, tag=None):
    G = nx.MultiGraph()
    # https://networkx.org/documentation/stable/reference/classes/digraph.html

    #print("Create nodes")
    for form in df['entity_form_norm'].unique():
        is_head = False #TODO !!!
        if tag is not None:
            G.add_node(form, type=tag, is_head=is_head, label=form)
        else:
            G.add_node(form, type='NP', is_head=is_head, label=form)
            #TODO: create nodes while creating edges and add the tag as an attribute
        
    print('# nodes of entities:', len(G.nodes))

    #print("Create edges")

    # edges attributes:
    # related_to = "article", "paragraph", "headword"
    # cooccurrence_freq = int

    cpt_article = 0
    cpt_paragraph = 0
    df_articles = df.groupby(['volume', 'numero'])
    for (volume, numero), df_article in df_articles:
        node_article = str(volume) + '_' + str(numero) # TODO: add _headword
        G.add_node(node_article, type='article', label=node_article)
        cpt_article += 1
        df_paragraphs = df_article.groupby(['paragraph_id'])
        
        for paragraph_id, df_paragraph in df_paragraphs:
            node_paragraph = str(volume) + '_' + str(numero) + '_' + str(paragraph_id[0]) # TODO: add _headword
            G.add_node(node_paragraph, type='paragraph', label=node_paragraph)
            cpt_paragraph += 1
            
            G.add_edge(node_article, node_paragraph)    
            for entity in df_paragraph['entity_form_norm'].values:
                G.add_edge(node_paragraph, entity)
        
        
    print('# nodes of articles:', cpt_article)
    print('# nodes of paragraphs:', cpt_paragraph)
    print('# edges:', len(G.edges))
    
    return G

In [34]:
mixed_G = create_homogeneous_graph(df_np)

# nodes of entities: 48797
# nodes of articles: 15336
# nodes of paragraphs: 27113
# edges: 207829


In [36]:
mixed_G.nodes['1_26']

{'type': 'article', 'label': '1_26'}

In [37]:
list(mixed_G.edges)

[('a', '1_26_1', 0),
 ('a', '1_56_1', 0),
 ('a', '2_5839_17', 0),
 ('a', '7_168_115', 0),
 ('a', '7_168_117', 0),
 ('a', '7_168_133', 0),
 ('a', '7_2239_37', 0),
 ('a', '16_2341_5', 0),
 ('france', '1_26_1', 0),
 ('france', '1_27_1', 0),
 ('france', '1_142_1', 0),
 ('france', '1_890_1', 0),
 ('france', '1_1065_1', 0),
 ('france', '1_1087_1', 0),
 ('france', '1_1103_1', 0),
 ('france', '1_1105_1', 0),
 ('france', '1_1210_1', 0),
 ('france', '1_1219_1', 0),
 ('france', '1_1243_1', 0),
 ('france', '1_1244_1', 0),
 ('france', '1_1279_1', 0),
 ('france', '1_1285_1', 0),
 ('france', '1_1286_1', 0),
 ('france', '1_1292_1', 0),
 ('france', '1_1312_1', 0),
 ('france', '1_1313_1', 0),
 ('france', '1_1365_1', 0),
 ('france', '1_1369_1', 0),
 ('france', '1_1445_1', 0),
 ('france', '1_1465_1', 0),
 ('france', '1_1467_1', 0),
 ('france', '1_1468_1', 0),
 ('france', '1_1497_1', 0),
 ('france', '1_1499_1', 0),
 ('france', '1_1509_1', 0),
 ('france', '1_1535_1', 0),
 ('france', '1_1565_1', 0),
 ('franc

In [35]:
nx.write_gexf(mixed_G, '../data/network-mixed_NP-Spatial.gexf')
   

In [None]:

nx.draw(mixed_G)