# Create co-occurence networks

This notebook creates co-occurence networks and exports them to .gexf-files, either at the start of a new book or chapter or one network for all selected books and chapters.

## User variables

In [1]:
# which Bible passages to create co-occurence networks for
# -1 matches the last chapter/verse. Useful when selecting a whole book/chapter
passages = {
    "Psalmi": [3,1,3,-1]
}

# export path. Exports to notebook path if empty
path = ""

# what range the co-occurence networks should have
# creates new file and network at the start of a new "chapter", "book" or never when set to "bible"
network_range = "book"

# include / exclude named entitity types:
# "pers" = person
# "mens" = measurement unit
# "gens" = people
# "topo" = place
# "ppde" = demonstrative personal pronoun
# ""     = not specified (seems to be the 'gentillic' words, i.e. from which country someone is)
allowed_nametypes = ["pers", "mens", "gens", "topo", "ppde", ""]
#allowed_nametypes = ["pers"]

# enable other_words to allow words apart from names to be included in network
other_words = True
# allowed_prs contains the grammatical functions of phrases that are included
allowed_prs = ['PreC', 'Pred', 'PreO', 'PreS', 'PtcO']

# minimum weight an edge should have before it is added to the output file
min_edge_weight = 0.1

# formula for the weight between words dependent on the distance in amount of sentences / words
def get_weight_sentence(src, tgt):
    return 1/(abs(src - tgt)+1)**2
def get_weight_word(src, tgt):
    return 1/(abs(src - tgt)+1)

## Initialize and import

Import the python modules, the plot modules, the LAF-Fabric module (``laf``) and initialize the ``laf`` processor.

In [2]:
import sys
import collections
import re
import sys
import os
import json
import time
import networkx as nx
from laf.fabric import LafFabric
fabric = LafFabric()

  0.00s This is LAF-Fabric 4.5.4
API reference: http://laf-fabric.readthedocs.org/en/latest/texts/API-reference.html
Feature doc: http://shebanq-doc.readthedocs.org/en/latest/texts/welcome.html



In [3]:
fabric.load('etcbc4', 'lexicon', '_'.join(passages.keys()),
{
    "primary": False,
    "xmlids": {"node": False, "edge": False},
    "features": ("otype book chapter verse number lex_utf8 gloss sp prs ls nametype g_prs_utf8 ps nu gn function", ""),
},)
exec(fabric.localnames.format(var='fabric'))

  0.00s LOADING API: please wait ... 
  0.03s INFO: USING DATA COMPILED AT: 2014-10-23T15-58-52
  0.07s INFO: USING DATA COMPILED AT: 2014-11-27T12-37-00
  7.58s LOGFILE=C:\Users\Frederik/laf-fabric-output/etcbc4/Psalmi/__log__Psalmi.txt
  7.58s INFO: DATA LOADED FROM SOURCE etcbc4 AND ANNOX lexicon FOR TASK Psalmi AT 2016-09-28T10-05-58


(Re)set all variables. If you want to run the notebook again, also run this code block again to make sure no leftovers from the previous run are used.

In [4]:
# replace -1 in 'passages' dict with inf. b=book, cv=chapter, verse
for b, cv in passages.items():
    for idx, item in enumerate(cv):
        if item == -1:
            cv[idx] = float('inf')
    passages[b] = cv
    
suffix_person_dict = {'NJ': ['I'],
    'J': ['I'],
    'NW': ['We'],
    'K': ['Myou'],
    'K=': ['Fyou'],
    'KM': ['Mpyou'],
    'KN': ['Fpyou'],
    'W': ['He'],
    'HW': ['He'],
    'H': ['She'],
    'HJ': ['She'],       
    'HM': ['Mpthey'],
    'M': ['Mpthey'],
    'MW': ['Mpthey'],            
    'HN': ['Fpthey'],
    'N': ['Fpthey'],
    'absent': [''],
    'n/a': ['']
}

def get_unique_lexeme(book_name, chapter_nr, lexeme):
    if network_range == "chapter":
        return "{}_{}_{}".format(book_name, chapter_nr, lexeme)
    elif network_range == "book":
        return "{}_{}".format(book_name, lexeme)
    else:
        return lexeme

bible_index = collections.defaultdict(lambda: [])
unique_labels = []
unique_nodes = collections.defaultdict(lambda: collections.defaultdict(lambda: collections.defaultdict(lambda: collections.defaultdict(lambda:0))))
correct_verse = False
node_id = 0

## Collect data

Walk through the relevant nodes and collect the data:

In [5]:
for node in NN():
    this_type = F.otype.v(node)

    # when arriving at a new verse, check if that verse is in the allowed passages
    if this_type == "verse":
        correct_verse = False
        this_book = F.book.v(node)

        for b, cv in passages.items():
            this_chapter = int(F.chapter.v(node))

            if this_book == b and this_chapter >= cv[0] and this_chapter <= cv[2]:
                this_verse = int(F.verse.v(node))
                
                if cv[0] == cv[2]:
                    if this_verse >= cv[1] and this_verse <= cv[3]:
                        correct_verse = True
                        break
                elif (
                    (this_chapter == cv[0] and this_verse >= cv[1]) or
                    (this_chapter > cv[0] and this_chapter < cv[2]) or
                    (this_chapter == cv[2] and this_verse <= cv[3])
                   ):
                    
                    correct_verse = True
                    break
                
    # if the verse is allowed, continue
    if correct_verse:
        if this_type == "verse":
            this_verse = int(F.verse.v(node))
            if not this_chapter in bible_index[this_book]:
                bible_index[this_book].append(this_chapter)
        
        elif this_type == "sentence":
            sentence_nr = int(F.number.v(node))
        
        elif this_type == "phrase":
            this_phrase = node
        
        elif this_type == "word":           
            word_nr = int(F.number.v(node))
            sp = F.sp.v(node)
            lexeme = F.lex_utf8.v(node)
            nametype = F.nametype.v(node)
            english_name = re.sub(r'\W+', '', F.gloss.v(node))
            allowed_nametype = any(x in nametype.split(',') for x in allowed_nametypes)
            unique_lexeme = get_unique_lexeme(this_book, this_chapter, lexeme)
            
            #Pronominal suffix -consonantal-transliterated prs toevoegen, word niveau
            prs = F.g_prs_utf8.v(node)
            prs_trans = suffix_person_dict[F.prs.v(node)][0]

            # if part of speech == proper noun or lexical set == gentilic and nametype
            if (sp == 'nmpr' or F.ls.v(node) == 'gntl') and allowed_nametype:

                # if the word is not yet in the chapter / book / bible, add it
                if unique_lexeme not in unique_nodes:
                    unique_nodes[unique_lexeme]["id"] = node_id
                    node_id += 1
                    
                    # if english name already exists, add a 2 (for clarity)
                    if [lexeme, english_name] not in unique_labels:
                        english_names = [x[1] for x in unique_labels]
                        while english_name in english_names and [lexeme, english_name] not in unique_labels:
                            english_name += "2"
                        if [lexeme, english_name] not in unique_labels:
                            unique_labels.append([lexeme, english_name])
                    
                    unique_nodes[unique_lexeme]["gloss"] = english_name
                    unique_nodes[unique_lexeme]["lexeme"] = lexeme
                    unique_nodes[unique_lexeme]["nametype"] = nametype
                    unique_nodes[unique_lexeme]["part_of_speech"] = sp
                    
                    occurrence = collections.defaultdict(lambda: collections.defaultdict(lambda: collections.defaultdict(lambda: [])))
                    occurrence[this_book][this_chapter]["sentence_nr"] = [sentence_nr]
                    occurrence[this_book][this_chapter]["word_nr"] = [word_nr]
                    occurrence[this_book][this_chapter]["verse"] =  [this_verse]
                    unique_nodes[unique_lexeme]["occurrence"] = occurrence
                # otherwise only add the occurence information
                else:
                    unique_nodes[unique_lexeme]["occurrence"][this_book][this_chapter]["verse"].append(this_verse)
                    unique_nodes[unique_lexeme]["occurrence"][this_book][this_chapter]["sentence_nr"].append(sentence_nr)
                    unique_nodes[unique_lexeme]["occurrence"][this_book][this_chapter]["word_nr"].append(word_nr)

            ### extensions for Christiaan Erwich
            
            # each personal pronoun, demonstrative pronoun and word with a suffix gets a unique node
            elif (sp == 'prps' or sp == 'ppde' or prs) and other_words:
                unique_lexeme = '{}_unique{}'.format(unique_lexeme, node_id)
                unique_nodes[unique_lexeme]["id"] = node_id
                node_id += 1

                unique_nodes[unique_lexeme]["gloss"] = english_name
                unique_nodes[unique_lexeme]["lexeme"] = lexeme
                unique_nodes[unique_lexeme]["part_of_speech"] = sp
                
                occurrence = collections.defaultdict(lambda: collections.defaultdict(lambda: collections.defaultdict(lambda: [])))
                occurrence[this_book][this_chapter]["sentence_nr"] = [sentence_nr]
                occurrence[this_book][this_chapter]["word_nr"] = [word_nr]
                occurrence[this_book][this_chapter]["verse"] = [this_verse]
                unique_nodes[unique_lexeme]["occurrence"] = occurrence
                
                # if the word has a suffix add suffix-property
                if prs:
                    unique_nodes[unique_lexeme]["suffix"] = prs
                    unique_nodes[unique_lexeme]["suffix_trans"] = prs_trans
                    
                    # if the word is a verb
                    if sp == 'verb':
                        
                        # add pgn-property
                        pgn = ''
                        if F.ps.v(node) == 'unknown':
                            pgn += 'u'
                        else:
                            pgn += str(F.ps.v(node))
                        if F.gn.v(node) == 'unknown':
                            pgn += 'u'
                        else:
                            pgn += str(F.gn.v(node))
                        if F.nu.v(node) == 'unknown':
                            pgn += 'u'
                        else:
                            pgn += str(F.nu.v(node))
                            
                        unique_nodes[unique_lexeme]["pgn"] = pgn

                        # if the phrase function is present in allowed_prs add phrase_function-property
                        phrase_function = F.function.v(this_phrase)
                        if phrase_function in allowed_prs:
                            unique_nodes[unique_lexeme]["phrase_function"] = phrase_function

msg("Done")

  3.03s Done


In [6]:
# show what's in a dict in a readable way (show first 1000 char)
import json
print(json.dumps(unique_nodes, sort_keys=False, indent=4)[:10000])

{
    "Psalmi_\u05d1\u05e8\u05d7[_unique1": {
        "pgn": "uuu",
        "suffix": "+\u05b9\u05d5",
        "id": 1,
        "occurrence": {
            "Psalmi": {
                "3": {
                    "sentence_nr": [
                        1
                    ],
                    "verse": [
                        1
                    ],
                    "word_nr": [
                        208
                    ]
                }
            }
        },
        "lexeme": "\u05d1\u05e8\u05d7[",
        "phrase_function": "PreS",
        "suffix_trans": "He",
        "gloss": "runaway",
        "part_of_speech": "verb"
    },
    "Psalmi_\u05d0\u05e0\u05d9_unique16": {
        "occurrence": {
            "Psalmi": {
                "3": {
                    "sentence_nr": [
                        14
                    ],
                    "verse": [
                        6
                    ],
                    "word_nr": [
                        249


## Compute and export

In [8]:
def add_edge(G, src_node, tgt_node):    
    weight = 0
    sentence_weight = 0
    word_weight = 0
    
    # if same book->chapter in src and tgt node
    for book in src_node["occurrence"].keys():
        for chapter in src_node["occurrence"][book].keys():
            if tgt_node["occurrence"][book][chapter]:
                # compare sentence nrs to get sentence_weight
                for sentence_src in src_node["occurrence"][book][chapter]["sentence_nr"]:
                    for sentence_tgt in tgt_node["occurrence"][book][chapter]["sentence_nr"]:
                        sentence_weight += get_weight_sentence(sentence_src, sentence_tgt)
                
                # compare word nrs to get word_weight
                for word_src in src_node["occurrence"][book][chapter]["word_nr"]:
                    for word_tgt in tgt_node["occurrence"][book][chapter]["word_nr"]:
                        word_weight += get_weight_word(word_src, word_tgt)
    
    sentence_weight = round(sentence_weight,2)
    word_weight = round(word_weight,2)
    weight = round(sentence_weight + word_weight, 2)
    
    if sentence_weight > min_edge_weight or word_weight > min_edge_weight:
        G.add_edge(
            src_node["id"],
            tgt_node["id"],
            weight = weight,
            sentence_weight = sentence_weight,
            word_weight = word_weight
        )
        
    return G

def add_node(G, node):
    nametype, suffix, suffix_trans, pgn, occurrence = '', '', '', '', ''

    if node["nametype"]:
        nametype = node["nametype"]
    if node["suffix"]:
        suffix = node["suffix"]
        suffix_trans = node["suffix_trans"]
    if node["pgn"]:
        pgn = node["pgn"]
    for book in node["occurrence"].keys():
        for chapter in node["occurrence"][book].keys():
            for verse in node["occurrence"][book][chapter]["verse"]:
                occurrence += '{}_{}_{},'.format(book, chapter, verse)

    G.add_node(
        node["id"],
        gloss = node["gloss"],
        lexeme = node["lexeme"],
        part_of_speech = node["part_of_speech"],
        nametype = nametype,
        suffix = suffix,
        suffix_trans = suffix_trans,
        pgn = pgn,
        occurrence = occurrence
    )

    return G

def create_network(unique_nodes, r_book='', r_chapter=''):
    # create list with only nodes for current chapter/book/bible
    selected_nodes = []
    for node in unique_nodes.values():
        if (node["occurrence"][r_book][r_chapter] or
            (node["occurrence"][r_book] and not r_chapter) or
            (not r_book and not r_chapter)):
            
            selected_nodes.append(node)
            
    # create graph
    G = nx.Graph()
    for src_idx in range(len(selected_nodes)):
        src_node = selected_nodes[src_idx]
        G = add_node(G, src_node)
        
        for tgt_idx in range(src_idx + 1, len(selected_nodes)):
            tgt_node = selected_nodes[tgt_idx]
            G = add_edge(G, src_node, tgt_node)
        
    return G

if network_range == "chapter":
    for r_book, r_chapters in bible_index.items():
        for r_chapter in r_chapters:
            G = create_network(unique_nodes, r_book, r_chapter)
            nx.write_gexf(G, "{}Bible_{}_{}_{}.gexf".format(
                    path,
                    r_book,
                    r_chapter,
                    time.time()
                ),
                encoding = "utf-8",
                prettyprint=True
            )
elif network_range == "book":
    for r_book in bible_index.keys():
        G = create_network(unique_nodes, r_book)
        nx.write_gexf(G, "{}Bible_{}_{}.gexf".format(
                path,
                r_book,
                time.time()
            ),
            encoding = "utf-8",
            prettyprint=True
        )
else:
    G = create_network(unique_nodes)
    nx.write_gexf(G, "{}Bible_{}.gexf".format(
            path,
            time.time()
        ),
        encoding = "utf-8",
        prettyprint=True,
        version='1.1draft'
     )
    
msg("Done")

26m 03s Done
