# Create co-occurence networks

This notebook creates co-occurence networks and exports them to .gexf-files, either at the start of a new book or chapter or one network for all selected books and chapters.

## User variables

In [22]:
# which Bible passages to create co-occurence networks for
# -1 matches the last chapter/verse. Useful when selecting a whole book/chapter
passages = {
    "Samuel_I": [1,1,-1,-1]
}

# what range the co-occurence networks should have
# creates new file and network at the start of a new "chapter", "book" or never when set to "bible"
network_range = "book"

# include / exclude named entitity types:
# "pers" = person
# "mens" = measurement unit
# "gens" = people
# "topo" = place
# "ppde" = demonstrative personal pronoun
# ""     = not specified (seems to be the 'gentillic' words, i.e. from which country someone is)
allowed_nametypes = ["pers"]
#allowed_nametypes = ["pers", "mens", "gens", "topo", "ppde", ""]

# minimum weight an edge should have before it is added to the output file
min_edge_weight = 0.1

# formula for the weight between words dependent on the distance in amount of sentences
# weight = 1/(distance in amount of sentences^2)
def get_weight(src_sentence, tgt_sentence):
    return 1/(abs(src_sentence - tgt_sentence)+1)**2

## Initialize and import

Import the python modules, the plot modules, the LAF-Fabric module (``laf``) and initialize the ``laf`` processor.

In [23]:
import sys
import collections
import re
import matplotlib.pyplot as plt
import sys
import os
from laf.fabric import LafFabric
fabric = LafFabric()

  0.00s This is LAF-Fabric 4.5.4
API reference: http://laf-fabric.readthedocs.org/en/latest/texts/API-reference.html
Feature doc: http://shebanq-doc.readthedocs.org/en/latest/texts/welcome.html



In [24]:
fabric.load('etcbc4', 'lexicon', '_'.join(passages.keys()),
{
    "primary": False,
    "xmlids": {"node": False, "edge": False},
    "features": ("otype book chapter verse number lex_utf8 gloss sp ls nametype", ""),
},)
exec(fabric.localnames.format(var='fabric'))

  0.00s LOADING API: please wait ... 
  0.02s INFO: USING DATA COMPILED AT: 2014-10-23T15-58-52
  0.02s INFO: USING DATA COMPILED AT: 2014-11-27T12-37-00
    14s LOGFILE=C:\Users\Frederik/laf-fabric-output/etcbc4/Judices/__log__Judices.txt
    14s INFO: DATA LOADED FROM SOURCE etcbc4 AND ANNOX lexicon FOR TASK Judices AT 2016-03-30T14-46-40


In [25]:
data_header = '''<?xml version="1.0" encoding="UTF-8"?>
<gexf xmlns:viz="http:///www.gexf.net/1.2draft/viz" xmlns="http://www.gexf.net/1.1draft" version="1.2">
<meta>
<creator>LAF-Fabric</creator>
</meta>
<graph defaultedgetype="undirected" idtype="string" type="static">
<attributes class="node" mode="static">
<attribute id="occurrence" title="occurrence" type="string"></attribute>
<attribute id="nametype" title="nametype" type="string"></attribute>
</attributes>
'''

Initialization

In [26]:
# replace -1 in 'passages' dict with inf
for b, cv in passages.items():
    for idx, item in enumerate(cv):
        if item == -1:
            cv[idx] = float('inf')
    passages[b] = cv
    
books = []
unique_labels = []
unique_nodes = collections.defaultdict(lambda: collections.defaultdict(lambda: collections.defaultdict(lambda: collections.defaultdict(lambda:0))))
def get_unique_lexeme(book_name, chapter_nr, lexeme):
    if network_range == "chapter":
        return "{}_{}_{}".format(book_name, chapter_nr, lexeme)
    elif network_range == "book":
        return "{}_{}".format(book_name, lexeme)
    else:
        return lexeme

correct_verse = False
node_id = 0
lexemes = collections.defaultdict(lambda: collections.defaultdict(lambda: collections.defaultdict(lambda:collections.defaultdict(lambda:0))))
edge_weight = collections.defaultdict(lambda: collections.defaultdict(lambda: collections.defaultdict(lambda: 0)))

## Collect data

Walk through the relevant nodes and collect the data:

In [27]:
for node in NN():
    this_type = F.otype.v(node)

    # when arriving at a new verse, check if that verse is in the allowed passages
    if this_type == "verse":
        correct_verse = False
        this_book = F.book.v(node)

        for b, cv in passages.items():
            this_chapter = int(F.chapter.v(node))

            if this_book == b and this_chapter >= cv[0] and this_chapter <= cv[2]:
                this_verse = int(F.verse.v(node))
                
                if cv[0] == cv[2]:
                    if this_verse >= cv[1] and this_verse <= cv[3]:
                        correct_verse = True
                        break
                elif (
                    (this_chapter == cv[0] and this_verse >= cv[1]) or
                    (this_chapter > cv[0] and this_chapter < cv[2]) or
                    (this_chapter == cv[2] and this_verse <= cv[3])
                   ):
                    
                    correct_verse = True
                    break
                
    # if the verse is allowed, continue
    if correct_verse:
        if this_book not in books:
            books.append(this_book)
            
        if this_type == "sentence":
            this_sentence = int(F.number.v(node))
        
        if this_type == "word":
            lexeme = F.lex_utf8.v(node)
            nametype = F.nametype.v(node)
            allowed_nametype = any(x in nametype.split(',') for x in allowed_nametypes)
            unique_lexeme = get_unique_lexeme(this_book, this_chapter, lexeme)

            # if part of speech == proper noun or lexical set == gentilic and nametype is allowed
            if (F.sp.v(node) == 'nmpr' or F.ls.v(node) == 'gntl') and allowed_nametype:
                lexemes[this_book][this_chapter][this_sentence][lexeme] += 1
                occurrence = "{}_{},".format(this_book, this_chapter)

                # if the word is not yet in the chapter / book / bible, add it
                if unique_lexeme not in unique_nodes:
                    unique_nodes[unique_lexeme]["id"] = node_id
                    node_id += 1

                    english_name = re.sub(r'\W+', '', F.gloss.v(node))
                    
                    if [lexeme, english_name] not in unique_labels:
                        english_names = [x[1] for x in unique_labels]
                        while english_name in english_names and [lexeme, english_name] not in unique_labels:
                            english_name += "2"
                        if [lexeme, english_name] not in unique_labels:
                            unique_labels.append([lexeme, english_name])
                    
                    unique_nodes[unique_lexeme]["gloss"] = english_name
                    unique_nodes[unique_lexeme]["nametype"] = nametype
                    unique_nodes[unique_lexeme]["occurrence"] = occurrence
                # otherwise add the occurence information
                else:
                    if not unique_nodes[unique_lexeme]["occurrence"].endswith(occurrence):
                        unique_nodes[unique_lexeme]["occurrence"] += occurrence

msg("Done")

 1m 14s END
    14s Done


## Compute and export

Sort the data according to the various subtasks, and compute the edges with their weights.

In [28]:
last_book = len(books) - 1
for idx_book, book_name in enumerate(books):
    
    last_chapter = len(lexemes[book_name]) - 1
    for idx_chapter, chapter_nr in enumerate(lexemes[book_name]):

        # create the nodes in the graph
        node_data = []
        unique_labels = []
        for node in unique_nodes:
            if (
                (network_range == "chapter" and "{}_{}_".format(book_name, chapter_nr) in node) or
                (network_range == "book" and "{}_".format(book_name) in node) or
                (network_range != "chapter" and network_range != "book")
                ):

                node_data.append('''<node id="{}" label="{}">\n  <attvalues><attvalue for="occurrence" value="{}"/><attvalue for="nametype" value="{}"/></attvalues>\n</node>\n'''.format(
                    unique_nodes[node]["id"], unique_nodes[node]["gloss"], unique_nodes[node]["occurrence"], unique_nodes[node]["nametype"]))

        # read all lexemes from the chapter and add them to the list 'names'
        names = []
        for sentence in lexemes[book_name][chapter_nr]:
            words = lexemes[book_name][chapter_nr][sentence].keys()
            # add sentence id to words like this: [sentence number, word]
            # so the distance between the words can be calculated
            words = [[s,sentence] for s in words]
            names.extend(words)

        # calculate the edge weights between the names in 'names'
        for src in range(len(names)):
            src_sentence = names[src][1]
            lexemes_src = names[src][0]
            
            for tgt in range(src + 1, len(names)):
                tgt_sentence = names[tgt][1]
                lexemes_tgt = names[tgt][0]
                
                if lexemes_src != lexemes_tgt:
                    # set weight dependent on how much sentences the words are away from each other
                    this_weight = get_weight(src_sentence, tgt_sentence)

                    occurrence = "{}_{},".format(book_name,chapter_nr)
                    
                    # if there already exists a relation the other way around, swap vars
                    if edge_weight[lexemes_tgt][lexemes_src]["weight"] > 0:
                        if not edge_weight[lexemes_tgt][lexemes_src]["occurrence"]:
                            edge_weight[lexemes_tgt][lexemes_src]["occurrence"] = occurrence
                        elif not edge_weight[lexemes_tgt][lexemes_src]["occurrence"].endswith(occurrence):
                            edge_weight[lexemes_tgt][lexemes_src]["occurrence"] += occurrence

                        edge_weight[lexemes_tgt][lexemes_src]["weight"] += this_weight
                    else:
                        if not edge_weight[lexemes_src][lexemes_tgt]["occurrence"]:
                            edge_weight[lexemes_src][lexemes_tgt]["occurrence"] = occurrence
                        elif not edge_weight[lexemes_src][lexemes_tgt]["occurrence"].endswith(occurrence):
                            edge_weight[lexemes_src][lexemes_tgt]["occurrence"] += occurrence
                                
                        edge_weight[lexemes_src][lexemes_tgt]["weight"] += this_weight

        # if a network is complete, create the edges, export the file and clear the edge weights
        if (
                (network_range == "chapter") or
                (network_range == "book" and idx_chapter == last_chapter) or
                (idx_chapter == last_chapter and idx_book == last_book)
            ):
            
            # create the edges
            edge_id = 0
            edge_data = []
            for src in edge_weight:
                unique_lexeme_src = get_unique_lexeme(book_name, chapter_nr, src)

                for tgt in edge_weight[src]:
                    unique_lexeme_tgt = get_unique_lexeme(book_name, chapter_nr, tgt)

                    if edge_weight[src][tgt]["weight"] > min_edge_weight:
                        edge_id += 1

                        edge_data.append('''<edge id="{}" source="{}" target="{}" weight="{}" label="{}"/>\n'''.
                        format(edge_id, unique_nodes[unique_lexeme_src]["id"], unique_nodes[unique_lexeme_tgt]["id"], round(edge_weight[src][tgt]["weight"],2), edge_weight[src][tgt]["occurrence"]))

            # export the file
            edges_header = '''<edges count="{}">\n'''.format(len(edge_data))

            if network_range == "chapter":
                filename = 'Bible_{}_{}.gexf'.format(book_name, str(chapter_nr).zfill(2))
            elif network_range == "book":
                filename = 'Bible_{}.gexf'.format(book_name)
            else:
                filename = 'Bible.gexf'
            out_file = outfile(filename)
            
            out_file.write(data_header)

            nodes_header = '''<nodes count="{}">\n\n'''.format(len(node_data))
            out_file.write(nodes_header)
            for node_line in node_data:
                out_file.write(node_line)
            out_file.write("</nodes>\n")

            out_file.write(edges_header)
            for edge_line in edge_data:
                out_file.write(edge_line)
            out_file.write("</edges>\n")
            out_file.write("</graph></gexf>\n")

            msg("chapter:  {}; nodes:  {}; edges: {}".format(chapter_nr, len(node_data), len(edge_data)))
             
            # clear the edge weights for the next network
            edge_weight = collections.defaultdict(lambda: collections.defaultdict(lambda: collections.defaultdict(lambda: 0)))
close()

    16s chapter:  21; nodes:  89; edges: 385
    16s Results directory:
C:\Users\Frederik/laf-fabric-output/etcbc4/Judices

Bible_Judices.gexf                    47570 Wed Mar 30 16:46:56 2016
__log__Judices.txt                      282 Wed Mar 30 16:46:40 2016
