# Constructing a ΔP Network Illustration

Using ΔP, how likely is a given lexeme X to prompt the co-occurrence of another lexeme Y within the same phrase.

Model the probabilities as edges in a collostructional network using Gephi. Edge strengths are 
determined by ΔP scores.

Use a graph method to visualize neighborhoods in the network.

In [30]:
from scripts.imports import *

# df_ph = pd.read_csv(
#     paths['phrase_dataset'], 
#     index_col='node', 
#     low_memory=False
# )

# df_sgph = df_ph[
#     (df_ph.n_heads == 1)
#     & (df_ph.n_phatoms == 1)
# ].copy()

# out = Exporter(
#     paths['outdir'], 
#     'headship_tfidf'
# )

In [22]:
import collections
import pandas as pd
import numpy as np

In [6]:
from tf.app import use

bhsa = use('bhsa')
F, T, L =  bhsa.api.F, bhsa.api.T, bhsa.api.L

In [188]:
col_ct = collections.defaultdict(lambda: collections.Counter())

for ph in F.otype.s('phrase'):
    
    # keep only time phrases
    if F.function.v(ph) != 'Time':
        continue
    
    # keep only those that have co-occurence counts > 1
    phwords = L.d(ph,'word')
    if len(phwords) < 2: 
        continue
        
    # keep only substative-like phrases
    if F.typ.v(ph) not in {'NP', 'PP', 'AdjP', 'AdvP'}:
        continue
    
    # poll the lowest frequency and filter by minimum
    min_freq=2
    freqs = [F.freq_lex.v(w) for w in phwords]
    if min(freqs) < min_freq:
        continue
    
    for wordi in phwords:
        wordi_lex = F.lex_utf8.v(wordi)
        for wordj in phwords:
            if wordi == wordj:
                continue
            wordj_lex = F.lex_utf8.v(wordj)
            if (wordi_lex != wordj_lex) and (F.freq_lex.v(wordi) > 9) and (F.freq_lex.v(wordj) > 9):
                col_ct[wordi_lex][wordj_lex] += 1

In [189]:
col_ct = pd.DataFrame(col_ct).fillna(0).astype(int)

In [190]:
col_ct.shape

(302, 302)

In [191]:
col_ct

Unnamed: 0,ב,ראשׁית,ה,יום,שׁביעי,ל,רוח,כל,חיים,עולם,...,חצרון,כלב,אלהים,מצה,סכה,יהוידע,זכריהו,אמת,נביא,שׁמה
ראשׁית,6,0,3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ה,1908,3,0,1644,153,364,1,150,2,19,...,0,0,2,7,7,1,0,3,1,1
יום,797,0,1644,0,47,110,1,191,31,1,...,0,0,0,0,0,2,1,0,1,1
שׁביעי,77,0,153,47,0,12,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
הם,34,0,75,33,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
אישׁ,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
הנה,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
כה,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
אן,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [192]:
col_dp = sig.apply_deltaP(col_ct, 0, 1)

col_dp

Unnamed: 0,ב,ראשׁית,ה,יום,שׁביעי,ל,רוח,כל,חיים,עולם,...,חצרון,כלב,אלהים,מצה,סכה,יהוידע,זכריהו,אמת,נביא,שׁמה
ראשׁית,0.034656,-0.000787,-0.108569,-0.104318,-0.007765,-0.048101,-0.000066,-0.012009,-0.001816,-0.006978,...,-0.000087,-0.000087,-0.000241,-0.000634,-0.000634,-0.000197,-0.000044,-0.000153,-0.000087,-0.000066
ה,0.105636,-0.000551,-0.237343,0.102815,0.011971,-0.008150,0.000060,0.006301,-0.001963,-0.005948,...,-0.000108,-0.000108,-0.000016,0.000203,0.000203,-0.000102,-0.000054,0.000234,0.000033,0.000060
יום,0.039165,-0.000878,0.170703,-0.116365,0.002340,-0.027907,0.000161,0.031315,0.005231,-0.007550,...,-0.000098,-0.000098,-0.000268,-0.000708,-0.000708,0.000249,0.000185,-0.000171,0.000136,0.000161
שׁביעי,0.085527,-0.000793,0.241039,0.028379,-0.007820,-0.014372,-0.000066,-0.012093,-0.001828,-0.007027,...,-0.000088,-0.000088,-0.000242,-0.000639,-0.000639,-0.000198,-0.000044,-0.000154,-0.000088,-0.000066
הם,0.084814,-0.000790,0.286875,0.106320,-0.007786,-0.041838,-0.000066,-0.012041,-0.001820,-0.006996,...,-0.000088,-0.000088,-0.000241,-0.000636,-0.000636,-0.000197,-0.000044,-0.000154,-0.000088,-0.000066
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
אישׁ,-0.132044,-0.000787,-0.191825,-0.104240,-0.007760,-0.048066,-0.000066,-0.012000,-0.001814,-0.006973,...,-0.000087,-0.000087,-0.000240,-0.000634,-0.000634,-0.000197,-0.000044,-0.000153,-0.000087,-0.000066
הנה,-0.132055,-0.000787,-0.191842,-0.104250,-0.007760,-0.048070,-0.000066,-0.012001,-0.001814,-0.006973,...,-0.000087,-0.000087,-0.000240,-0.000634,-0.000634,-0.000197,-0.000044,-0.000153,-0.000087,-0.000066
כה,-0.132044,-0.000787,-0.191825,-0.104240,-0.007760,-0.048066,-0.000066,-0.012000,-0.001814,-0.006973,...,-0.000087,-0.000087,-0.000240,-0.000634,-0.000634,-0.000197,-0.000044,-0.000153,-0.000087,-0.000066
אן,-0.132078,-0.000787,-0.191875,-0.104268,-0.007762,-0.048078,-0.000066,-0.012003,-0.001815,-0.006975,...,-0.000087,-0.000087,-0.000241,-0.000634,-0.000634,-0.000197,-0.000044,-0.000153,-0.000087,-0.000066


# Extract Edge Values for Gephi

In [193]:
# assign node values for all unique lexeme strings
str2id = {}
for i,string in enumerate(col_dp.index):
    str2id[string] = i

In [194]:
nodes = []
edges = []

for lexi in col_dp.index:
    idi = str2id[lexi]
    nodes.append([idi, get_display(lexi)])
    for lexj in col_dp.columns:
        if lexi == lexj:
            continue
        dp = col_dp[lexi][lexj].round(2)
        if dp <= 0:
            continue
        idj =  str2id[lexj]
        edges.append([idi, idj, dp])

In [195]:
nodes[:10]

[[0, 'תיׁשאר'],
 [1, 'ה'],
 [2, 'םוי'],
 [3, 'יעיבׁש'],
 [4, 'םה'],
 [5, 'הנׁש'],
 [6, 'ׁשׁש'],
 [7, 'האמ'],
 [8, 'ל'],
 [9, 'םייח']]

In [196]:
edges[:10]

[[0, 49, 0.02],
 [0, 84, 0.01],
 [0, 221, 0.05],
 [0, 109, 0.01],
 [0, 130, 0.01],
 [0, 144, 0.02],
 [0, 150, 0.12],
 [0, 152, 0.01],
 [1, 220, 0.14],
 [1, 2, 0.17]]

In [197]:
len(edges)

2228

In [198]:
len(nodes)

302

In [199]:
import csv

In [200]:
outdir = paths['outdir'].joinpath('ΔP_gephi')
nodef = outdir.joinpath('nodes.csv')
edgef = outdir.joinpath('edges.csv')

In [201]:
with open(nodef, 'w') as outfile:
    writer = csv.writer(outfile)
    writer.writerow(['ID', 'Label'])
    writer.writerows(nodes)

In [202]:
with open(edgef, 'w') as outfile:
    writer = csv.writer(outfile)
    writer.writerow(['Source', 'Target', 'Weight'])
    writer.writerows(edges)