# Semantic Vector Space

Construct a basic semantic vector set for disambiguating coordinate relations.

In [1]:
import collections
from datetime import datetime
from tools.langtools import PositionsTF
from tools.significance import apply_fishers, contingency_table
from tools.locations import data_locations
from cxbuilders import wordConstructions

from sklearn.metrics.pairwise import pairwise_distances
from scipy.stats import chi2_contingency

from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
from tf.app import use
from tf.fabric import Fabric

# load custom BHSA data + heads
TF = Fabric(locations=data_locations.values())
load_features = ['g_cons_utf8', 'trailer_utf8', 'label', 'lex',
                 'role', 'rela', 'typ', 'function', 'language',
                 'pdp', 'gloss', 'vs', 'vt', 'nhead', 'head', 
                 'mother', 'nu', 'prs', 'sem_set', 'ls', 'st',
                 'kind', 'top_assoc', 'number', 'obj_prep',
                 'embed', 'freq_lex', 'sp']
api = TF.load(' '.join(load_features))
F, E, T, L = api.F, api.E, api.T, api.L # shortform TF methods

A = use('bhsa', api=api, silent=True)
A.displaySetup(condenseType='phrase', withNodes=True, extraFeatures='lex')

This is Text-Fabric 7.8.12
Api reference : https://annotation.github.io/text-fabric/Api/Fabric/

123 features found and 6 ignored
  0.00s loading features ...
   |     0.00s No structure info in otext, the structure part of the T-API cannot be used
  7.15s All features loaded/computed - for details use loadLog()


## Get Context Counts Around Window (bag of words)

For every lexeme found in a timephrase, count the other lexemes that occur in it's vicinity of 5 words for every occurrence of that word in the Hebrew Bible. This allows us to construct an approximate semantic profile that can be compared between terms.

A "bag of words" model means that we do not consider the position of a context word relative to the target word (i.e. "ngrams").

In [2]:
words = wordConstructions(A)

In [3]:
words.findall(2)

[CX cont {2}]

In [4]:
def get_window(word, model='bagofwords'):
    '''
    Build a contextual window, return context words.
    '''
    window = 5
    context = 'sentence'
    confeat = 'lex'
    P = PositionsTF(word, context, A).get
    fore = list(range(-window, 0))
    back = list(range(1, window+1))
    conwords = []
    for pos in (fore + back):
        cword = P(pos, confeat)
        if cword:
            if model == 'bagofwords':
                conwords.append(f'{cword}')
            elif model == 'ngram':
                conwords.append(f'{pos}.{cword}')
    return conwords

wordcons = collections.defaultdict(lambda:collections.Counter())

timelexs = set()

for ph in F.otype.s('timephrase'):
    for w in L.d(ph,'word'): 
        cx = words.findall(w)[0]
        if cx.name == 'cont':
            timelexs.add(L.u(w,'lex')[0])

timewords = set(
    w for lex in timelexs
        for w in L.d(lex,'word')
)

print(f'{len(timewords)} timewords ready for analysis...')

for w in timewords:
    context = get_window(w)
    wordcons[F.lex.v(w)].update(context)
        
wordcons = pd.DataFrame(wordcons).fillna(0)
        
print(f'{wordcons.shape[1]} words analyzed...')
print(f'\t{wordcons.shape[0]} word contexts analyzed...')

51311 timewords ready for analysis...
215 words analyzed...
	5866 word contexts analyzed...


In [5]:
wordcons.head()

Unnamed: 0,>JC/,R>CJT/,>LHJM/,BN/,BQR=/,CMJM/,<T/,QYJR/,>RY/,PNH/,...,XRP=/,DMJ=/,XTNH/,MRWD/,BXWRWT/,CMVH/,BYRT/,MDJ=/,ZMN[,PRSJ/
<B/,1.0,0.0,0.0,0.0,2.0,3.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
<B=/,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
<B==/,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
<BD/,26.0,0.0,27.0,59.0,1.0,1.0,2.0,0.0,28.0,35.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
<BD=/,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
wordcons.shape[0] * wordcons.shape[1]

1261190

In [7]:
wordcons['CNH/'].sort_values(ascending=False).head(10)

W       606.0
B       466.0
H       408.0
L       298.0
BN/     229.0
CNH/    218.0
CLC/    146.0
MLK[    123.0
CB</    122.0
XMC/    121.0
Name: CNH/, dtype: float64

## Measure Target Word / Context Associations 

In [8]:
# contingency table
ct = contingency_table(wordcons)

### Apply ΔP 

We need an efficient (i.e. simple) normalization method for such a large dataset. ΔP is such a test that includes contingency information [(Gries 2008)](https://www.researchgate.net/publication/233650934_Dispersions_and_adjusted_frequencies_in_corpora_further_explorations).

In [9]:
a = wordcons
b = ct['b']
c = ct['c']
d = ct['d']

In [10]:
deltap = (a/(a+b)) - (c/(c+d)).fillna(0)

## Calculate Cosine Distance

In [11]:
distances_raw = pairwise_distances(np.nan_to_num(deltap.T.values), metric='cosine')

In [12]:
dist = pd.DataFrame(distances_raw, columns=wordcons.columns, index=wordcons.columns)

## Testing Efficacy

We want to use semantic vectors to disambiguate coordinate relations when there is more than one candidate to connect a target to.

### Hypothesis: Candidates for coordinate pairs can be distinguished by selecting the candidate with the shortest distance in semantic space from the target word.

In [13]:
def show_dist(target, compares):
    """Return candidates in order of distance."""
    return sorted(
        (dist[target][comp], comp) 
            for comp in compares
    )

### K>B: with XLH or JWM?

In [15]:
A.pretty(777703)

In [16]:
show_dist('K>B/', ('XLH[', 'JWM/'))

[(0.6912441582349056, 'XLH['), (1.1388327444323063, 'JWM/')]

Success. The test shows that XLH is more semantically similar.

### <RPL: <NN or JWM?

In [17]:
A.pretty(817713)

In [18]:
show_dist('<RPL/', ('JWM/', '<NN/'))

[(0.704308463703047, '<NN/'), (1.081819133705937, 'JWM/')]

Success. <NN/ is correctly selected as more semantically similar.

### >PLH/: LJLH or >JCWN?

In [19]:
A.pretty(862564)

In [20]:
show_dist('>PLH/', ('LJLH/', '>JCWN/'))

[(0.4413360573890487, 'LJLH/'), (0.6507899406402325, '>JCWN/')]

Sucess. LJLH is most similar semantically.

### MRWD: <NJH or JWM?

In [21]:
A.pretty(872677)

In [22]:
show_dist('MRWD/', ('<NJ=/', 'JWM/'))

[(0.6806962662270392, '<NJ=/'), (1.3884310982291002, 'JWM/')]

Sucess.

### >M: >B or MWT?

In [23]:
A.pretty(874237)

In [24]:
show_dist('>M/', ('>B/', 'MWT/'))

[(0.4755153921974883, '>B/'), (1.1820189773122076, 'MWT/')]

Sucess.

# Export Vector Resource

In [25]:
import pickle

In [26]:
dist_dict = dist.to_dict()

In [27]:
with open('semvector.pickle', 'wb') as outfile:
    pickle.dump(dist_dict, outfile)