# Applying Markovs Full-Throttle 🚀🚀

In this notebook, we will apply statistical tests to 3 datatypes at the following levels:

* word: part of speech
* phrase: function, type
* clause: type

In [1]:
import glob, re, getpass, collections, operator, scipy.stats, sklearn.metrics
from itertools import chain
from pprint import pprint
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from discreteMarkovChain import markovChain
from tf.fabric import Fabric

# format paths for Etienne or Cody
if getpass.getuser() == 'etien':
    locations = 'C:/Users/etien/Documents/github/bhsa/tf'
elif getpass.getuser() == 'cody':
    locations = '~/github/etcbc/bhsa/tf'

# load TF and BHSA data
TF = Fabric(locations=locations, modules='c', silent=True)
api = TF.load('''
              otype
              book chapter verse
              function domain
              typ pdp kind
              ''')
api.makeAvailableIn(globals()) # globalize TF methods

  0.00s loading features ...
   |     0.02s B otype                from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.01s B book                 from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.01s B chapter              from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.01s B verse                from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.06s B function             from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.18s T domain               from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.17s B typ                  from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.10s B pdp                  from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.02s B kind                 from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.00s Feature overview: 109 for nodes; 5 for edges; 1 configs; 7 computed
  4.32s All features loaded/computed - for details use loadLog()


# Format ETCBC Data

In [4]:
# book lists:
lbh_books = {'1_Chronicles', '2_Chronicles', 
             'Ezra', 'Esther', 'Nehemiah'}

sbh_books = {'Genesis', 'Exodus','Leviticus', 
             'Deuteronomy','Joshua', 'Judges', 
             '1_Kings', '2_Kings', '1_Samuel',
             '2_Samuel'}

# don't load clauses with these functions
exclude_functions = {'IntS', 'Ques', 'Exst', 'ModS',
                     'NCoS', 'NCop', 'Supp', 'PrAd',
                     'Frnt', 'Intj', 'EPPr', "ExsS", 
                     "PrcS", "Voct"}

# structure: data[FEATURE][BOOK] = list(list()*N)
data = collections.defaultdict(lambda: collections.defaultdict(list)) 

for chapter in F.otype.s('chapter'):

    # format book data
    book, ch, vs = T.sectionFromNode(chapter)
    if book not in (lbh_books | sbh_books): # skip non-studied books
        continue
    book_typ = 'lbh' if book in lbh_books else 'sbh' # set to lbh or sbh
    
    # text constituents (clause type transitions)
    clauses = L.d(chapter, otype='clause')
    clause_typs = [F.typ.v(clause) for clause in L.d(chapter, otype='clause')]
    
    # clause constituents
    for clause in clauses:

        # phrase level data
        ph_functions = [F.function.v(phrase) for phrase in L.d(clause, otype='phrase')]
        ph_typs = [F.typ.v(phrase) for phrase in L.d(clause, otype='phrase')]

        # word level data
        parts_of_speech = [F.pdp.v(word) for word in L.d(clause, otype='word')]

        # selection restrictions:
        if any([
                F.domain.v(clause) != 'N',            # must be narrative
                set(ph_functions) & exclude_functions # cannot have an excluded phrase function
                ]):
            continue # skip it

        # put data in data dict
        data['phrase_functions'][book].append(ph_functions)
        data['phrase_types'][book].append(ph_typs)
        data['word_pos'][book].append(parts_of_speech)
        
        # add by book type (LBH vs. SBH)
        data['phrase_functions'][book_typ].append(ph_functions)
        data['phrase_types'][book_typ].append(ph_typs)
        data['word_pos'][book_typ].append(parts_of_speech)
            
    # put data in datadict
    data['clause_types'][book].append(clause_typs)
    data['clause_types'][book_typ].append(clause_typs)
    
# report

print('The following features are ready:\n')
for feature, books in data.items():
    data_points = 0
    for book, feature_list in books.items():
        data_points += len(feature_list)
    print('{}\n\t{} datapoints'.format(feature, data_points))

The following features are ready:

phrase_functions
	42790 datapoints
phrase_types
	42790 datapoints
word_pos
	42790 datapoints
clause_types
	792 datapoints


In [6]:
data['phrase_functions']['lbh'][0] # e.g.

['Conj', 'Pred', 'Time']

# Analysis Functions

In [181]:
def give_unique(feature, otype='phrase'):
    '''
    Returns sorted list of unique features from BHSA data.
    Sort by the frequency of the feature, most frequent first.
    '''
    
    feature_count = collections.Counter()
    
    for obj in F.otype.s(otype):
        feature_count[feature.v(obj)] += 1
    
    unique_features = list(v[0] for v in feature_count.most_common())
    
    return unique_features


def make_counts(feature_dict):
    '''
    Return a dictionary with frequency counts of the feature values.
    Requires a feature dict: feature[book] = list(list()*N)
    '''
    
    feature_count = dict() # counts here

    # loop and count
    for bookname, units in feature_dict.items():
        feature_count[bookname] = collections.Counter(list(chain(*units))) 

    # make into dataframe
    df_Count = pd.DataFrame(feature_count, columns=feature_dict.keys()).fillna(0)
    df_prob_Count = df_Count.div(df_Count.sum(axis=0), axis=1)
    
    # return both count and probability objects
    return  df_Count, df_prob_Count
    
    
def chi_squareTest(matrix):
    '''
    Perform a chi-squared test on any matrix and return the p value.
    '''
    chi2, p, dof, ex = scipy.stats.chi2_contingency(matrix)
    return p

def compute_testing(df):
    '''
    Performs the chi-squared test on each pair of columns 
    to test whether two books are similar or different. 
    '''
    books = df.keys()
    df_test = pd.DataFrame(np.zeros((len(books), len(books))), columns = books, index = books)
    
    # test each pair of columns
    for indexi,i in enumerate(books):
        for indexj,j in enumerate(books):
            if indexi>indexj:
                dfTemp = df[[i,j]]                     # Select columns
                dfTemp = dfTemp[(dfTemp.T != 0).any()] # Remove zeros
                dfTemp = np.matrix(dfTemp)             # Make np matrix
                dfTemp = dfTemp.transpose()            # Transpose matrix
                dfTemp = dfTemp.astype(int)            # Convert to integers
                df_test.loc[i,j] = round(chi_squareTest(dfTemp), 5)
    return df_test.replace(0.0,np.nan)

def make_transitions(feature_dict, unique_values=set()):
    '''
    Returns a dictionary with two transition matrices
    one with frequency and one with normalized counts.
    '''
    
    df_Transition_freq = dict() # Transition matrix with frequences
    df_Transition_prob = dict() # Normalized transition matrix

    nodes = ["Clause_Begin", "Clause_End"] + unique_values

    for bookname, clauses in feature_dict.items():
        transition_Matrix = np.zeros((len(nodes),len(nodes)))
        
        # count transitions using bigrams
        transitions = list()
        for clause in clauses:
            transitions.append("Clause_Begin")
            transitions.extend(clause)
            transitions.append("Clause_End")
            
        for i in range(0, len(transitions)-1):
            transition_Matrix[nodes.index(transitions[i]), nodes.index(transitions[i+1])] +=1 
        df_Trans = pd.DataFrame(transition_Matrix, columns = nodes, index = nodes)    

        # Remove unused transition functions
        df_Trans = df_Trans[(df_Trans.T != 0).any()]
        df_Trans = df_Trans.loc[:, (df_Trans != 0).any(axis=0)]

        df_Transition_freq[bookname] = df_Trans    
        # Normalized by row 
        df_Transition_prob[bookname] = df_Trans.div(df_Trans.sum(axis=1), axis=0)
        
    return df_Transition_freq, df_Transition_prob

def get_sorted_pairs(matrix):
    '''
    Return a dictionary of sorted pairs from transition matrix differences
    '''
    
    df_sub_abs = abs(matrix)
    temp_dict = df_sub_abs.to_dict(orient="records")
    nodes = list(temp_dict[0].keys())
    dictionary = dict()
    
    for index, row_dict in enumerate(temp_dict):
        for key, value in row_dict.items():
            dictionary[nodes[index],key]=value
            
    return sorted(dictionary.items(), key=operator.itemgetter(1), reverse=True)

# Phrase Function Analysis

In [182]:
function_freq, function_prob = make_counts(data['phrase_functions'])

## Probability Counts, SBH and LBH

In [183]:
function_prob[list(sbh_books)] # SBH 

Unnamed: 0,Joshua,Exodus,1_Kings,2_Kings,1_Samuel,2_Samuel,Genesis,Leviticus,Judges,Deuteronomy
Adju,0.026285,0.032909,0.027991,0.025276,0.011253,0.018989,0.016512,0.018967,0.016912,0.037156
Cmpl,0.152179,0.109109,0.123292,0.134416,0.133383,0.138372,0.121257,0.162276,0.146875,0.140935
Conj,0.224349,0.231929,0.228924,0.244895,0.260015,0.256063,0.253457,0.228662,0.25386,0.214606
Loca,0.012682,0.023702,0.012662,0.013224,0.009152,0.009964,0.011738,0.00843,0.009926,0.016015
Modi,0.008301,0.010186,0.007498,0.005859,0.007352,0.0094,0.008256,0.00843,0.006801,0.005766
Nega,0.011298,0.010774,0.009497,0.012554,0.008252,0.007144,0.004775,0.001054,0.009191,0.021781
Objc,0.074937,0.109696,0.079807,0.080013,0.062266,0.061102,0.090023,0.116965,0.074265,0.100577
PreC,0.06825,0.049951,0.067977,0.040341,0.049662,0.047001,0.045061,0.024236,0.04375,0.03075
PreO,0.018215,0.011166,0.015495,0.017911,0.015154,0.01598,0.010843,0.013699,0.017279,0.035874
PreS,0.002997,0.001763,0.002666,0.003515,0.002851,0.004136,0.006068,0.003161,0.001838,0.007047


In [184]:
function_prob[list(lbh_books)] # LBH

Unnamed: 0,Esther,2_Chronicles,1_Chronicles,Ezra,Nehemiah
Adju,0.0437,0.054427,0.06299,0.074669,0.035974
Cmpl,0.150765,0.124383,0.108207,0.102079,0.122912
Conj,0.203933,0.222932,0.216675,0.174858,0.2197
Loca,0.008012,0.013208,0.010455,0.016068,0.022698
Modi,0.005827,0.00566,0.006273,0.006616,0.008137
Nega,0.007283,0.010305,0.005227,0.006616,0.014561
Objc,0.080845,0.097678,0.085468,0.093573,0.075375
PreC,0.06118,0.061393,0.108468,0.125709,0.091221
PreO,0.01311,0.020464,0.015159,0.008507,0.017131
PreS,0.002913,0.00479,0.001045,0.00189,0.001713


## Comparing SBH & LBH Transition Matrices

In [185]:
unique_functions = give_unique(F.function, 'phrase')

function_trans_freq, function_trans_prob = make_transitions(data['phrase_functions'], 
                                                            unique_values=unique_functions)

In [187]:
function_trans_prob['sbh']

Unnamed: 0,Clause_Begin,Clause_End,Pred,Conj,Subj,Cmpl,Objc,PreC,Adju,Rela,Nega,PreO,Time,Modi,Loca,PreS,PtcO
Clause_Begin,0.0,0.0,0.073683,0.766376,0.027227,0.002259,0.006532,0.019779,0.004395,0.070081,0.004395,0.007448,0.003052,0.002747,0.00116,0.010744,0.000122
Clause_End,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Pred,0.0,0.199736,0.0,0.0,0.336773,0.240708,0.155971,0.01389,0.014821,0.0,0.0,0.0,0.016451,0.010941,0.010708,0.0,0.0
Conj,0.0,0.000159,0.781384,0.001193,0.087033,0.007637,0.023071,0.019809,0.002148,0.002307,0.020525,0.047176,0.002705,0.001909,0.002228,0.000716,0.0
Subj,0.0,0.319316,0.06743,0.0,0.0,0.255075,0.125725,0.158208,0.025812,0.0,0.006671,0.001595,0.021897,0.008556,0.009716,0.0,0.0
Cmpl,0.0,0.758005,0.01151,0.0,0.058138,0.03084,0.0574,0.004869,0.029364,0.0,0.002361,0.000148,0.019035,0.008116,0.020215,0.0,0.0
Objc,0.0,0.600385,0.050493,0.0,0.015148,0.178408,0.045925,0.00024,0.049531,0.0,0.00529,0.00024,0.021399,0.005049,0.027891,0.0,0.0
PreC,0.0,0.58081,0.010617,0.0,0.10696,0.1628,0.040897,0.0,0.045222,0.0,0.000393,0.0,0.015729,0.011011,0.02556,0.0,0.0
Adju,0.0,0.737174,0.037804,0.0,0.026103,0.061206,0.036004,0.013501,0.036004,0.0,0.0045,0.0027,0.019802,0.011701,0.013501,0.0,0.0
Rela,0.0,0.0,0.418862,0.0,0.02209,0.00085,0.0,0.507222,0.002549,0.0,0.024639,0.023789,0.0,0.0,0.0,0.0,0.0


In [188]:
function_trans_prob['lbh']

Unnamed: 0,Clause_Begin,Clause_End,Pred,Conj,Subj,Cmpl,Objc,PreC,Adju,Rela,Nega,PreO,Time,Modi,Loca,PreS,PtcO
Clause_Begin,0.0,0.0,0.088751,0.665935,0.073793,0.007579,0.006382,0.036697,0.008775,0.080375,0.001795,0.007778,0.008975,0.002593,0.001795,0.008775,0.0
Clause_End,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Pred,0.0,0.142812,0.0,0.0,0.298262,0.254344,0.19684,0.025276,0.045814,0.0,0.0,0.0,0.014534,0.006951,0.01485,0.0,0.000316
Conj,0.0,0.000598,0.603649,0.000897,0.140891,0.018546,0.054741,0.046366,0.023931,0.002692,0.023033,0.055938,0.015555,0.004188,0.008077,0.000897,0.0
Subj,0.0,0.320393,0.079981,0.0,0.0,0.149205,0.090739,0.264266,0.050514,0.0,0.008887,0.004677,0.01029,0.007016,0.014032,0.0,0.0
Cmpl,0.0,0.719701,0.016017,0.0,0.051255,0.033102,0.065136,0.007475,0.065136,0.0,0.003203,0.000534,0.02456,0.004805,0.009076,0.0,0.0
Objc,0.0,0.590188,0.066378,0.0,0.004329,0.145743,0.028139,0.002165,0.099567,0.0,0.007937,0.0,0.022367,0.004329,0.02886,0.0,0.0
PreC,0.0,0.573344,0.005521,0.0,0.087539,0.160883,0.06388,0.0,0.078864,0.0,0.0,0.0,0.012618,0.007886,0.009464,0.0,0.0
Adju,0.0,0.638856,0.030989,0.0,0.066746,0.073897,0.075089,0.016687,0.06317,0.0,0.003576,0.003576,0.009535,0.003576,0.014303,0.0,0.0
Rela,0.0,0.0,0.368932,0.0,0.01699,0.0,0.0,0.56068,0.0,0.0,0.026699,0.026699,0.0,0.0,0.0,0.0,0.0


### Difference between SBH & LBH

In [189]:
lbh_sbh_dif = function_trans_prob["lbh"].subtract(function_trans_prob["sbh"], fill_value =0)
lbh_sbh_dif = round(lbh_sbh_dif, 3)

lbh_sbh_dif

Unnamed: 0,Clause_Begin,Clause_End,Pred,Conj,Subj,Cmpl,Objc,PreC,Adju,Rela,Nega,PreO,Time,Modi,Loca,PreS,PtcO
Clause_Begin,0.0,0.0,0.015,-0.1,0.047,0.005,-0.0,0.017,0.004,0.01,-0.003,0.0,0.006,-0.0,0.001,-0.002,-0.0
Clause_End,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Pred,0.0,-0.057,0.0,0.0,-0.039,0.014,0.041,0.011,0.031,0.0,0.0,0.0,-0.002,-0.004,0.004,0.0,0.0
Conj,0.0,0.0,-0.178,-0.0,0.054,0.011,0.032,0.027,0.022,0.0,0.003,0.009,0.013,0.002,0.006,0.0,0.0
Subj,0.0,0.001,0.013,0.0,0.0,-0.106,-0.035,0.106,0.025,0.0,0.002,0.003,-0.012,-0.002,0.004,0.0,0.0
Cmpl,0.0,-0.038,0.005,0.0,-0.007,0.002,0.008,0.003,0.036,0.0,0.001,0.0,0.006,-0.003,-0.011,0.0,0.0
Objc,0.0,-0.01,0.016,0.0,-0.011,-0.033,-0.018,0.002,0.05,0.0,0.003,-0.0,0.001,-0.001,0.001,0.0,0.0
PreC,0.0,-0.007,-0.005,0.0,-0.019,-0.002,0.023,0.0,0.034,0.0,-0.0,0.0,-0.003,-0.003,-0.016,0.0,0.0
Adju,0.0,-0.098,-0.007,0.0,0.041,0.013,0.039,0.003,0.027,0.0,-0.001,0.001,-0.01,-0.008,0.001,0.0,0.0
Rela,0.0,0.0,-0.05,0.0,-0.005,-0.001,0.0,0.053,-0.003,0.0,0.002,0.003,0.0,0.0,0.0,0.0,0.0


### Largest Differences

In [190]:
lbh_sbh_dif[(abs(lbh_sbh_dif).T > .09).any()]

Unnamed: 0,Clause_Begin,Clause_End,Pred,Conj,Subj,Cmpl,Objc,PreC,Adju,Rela,Nega,PreO,Time,Modi,Loca,PreS,PtcO
Clause_Begin,0.0,0.0,0.015,-0.1,0.047,0.005,-0.0,0.017,0.004,0.01,-0.003,0.0,0.006,-0.0,0.001,-0.002,-0.0
Conj,0.0,0.0,-0.178,-0.0,0.054,0.011,0.032,0.027,0.022,0.0,0.003,0.009,0.013,0.002,0.006,0.0,0.0
Subj,0.0,0.001,0.013,0.0,0.0,-0.106,-0.035,0.106,0.025,0.0,0.002,0.003,-0.012,-0.002,0.004,0.0,0.0
Adju,0.0,-0.098,-0.007,0.0,0.041,0.013,0.039,0.003,0.027,0.0,-0.001,0.001,-0.01,-0.008,0.001,0.0,0.0
Time,0.0,-0.226,0.151,0.0,0.019,-0.019,-0.007,0.008,0.038,0.0,0.01,0.01,0.015,0.001,0.0,0.0,0.0
Modi,0.0,-0.124,0.01,0.008,0.009,0.028,0.032,0.03,0.009,0.0,0.001,-0.002,0.003,-0.002,-0.002,0.0,0.0
PreS,0.0,0.132,0.0,0.0,0.0,-0.079,-0.069,0.016,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [191]:
get_sorted_pairs(lbh_sbh_dif)[:10]

[(('Time', 'Clause_End'), 0.22600000000000001),
 (('Conj', 'Pred'), 0.17799999999999999),
 (('Time', 'Pred'), 0.151),
 (('PreS', 'Clause_End'), 0.13200000000000001),
 (('Modi', 'Clause_End'), 0.124),
 (('Subj', 'Cmpl'), 0.106),
 (('Subj', 'PreC'), 0.106),
 (('Clause_Begin', 'Conj'), 0.10000000000000001),
 (('Adju', 'Clause_End'), 0.098000000000000004),
 (('Loca', 'Pred'), 0.084000000000000005)]

# Phrase Type Analysis

## Counts

In [192]:
type_freq, type_prob = make_counts(data['phrase_types'])

In [193]:
type_prob[list(sbh_books)]

Unnamed: 0,Joshua,Exodus,1_Kings,2_Kings,1_Samuel,2_Samuel,Genesis,Leviticus,Judges,Deuteronomy
AdjP,0.001614,0.003526,0.004665,0.004017,0.004051,0.003572,0.003581,0.0,0.003309,0.000641
AdvP,0.02006,0.015671,0.013829,0.013894,0.017104,0.019929,0.021287,0.013699,0.017279,0.014734
CP,0.25317,0.247796,0.263246,0.279712,0.275169,0.272796,0.272854,0.243414,0.274081,0.245356
DPrP,0.005303,0.001763,0.001999,0.0,0.00015,0.001316,0.002885,0.006322,0.000184,0.000641
IPrP,0.0,0.000392,0.0,0.0,0.00015,0.0,9.9e-05,0.0,0.000184,0.0
NP,0.11252,0.142605,0.132289,0.094242,0.102776,0.107915,0.1132,0.053741,0.117279,0.071749
NegP,0.011298,0.010774,0.009497,0.012554,0.008252,0.007144,0.004775,0.001054,0.009191,0.021781
PP,0.260549,0.244466,0.236921,0.228658,0.205851,0.214514,0.20939,0.296101,0.233088,0.280589
PPrP,0.008762,0.005093,0.005998,0.006194,0.006602,0.003572,0.008057,0.005269,0.008824,0.003203
PrNP,0.077473,0.064643,0.053482,0.057918,0.092873,0.084602,0.076892,0.097998,0.049081,0.067265


In [194]:
type_prob[list(lbh_books)]

Unnamed: 0,Esther,2_Chronicles,1_Chronicles,Ezra,Nehemiah
AdjP,0.00874,0.00479,0.001307,0.00189,0.002998
AdvP,0.006555,0.012046,0.013853,0.014178,0.011135
CP,0.242535,0.253846,0.233403,0.200378,0.243255
DPrP,0.0,0.001016,0.006534,0.005671,0.00257
IPrP,0.001457,0.0,0.0,0.0,0.000857
NP,0.126001,0.134543,0.151072,0.259924,0.176874
NegP,0.007283,0.010305,0.005227,0.006616,0.014561
PP,0.276038,0.258055,0.252222,0.238185,0.244968
PPrP,0.006555,0.00508,0.00758,0.006616,0.013704
PrNP,0.05244,0.051089,0.11265,0.055766,0.039829


## Phrase Type Transition Matrices

In [196]:
unique_types = give_unique(F.typ, otype='phrase')

typ_trans_freq, typ_trans_prob = make_transitions(data['phrase_types'], unique_values=unique_types)

In [197]:
typ_trans_prob['sbh']

Unnamed: 0,Clause_Begin,Clause_End,VP,PP,CP,NP,PrNP,NegP,AdvP,PPrP,AdjP,IPrP,DPrP
Clause_Begin,0.0,0.0,0.100238,0.014834,0.836457,0.025334,0.005006,0.004395,0.003358,0.006044,0.001038,0.000305,0.002991
Clause_End,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
VP,0.0,0.212602,0.001172,0.378947,0.0,0.193093,0.181511,0.0,0.029367,0.002551,0.000483,0.0,0.000276
PP,0.0,0.714043,0.029224,0.136267,0.0,0.074505,0.024977,0.003058,0.015037,0.002464,0.000425,0.0,0.0
CP,0.0,0.000145,0.810213,0.056376,0.003201,0.060086,0.026988,0.020877,0.004365,0.008802,0.006401,0.0,0.002546
NP,0.0,0.486917,0.078149,0.302027,0.0,0.063594,0.031364,0.006238,0.018888,0.005892,0.006931,0.0,0.0
PrNP,0.0,0.381593,0.072527,0.453022,0.0,0.068681,0.003846,0.004396,0.013187,0.000275,0.002473,0.0,0.0
NegP,0.0,0.008658,0.954545,0.025974,0.0,0.006494,0.0,0.0,0.0,0.0,0.004329,0.0,0.0
AdvP,0.0,0.619844,0.078038,0.120401,0.001115,0.119287,0.037904,0.008919,0.010033,0.00223,0.001115,0.0,0.001115
PPrP,0.0,0.20649,0.356932,0.147493,0.0,0.138643,0.088496,0.020649,0.014749,0.0,0.026549,0.0,0.0


In [198]:
typ_trans_prob['lbh']

Unnamed: 0,Clause_Begin,Clause_End,VP,PP,CP,NP,PrNP,NegP,AdvP,PPrP,AdjP,IPrP,DPrP
Clause_Begin,0.0,0.0,0.119266,0.031512,0.74631,0.066813,0.015756,0.001795,0.003391,0.008576,0.001396,0.000199,0.004986
Clause_End,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
VP,0.0,0.164465,0.005698,0.444962,0.0,0.215229,0.146335,0.0,0.019425,0.002072,0.001295,0.0,0.000518
PP,0.0,0.667511,0.048657,0.155347,0.0,0.086417,0.027623,0.003548,0.008363,0.002027,0.000253,0.0,0.000253
CP,0.0,0.000533,0.676698,0.10972,0.003196,0.114248,0.03968,0.023435,0.006658,0.011984,0.008788,0.000799,0.004261
NP,0.0,0.478022,0.092139,0.259087,0.0,0.105241,0.038884,0.010989,0.010566,0.001268,0.003804,0.0,0.0
PrNP,0.0,0.478649,0.077458,0.333664,0.0,0.09434,0.004965,0.001986,0.007944,0.0,0.000993,0.0,0.0
NegP,0.0,0.007042,0.901408,0.06338,0.0,0.021127,0.0,0.0,0.0,0.0,0.007042,0.0,0.0
AdvP,0.0,0.473118,0.134409,0.209677,0.005376,0.129032,0.021505,0.010753,0.010753,0.005376,0.0,0.0,0.0
PPrP,0.0,0.160714,0.419643,0.098214,0.0,0.258929,0.035714,0.008929,0.0,0.0,0.017857,0.0,0.0


### Transition Differences

In [199]:
typ_dif = typ_trans_prob["lbh"].subtract(typ_trans_prob["sbh"], fill_value =0)
typ_dif = round(typ_dif, 3)

typ_dif

Unnamed: 0,Clause_Begin,Clause_End,VP,PP,CP,NP,PrNP,NegP,AdvP,PPrP,AdjP,IPrP,DPrP
Clause_Begin,0.0,0.0,0.019,0.017,-0.09,0.041,0.011,-0.003,0.0,0.003,0.0,-0.0,0.002
Clause_End,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
VP,0.0,-0.048,0.005,0.066,0.0,0.022,-0.035,0.0,-0.01,-0.0,0.001,0.0,0.0
PP,0.0,-0.047,0.019,0.019,0.0,0.012,0.003,0.0,-0.007,-0.0,-0.0,0.0,0.0
CP,0.0,0.0,-0.134,0.053,-0.0,0.054,0.013,0.003,0.002,0.003,0.002,0.001,0.002
NP,0.0,-0.009,0.014,-0.043,0.0,0.042,0.008,0.005,-0.008,-0.005,-0.003,0.0,0.0
PrNP,0.0,0.097,0.005,-0.119,0.0,0.026,0.001,-0.002,-0.005,-0.0,-0.001,0.0,0.0
NegP,0.0,-0.002,-0.053,0.037,0.0,0.015,0.0,0.0,0.0,0.0,0.003,0.0,0.0
AdvP,0.0,-0.147,0.056,0.089,0.004,0.01,-0.016,0.002,0.001,0.003,-0.001,0.0,-0.001
PPrP,0.0,-0.046,0.063,-0.049,0.0,0.12,-0.053,-0.012,-0.015,0.0,-0.009,0.0,0.0


In [202]:
get_sorted_pairs(typ_dif)[:15]

[(('IPrP', 'VP'), 0.29999999999999999),
 (('IPrP', 'NP'), 0.25),
 (('DPrP', 'NP'), 0.17299999999999999),
 (('AdvP', 'Clause_End'), 0.14699999999999999),
 (('AdjP', 'PP'), 0.13800000000000001),
 (('CP', 'VP'), 0.13400000000000001),
 (('DPrP', 'Clause_End'), 0.125),
 (('PPrP', 'NP'), 0.12),
 (('PrNP', 'PP'), 0.11899999999999999),
 (('DPrP', 'VP'), 0.104),
 (('PrNP', 'Clause_End'), 0.097000000000000003),
 (('Clause_Begin', 'CP'), 0.089999999999999997),
 (('AdvP', 'PP'), 0.088999999999999996),
 (('VP', 'PP'), 0.066000000000000003),
 (('PPrP', 'VP'), 0.063)]