# Sanity Check


That was only testing 'every' contexts though, as the "unknowns". Let's do a sanity check: how much do distributions which we *know* are negative diverge from the rest of the negative data? 

### `no` vs. all other negative cases

---

---

In [1]:
import pandas as pd
from scipy.stats import entropy
import numpy as np
import matplotlib_inline as mplot
import itertools

In [2]:
# copied from `getFrequencies.py` and then modified slightly
def basic_freq_table(data: pd.DataFrame, label: str, mode: str = 'colloc'):
    if mode == 'colloc':
        crosstab_rows = data.colloc
    elif mode == 'adv':
        crosstab_rows = data.adv  # _word
    elif mode == 'adj':
        crosstab_rows = data.adj  # _word

    # get frequency table
    by_context = pd.crosstab(
        crosstab_rows, data.context).apply(pd.to_numeric, downcast="unsigned")
    with_sum = by_context.assign(COMBINED=by_context.sum(axis=1))
    sum2 = with_sum.sum()
    sum2.name = 'SUM'
    with_sum = with_sum.append(sum2)
    with_sum = (with_sum
                .sort_values('COMBINED', ascending=False)
                .sort_values(by='SUM', axis=1, ascending=False))
    # print('>>>', name, mode, 'frequencies')
    # print(with_sum.head(11))
    new_label = f'{label[0:3].upper()}_COMBINED'
    
    with_sum.columns = with_sum.columns.str.replace('COMBINED', new_label)
    
    return with_sum


In [3]:
# load nonoverlapping hits dataframe
compiled = pd.read_pickle("compiled_hits.pkl.gz")
hits = compiled.loc[:, ['colloc', 'context', 'context_word',
                        'context_type', 'context_group', 'adv', 'adj', 'polarity']]


In [4]:
pos_hits = hits[hits.polarity == 'positive']
neg_hits = hits[hits.polarity == 'negative']

In [5]:
other_neg_hits, no_hits = [x for _, x in neg_hits.groupby(neg_hits.context_word.isin(['no','nobody','no-one']))]

other_neg_hits.context.unique().to_list()

['never_VP-adv',
 'not_VP-adv',
 'without-being_PP',
 'neither_subj-det',
 'neither_subj-NP',
 'nothing_subj-NP',
 'not-one_subj-det',
 'not-one_subj-NP',
 'not-a-single_subj-det',
 'none_subj-NP',
 'few_subj',
 'few_det-of-subj']

In [6]:
no_hits.context.unique().to_list()

['no_subj-det', 'nobody_subj-NP', 'no-one_subj-NP']

colloc-by-context frequency tables could be calculated from pre-divided groups, or all together, and then select from the count columns based on context. 

First, from all the hits together: 

In [7]:
all_counts = basic_freq_table(hits, label='all')
all_counts.head(11)

context,ALL_COMBINED,be-ADV-ADJ_positive,not_VP-adv,never_VP-adv,no_subj-det,everyone_subj,none_subj-NP,no-one_subj-NP,few_det-of-subj,everybody_subj,nothing_subj-NP,every_det-of-subj,nobody_subj-NP,few_subj,without-being_PP,neither_subj-NP,neither_subj-det,not-one_subj-NP,not-a-single_subj-det,not-one_subj-det
colloc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
SUM,641085,374956,238598,7302,4818,2881,1902,1817,1785,1605,1492,1226,689,681,589,379,323,33,5,4
immediately_clear,15805,21,15769,0,14,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
immediately_available,14363,59,11632,1,2191,0,27,314,65,0,0,0,49,1,0,11,13,0,0,0
as_good,6492,2245,3858,233,24,8,21,16,11,8,26,16,9,5,0,10,2,0,0,0
very_good,4913,2101,2665,86,4,2,17,0,6,7,0,6,6,3,1,7,2,0,0,0
so_sure,4123,122,3940,7,0,43,1,5,0,1,1,1,1,0,0,1,0,0,0,0
too_late,3871,2158,1173,537,0,2,0,1,0,0,0,0,0,0,0,0,0,0,0,0
as_bad,3525,455,2921,90,8,1,14,4,1,0,23,0,1,0,0,6,0,1,0,0
too_early,3285,2688,347,250,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
so_bad,2638,685,1901,35,6,0,1,2,2,0,5,0,0,0,0,0,1,0,0,0


In [8]:
pos_counts = basic_freq_table(pos_hits, label='positive')
pos_counts

context,be-ADV-ADJ_positive,POS_COMBINED
colloc,Unnamed: 1_level_1,Unnamed: 2_level_1
SUM,374956,374956
too_early,2688,2688
very_important,2313,2313
as_good,2245,2245
too_late,2158,2158
...,...,...
irresponsibly_vague,1,1
irreversibly_busted,1,1
irreversibly_committed,1,1
irreversibly_constricted,1,1


In [9]:
neg_main_counts = basic_freq_table(other_neg_hits, label='negative')
neg_main_counts

context,NEG_COMBINED,not_VP-adv,never_VP-adv,none_subj-NP,few_det-of-subj,nothing_subj-NP,few_subj,without-being_PP,neither_subj-NP,neither_subj-det,not-one_subj-NP,not-a-single_subj-det,not-one_subj-det
colloc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
SUM,253093,238598,7302,1902,1785,1492,681,589,379,323,33,5,4
immediately_clear,15770,15769,0,1,0,0,0,0,0,0,0,0,0
immediately_available,11750,11632,1,27,65,0,1,0,11,13,0,0,0
as_good,4166,3858,233,21,11,26,5,0,10,2,0,0,0
so_sure,3950,3940,7,1,0,1,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
just_justifiable,1,1,0,0,0,0,0,0,0,0,0,0,0
just_jobless,1,1,0,0,0,0,0,0,0,0,0,0,0
just_jiggy,1,1,0,0,0,0,0,0,0,0,0,0,0
just_jewish,1,1,0,0,0,0,0,0,0,0,0,0,0


In [10]:
no_counts = basic_freq_table(no_hits, label='no')
no_counts

context,NO_COMBINED,no_subj-det,no-one_subj-NP,nobody_subj-NP
colloc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
SUM,7324,4818,1817,689
immediately_available,2554,2191,314,49
quite_sure,500,0,382,118
too_small,193,193,0,0
really_sure,121,0,84,37
...,...,...,...,...
flat-out_awful,1,0,1,0
fiscally_conservative,1,0,1,0
firmly_ingrained,1,1,0,0
financially_viable,1,1,0,0


In [11]:
san_chk = pos_counts.join(neg_main_counts).join(no_counts)
san_chk = san_chk.loc[:,san_chk.columns.str.contains('COMBINED')].fillna(0)
san_chk = san_chk.assign(TOTAL=san_chk.sum(axis=1)).convert_dtypes()
san_chk.sort_values('NO_COMBINED', ascending=False)

Unnamed: 0_level_0,POS_COMBINED,NEG_COMBINED,NO_COMBINED,TOTAL
colloc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
SUM,374956,253093,7324,635373
immediately_available,59,11750,2554,14363
quite_sure,93,1569,500,2162
too_small,1024,85,193,1302
really_sure,7,735,121,863
...,...,...,...,...
real_woozy,1,0,0,1
usually_unwelcome,1,0,0,1
usually_pivotal,1,0,0,1
usually_unwilling,1,0,0,1


An overly simplified sanity check perhaps, but how do the subset `COMBINED` columns compare to the superset column `TOTAL`? 

In [12]:
san_chk = san_chk.add(1)

In [13]:
def column_divergence(df, col):
    div_df = pd.DataFrame()
    for c1, c2 in itertools.permutations(col, 2): 
        l1 = df[c1].to_list()

        l2 = df[c2].to_list()
        kldiv = round(entropy(l1, l2),3)
        print(c1, 'KLdiv from', c2, '=', kldiv)
        
        div_df.loc[c1+'_KLdiv','from_'+c2] = kldiv

    return div_df.fillna(0).convert_dtypes()

In [14]:
col = san_chk.columns.to_list()
div_table = column_divergence(san_chk, col)
print('---\nKL Divergence between *all* columns')
div_table
    

POS_COMBINED KLdiv from NEG_COMBINED = 0.573
POS_COMBINED KLdiv from NO_COMBINED = 1.155
POS_COMBINED KLdiv from TOTAL = 0.106
NEG_COMBINED KLdiv from POS_COMBINED = 0.801
NEG_COMBINED KLdiv from NO_COMBINED = 1.478
NEG_COMBINED KLdiv from TOTAL = 0.146
NO_COMBINED KLdiv from POS_COMBINED = 1.038
NO_COMBINED KLdiv from NEG_COMBINED = 1.171
NO_COMBINED KLdiv from TOTAL = 1.159
TOTAL KLdiv from POS_COMBINED = 0.242
TOTAL KLdiv from NEG_COMBINED = 0.232
TOTAL KLdiv from NO_COMBINED = 1.32
---
KL Divergence between *all* columns


Unnamed: 0,from_NEG_COMBINED,from_NO_COMBINED,from_TOTAL,from_POS_COMBINED
POS_COMBINED_KLdiv,0.573,1.155,0.106,0.0
NEG_COMBINED_KLdiv,0.0,1.478,0.146,0.801
NO_COMBINED_KLdiv,1.171,0.0,1.159,1.038
TOTAL_KLdiv,0.232,1.32,0.0,0.242


We see that total doesn't diverge from itself (as should be the the case), but this also shows that the larger the subset is, the less it diverges from the superset. This also makes sense: the bigger the contribution, the better the final distribution will represent that contribution. 

But how do the subdistributions compare to each other?

In [15]:
col = san_chk.columns.to_list()
col.pop(-1)
div_table = column_divergence(san_chk, col)
print('---\nKL Divergence between data subgroup columns')
div_table

POS_COMBINED KLdiv from NEG_COMBINED = 0.573
POS_COMBINED KLdiv from NO_COMBINED = 1.155
NEG_COMBINED KLdiv from POS_COMBINED = 0.801
NEG_COMBINED KLdiv from NO_COMBINED = 1.478
NO_COMBINED KLdiv from POS_COMBINED = 1.038
NO_COMBINED KLdiv from NEG_COMBINED = 1.171
---
KL Divergence between data subgroup columns


Unnamed: 0,from_NEG_COMBINED,from_NO_COMBINED,from_POS_COMBINED
POS_COMBINED_KLdiv,0.573,1.155,0.0
NEG_COMBINED_KLdiv,0.0,1.478,0.801
NO_COMBINED_KLdiv,1.171,0.0,1.038
