In [None]:
import pandas as pd

from source.utils import FREQ_DIR, RESULT_DIR, UCS_DIR, confirm_dir
from source.utils.associate import (BINARY_ASSOC_ARGS, add_extra_am,
                                    associate_ucs, confirm_basic_ucs)
from source.utils.associate import convert_ucs_to_csv as ucs2csv
from source.utils.associate import get_associations_csv as init_am, AM_DF_DIR
from source.utils.associate import manipulate_ucs, seek_readable_ucs, adjust_assoc_columns
pd.set_option('display.float_format', '{:,.2f}'.format)



 set parameters

In [None]:
UNIT = 'Adj'
# PAT_DIR = 'POSmirror'
# PAT_DIR = 'NEGmirror'
PAT_DIR = 'RBdirect'
# PAT_DIR = 'ANYmirror'
# FRQ_FLOOR = 3
# FRQ_FLOOR = 10
# FRQ_FLOOR = 20
# FRQ_FLOOR = 50
FRQ_FLOOR = 100  # BUG 100 will be used regardless, so set it to this to at least keep the naming accurate
ADVADJ_TSV = FREQ_DIR.joinpath(
    f'{PAT_DIR}/ucs_format/Adv{UNIT}_frq-thrMIN-7.35f.tsv')
FOCUS = ['f', 'unexpected_f',
         'conservative_log_ratio',
         'am_p1_given2', 'am_p2_given1',
         'am_p1_given2_simple', 'am_p2_given1_simple',
         'am_log_likelihood',
         #  'mutual_information', 'am_odds_ratio_disc', 't_score',
         'N', 'f1', 'f2', 'E11',
         'l1', 'l2']

In [None]:
def invert_set_dict(d: dict):
    return {v: k for k in d for v in d[k]}



 1. Run `seek_readable_ucs()` to generate consistent output path

In [None]:
readable = seek_readable_ucs(min_freq=FRQ_FLOOR,
                             ucs_subdir='adv_adj',
                             contained_counts_path=ADVADJ_TSV)
print(readable.relative_to(RESULT_DIR))

    > seeking `adv_adj/RBdirect/readable/AdvAdj_frq-thrMIN-7.35f_min100x*` frequency data and initial associations...
ucs/adv_adj/RBdirect/readable/AdvAdj_frq-thrMIN-7.35f_min100x.rsort-view_am-only.csv




 Snippet of starting frequency data (`ADVADJ_TSV`)

In [None]:
! head -5 {ADVADJ_TSV} | column -t

44885  as     good
30349  very   good
27740  as     bad
26584  so     sure
26156  quite  sure




 2. Run `confirm_basic_ucs()` (if needed)

In [None]:
if not readable.is_file():
    basic_ucs_path = readable.parent.parent.joinpath(
        readable.name.replace('.rsort-view_am-only.txt', '.ds.gz'))
    print(
        f'Creating initial UCS table: `{basic_ucs_path.relative_to(RESULT_DIR)}')

    basic_ucs_path = confirm_basic_ucs(
        basic_ucs_path,
        freq_floor=FRQ_FLOOR,
        contained_counts_path=ADVADJ_TSV)



 Excerpt of initial UCS table

In [None]:
init_readable = UCS_DIR.joinpath(
    f'adv_adj/{PAT_DIR}/readable'
).joinpath(f'{ADVADJ_TSV.name.replace(".tsv","")}_min{FRQ_FLOOR}x.init.txt')
! head -7 {init_readable}

             l1  l2                    f      f2      f1        N  
---------------  ----------------  -----  ------  ------  -------  
             as  good              44889  132448  535450  3226225  
             so  good              18002  132448  346575  3226225  
            too  good               3623  132448  311944  3226225  
           very  good              30349  132448  193561  3226225  
           that  good              10655  132448  165411  3226225  




 3. Run `associate_ucs()` (if needed)

In [None]:
if not readable.is_file():
    associate_ucs(basic_ucs_path)

transform_ucs_log = f'/share/compling/projects/sanpi/logs/associate/ucs//ucs-{PAT_DIR}_Adv{UNIT}_frq-thrMIN-7-35f_min{FRQ_FLOOR}x*.log'
! head -15 `ls -t1 {transform_ucs_log} | head -1`
! echo '...'
! tail -2 `ls -t1 {transform_ucs_log} | head -1`

# Manipulating AdvAdj_frq-thrMIN-7-35f_min100x ucs table
path to this script: /share/compling/projects/sanpi/script/transform_ucs.sh
Thu May 30 14:42:52 EDT 2024
(TMP: /share/compling/projects/sanpi/results/ucs/adv_adj/RBdirect/tmp/tmp_RBdirect-20240530-144252.AdvAdj_frq-thrMIN-7-35f_min100x)
## Initial Contingency Info

DATA SET FILE:  /share/compling/projects/sanpi/results/ucs/adv_adj/RBdirect/AdvAdj_frq-thrMIN-7.35f_min100x.ds.gz

# Frequency signatures computed by the ucs-make-tables tool for relational cooccurrences.
# Sample size:  N = 3226225 tokens,  V = 182629 pair types.
# A frequency threshold of f >= 100 was applied, leaving V = 3622 pair types.

##:: size = 3622
##:: threshold = 100

...
Loading data set /share/compling/projects/sanpi/results/ucs/adv_adj/RBdirect/AdvAdj_frq-thrMIN-7.35f_min100x.rsort.gz ... 3622 rows
Script finished at Thu May 30 14:43:05 EDT 2024




 4. Run `ucs_to_csv()` to convert `ucs/[PAT_DIR]/readable/*.txt` to format that `pandas` can parse as a dataframe

In [None]:
! head -5 {readable}
csv_path = ucs2csv(readable)
print(f'CSV: `{csv_path.relative_to(RESULT_DIR)}`')

l1,l2,f,E11,am_log_likelihood,am_odds_ratio_disc,am_p1_given2,am_p2_given1,am_p1_given2_simple,am_p2_given1_simple,f1,f2,N
legally,binding,637,0.7166022,8527.099448,4.197714768,0.9304361497,0.1884469551,0.9312865497,0.1884615385,3380,684,3226225
too,late,25854,2543.3362447,118329.968464,2.766802345,0.8934870036,0.0827258485,0.9828923358,0.0828802606,311944,26304,3226225
mutually,exclusive,5509,11.4188812,73788.677919,4.772885829,0.8881280010,0.9272258791,0.8882618510,0.9274410774,5940,6202,3226225
too,early,7193,713.0894466,32089.573205,2.576235904,0.8806450530,0.0229961748,0.9753220339,0.0230586259,311944,7375,3226225
UCS table text converted & saved as /share/compling/projects/sanpi/results/ucs/adv_adj/RBdirect/readable/AdvAdj_frq-thrMIN-7.35f_min100x.rsort-view_am-only.csv
CSV: `ucs/adv_adj/RBdirect/readable/AdvAdj_frq-thrMIN-7.35f_min100x.rsort-view_am-only.csv`




##

In [None]:
adx_amdf = pd.read_csv(csv_path).convert_dtypes()
adx_amdf

Unnamed: 0,l1,l2,f,E11,am_log_likelihood,am_odds_ratio_disc,am_p1_given2,am_p2_given1,am_p1_given2_simple,am_p2_given1_simple,f1,f2,N
0,legally,binding,637,0.72,8527.10,4.20,0.93,0.19,0.93,0.19,3380,684,3226225
1,too,late,25854,2543.34,118329.97,2.77,0.89,0.08,0.98,0.08,311944,26304,3226225
2,mutually,exclusive,5509,11.42,73788.68,4.77,0.89,0.93,0.89,0.93,5940,6202,3226225
3,too,early,7193,713.09,32089.57,2.58,0.88,0.02,0.98,0.02,311944,7375,3226225
4,too,shabby,5399,535.18,24062.20,2.58,0.88,0.02,0.98,0.02,311944,5535,3226225
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3617,as,able,202,2364.38,-3713.05,-1.14,-0.15,-0.00,0.01,0.00,535450,14246,3226225
3618,as,distant,111,1358.28,-2157.25,-1.16,-0.15,-0.00,0.01,0.00,535450,8184,3226225
3619,as,different,616,7595.19,-12178.64,-1.17,-0.15,-0.02,0.01,0.00,535450,45763,3226225
3620,as,available,320,9316.28,-17646.14,-1.55,-0.16,-0.02,0.01,0.00,535450,56133,3226225


In [None]:


adx_amdf['key'] = (adx_amdf.l1 + '~' +
                   adx_amdf.l2).astype('string')
adx_amdf = adx_amdf.set_index('key')
adx_amdf

Unnamed: 0_level_0,l1,l2,f,E11,am_log_likelihood,am_odds_ratio_disc,am_p1_given2,am_p2_given1,am_p1_given2_simple,am_p2_given1_simple,f1,f2,N
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
legally~binding,legally,binding,637,0.72,8527.10,4.20,0.93,0.19,0.93,0.19,3380,684,3226225
too~late,too,late,25854,2543.34,118329.97,2.77,0.89,0.08,0.98,0.08,311944,26304,3226225
mutually~exclusive,mutually,exclusive,5509,11.42,73788.68,4.77,0.89,0.93,0.89,0.93,5940,6202,3226225
too~early,too,early,7193,713.09,32089.57,2.58,0.88,0.02,0.98,0.02,311944,7375,3226225
too~shabby,too,shabby,5399,535.18,24062.20,2.58,0.88,0.02,0.98,0.02,311944,5535,3226225
...,...,...,...,...,...,...,...,...,...,...,...,...,...
as~able,as,able,202,2364.38,-3713.05,-1.14,-0.15,-0.00,0.01,0.00,535450,14246,3226225
as~distant,as,distant,111,1358.28,-2157.25,-1.16,-0.15,-0.00,0.01,0.00,535450,8184,3226225
as~different,as,different,616,7595.19,-12178.64,-1.17,-0.15,-0.02,0.01,0.00,535450,45763,3226225
as~available,as,available,320,9316.28,-17646.14,-1.55,-0.16,-0.02,0.01,0.00,535450,56133,3226225




 6. Save to `./results/assoc_df/`

In [None]:
df_csv_path = AM_DF_DIR.joinpath(
    str(csv_path.relative_to(UCS_DIR))
    .replace('/readable', '')
    .replace('.rsort-view_am-only', ''))

if not df_csv_path.is_file():
    confirm_dir(df_csv_path.parent)
    adx_amdf.to_csv(df_csv_path)

df_pkl_path = df_csv_path.with_suffix('.pkl.gz')
if not df_pkl_path.is_file():
    adx_amdf.to_pickle(df_csv_path.with_suffix('.pkl.gz'))



 7. Add additional AM via `add_extra_am()`
 Define dictionary containing relevant vocab sizes
 !!! Warning This is a `#HACK`: \
     Rather than developing a command/code to retrieve the vocab sizes programmatically,
     I simply copied the values given in the log output of `transform_usc.sh`
     for each `PAT_DIR`+`UNIT` combination

In [None]:
VOCABS = {
    # // 'ANYmirror': {'Adj': 83422},
    'NEGmirror': {'Adj': 40004},
    'POSmirror': {'Adj': 178159},
    'RBdirect':  {'Adj': 61860}
}  # ! #HACK
VOCAB = VOCABS[PAT_DIR][UNIT]

print(pd.DataFrame(VOCABS).convert_dtypes().to_markdown(intfmt=','))
VOCAB = None
ex_adx_amdf = add_extra_am(df=adx_amdf,
                           verbose=True,
                           vocab=VOCAB,
                           metrics=['t_score', 'mutual_information']
                           ).convert_dtypes()

|     |   NEGmirror |   POSmirror |   RBdirect |
|:----|------------:|------------:|-----------:|
| Adj |      40,004 |     178,159 |     61,860 |

Preview of Extended Measures (rounded)

| key                |   t_score |   mutual_information |   deltaP_min |   deltaP_max |   deltaP_max_abs |   deltaP_product |   unexpected_f |   unexpected_ratio |
|:-------------------|----------:|---------------------:|-------------:|-------------:|-----------------:|-----------------:|---------------:|-------------------:|
| legally~binding    |     25.21 |                 2.95 |         0.19 |         0.93 |             0.93 |             0.18 |         636.28 |               1.00 |
| too~late           |    144.97 |                 1.01 |         0.08 |         0.89 |             0.89 |             0.07 |      23,310.66 |               0.90 |
| mutually~exclusive |     74.07 |                 2.68 |         0.89 |         0.93 |             0.93 |             0.82 |       5,497.58 |              



 Save extended AM tables to `extra/` subdirectory if not already saved

In [None]:
df_extra_csv = df_csv_path.parent / 'extra' / \
    df_csv_path.name.replace('.csv', '_extra.csv')
print(df_extra_csv)
if not df_extra_csv.is_file():
    confirm_dir(df_extra_csv.parent)
    ex_adx_amdf.to_csv(df_extra_csv)

df_extra_pkl = df_extra_csv.with_suffix('.pkl.gz')
if not df_extra_pkl.is_file():
    ex_adx_amdf.to_pickle(df_extra_pkl)

/share/compling/projects/sanpi/results/assoc_df/adv_adj/RBdirect/extra/AdvAdj_frq-thrMIN-7.35f_min100x_extra.csv


In [None]:


ex_adx_full = ex_adx_amdf.copy()
ex_adx_abbr = adjust_assoc_columns(
    ex_adx_amdf[[c for c in ['polarity', 'quant'] + FOCUS if c in ex_adx_amdf.columns]]).sort_values('LRC', ascending=False)
cols = ex_adx_abbr.columns



 Define lexical items with given lean shown in binary environment evaluation

In [None]:
pos_prone = {
    'Adj': [
        'unrelated',
        'unable',
        'akin',
        'larger',
        'different',
        'familiar',
        'similar',
        'likely',
        'brief',
        'unaware'
    ],
    'Adv': [
        'slightly',
        'definitely',
        'utterly',
        # LRC top
        'pretty',
        'rather',
        'plain',
        'fairly',
        'somewhat',
        'otherwise',
        'downright',
        'relatively',
        # G2 top
        # 'very',
        # 'even',
        # 'just',
        # dP1 top (and odds ratio disc)
        'plain',
        'maybe'
    ],
    'Bigr': [
        # G2 top
        'completely_different',
        'too_familiar',
        'even_better',
        # dP1 top
        'quite_different',
        'too_real',
        'well_aware',
        # LRC top
        'too_common',
        'entirely_different'
    ]}
neg_prone = {
    'Bigr': [
        # LRC top
        'quite_sure',
        'really_sure',
        'too_early',
        'too_pleased',
        'too_fancy',
        # dP1 top
        'entirely_sure',
        'ever_easy',
        'ever_perfect',
        'particularly_surprising',
        'particularly_new',
        # G2 top
        'too_late',
        'more_important',
        'so_easy',
        'as_good',
        'too_old'
    ],
    'Adv': [
        'yet',
        # LRC top
        'ever',
        'any',
        'longer',
        'necessarily',
        'that',
        # dP1 top
        'before',
        'wise',  # ? How is this used as an adverb?
        'earthly',
        'remotely',
        'exactly',
        # G2 top
        'particularly',
        'too',
        # 'inherently'
    ],
    'Adj': [
        # LRC top
        'early',
        'late',
        'fancy',
        'alone',
        'sure',
        # dP1 top
        'shabby',
        'demoralizing',
        'alone',
        'aggravating',
        'groundbreaking',
        'eventful',
        # G2 top
        'important',
        'frustrating',
        'evident',
        'certain'
    ]
}


def sort_prone_by_f2(prone_list, amdf):
    return amdf.copy().loc[amdf.l2.isin(prone_list), ['f2', 'l2']].drop_duplicates().reset_index(drop=True).set_index('l2').round(1).sort_values(['f2'], ascending=False).index.to_list()


pos_prone[UNIT] = sort_prone_by_f2(pos_prone[UNIT], ex_adx_abbr)
neg_prone[UNIT] = sort_prone_by_f2(neg_prone[UNIT], ex_adx_abbr)
pos_prone[UNIT]

['different', 'familiar', 'likely', 'similar', 'larger', 'unrelated', 'unable']



 Strongest associations for each polarity by metric

In [None]:
def show_metric_top(amdf: pd.DataFrame,
                    metric: str,
                    k=5,
                    cols=[None]):
    if not any(cols):
        cols = amdf.columns
    return (amdf.nlargest(k, metric)
            .loc[:, [metric] + cols[cols != metric].to_list()]
            .reset_index(drop=True).set_index(['l1', 'l2'])
            )



 Top consevative log ratio $LRC$ values

In [None]:
exdf = show_metric_top(ex_adx_abbr, "LRC", k=8)
# pos_prone, neg_prone = update_prone(exdf, pos_prone, neg_prone)
exdf

Unnamed: 0_level_0,Unnamed: 1_level_0,LRC,f,unexp_f,dP1,dP2,dP1_simple,dP2_simple,G2,N,f1,f2,exp_f
l1,l2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
legally,binding,12.67,637,636.28,0.93,0.19,0.93,0.19,8527.1,3226225,3380,684,0.72
seriously,hurt,11.85,252,251.86,0.7,0.21,0.7,0.21,3585.97,3226225,1219,361,0.14
mutually,exclusive,11.82,5509,5497.58,0.89,0.93,0.89,0.93,73788.68,3226225,5940,6202,11.42
long,lasting,11.48,372,371.62,0.8,0.14,0.8,0.14,4869.3,3226225,2670,465,0.38
reasonably,foreseeable,11.12,114,113.94,0.55,0.12,0.55,0.12,1583.89,3226225,935,209,0.06
clearly,erroneous,11.07,176,175.87,0.65,0.11,0.65,0.11,2354.77,3226225,1569,271,0.13
aesthetically,pleasing,10.88,794,793.37,0.43,0.72,0.43,0.72,10961.34,3226225,1104,1836,0.63
independently,wealthy,10.63,192,191.89,0.16,0.65,0.16,0.65,2670.87,3226225,295,1236,0.11




 Top $\Delta P(\texttt{adv}|\texttt{adj})$ values

In [None]:
exdf = show_metric_top(ex_adx_abbr, 'dP1', k=8)
# pos_prone, neg_prone = update_prone(exdf, pos_prone, neg_prone)
exdf

Unnamed: 0_level_0,Unnamed: 1_level_0,dP1,f,unexp_f,LRC,dP2,dP1_simple,dP2_simple,G2,N,f1,f2,exp_f
l1,l2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
legally,binding,0.93,637,636.28,12.67,0.19,0.93,0.19,8527.1,3226225,3380,684,0.72
too,late,0.89,25854,23310.66,8.75,0.08,0.98,0.08,118329.97,3226225,311944,26304,2543.34
mutually,exclusive,0.89,5509,5497.58,11.82,0.93,0.89,0.93,73788.68,3226225,5940,6202,11.42
too,early,0.88,7193,6479.91,8.03,0.02,0.98,0.02,32089.57,3226225,311944,7375,713.09
too,shabby,0.88,5399,4863.82,7.96,0.02,0.98,0.02,24062.2,3226225,311944,5535,535.18
much,older,0.81,1195,1170.85,7.68,0.02,0.83,0.02,8482.07,3226225,53921,1445,24.15
long,lasting,0.8,372,371.62,11.48,0.14,0.8,0.14,4869.3,3226225,2670,465,0.38
always,greener,0.78,426,409.04,6.29,0.0,0.81,0.0,2427.81,3226225,104605,523,16.96




 Top conditional probability $P(\texttt{adv}|\texttt{adj})$ values

In [None]:
exdf = show_metric_top(ex_adx_abbr, 'dP1_simple', k=8)
# pos_prone, neg_prone = update_prone(exdf, pos_prone, neg_prone)
exdf

Unnamed: 0_level_0,Unnamed: 1_level_0,dP1_simple,f,unexp_f,LRC,dP1,dP2,dP2_simple,G2,N,f1,f2,exp_f
l1,l2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
too,late,0.98,25854,23310.66,8.75,0.89,0.08,0.08,118329.97,3226225,311944,26304,2543.34
too,shabby,0.98,5399,4863.82,7.96,0.88,0.02,0.02,24062.2,3226225,311944,5535,535.18
too,early,0.98,7193,6479.91,8.03,0.88,0.02,0.02,32089.57,3226225,311944,7375,713.09
legally,binding,0.93,637,636.28,12.67,0.93,0.19,0.19,8527.1,3226225,3380,684,0.72
mutually,exclusive,0.89,5509,5497.58,11.82,0.89,0.93,0.93,73788.68,3226225,5940,6202,11.42
too,fussed,0.84,136,120.43,4.23,0.75,0.0,0.0,501.57,3226225,311944,161,15.57
much,older,0.83,1195,1170.85,7.68,0.81,0.02,0.02,8482.07,3226225,53921,1445,24.15
more,fitting,0.82,299,279.86,5.46,0.77,0.0,0.0,1425.41,3226225,169217,365,19.14




 Top $\Delta P(\texttt{adj}|\texttt{adv})$ values

In [None]:
exdf = show_metric_top(ex_adx_abbr, 'dP2', k=8)
# pos_prone, neg_prone = update_prone(exdf, pos_prone, neg_prone)
exdf

Unnamed: 0_level_0,Unnamed: 1_level_0,dP2,f,unexp_f,LRC,dP1,dP1_simple,dP2_simple,G2,N,f1,f2,exp_f
l1,l2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
ornamentally,significant,1.0,109,108.73,8.62,0.01,0.01,1.0,1312.36,3226225,109,7893,0.27
backwards,compatible,0.97,142,141.89,10.31,0.06,0.06,0.97,2025.02,3226225,146,2341,0.11
crystal,clear,0.97,117,114.33,5.4,0.0,0.0,0.99,875.54,3226225,118,72905,2.67
mutually,exclusive,0.93,5509,5497.58,11.82,0.89,0.89,0.93,73788.68,3226225,5940,6202,11.42
closely,related,0.92,819,816.26,8.25,0.08,0.08,0.92,9058.12,3226225,887,9959,2.74
terminally,ill,0.9,373,372.74,10.6,0.18,0.18,0.9,5314.49,3226225,413,2017,0.26
mobile,friendly,0.9,435,434.12,8.93,0.07,0.07,0.9,5210.44,3226225,484,5834,0.88
humanly,possible,0.88,344,341.53,7.02,0.02,0.02,0.89,3216.91,3226225,386,20623,2.47




 Top conditional probability $P(\texttt{adj}|\texttt{adv})$ values

In [None]:
exdf = show_metric_top(ex_adx_abbr, 'dP2_simple', k=8)
# pos_prone, neg_prone = update_prone(exdf, pos_prone, neg_prone)
exdf

Unnamed: 0_level_0,Unnamed: 1_level_0,dP2_simple,f,unexp_f,LRC,dP1,dP2,dP1_simple,G2,N,f1,f2,exp_f
l1,l2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
ornamentally,significant,1.0,109,108.73,8.62,0.01,1.0,0.01,1312.36,3226225,109,7893,0.27
crystal,clear,0.99,117,114.33,5.4,0.0,0.97,0.0,875.54,3226225,118,72905,2.67
backwards,compatible,0.97,142,141.89,10.31,0.06,0.97,0.06,2025.02,3226225,146,2341,0.11
mutually,exclusive,0.93,5509,5497.58,11.82,0.89,0.93,0.89,73788.68,3226225,5940,6202,11.42
closely,related,0.92,819,816.26,8.25,0.08,0.92,0.08,9058.12,3226225,887,9959,2.74
terminally,ill,0.9,373,372.74,10.6,0.18,0.9,0.18,5314.49,3226225,413,2017,0.26
mobile,friendly,0.9,435,434.12,8.93,0.07,0.9,0.07,5210.44,3226225,484,5834,0.88
humanly,possible,0.89,344,341.53,7.02,0.02,0.88,0.02,3216.91,3226225,386,20623,2.47




 Top log-likelihood $G^2$ values

In [None]:
exdf = show_metric_top(ex_adx_abbr, 'G2', k=8)
# pos_prone, neg_prone = update_prone(exdf, pos_prone, neg_prone)
exdf

Unnamed: 0_level_0,Unnamed: 1_level_0,G2,f,unexp_f,LRC,dP1,dP2,dP1_simple,dP2_simple,N,f1,f2,exp_f
l1,l2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
immediately,clear,123693.61,25276,23980.73,4.83,0.34,0.43,0.35,0.44,3226225,57319,72905,1295.27
too,late,118329.97,25854,23310.66,8.75,0.89,0.08,0.98,0.08,3226225,311944,26304,2543.34
immediately,available,106933.84,21297,20299.71,5.03,0.37,0.36,0.38,0.37,3226225,57319,56133,997.29
even,close,96653.89,17701,16984.89,5.84,0.58,0.22,0.6,0.23,3226225,78489,29435,716.11
quite,sure,77043.21,26156,22996.8,3.3,0.19,0.3,0.2,0.33,3226225,79118,128824,3159.2
always,easy,73886.15,24574,21734.43,3.5,0.26,0.21,0.28,0.23,3226225,104605,87578,2839.57
mutually,exclusive,73788.68,5509,5497.58,11.82,0.89,0.93,0.89,0.93,3226225,5940,6202,11.42
much,better,61573.77,11225,10824.8,5.62,0.46,0.2,0.47,0.21,3226225,53921,23945,400.2


In [None]:


sig_adx_abbr = ex_adx_abbr.loc[ex_adx_abbr.LRC.abs() > 1, :]
sig_adx_abbr

Unnamed: 0_level_0,f,unexp_f,LRC,dP1,dP2,dP1_simple,dP2_simple,G2,N,f1,f2,exp_f,l1,l2
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
legally~binding,637,636.28,12.67,0.93,0.19,0.93,0.19,8527.10,3226225,3380,684,0.72,legally,binding
seriously~hurt,252,251.86,11.85,0.70,0.21,0.70,0.21,3585.97,3226225,1219,361,0.14,seriously,hurt
mutually~exclusive,5509,5497.58,11.82,0.89,0.93,0.89,0.93,73788.68,3226225,5940,6202,11.42,mutually,exclusive
long~lasting,372,371.62,11.48,0.80,0.14,0.80,0.14,4869.30,3226225,2670,465,0.38,long,lasting
reasonably~foreseeable,114,113.94,11.12,0.55,0.12,0.55,0.12,1583.89,3226225,935,209,0.06,reasonably,foreseeable
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
that~sure,300,-6304.90,-4.15,-0.05,-0.04,0.00,0.00,-11335.14,3226225,165411,128824,6604.90,that,sure
more~easy,147,-4446.51,-4.50,-0.05,-0.03,0.00,0.00,-8240.87,3226225,169217,87578,4593.51,more,easy
as~available,320,-8996.28,-4.75,-0.16,-0.02,0.01,0.00,-17646.14,3226225,535450,56133,9316.28,as,available
as~sure,714,-20666.66,-4.91,-0.17,-0.05,0.01,0.00,-41207.08,3226225,535450,128824,21380.66,as,sure




 Positive Prone Adverbs with significant LRC

In [None]:
sig_adx_abbr.loc[sig_adx_abbr.l1.isin(pos_prone['Adv'])]

Unnamed: 0_level_0,f,unexp_f,LRC,dP1,dP2,dP1_simple,dP2_simple,G2,N,f1,f2,exp_f,l1,l2
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
otherwise~eligible,103,102.23,6.52,0.06,0.07,0.06,0.07,819.04,3226225,1498,1648,0.77,otherwise,eligible
downright~impossible,127,125.47,5.86,0.04,0.07,0.05,0.07,886.49,3226225,1765,2795,1.53,downright,impossible
pretty~enough,544,528.4,4.91,0.02,0.23,0.03,0.23,2952.01,3226225,2319,21707,15.6,pretty,enough
otherwise~available,388,361.94,3.62,0.01,0.24,0.01,0.26,1471.42,3226225,1498,56133,26.06,otherwise,available
otherwise~possible,116,106.42,3.02,0.01,0.07,0.01,0.08,374.2,3226225,1498,20623,9.58,otherwise,possible




 Negative Prone Adverbs with significant LRC

In [None]:
sig_adx_abbr.loc[sig_adx_abbr.l1.isin(neg_prone['Adv'])]

Unnamed: 0_level_0,f,unexp_f,LRC,dP1,dP2,dP1_simple,dP2_simple,G2,N,f1,f2,exp_f,l1,l2
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
too~late,25854,23310.66,8.75,0.89,0.08,0.98,0.08,118329.97,3226225,311944,26304,2543.34,too,late
any~clearer,357,354.58,8.33,0.70,0.02,0.71,0.02,3213.03,3226225,15492,504,2.42,any,clearer
any~happier,830,823.67,8.09,0.63,0.05,0.63,0.05,7174.46,3226225,15492,1318,6.33,any,happier
too~early,7193,6479.91,8.03,0.88,0.02,0.98,0.02,32089.57,3226225,311944,7375,713.09,too,early
too~shabby,5399,4863.82,7.96,0.88,0.02,0.98,0.02,24062.20,3226225,311944,5535,535.18,too,shabby
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
too~simple,236,-2726.29,-3.35,-0.09,-0.01,0.01,0.00,-4545.21,3226225,311944,30637,2962.29,too,simple
too~effective,140,-1809.76,-3.38,-0.09,-0.01,0.01,0.00,-3067.85,3226225,311944,20165,1949.76,too,effective
too~aware,128,-1770.51,-3.44,-0.09,-0.01,0.01,0.00,-3032.95,3226225,311944,19635,1898.51,too,aware
too~clear,473,-6576.19,-3.73,-0.09,-0.02,0.01,0.00,-11388.57,3226225,311944,72905,7049.19,too,clear




 Positive Prone Adjectives with significant LRC

In [None]:
sig_adx_abbr.loc[sig_adx_abbr.l2.isin(pos_prone['Adj'])]

Unnamed: 0_level_0,f,unexp_f,LRC,dP1,dP2,dP1_simple,dP2_simple,G2,N,f1,f2,exp_f,l1,l2
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
intimately~familiar,311,309.26,7.31,0.02,0.72,0.02,0.73,2931.16,3226225,427,13156,1.74,intimately,familiar
substantially~similar,118,117.46,7.28,0.06,0.14,0.06,0.14,1061.52,3226225,866,2000,0.54,substantially,similar
much~larger,790,769.56,6.35,0.63,0.01,0.65,0.01,4900.77,3226225,53921,1223,20.44,much,larger
radically~different,430,421.67,5.54,0.01,0.72,0.01,0.73,2986.49,3226225,587,45763,8.33,radically,different
drastically~different,237,232.02,5.33,0.01,0.66,0.01,0.68,1579.05,3226225,351,45763,4.98,drastically,different
significantly~different,1604,1561.26,5.16,0.03,0.52,0.04,0.53,9582.62,3226225,3013,45763,42.74,significantly,different
already~familiar,893,869.58,5.14,0.07,0.15,0.07,0.16,4962.0,3226225,5744,13156,23.42,already,familiar
markedly~different,147,143.51,5.05,0.0,0.58,0.0,0.6,922.83,3226225,246,45763,3.49,markedly,different
remotely~similar,152,148.48,4.99,0.07,0.03,0.08,0.03,862.94,3226225,5679,2000,3.52,remotely,similar
vastly~different,140,136.45,4.94,0.0,0.55,0.0,0.56,852.16,3226225,250,45763,3.55,vastly,different




 Negative Prone Adjectives with significant LRC

In [None]:
sig_adx_abbr.loc[sig_adx_abbr.l2.isin(neg_prone['Adj'])]

Unnamed: 0_level_0,f,unexp_f,LRC,dP1,dP2,dP1_simple,dP2_simple,G2,N,f1,f2,exp_f,l1,l2
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
too~late,25854,23310.66,8.75,0.89,0.08,0.98,0.08,118329.97,3226225,311944,26304,2543.34,too,late
too~early,7193,6479.91,8.03,0.88,0.02,0.98,0.02,32089.57,3226225,311944,7375,713.09,too,early
too~shabby,5399,4863.82,7.96,0.88,0.02,0.98,0.02,24062.2,3226225,311944,5535,535.18,too,shabby
truly~alone,228,223.88,5.53,0.13,0.03,0.13,0.03,1419.43,3226225,7763,1713,4.12,truly,alone
more~frustrating,1917,1770.35,5.03,0.63,0.01,0.69,0.01,7935.19,3226225,169217,2796,146.65,more,frustrating
absolutely~certain,543,527.65,4.96,0.06,0.09,0.06,0.09,2899.94,3226225,5877,8429,15.35,absolutely,certain
more~aggravating,101,92.56,3.85,0.57,0.0,0.63,0.0,389.36,3226225,169217,161,8.44,more,aggravating
quite~sure,26156,22996.8,3.3,0.19,0.3,0.2,0.33,77043.21,3226225,79118,128824,3159.2,quite,sure
completely~alone,214,195.62,3.24,0.11,0.01,0.12,0.01,683.9,3226225,34625,1713,18.38,completely,alone
more~evident,1475,1263.62,3.17,0.31,0.01,0.37,0.01,3687.8,3226225,169217,4030,211.38,more,evident
