In [None]:
# %%

import pandas as pd

from source.utils import FREQ_DIR, RESULT_DIR, UCS_DIR, confirm_dir
from source.utils.associate import (BINARY_ASSOC_ARGS, add_extra_am,
                                    associate_ucs, confirm_basic_ucs)
from source.utils.associate import convert_ucs_to_csv as ucs2csv
from source.utils.associate import get_associations_csv as init_am, AM_DF_DIR
from source.utils.associate import manipulate_ucs, seek_readable_ucs, adjust_assoc_columns
pd.set_option('display.float_format', '{:,.2f}'.format)

# %% [markdown]

 set parameters

In [None]:
UNIT = 'Adj'
# PAT_DIR = 'POSmirror'
# PAT_DIR = 'NEGmirror'
PAT_DIR = 'ANYmirror'
# PAT_DIR = 'RBdirect'
# PAT_DIR = 'RBXadj'
# FRQ_FLOOR = 3
# FRQ_FLOOR = 10
# FRQ_FLOOR = 20
# FRQ_FLOOR = 50
FRQ_FLOOR = 100  # BUG 100 will be used regardless, so set it to this to at least keep the naming accurate
ADVADJ_TSV = FREQ_DIR.joinpath(
    f'{PAT_DIR}/ucs_format/Adv{UNIT}_frq-thrMIN-7.35f.tsv')
FOCUS = ['f', 'unexpected_f',
         'conservative_log_ratio',
         'am_p1_given2', 'am_p2_given1',
         'am_p1_given2_simple', 'am_p2_given1_simple',
         'am_log_likelihood',
         #  'mutual_information', 'am_odds_ratio_disc', 't_score',
         'N', 'f1', 'f2', 'E11',
         'l1', 'l2']

In [None]:
# %%

def invert_set_dict(d: dict):
    return {v: k for k in d for v in d[k]}

# %% [markdown]

 1. Run `seek_readable_ucs()` to generate consistent output path

In [None]:
readable = seek_readable_ucs(min_freq=FRQ_FLOOR,
                             ucs_subdir='adv_adj',
                             contained_counts_path=ADVADJ_TSV)
print(readable.relative_to(RESULT_DIR))

    > seeking `adv_adj/ANYmirror/readable/AdvAdj_frq-thrMIN-7.35f_min100x*` frequency data and initial associations...
ucs/adv_adj/ANYmirror/readable/AdvAdj_frq-thrMIN-7.35f_min100x.rsort-view_am-only.csv


# %% [markdown]

 Snippet of starting frequency data (`ADVADJ_TSV`)

In [None]:
! head -5 {ADVADJ_TSV} | column -t

14875  as          simple
10996  more        important
8327   completely  different
7819   more        likely
7221   too         easy


# %% [markdown]

 2. Run `confirm_basic_ucs()` (if needed)

In [None]:
if not readable.is_file():
    basic_ucs_path = readable.parent.parent.joinpath(
        readable.name.replace('.rsort-view_am-only.txt', '.ds.gz'))
    print(
        f'Creating initial UCS table: `{basic_ucs_path.relative_to(RESULT_DIR)}')

    basic_ucs_path = confirm_basic_ucs(
        basic_ucs_path,
        freq_floor=FRQ_FLOOR,
        contained_counts_path=ADVADJ_TSV)

# %% [markdown]

 Excerpt of initial UCS table

In [None]:
init_readable = UCS_DIR.joinpath(
    f'adv_adj/{PAT_DIR}/readable'
).joinpath(f'{ADVADJ_TSV.name.replace(".tsv","")}_min{FRQ_FLOOR}x.init.txt')
! head -7 {init_readable}

             l1  l2                    f     f2      f1        N  
---------------  ----------------  -----  -----  ------  -------  
           more  important         23085  43776  285602  1761853  
           very  important          6209  43776  184009  1761853  
            too  important           252  43776  158082  1761853  
             so  important          1739  43776  129602  1761853  
             as  important          3417  43776  108116  1761853  


# %% [markdown]

 3. Run `associate_ucs()` (if needed)

In [None]:
if not readable.is_file():
    associate_ucs(basic_ucs_path)

transform_ucs_log = f'/share/compling/projects/sanpi/logs/associate/ucs//ucs-{PAT_DIR}_Adv{UNIT}_frq-thrMIN-7-35f_min{FRQ_FLOOR}x*.log'
! head -15 `ls -t1 {transform_ucs_log} | head -1`
! echo '...'
! tail -2 `ls -t1 {transform_ucs_log} | head -1`

# Manipulating AdvAdj_frq-thrMIN-7-35f_min100x ucs table
path to this script: /share/compling/projects/sanpi/script/transform_ucs.sh
Thu May 23 18:15:25 EDT 2024
(TMP: /share/compling/projects/sanpi/results/ucs/adv_adj/ANYmirror/tmp/tmp_ANYmirror-20240523-181525.AdvAdj_frq-thrMIN-7-35f_min100x)
## Initial Contingency Info

DATA SET FILE:  /share/compling/projects/sanpi/results/ucs/adv_adj/ANYmirror/AdvAdj_frq-thrMIN-7.35f_min100x.ds.gz

# Frequency signatures computed by the ucs-make-tables tool for relational cooccurrences.
# Sample size:  N = 1761853 tokens,  V = 195059 pair types.
# A frequency threshold of f >= 100 was applied, leaving V = 2315 pair types.

##:: size = 2315
##:: threshold = 100

...
Loading data set /share/compling/projects/sanpi/results/ucs/adv_adj/ANYmirror/AdvAdj_frq-thrMIN-7.35f_min100x.rsort.gz ... 2315 rows
Script finished at Thu May 23 18:15:29 EDT 2024


# %% [markdown]

 4. Run `ucs_to_csv()` to convert `ucs/[PAT_DIR]/readable/*.txt` to format that `pandas` can parse as a dataframe

In [None]:
! head -5 {readable}
csv_path = ucs2csv(readable)
print(f'CSV: `{csv_path.relative_to(RESULT_DIR)}`')

l1,l2,f,E11,am_log_likelihood,am_odds_ratio_disc,am_p1_given2,am_p2_given1,am_p1_given2_simple,am_p2_given1_simple,f1,f2,N
even,triple,113,4.2193225,696.935688,2.523903988,0.8917066440,0.0018492047,0.9262295082,0.0018544959,60933,122,1761853
too,late,12778,1180.1509808,59242.424654,2.574631803,0.8883968110,0.0805976433,0.9714893941,0.0808314672,158082,13153,1761853
too,early,5158,481.8224562,23279.731355,2.405772510,0.8734588021,0.0324964473,0.9605214153,0.0326286358,158082,5370,1761853
very,least,101,11.1751451,411.488355,2.127013432,0.8395356111,0.0005450835,0.9439252336,0.0005488862,184009,107,1761853
UCS table text converted & saved as /share/compling/projects/sanpi/results/ucs/adv_adj/ANYmirror/readable/AdvAdj_frq-thrMIN-7.35f_min100x.rsort-view_am-only.csv
CSV: `ucs/adv_adj/ANYmirror/readable/AdvAdj_frq-thrMIN-7.35f_min100x.rsort-view_am-only.csv`


# %% [markdown]

##

In [None]:
adx_amdf = pd.read_csv(csv_path).convert_dtypes()
adx_amdf

Unnamed: 0,l1,l2,f,E11,am_log_likelihood,am_odds_ratio_disc,am_p1_given2,am_p2_given1,am_p1_given2_simple,am_p2_given1_simple,f1,f2,N
0,even,triple,113,4.22,696.94,2.52,0.89,0.00,0.93,0.00,60933,122,1761853
1,too,late,12778,1180.15,59242.42,2.57,0.89,0.08,0.97,0.08,158082,13153,1761853
2,too,early,5158,481.82,23279.73,2.41,0.87,0.03,0.96,0.03,158082,5370,1761853
3,very,least,101,11.18,411.49,2.13,0.84,0.00,0.94,0.00,184009,107,1761853
4,early,next,144,0.02,2675.23,5.42,0.82,0.83,0.82,0.83,174,175,1761853
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2310,more,wrong,273,3382.45,-5408.31,-1.17,-0.15,-0.01,0.01,0.00,285602,20866,1761853
2311,more,happy,179,2327.32,-3760.40,-1.19,-0.15,-0.01,0.01,0.00,285602,14357,1761853
2312,more,available,136,2048.34,-3428.64,-1.25,-0.15,-0.01,0.01,0.00,285602,12636,1761853
2313,more,better,125,2281.76,-3979.33,-1.34,-0.15,-0.01,0.01,0.00,285602,14076,1761853


In [None]:
# %%

adx_amdf['key'] = (adx_amdf.l1 + '~' +
                   adx_amdf.l2).astype('string')
adx_amdf = adx_amdf.set_index('key')
adx_amdf

Unnamed: 0_level_0,l1,l2,f,E11,am_log_likelihood,am_odds_ratio_disc,am_p1_given2,am_p2_given1,am_p1_given2_simple,am_p2_given1_simple,f1,f2,N
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
even~triple,even,triple,113,4.22,696.94,2.52,0.89,0.00,0.93,0.00,60933,122,1761853
too~late,too,late,12778,1180.15,59242.42,2.57,0.89,0.08,0.97,0.08,158082,13153,1761853
too~early,too,early,5158,481.82,23279.73,2.41,0.87,0.03,0.96,0.03,158082,5370,1761853
very~least,very,least,101,11.18,411.49,2.13,0.84,0.00,0.94,0.00,184009,107,1761853
early~next,early,next,144,0.02,2675.23,5.42,0.82,0.83,0.82,0.83,174,175,1761853
...,...,...,...,...,...,...,...,...,...,...,...,...,...
more~wrong,more,wrong,273,3382.45,-5408.31,-1.17,-0.15,-0.01,0.01,0.00,285602,20866,1761853
more~happy,more,happy,179,2327.32,-3760.40,-1.19,-0.15,-0.01,0.01,0.00,285602,14357,1761853
more~available,more,available,136,2048.34,-3428.64,-1.25,-0.15,-0.01,0.01,0.00,285602,12636,1761853
more~better,more,better,125,2281.76,-3979.33,-1.34,-0.15,-0.01,0.01,0.00,285602,14076,1761853


# %% [markdown]

 6. Save to `./results/assoc_df/`

In [None]:
df_csv_path = AM_DF_DIR.joinpath(
    str(csv_path.relative_to(UCS_DIR))
    .replace('/readable', '')
    .replace('.rsort-view_am-only', ''))

if not df_csv_path.is_file():
    confirm_dir(df_csv_path.parent)
    adx_amdf.to_csv(df_csv_path)

df_pkl_path = df_csv_path.with_suffix('.pkl.gz')
if not df_pkl_path.is_file():
    adx_amdf.to_pickle(df_csv_path.with_suffix('.pkl.gz'))

# %% [markdown]

 7. Add additional AM via `add_extra_am()`
 Define dictionary containing relevant vocab sizes
 !!! Warning This is a `#HACK`: \
     Rather than developing a command/code to retrieve the vocab sizes programmatically,
     I simply copied the values given in the log output of `transform_usc.sh`
     for each `PAT_DIR`+`UNIT` combination

In [None]:
VOCABS = {
    'NEGmirror': {'Adj': 40004},
    'POSmirror': {'Adj': 178159},
    'ANYmirror': {'Adj': 195059},
    'RBdirect':  {'Adj': 61860},
    'RBXadj':  {'Adj': 1940305}
}  # ! #HACK
VOCAB = VOCABS[PAT_DIR][UNIT]

print(pd.DataFrame(VOCABS).convert_dtypes().to_markdown(intfmt=','))
VOCAB = None
ex_adx_amdf = add_extra_am(df=adx_amdf,
                           verbose=True,
                           vocab=VOCAB,
                           metrics=['t_score', 'mutual_information']
                           ).convert_dtypes()

|     |   NEGmirror |   POSmirror |   ANYmirror |   RBdirect |    RBXadj |
|:----|------------:|------------:|------------:|-----------:|----------:|
| Adj |      40,004 |     178,159 |     195,059 |     61,860 | 1,940,305 |

Preview of Extended Measures (rounded)

| key           |   t_score |   mutual_information |   deltaP_min |   deltaP_max |   deltaP_max_abs |   deltaP_product |   unexpected_f |   unexpected_ratio |
|:--------------|----------:|---------------------:|-------------:|-------------:|-----------------:|-----------------:|---------------:|-------------------:|
| even~triple   |     10.23 |                 1.43 |         0.00 |         0.89 |             0.89 |             0.00 |         108.78 |               0.96 |
| too~late      |    102.60 |                 1.03 |         0.08 |         0.89 |             0.89 |             0.07 |      11,597.85 |               0.91 |
| too~early     |     65.11 |                 1.03 |         0.03 |         0.87 |             0.8

# %% [markdown]

 Save extended AM tables to `extra/` subdirectory if not already saved

In [None]:
df_extra_csv = df_csv_path.parent / 'extra' / \
    df_csv_path.name.replace('.csv', '_extra.csv')
print(df_extra_csv)
if not df_extra_csv.is_file():
    confirm_dir(df_extra_csv.parent)
    ex_adx_amdf.to_csv(df_extra_csv)

df_extra_pkl = df_extra_csv.with_suffix('.pkl.gz')
if not df_extra_pkl.is_file():
    ex_adx_amdf.to_pickle(df_extra_pkl)

/share/compling/projects/sanpi/results/assoc_df/adv_adj/ANYmirror/extra/AdvAdj_frq-thrMIN-7.35f_min100x_extra.csv


In [None]:
# %%

ex_adx_full = ex_adx_amdf.copy()
ex_adx_abbr = adjust_assoc_columns(
    ex_adx_amdf[[c for c in ['polarity', 'quant'] + FOCUS if c in ex_adx_amdf.columns]]).sort_values('LRC', ascending=False)
cols = ex_adx_abbr.columns

# %% [markdown]

 Define lexical items with given lean shown in binary environment evaluation

In [None]:
pos_prone = {
    'Adj': [
        'unrelated',
        'unable',
        'akin',
        'larger',
        'different',
        'familiar',
        'similar',
        'likely',
        'brief',
        'unaware'
    ],
    'Adv': [
        'slightly',
        'definitely',
        'utterly',
        # LRC top
        'pretty',
        'rather',
        'plain',
        'fairly',
        'somewhat',
        'otherwise',
        'downright',
        'relatively',
        # G2 top
        # 'very',
        # 'even',
        # 'just',
        # dP1 top (and odds ratio disc)
        'plain',
        'maybe'
    ],
    'Bigr': [
        # G2 top
        'completely_different',
        'too_familiar',
        'even_better',
        # dP1 top
        'quite_different',
        'too_real',
        'well_aware',
        # LRC top
        'too_common',
        'entirely_different'
    ]}
neg_prone = {
    'Bigr': [
        # LRC top
        'quite_sure',
        'really_sure',
        'too_early',
        'too_pleased',
        'too_fancy',
        # dP1 top
        'entirely_sure',
        'ever_easy',
        'ever_perfect',
        'particularly_surprising',
        'particularly_new',
        # G2 top
        'too_late',
        'more_important',
        'so_easy',
        'as_good',
        'too_old'
    ],
    'Adv': [
        'yet',
        # LRC top
        'ever',
        'any',
        'longer',
        'necessarily',
        'that',
        # dP1 top
        'before',
        'wise',  # ? How is this used as an adverb?
        'earthly',
        'remotely',
        'exactly',
        # G2 top
        'particularly',
        'too',
        # 'inherently'
    ],
    'Adj': [
        # LRC top
        'early',
        'late',
        'fancy',
        'alone',
        'sure',
        # dP1 top
        'shabby',
        'demoralizing',
        'alone',
        'aggravating',
        'groundbreaking',
        'eventful',
        # G2 top
        'important',
        'frustrating',
        'evident',
        'certain'
    ]
}


def sort_prone_by_f2(prone_list, amdf):
    return amdf.copy().loc[amdf.l2.isin(prone_list), ['f2', 'l2']].drop_duplicates().reset_index(drop=True).set_index('l2').round(1).sort_values(['f2'], ascending=False).index.to_list()


pos_prone[UNIT] = sort_prone_by_f2(pos_prone[UNIT], ex_adx_abbr)
neg_prone[UNIT] = sort_prone_by_f2(neg_prone[UNIT], ex_adx_abbr)
pos_prone[UNIT]

['different',
 'likely',
 'familiar',
 'similar',
 'larger',
 'unable',
 'akin',
 'unaware',
 'unrelated',
 'brief']

# %% [markdown]

 Strongest associations for each polarity by metric

In [None]:
def show_metric_top(amdf: pd.DataFrame,
                    metric: str,
                    k=5,
                    cols=[None]):
    if not any(cols):
        cols = amdf.columns
    return (amdf.nlargest(k, metric)
            .loc[:, [metric] + cols[cols != metric].to_list()]
            .reset_index(drop=True).set_index(['l1', 'l2'])
            )

# %% [markdown]

 Top consevative log ratio $LRC$ values

In [None]:
exdf = show_metric_top(ex_adx_abbr, "LRC", k=8)
# pos_prone, neg_prone = update_prone(exdf, pos_prone, neg_prone)
exdf

Unnamed: 0_level_0,Unnamed: 1_level_0,LRC,f,unexp_f,dP1,dP2,dP1_simple,dP2_simple,G2,N,f1,f2,exp_f
l1,l2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
early,next,14.32,144,143.98,0.82,0.83,0.82,0.83,2675.23,1761853,174,175,0.02
non,existent,11.78,108,107.95,0.67,0.22,0.68,0.22,1589.53,1761853,496,160,0.05
lightly,golden,11.47,206,205.9,0.48,0.52,0.48,0.52,2999.46,1761853,399,426,0.1
long,overdue,11.25,208,207.83,0.72,0.2,0.72,0.2,2793.45,1761853,1043,289,0.17
minimally,invasive,11.03,122,121.94,0.31,0.48,0.31,0.48,1743.16,1761853,254,392,0.06
mutually,exclusive,10.98,441,440.63,0.56,0.54,0.56,0.54,5968.69,1761853,823,794,0.37
critically,endangered,10.85,206,205.85,0.5,0.32,0.5,0.32,2768.7,1761853,645,408,0.15
grossly,negligent,10.42,112,111.92,0.42,0.2,0.42,0.2,1470.63,1761853,550,266,0.08


# %% [markdown]

 Top $\Delta P(\texttt{adv}|\texttt{adj})$ values

In [None]:
exdf = show_metric_top(ex_adx_abbr, 'dP1', k=8)
# pos_prone, neg_prone = update_prone(exdf, pos_prone, neg_prone)
exdf

Unnamed: 0_level_0,Unnamed: 1_level_0,dP1,f,unexp_f,LRC,dP2,dP1_simple,dP2_simple,G2,N,f1,f2,exp_f
l1,l2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
even,triple,0.89,113,108.78,6.19,0.0,0.93,0.0,696.94,1761853,60933,122,4.22
too,late,0.89,12778,11597.85,8.09,0.08,0.97,0.08,59242.42,1761853,158082,13153,1180.15
too,early,0.87,5158,4676.18,7.49,0.03,0.96,0.03,23279.73,1761853,158082,5370,481.82
very,least,0.84,101,89.82,4.42,0.0,0.94,0.0,411.49,1761853,184009,107,11.18
early,next,0.82,144,143.98,14.32,0.83,0.82,0.83,2675.23,1761853,174,175,0.02
even,third,0.8,371,355.54,6.27,0.01,0.83,0.01,2096.25,1761853,60933,447,15.46
even,fourth,0.77,176,168.46,5.75,0.0,0.81,0.0,974.02,1761853,60933,218,7.54
now,live,0.77,407,405.35,9.38,0.07,0.77,0.07,4156.73,1761853,5479,530,1.65


# %% [markdown]

 Top conditional probability $P(\texttt{adv}|\texttt{adj})$ values

In [None]:
exdf = show_metric_top(ex_adx_abbr, 'dP1_simple', k=8)
# pos_prone, neg_prone = update_prone(exdf, pos_prone, neg_prone)
exdf

Unnamed: 0_level_0,Unnamed: 1_level_0,dP1_simple,f,unexp_f,LRC,dP1,dP2,dP2_simple,G2,N,f1,f2,exp_f
l1,l2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
too,late,0.97,12778,11597.85,8.09,0.89,0.08,0.08,59242.42,1761853,158082,13153,1180.15
too,early,0.96,5158,4676.18,7.49,0.87,0.03,0.03,23279.73,1761853,158082,5370,481.82
very,least,0.94,101,89.82,4.42,0.84,0.0,0.0,411.49,1761853,184009,107,11.18
even,triple,0.93,113,108.78,6.19,0.89,0.0,0.0,696.94,1761853,60933,122,4.22
more,akin,0.86,946,768.33,4.45,0.7,0.0,0.0,2622.96,1761853,285602,1096,177.67
more,galling,0.85,147,119.12,3.51,0.69,0.0,0.0,401.23,1761853,285602,172,27.88
even,third,0.83,371,355.54,6.27,0.8,0.01,0.01,2096.25,1761853,60933,447,15.46
early,next,0.82,144,143.98,14.32,0.82,0.83,0.83,2675.23,1761853,174,175,0.02


# %% [markdown]

 Top $\Delta P(\texttt{adj}|\texttt{adv})$ values

In [None]:
exdf = show_metric_top(ex_adx_abbr, 'dP2', k=8)
# pos_prone, neg_prone = update_prone(exdf, pos_prone, neg_prone)
exdf

Unnamed: 0_level_0,Unnamed: 1_level_0,dP2,f,unexp_f,LRC,dP1,dP1_simple,dP2_simple,G2,N,f1,f2,exp_f
l1,l2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
crystal,clear,1.0,146,145.35,7.77,0.02,0.02,1.0,1584.13,1761853,146,7833,0.65
brand,new,0.96,523,518.96,7.0,0.04,0.04,0.97,4986.68,1761853,541,13145,4.04
kind,enough,0.95,354,352.1,7.49,0.04,0.04,0.95,3609.5,1761853,371,9034,1.9
at,best,0.94,139,138.86,9.84,0.08,0.08,0.95,1877.47,1761853,147,1714,0.14
humanly,possible,0.88,296,294.94,8.05,0.05,0.05,0.88,3181.85,1761853,336,5543,1.06
closely,related,0.87,703,701.24,8.79,0.18,0.18,0.87,8139.68,1761853,804,3857,1.76
wide,open,0.85,232,231.41,8.52,0.06,0.06,0.85,2632.68,1761853,273,3796,0.59
terminally,ill,0.84,255,254.56,9.13,0.1,0.1,0.84,3101.85,1761853,302,2540,0.44


# %% [markdown]

 Top conditional probability $P(\texttt{adj}|\texttt{adv})$ values

In [None]:
exdf = show_metric_top(ex_adx_abbr, 'dP2_simple', k=8)
# pos_prone, neg_prone = update_prone(exdf, pos_prone, neg_prone)
exdf

Unnamed: 0_level_0,Unnamed: 1_level_0,dP2_simple,f,unexp_f,LRC,dP1,dP2,dP1_simple,G2,N,f1,f2,exp_f
l1,l2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
crystal,clear,1.0,146,145.35,7.77,0.02,1.0,0.02,1584.13,1761853,146,7833,0.65
brand,new,0.97,523,518.96,7.0,0.04,0.96,0.04,4986.68,1761853,541,13145,4.04
kind,enough,0.95,354,352.1,7.49,0.04,0.95,0.04,3609.5,1761853,371,9034,1.9
at,best,0.95,139,138.86,9.84,0.08,0.94,0.08,1877.47,1761853,147,1714,0.14
humanly,possible,0.88,296,294.94,8.05,0.05,0.88,0.05,3181.85,1761853,336,5543,1.06
closely,related,0.87,703,701.24,8.79,0.18,0.87,0.18,8139.68,1761853,804,3857,1.76
vitally,important,0.86,270,262.22,4.98,0.01,0.84,0.01,1748.59,1761853,313,43776,7.78
wide,open,0.85,232,231.41,8.52,0.06,0.85,0.06,2632.68,1761853,273,3796,0.59


# %% [markdown]

 Top log-likelihood $G^2$ values

In [None]:
exdf = show_metric_top(ex_adx_abbr, 'G2', k=8)
# pos_prone, neg_prone = update_prone(exdf, pos_prone, neg_prone)
exdf

Unnamed: 0_level_0,Unnamed: 1_level_0,G2,f,unexp_f,LRC,dP1,dP2,dP1_simple,dP2_simple,N,f1,f2,exp_f
l1,l2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
too,late,59242.42,12778,11597.85,8.09,0.89,0.08,0.97,0.08,1761853,158082,13153,1180.15
as,simple,58922.5,15954,14396.44,4.61,0.58,0.14,0.63,0.15,1761853,108116,25382,1557.56
completely,different,34381.84,8352,7804.1,4.22,0.22,0.3,0.23,0.31,1761853,26819,35994,547.9
more,important,31883.65,23085,15988.77,2.47,0.37,0.07,0.53,0.08,1761853,285602,43776,7096.23
too,early,23279.73,5158,4676.18,7.49,0.87,0.03,0.96,0.03,1761853,158082,5370,481.82
even,better,22919.59,6103,5616.19,4.31,0.4,0.1,0.43,0.1,1761853,60933,14076,486.81
even,worse,22551.8,4970,4676.38,5.16,0.55,0.08,0.59,0.08,1761853,60933,8490,293.62
n't,right,18933.61,3808,3653.63,4.97,0.27,0.18,0.28,0.19,1761853,20187,13473,154.37


In [None]:
# %%

sig_adx_abbr = ex_adx_abbr.loc[ex_adx_abbr.LRC.abs() > 1, :]
sig_adx_abbr

Unnamed: 0_level_0,f,unexp_f,LRC,dP1,dP2,dP1_simple,dP2_simple,G2,N,f1,f2,exp_f,l1,l2
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
early~next,144,143.98,14.32,0.82,0.83,0.82,0.83,2675.23,1761853,174,175,0.02,early,next
non~existent,108,107.95,11.78,0.67,0.22,0.68,0.22,1589.53,1761853,496,160,0.05,non,existent
lightly~golden,206,205.90,11.47,0.48,0.52,0.48,0.52,2999.46,1761853,399,426,0.10,lightly,golden
long~overdue,208,207.83,11.25,0.72,0.20,0.72,0.20,2793.45,1761853,1043,289,0.17,long,overdue
minimally~invasive,122,121.94,11.03,0.31,0.48,0.31,0.48,1743.16,1761853,254,392,0.06,minimally,invasive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
more~available,136,-1912.34,-3.59,-0.15,-0.01,0.01,0.00,-3428.64,1761853,285602,12636,2048.34,more,available
too~important,252,-3675.80,-3.68,-0.09,-0.03,0.01,0.00,-6392.11,1761853,158082,43776,3927.80,too,important
more~better,125,-2156.76,-3.84,-0.15,-0.01,0.01,0.00,-3979.33,1761853,285602,14076,2281.76,more,better
more~right,113,-2071.02,-3.90,-0.15,-0.01,0.01,0.00,-3849.38,1761853,285602,13473,2184.02,more,right


# %% [markdown]

 Positive Prone Adverbs with significant LRC

In [None]:
sig_adx_abbr.loc[sig_adx_abbr.l1.isin(pos_prone['Adv'])]

Unnamed: 0_level_0,f,unexp_f,LRC,dP1,dP2,dP1_simple,dP2_simple,G2,N,f1,f2,exp_f,l1,l2
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
otherwise~objectionable,193,191.56,7.41,0.5,0.03,0.5,0.03,1631.17,1761853,6603,384,1.44,otherwise,objectionable
otherwise~unable,348,343.32,6.28,0.28,0.05,0.28,0.05,2436.0,1761853,6603,1248,4.68,otherwise,unable
otherwise~unavailable,165,162.77,6.08,0.27,0.02,0.28,0.02,1148.3,1761853,6603,595,2.23,otherwise,unavailable
relatively~inexpensive,137,135.25,6.04,0.24,0.03,0.24,0.03,962.26,1761853,5366,575,1.75,relatively,inexpensive
otherwise~inappropriate,199,195.84,5.84,0.23,0.03,0.24,0.03,1312.8,1761853,6603,843,3.16,otherwise,inappropriate
definitely~worth,267,262.69,5.66,0.05,0.18,0.05,0.18,1742.33,1761853,1468,5169,4.31,definitely,worth
relatively~minor,160,157.13,5.5,0.17,0.03,0.17,0.03,1005.08,1761853,5366,942,2.87,relatively,minor
plain~weird,237,231.14,5.07,0.11,0.05,0.12,0.05,1329.78,1761853,5069,2036,5.86,plain,weird
maybe~more,404,391.85,4.82,0.05,0.15,0.05,0.16,2129.39,1761853,2581,8296,12.15,maybe,more
slightly~higher,377,364.46,4.75,0.13,0.05,0.13,0.05,1902.93,1761853,7631,2896,12.54,slightly,higher


# %% [markdown]

 Negative Prone Adverbs with significant LRC

In [None]:
sig_adx_abbr.loc[sig_adx_abbr.l1.isin(neg_prone['Adv'])]

Unnamed: 0_level_0,f,unexp_f,LRC,dP1,dP2,dP1_simple,dP2_simple,G2,N,f1,f2,exp_f,l1,l2
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
too~late,12778,11597.85,8.09,0.89,0.08,0.97,0.08,59242.42,1761853,158082,13153,1180.15,too,late
too~early,5158,4676.18,7.49,0.87,0.03,0.96,0.03,23279.73,1761853,158082,5370,481.82,too,early
before~available,175,172.93,6.11,0.01,0.60,0.01,0.61,1346.37,1761853,288,12636,2.07,before,available
any~better,416,406.59,5.24,0.03,0.35,0.03,0.35,2512.54,1761853,1178,14076,9.41,any,better
particularly~noteworthy,100,96.54,4.45,0.21,0.01,0.21,0.01,501.81,1761853,13041,468,3.46,particularly,noteworthy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
too~interested,114,-716.67,-2.37,-0.08,-0.00,0.01,0.00,-1043.40,1761853,158082,9258,830.67,too,interested
too~simple,327,-1950.40,-2.55,-0.08,-0.01,0.01,0.00,-2818.41,1761853,158082,25382,2277.40,too,simple
too~interesting,100,-1026.50,-2.96,-0.08,-0.01,0.01,0.00,-1665.57,1761853,158082,12555,1126.50,too,interesting
too~important,252,-3675.80,-3.68,-0.09,-0.03,0.01,0.00,-6392.11,1761853,158082,43776,3927.80,too,important


# %% [markdown]

 Positive Prone Adjectives with significant LRC

In [None]:
sig_adx_abbr.loc[sig_adx_abbr.l2.isin(pos_prone['Adj'])]

Unnamed: 0_level_0,f,unexp_f,LRC,dP1,dP2,dP1_simple,dP2_simple,G2,N,f1,f2,exp_f,l1,l2
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
otherwise~unable,348,343.32,6.28,0.28,0.05,0.28,0.05,2436.0,1761853,6603,1248,4.68,otherwise,unable
much~larger,1364,1330.39,6.22,0.54,0.06,0.55,0.06,8429.52,1761853,23915,2476,33.61,much,larger
intimately~familiar,131,129.0,5.63,0.01,0.47,0.01,0.48,913.7,1761853,273,12895,2.0,intimately,familiar
eerily~similar,106,104.4,5.52,0.01,0.26,0.02,0.26,712.38,1761853,401,7011,1.6,eerily,similar
physically~unable,106,104.39,5.52,0.08,0.05,0.08,0.05,693.15,1761853,2269,1248,1.61,physically,unable
completely~unrelated,424,410.12,5.38,0.45,0.02,0.46,0.02,2310.46,1761853,26819,912,13.88,completely,unrelated
strikingly~similar,101,99.3,5.34,0.01,0.23,0.01,0.24,652.69,1761853,428,7011,1.7,strikingly,similar
totally~unrelated,200,194.42,4.99,0.21,0.02,0.22,0.02,1091.44,1761853,10779,912,5.58,totally,unrelated
substantially~similar,115,112.31,4.88,0.02,0.17,0.02,0.17,661.17,1761853,675,7011,2.69,substantially,similar
radically~different,539,520.74,4.72,0.01,0.58,0.01,0.6,3015.43,1761853,894,35994,18.26,radically,different


# %% [markdown]

 Negative Prone Adjectives with significant LRC

In [None]:
sig_adx_abbr.loc[sig_adx_abbr.l2.isin(neg_prone['Adj'])]

Unnamed: 0_level_0,f,unexp_f,LRC,dP1,dP2,dP1_simple,dP2_simple,G2,N,f1,f2,exp_f,l1,l2
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
too~late,12778,11597.85,8.09,0.89,0.08,0.97,0.08,59242.42,1761853,158082,13153,1180.15,too,late
too~early,5158,4676.18,7.49,0.87,0.03,0.96,0.03,23279.73,1761853,158082,5370,481.82,too,early
vitally~important,270,262.22,4.98,0.01,0.84,0.01,0.86,1748.59,1761853,313,43776,7.78,vitally,important
truly~alone,190,184.12,4.86,0.23,0.01,0.24,0.01,1001.94,1761853,13083,792,5.88,truly,alone
ever~certain,143,137.59,4.27,0.07,0.03,0.08,0.03,675.45,1761853,5027,1896,5.41,ever,certain
exactly~sure,148,142.28,4.21,0.01,0.14,0.02,0.14,701.24,1761853,1034,9744,5.72,exactly,sure
absolutely~certain,154,147.79,4.21,0.08,0.03,0.08,0.03,708.98,1761853,5772,1896,6.21,absolutely,certain
quite~sure,2984,2759.65,4.09,0.28,0.07,0.31,0.07,11013.33,1761853,40566,9744,224.35,quite,sure
critically~important,209,192.97,3.34,0.0,0.3,0.0,0.32,754.81,1761853,645,43776,16.03,critically,important
more~frustrating,2269,1731.14,3.24,0.52,0.01,0.68,0.01,4500.17,1761853,285602,3318,537.86,more,frustrating
