In [None]:
import pandas as pd

from source.utils import FREQ_DIR, RESULT_DIR, UCS_DIR, confirm_dir
from source.utils.associate import (BINARY_ASSOC_ARGS, add_extra_am,
                                    associate_ucs, confirm_basic_ucs)
from source.utils.associate import convert_ucs_to_csv as ucs2csv
from source.utils.associate import get_associations_csv as init_am, AM_DF_DIR
from source.utils.associate import manipulate_ucs, seek_readable_ucs, adjust_assoc_columns
pd.set_option('display.float_format', '{:,.2f}'.format)

set parameters

In [None]:
UNIT = 'Adj'
PAT_DIR = 'POSmirror'
# PAT_DIR = 'NEGmirror'
# PAT_DIR = 'RBdirect'
# PAT_DIR = 'ANYmirror'
# FRQ_FLOOR = 3
# FRQ_FLOOR = 10
# FRQ_FLOOR = 20
# FRQ_FLOOR = 50
FRQ_FLOOR = 100  # BUG 100 will be used regardless, so set it to this to at least keep the naming accurate
ADVADJ_TSV = FREQ_DIR.joinpath(
    f'{PAT_DIR}/ucs_format/Adv{UNIT}_frq-thrMIN-7.35f.tsv')
FOCUS = ['f', 'unexpected_f',
         'conservative_log_ratio',
         'am_p1_given2', 'am_p2_given1',
         'am_p1_given2_simple', 'am_p2_given1_simple',
         'am_log_likelihood',
         #  'mutual_information', 'am_odds_ratio_disc', 't_score',
         'N', 'f1', 'f2', 'E11',
         'l1', 'l2']

In [None]:
def invert_set_dict(d: dict):
    return {v: k for k in d for v in d[k]}

## 1. Run `seek_readable_ucs()` to generate consistent output path

In [None]:
readable = seek_readable_ucs(min_freq=FRQ_FLOOR,
                             ucs_subdir='adv_adj',
                             contained_counts_path=ADVADJ_TSV)
print(readable.relative_to(RESULT_DIR))

    > seeking `adv_adj/POSmirror/readable/AdvAdj_frq-thrMIN-7.35f_min100x*` frequency data and initial associations...
ucs/adv_adj/POSmirror/readable/AdvAdj_frq-thrMIN-7.35f_min100x.rsort-view_am-only.csv




 Snippet of starting frequency data (`ADVADJ_TSV`)

In [None]:
! head -5 {ADVADJ_TSV} | column -t

14875  as          simple
10996  more        important
8327   completely  different
7819   more        likely
7221   too         easy


## 2. Run `confirm_basic_ucs()` (if needed)

In [None]:
if not readable.is_file():
    basic_ucs_path = readable.parent.parent.joinpath(
        readable.name.replace('.rsort-view_am-only.txt', '.ds.gz'))
    print(
        f'Creating initial UCS table: `{basic_ucs_path.relative_to(RESULT_DIR)}')

    basic_ucs_path = confirm_basic_ucs(
        basic_ucs_path,
        freq_floor=FRQ_FLOOR,
        contained_counts_path=ADVADJ_TSV)

Excerpt of initial UCS table

In [None]:
init_readable = UCS_DIR.joinpath(
    f'adv_adj/{PAT_DIR}/readable'
).joinpath(f'{ADVADJ_TSV.name.replace(".tsv","")}_min{FRQ_FLOOR}x.init.txt')
! head -7 {init_readable}

             l1  l2                    f     f2      f1        N  
---------------  ----------------  -----  -----  ------  -------  
           more  different           138  34503  207924  1472077  
           very  different          6219  34503  175105  1472077  
            too  different           103  34503  112009  1472077  
             so  different          1844  34503  104617  1472077  
             as  different           134  34503   76987  1472077  


## 3. Run `associate_ucs()` (if needed)

In [None]:
if not readable.is_file():
    associate_ucs(basic_ucs_path)

transform_ucs_log = f'/share/compling/projects/sanpi/logs/associate/ucs//ucs-{PAT_DIR}_Adv{UNIT}_frq-thrMIN-7-35f_min{FRQ_FLOOR}x*.log'
! head -15 `ls -t1 {transform_ucs_log} | head -1`
! echo '...'
! tail -2 `ls -t1 {transform_ucs_log} | head -1`

# Manipulating AdvAdj_frq-thrMIN-7-35f_min100x ucs table
path to this script: /share/compling/projects/sanpi/script/transform_ucs.sh
Sun Jun  2 18:49:09 EDT 2024
(TMP: /share/compling/projects/sanpi/results/ucs/adv_adj/POSmirror/tmp/tmp_POSmirror-20240602-184909.AdvAdj_frq-thrMIN-7-35f_min100x)
## Initial Contingency Info

DATA SET FILE:  /share/compling/projects/sanpi/results/ucs/adv_adj/POSmirror/AdvAdj_frq-thrMIN-7.35f_min100x.ds.gz

# Frequency signatures computed by the ucs-make-tables tool for relational cooccurrences.
# Sample size:  N = 1472077 tokens,  V = 178159 pair types.
# A frequency threshold of f >= 100 was applied, leaving V = 1892 pair types.

##:: size = 1892
##:: threshold = 100

...
Loading data set /share/compling/projects/sanpi/results/ucs/adv_adj/POSmirror/AdvAdj_frq-thrMIN-7.35f_min100x.rsort.gz ... 1892 rows
Script finished at Sun Jun  2 18:49:22 EDT 2024


## 4. Run `ucs_to_csv()` to convert `ucs/[PAT_DIR]/readable/*.txt` to format that `pandas` can parse as a dataframe

In [None]:
! head -5 {readable}
csv_path = ucs2csv(readable)
print(f'CSV: `{csv_path.relative_to(RESULT_DIR)}`')

l1,l2,f,E11,am_log_likelihood,am_odds_ratio_disc,am_p1_given2,am_p2_given1,am_p1_given2_simple,am_p2_given1_simple,f1,f2,N
even,triple,113,4.8168418,667.103411,2.464213905,0.8868206947,0.0019378547,0.9262295082,0.0019442198,58121,122,1472077
very,least,101,12.3709018,403.690341,2.332278545,0.8522630785,0.0005744837,0.9711538462,0.0005767968,175105,104,1472077
early,next,144,0.0200968,2641.253161,5.418049794,0.8371902794,0.8371902794,0.8372093023,0.8372093023,172,172,1472077
even,third,371,17.5301455,2009.332874,2.092529357,0.7963434648,0.0063316068,0.8355855856,0.0063832350,58121,444,1472077
UCS table text converted & saved as /share/compling/projects/sanpi/results/ucs/adv_adj/POSmirror/readable/AdvAdj_frq-thrMIN-7.35f_min100x.rsort-view_am-only.csv
CSV: `ucs/adv_adj/POSmirror/readable/AdvAdj_frq-thrMIN-7.35f_min100x.rsort-view_am-only.csv`




##

In [None]:
adx_amdf = pd.read_csv(csv_path).convert_dtypes()
adx_amdf

Unnamed: 0,l1,l2,f,E11,am_log_likelihood,am_odds_ratio_disc,am_p1_given2,am_p2_given1,am_p1_given2_simple,am_p2_given1_simple,f1,f2,N
0,even,triple,113,4.82,667.10,2.46,0.89,0.00,0.93,0.00,58121,122,1472077
1,very,least,101,12.37,403.69,2.33,0.85,0.00,0.97,0.00,175105,104,1472077
2,early,next,144,0.02,2641.25,5.42,0.84,0.84,0.84,0.84,172,172,1472077
3,even,third,371,17.53,2009.33,2.09,0.80,0.01,0.84,0.01,58121,444,1472077
4,now,live,407,1.86,4083.81,3.05,0.79,0.08,0.79,0.08,5320,514,1472077
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1887,more,simple,373,3190.73,-4470.00,-1.00,-0.13,-0.02,0.02,0.00,207924,22590,1472077
1888,more,easy,204,2117.97,-3165.21,-1.08,-0.13,-0.01,0.01,0.00,207924,14995,1472077
1889,more,available,105,1614.43,-2678.88,-1.25,-0.13,-0.01,0.01,0.00,207924,11430,1472077
1890,more,better,119,1832.38,-3043.50,-1.25,-0.13,-0.01,0.01,0.00,207924,12973,1472077


In [None]:


adx_amdf['key'] = (adx_amdf.l1 + '~' +
                   adx_amdf.l2).astype('string')
adx_amdf = adx_amdf.set_index('key')
adx_amdf

Unnamed: 0_level_0,l1,l2,f,E11,am_log_likelihood,am_odds_ratio_disc,am_p1_given2,am_p2_given1,am_p1_given2_simple,am_p2_given1_simple,f1,f2,N
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
even~triple,even,triple,113,4.82,667.10,2.46,0.89,0.00,0.93,0.00,58121,122,1472077
very~least,very,least,101,12.37,403.69,2.33,0.85,0.00,0.97,0.00,175105,104,1472077
early~next,early,next,144,0.02,2641.25,5.42,0.84,0.84,0.84,0.84,172,172,1472077
even~third,even,third,371,17.53,2009.33,2.09,0.80,0.01,0.84,0.01,58121,444,1472077
now~live,now,live,407,1.86,4083.81,3.05,0.79,0.08,0.79,0.08,5320,514,1472077
...,...,...,...,...,...,...,...,...,...,...,...,...,...
more~simple,more,simple,373,3190.73,-4470.00,-1.00,-0.13,-0.02,0.02,0.00,207924,22590,1472077
more~easy,more,easy,204,2117.97,-3165.21,-1.08,-0.13,-0.01,0.01,0.00,207924,14995,1472077
more~available,more,available,105,1614.43,-2678.88,-1.25,-0.13,-0.01,0.01,0.00,207924,11430,1472077
more~better,more,better,119,1832.38,-3043.50,-1.25,-0.13,-0.01,0.01,0.00,207924,12973,1472077


## 6. Save to `./results/assoc_df/`

In [None]:
df_csv_path = AM_DF_DIR.joinpath(
    str(csv_path.relative_to(UCS_DIR))
    .replace('/readable', '')
    .replace('.rsort-view_am-only', ''))

if not df_csv_path.is_file():
    confirm_dir(df_csv_path.parent)
    adx_amdf.to_csv(df_csv_path)

df_pkl_path = df_csv_path.with_suffix('.pkl.gz')
if not df_pkl_path.is_file():
    adx_amdf.to_pickle(df_csv_path.with_suffix('.pkl.gz'))

## 7. Add additional AM via `add_extra_am()`
Define dictionary containing relevant vocab sizes
!!! Warning This is a `#HACK`: \
    Rather than developing a command/code to retrieve the vocab sizes programmatically,
    I simply copied the values given in the log output of `transform_usc.sh`
    for each `PAT_DIR`+`UNIT` combination

In [None]:
VOCABS = {
    # // 'ANYmirror': {'Adj': 83422},
    # //   'NEGmirror': {'Adj': 21562},
    'POSmirror': {'Adj': 178159},
    'RBdirect':  {'Adj': 61860}
}  # ! #HACK
VOCAB = VOCABS[PAT_DIR][UNIT]

print(pd.DataFrame(VOCABS).convert_dtypes().to_markdown(intfmt=','))
VOCAB = None
ex_adx_amdf = add_extra_am(df=adx_amdf,
                           verbose=True,
                           vocab=VOCAB,
                           metrics=['t_score', 'mutual_information']
                           ).convert_dtypes()

|     |   POSmirror |   RBdirect |
|:----|------------:|-----------:|
| Adj |     178,159 |     61,860 |

Preview of Extended Measures (rounded)

| key          |   t_score |   mutual_information |   deltaP_min |   deltaP_max |   deltaP_max_abs |   deltaP_product |   unexpected_f |   unexpected_ratio |
|:-------------|----------:|---------------------:|-------------:|-------------:|-----------------:|-----------------:|---------------:|-------------------:|
| even~triple  |     10.18 |                 1.37 |         0.00 |         0.89 |             0.89 |             0.00 |         108.18 |               0.96 |
| very~least   |      8.82 |                 0.91 |         0.00 |         0.85 |             0.85 |             0.00 |          88.63 |               0.88 |
| early~next   |     12.00 |                 3.86 |         0.84 |         0.84 |             0.84 |             0.70 |         143.98 |               1.00 |
| even~third   |     18.35 |                 1.33 |         0.01



 Save extended AM tables to `extra/` subdirectory if not already saved

In [None]:
df_extra_csv = df_csv_path.parent / 'extra' / \
    df_csv_path.name.replace('.csv', '_extra.csv')
print(df_extra_csv)
if not df_extra_csv.is_file():
    confirm_dir(df_extra_csv.parent)
    ex_adx_amdf.to_csv(df_extra_csv)

df_extra_pkl = df_extra_csv.with_suffix('.pkl.gz')
if not df_extra_pkl.is_file():
    ex_adx_amdf.to_pickle(df_extra_pkl)

/share/compling/projects/sanpi/results/assoc_df/adv_adj/POSmirror/extra/AdvAdj_frq-thrMIN-7.35f_min100x_extra.csv


In [None]:


ex_adx_full = ex_adx_amdf.copy()
ex_adx_abbr = adjust_assoc_columns(
    ex_adx_amdf[[c for c in ['polarity', 'quant'] + FOCUS if c in ex_adx_amdf.columns]]).sort_values('LRC', ascending=False)
cols = ex_adx_abbr.columns



 Define lexical items with given lean shown in binary environment evaluation

In [None]:
pos_prone = {
    'Adj': [
        'unrelated',
        'unable',
        'akin',
        'larger',
        'different',
        'familiar',
        'similar',
        'likely',
        'brief',
        'unaware'
    ],
    'Adv': [
        'slightly',
        'definitely',
        'utterly',
        # LRC top
        'pretty',
        'rather',
        'plain',
        'fairly',
        'somewhat',
        'otherwise',
        'downright',
        'relatively',
        # G2 top
        # 'very',
        # 'even',
        # 'just',
        # dP1 top (and odds ratio disc)
        'plain',
        'maybe'
    ],
    'Bigr': [
        # G2 top
        'completely_different',
        'too_familiar',
        'even_better',
        # dP1 top
        'quite_different',
        'too_real',
        'well_aware',
        # LRC top
        'too_common',
        'entirely_different'
    ]}
neg_prone = {
    'Bigr': [
        # LRC top
        'quite_sure',
        'really_sure',
        'too_early',
        'too_pleased',
        'too_fancy',
        # dP1 top
        'entirely_sure',
        'ever_easy',
        'ever_perfect',
        'particularly_surprising',
        'particularly_new',
        # G2 top
        'too_late',
        'more_important',
        'so_easy',
        'as_good',
        'too_old'
    ],
    'Adv': [
        'yet',
        # LRC top
        'ever',
        'any',
        'longer',
        'necessarily',
        'that',
        # dP1 top
        'before',
        'wise',  # ? How is this used as an adverb?
        'earthly',
        'remotely',
        'exactly',
        # G2 top
        'particularly',
        'too',
        # 'inherently'
    ],
    'Adj': [
        # LRC top
        'early',
        'late',
        'fancy',
        'alone',
        'sure',
        # dP1 top
        'shabby',
        'demoralizing',
        'alone',
        'aggravating',
        'groundbreaking',
        'eventful',
        # G2 top
        'important',
        'frustrating',
        'evident',
        'certain'
    ]
}


def sort_prone_by_f2(prone_list, amdf):
    return amdf.copy().loc[amdf.l2.isin(prone_list), ['f2', 'l2']].drop_duplicates().reset_index(drop=True).set_index('l2').round(1).sort_values(['f2'], ascending=False).index.to_list()


pos_prone[UNIT] = sort_prone_by_f2(pos_prone[UNIT], ex_adx_abbr)
neg_prone[UNIT] = sort_prone_by_f2(neg_prone[UNIT], ex_adx_abbr)
pos_prone[UNIT]

['different',
 'familiar',
 'likely',
 'similar',
 'larger',
 'unable',
 'akin',
 'unaware',
 'unrelated',
 'brief']



 Strongest associations for each polarity by metric

In [None]:
def show_metric_top(amdf: pd.DataFrame,
                    metric: str,
                    k=5,
                    cols=[None]):
    if not any(cols):
        cols = amdf.columns
    return (amdf.nlargest(k, metric)
            .loc[:, [metric] + cols[cols != metric].to_list()]
            .reset_index(drop=True).set_index(['l1', 'l2'])
            )



 Top consevative log ratio $LRC$ values

In [None]:
exdf = show_metric_top(ex_adx_abbr, "LRC", k=8)
# pos_prone, neg_prone = update_prone(exdf, pos_prone, neg_prone)
exdf

Unnamed: 0_level_0,Unnamed: 1_level_0,LRC,f,unexp_f,dP1,dP2,dP1_simple,dP2_simple,G2,N,f1,f2,exp_f
l1,l2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
early,next,14.18,144,143.98,0.84,0.84,0.84,0.84,2641.25,1472077,172,172,0.02
non,existent,11.71,108,107.95,0.71,0.22,0.71,0.22,1571.6,1472077,487,153,0.05
long,overdue,11.35,204,203.84,0.72,0.25,0.72,0.25,2781.93,1472077,808,284,0.16
lightly,golden,11.3,206,205.89,0.5,0.52,0.5,0.52,2947.29,1472077,397,412,0.11
minimally,invasive,10.94,120,119.94,0.33,0.49,0.33,0.49,1700.11,1472077,245,362,0.06
critically,endangered,10.83,204,203.84,0.54,0.33,0.54,0.33,2729.85,1472077,609,381,0.16
mutually,exclusive,10.33,220,219.79,0.42,0.38,0.42,0.38,2832.8,1472077,578,526,0.21
grossly,negligent,10.3,111,110.91,0.43,0.21,0.43,0.21,1438.05,1472077,519,258,0.09




 Top $\Delta P(\texttt{adv}|\texttt{adj})$ values

In [None]:
exdf = show_metric_top(ex_adx_abbr, 'dP1', k=8)
# pos_prone, neg_prone = update_prone(exdf, pos_prone, neg_prone)
exdf

Unnamed: 0_level_0,Unnamed: 1_level_0,dP1,f,unexp_f,LRC,dP2,dP1_simple,dP2_simple,G2,N,f1,f2,exp_f
l1,l2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
even,triple,0.89,113,108.18,6.0,0.0,0.93,0.0,667.1,1472077,58121,122,4.82
very,least,0.85,101,88.63,4.15,0.0,0.97,0.0,403.69,1472077,175105,104,12.37
early,next,0.84,144,143.98,14.18,0.84,0.84,0.84,2641.25,1472077,172,172,0.02
even,third,0.8,371,353.47,6.12,0.01,0.84,0.01,2009.33,1472077,58121,444,17.53
now,live,0.79,407,405.14,9.33,0.08,0.79,0.08,4083.81,1472077,5320,514,1.86
too,late,0.78,1922,1751.18,5.78,0.02,0.86,0.02,8132.92,1472077,112009,2245,170.82
even,fourth,0.77,176,167.43,5.58,0.0,0.81,0.0,931.08,1472077,58121,217,8.57
more,akin,0.73,945,791.33,4.74,0.0,0.87,0.0,2899.59,1472077,207924,1088,153.67




 Top conditional probability $P(\texttt{adv}|\texttt{adj})$ values

In [None]:
exdf = show_metric_top(ex_adx_abbr, 'dP1_simple', k=8)
# pos_prone, neg_prone = update_prone(exdf, pos_prone, neg_prone)
exdf

Unnamed: 0_level_0,Unnamed: 1_level_0,dP1_simple,f,unexp_f,LRC,dP1,dP2,dP2_simple,G2,N,f1,f2,exp_f
l1,l2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
very,least,0.97,101,88.63,4.15,0.85,0.0,0.0,403.69,1472077,175105,104,12.37
even,triple,0.93,113,108.18,6.0,0.89,0.0,0.0,667.1,1472077,58121,122,4.82
more,akin,0.87,945,791.33,4.74,0.73,0.0,0.0,2899.59,1472077,207924,1088,153.67
too,late,0.86,1922,1751.18,5.78,0.78,0.02,0.02,8132.92,1472077,112009,2245,170.82
more,galling,0.85,132,110.11,3.65,0.71,0.0,0.0,393.62,1472077,207924,155,21.89
more,substantial,0.84,1529,1272.5,4.6,0.7,0.01,0.01,4496.77,1472077,207924,1816,256.5
early,next,0.84,144,143.98,14.18,0.84,0.84,0.84,2641.25,1472077,172,172,0.02
even,third,0.84,371,353.47,6.12,0.8,0.01,0.01,2009.33,1472077,58121,444,17.53




 Top $\Delta P(\texttt{adj}|\texttt{adv})$ values

In [None]:
exdf = show_metric_top(ex_adx_abbr, 'dP2', k=8)
# pos_prone, neg_prone = update_prone(exdf, pos_prone, neg_prone)
exdf

Unnamed: 0_level_0,Unnamed: 1_level_0,dP2,f,unexp_f,LRC,dP1,dP1_simple,dP2_simple,G2,N,f1,f2,exp_f
l1,l2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
crystal,clear,1.0,137,136.5,8.04,0.03,0.03,1.0,1540.35,1472077,137,5395,0.5
brand,new,0.97,511,507.07,7.01,0.05,0.05,0.97,4898.44,1472077,525,11009,3.93
kind,enough,0.95,347,345.01,7.4,0.04,0.04,0.95,3493.58,1472077,364,8041,1.99
at,best,0.94,137,136.84,9.63,0.08,0.08,0.94,1811.78,1472077,145,1647,0.16
humanly,possible,0.9,285,283.92,7.97,0.06,0.06,0.9,3052.48,1472077,315,5056,1.08
closely,related,0.88,681,679.12,8.65,0.19,0.19,0.88,7766.92,1472077,774,3583,1.88
wide,open,0.87,218,217.42,8.45,0.06,0.06,0.87,2467.4,1472077,250,3419,0.58
vitally,important,0.85,260,254.07,5.31,0.01,0.01,0.87,1808.26,1472077,300,29118,5.93




 Top conditional probability $P(\texttt{adj}|\texttt{adv})$ values

In [None]:
exdf = show_metric_top(ex_adx_abbr, 'dP2_simple', k=8)
# pos_prone, neg_prone = update_prone(exdf, pos_prone, neg_prone)
exdf

Unnamed: 0_level_0,Unnamed: 1_level_0,dP2_simple,f,unexp_f,LRC,dP1,dP2,dP1_simple,G2,N,f1,f2,exp_f
l1,l2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
crystal,clear,1.0,137,136.5,8.04,0.03,1.0,0.03,1540.35,1472077,137,5395,0.5
brand,new,0.97,511,507.07,7.01,0.05,0.97,0.05,4898.44,1472077,525,11009,3.93
kind,enough,0.95,347,345.01,7.4,0.04,0.95,0.04,3493.58,1472077,364,8041,1.99
at,best,0.94,137,136.84,9.63,0.08,0.94,0.08,1811.78,1472077,145,1647,0.16
humanly,possible,0.9,285,283.92,7.97,0.06,0.9,0.06,3052.48,1472077,315,5056,1.08
closely,related,0.88,681,679.12,8.65,0.19,0.88,0.19,7766.92,1472077,774,3583,1.88
wide,open,0.87,218,217.42,8.45,0.06,0.87,0.06,2467.4,1472077,250,3419,0.58
vitally,important,0.87,260,254.07,5.31,0.01,0.85,0.01,1808.26,1472077,300,29118,5.93




 Top log-likelihood $G^2$ values

In [None]:
exdf = show_metric_top(ex_adx_abbr, 'G2', k=8)
# pos_prone, neg_prone = update_prone(exdf, pos_prone, neg_prone)
exdf

Unnamed: 0_level_0,Unnamed: 1_level_0,G2,f,unexp_f,LRC,dP1,dP2,dP1_simple,dP2_simple,N,f1,f2,exp_f
l1,l2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
as,simple,62381.27,14875,13693.58,5.04,0.62,0.19,0.66,0.19,1472077,76987,22590,1181.42
completely,different,33964.46,8327,7756.53,4.17,0.23,0.32,0.24,0.34,1472077,24339,34503,570.47
even,better,22570.11,6091,5578.8,4.32,0.43,0.1,0.47,0.1,1472077,58121,12973,512.2
even,worse,21113.74,4867,4545.14,5.03,0.56,0.08,0.6,0.08,1472077,58121,8152,321.86
too,familiar,20013.5,7027,6061.28,3.8,0.48,0.06,0.55,0.06,1472077,112009,12692,965.72
n't,right,18353.73,3800,3631.8,4.89,0.29,0.18,0.31,0.19,1472077,19938,12419,168.2
too,easy,18030.93,7221,6080.04,3.39,0.41,0.06,0.48,0.06,1472077,112009,14995,1140.96
all,right,17420.94,2521,2476.91,6.05,0.2,0.48,0.2,0.48,1472077,5226,12419,44.09


In [None]:
sig_adx_abbr = ex_adx_abbr.loc[ex_adx_abbr.LRC.abs() > 1, :]
sig_adx_abbr

Unnamed: 0_level_0,f,unexp_f,LRC,dP1,dP2,dP1_simple,dP2_simple,G2,N,f1,f2,exp_f,l1,l2
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
early~next,144,143.98,14.18,0.84,0.84,0.84,0.84,2641.25,1472077,172,172,0.02,early,next
non~existent,108,107.95,11.71,0.71,0.22,0.71,0.22,1571.60,1472077,487,153,0.05,non,existent
long~overdue,204,203.84,11.35,0.72,0.25,0.72,0.25,2781.93,1472077,808,284,0.16,long,overdue
lightly~golden,206,205.89,11.30,0.50,0.52,0.50,0.52,2947.29,1472077,397,412,0.11,lightly,golden
minimally~invasive,120,119.94,10.94,0.33,0.49,0.33,0.49,1700.11,1472077,245,362,0.06,minimally,invasive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
as~different,134,-1670.45,-3.26,-0.05,-0.02,0.00,0.00,-2766.86,1472077,76987,34503,1804.45,as,different
more~available,105,-1509.43,-3.51,-0.13,-0.01,0.01,0.00,-2678.88,1472077,207924,11430,1614.43,more,available
more~better,119,-1713.38,-3.55,-0.13,-0.01,0.01,0.00,-3043.50,1472077,207924,12973,1832.38,more,better
too~different,103,-2522.30,-4.14,-0.07,-0.02,0.00,0.00,-4634.55,1472077,112009,34503,2625.30,too,different




 Positive Prone Adverbs with significant LRC

In [None]:
sig_adx_abbr.loc[sig_adx_abbr.l1.isin(pos_prone['Adv'])]

Unnamed: 0_level_0,f,unexp_f,LRC,dP1,dP2,dP1_simple,dP2_simple,G2,N,f1,f2,exp_f,l1,l2
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
otherwise~objectionable,193,191.68,7.91,0.65,0.03,0.65,0.03,1711.43,1472077,6562,297,1.32,otherwise,objectionable
otherwise~unable,348,342.48,6.04,0.28,0.05,0.28,0.05,2322.42,1472077,6562,1239,5.52,otherwise,unable
relatively~inexpensive,136,134.02,5.87,0.25,0.03,0.25,0.03,922.45,1472077,5328,546,1.98,relatively,inexpensive
otherwise~unavailable,165,162.38,5.85,0.28,0.02,0.28,0.03,1096.19,1472077,6562,588,2.62,otherwise,unavailable
otherwise~inappropriate,199,195.55,5.75,0.25,0.03,0.26,0.03,1283.69,1472077,6562,773,3.45,otherwise,inappropriate
definitely~worth,260,255.4,5.52,0.05,0.18,0.06,0.18,1650.3,1472077,1442,4691,4.6,definitely,worth
relatively~minor,160,156.64,5.28,0.17,0.03,0.17,0.03,955.54,1472077,5328,929,3.36,relatively,minor
plain~weird,236,229.57,4.94,0.12,0.05,0.13,0.05,1281.84,1472077,5053,1872,6.43,plain,weird
maybe~more,403,389.76,4.7,0.05,0.15,0.05,0.16,2056.68,1472077,2573,7575,13.24,maybe,more
slightly~higher,376,361.5,4.54,0.13,0.05,0.13,0.05,1791.27,1472077,7524,2837,14.5,slightly,higher




 Negative Prone Adverbs with significant LRC

In [None]:
sig_adx_abbr.loc[sig_adx_abbr.l1.isin(neg_prone['Adv'])]

Unnamed: 0_level_0,f,unexp_f,LRC,dP1,dP2,dP1_simple,dP2_simple,G2,N,f1,f2,exp_f,l1,l2
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
too~late,1922,1751.18,5.78,0.78,0.02,0.86,0.02,8132.92,1472077,112009,2245,170.82,too,late
too~much,3854,3445.33,4.75,0.64,0.03,0.72,0.03,13816.86,1472077,112009,5371,408.67,too,much
too~common,4927,4369.34,4.48,0.60,0.04,0.67,0.04,16678.14,1472077,112009,7329,557.66,too,common
too~familiar,7027,6061.28,3.80,0.48,0.06,0.55,0.06,20013.50,1472077,112009,12692,965.72,too,familiar
too~embarrassed,241,212.16,3.71,0.56,0.00,0.64,0.00,766.78,1472077,112009,379,28.84,too,embarrassed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
too~excited,168,-502.95,-1.58,-0.06,-0.00,0.02,0.00,-573.52,1472077,112009,8818,670.95,too,excited
too~simple,299,-1419.85,-2.24,-0.06,-0.01,0.01,0.00,-1908.00,1472077,112009,22590,1718.85,too,simple
too~new,112,-725.66,-2.38,-0.07,-0.01,0.01,0.00,-1056.31,1472077,112009,11009,837.66,too,new
too~important,183,-2032.56,-3.22,-0.07,-0.02,0.01,0.00,-3342.74,1472077,112009,29118,2215.56,too,important




 Positive Prone Adjectives with significant LRC

In [None]:
sig_adx_abbr.loc[sig_adx_abbr.l2.isin(pos_prone['Adj'])]

Unnamed: 0_level_0,f,unexp_f,LRC,dP1,dP2,dP1_simple,dP2_simple,G2,N,f1,f2,exp_f,l1,l2
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
much~larger,1352,1315.89,6.1,0.54,0.06,0.55,0.06,8150.08,1472077,21712,2448,36.11,much,larger
otherwise~unable,348,342.48,6.04,0.28,0.05,0.28,0.05,2322.42,1472077,6562,1239,5.52,otherwise,unable
physically~unable,106,104.51,5.64,0.08,0.06,0.09,0.06,710.27,1472077,1772,1239,1.49,physically,unable
intimately~familiar,129,126.76,5.45,0.01,0.49,0.01,0.5,869.52,1472077,260,12692,2.24,intimately,familiar
eerily~similar,106,104.15,5.31,0.02,0.26,0.02,0.27,681.31,1472077,399,6835,1.85,eerily,similar
completely~unrelated,424,408.99,5.27,0.45,0.02,0.47,0.02,2247.17,1472077,24339,908,15.01,completely,unrelated
strikingly~similar,101,99.14,5.22,0.01,0.25,0.01,0.25,637.4,1472077,400,6835,1.86,strikingly,similar
totally~unrelated,199,192.93,4.87,0.21,0.02,0.22,0.02,1051.86,1472077,9833,908,6.07,totally,unrelated
substantially~similar,113,110.11,4.75,0.02,0.18,0.02,0.18,630.74,1472077,623,6835,2.89,substantially,similar
more~akin,945,791.33,4.74,0.73,0.0,0.87,0.0,2899.59,1472077,207924,1088,153.67,more,akin




 Negative Prone Adjectives with significant LRC

In [None]:
sig_adx_abbr.loc[sig_adx_abbr.l2.isin(neg_prone['Adj'])]

Unnamed: 0_level_0,f,unexp_f,LRC,dP1,dP2,dP1_simple,dP2_simple,G2,N,f1,f2,exp_f,l1,l2
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
too~late,1922,1751.18,5.78,0.78,0.02,0.86,0.02,8132.92,1472077,112009,2245,170.82,too,late
vitally~important,260,254.07,5.31,0.01,0.85,0.01,0.87,1808.26,1472077,300,29118,5.93,vitally,important
not~sure,2127,1973.53,4.63,0.5,0.04,0.53,0.04,8564.19,1472077,56607,3991,153.47,not,sure
n't~sure,882,827.95,4.12,0.21,0.04,0.22,0.04,3492.78,1472077,19938,3991,54.05,n't,sure
critically~important,202,189.95,3.7,0.01,0.31,0.01,0.33,828.57,1472077,609,29118,12.05,critically,important
too~early,235,202.36,3.25,0.47,0.0,0.55,0.0,650.96,1472077,112009,429,32.64,too,early
too~evident,409,331.47,2.61,0.33,0.0,0.4,0.0,831.93,1472077,112009,1019,77.53,too,evident
equally~important,941,797.89,2.56,0.03,0.11,0.03,0.13,2064.61,1472077,7235,29118,143.11,equally,important
pretty~sure,299,232.31,1.86,0.06,0.01,0.07,0.01,448.88,1472077,24599,3991,66.69,pretty,sure
more~important,10996,6883.22,1.81,0.24,0.04,0.38,0.05,10231.89,1472077,207924,29118,4112.78,more,important
