In [None]:
# %%

import pandas as pd

from source.utils import FREQ_DIR, RESULT_DIR, UCS_DIR, confirm_dir
from source.utils.associate import (BINARY_ASSOC_ARGS, add_extra_am,
                                    associate_ucs, confirm_basic_ucs)
from source.utils.associate import convert_ucs_to_csv as ucs2csv
from source.utils.associate import get_associations_csv as init_am, AM_DF_DIR
from source.utils.associate import manipulate_ucs, seek_readable_ucs, adjust_assoc_columns
pd.set_option('display.float_format', '{:,.2f}'.format)

# %% [markdown]

 set parameters

In [None]:
UNIT = 'Adj'
# PAT_DIR = 'POSmirror'
PAT_DIR = 'NEGmirror'
# PAT_DIR = 'RBdirect'
# PAT_DIR = 'ANYmirror'
# FRQ_FLOOR = 3
# FRQ_FLOOR = 10
# FRQ_FLOOR = 20
# FRQ_FLOOR = 50
FRQ_FLOOR = 100  # BUG 100 will be used regardless, so set it to this to at least keep the naming accurate
ADVADJ_TSV = FREQ_DIR.joinpath(
    f'{PAT_DIR}/ucs_format/Adv{UNIT}_frq-thrMIN-7.35f.tsv')
FOCUS = ['f', 'unexpected_f',
         'conservative_log_ratio',
         'am_p1_given2', 'am_p2_given1',
         'am_p1_given2_simple', 'am_p2_given1_simple',
         'am_log_likelihood',
         #  'mutual_information', 'am_odds_ratio_disc', 't_score',
         'N', 'f1', 'f2', 'E11',
         'l1', 'l2']

In [None]:
# %%

def invert_set_dict(d: dict):
    return {v: k for k in d for v in d[k]}

# %% [markdown]

 1. Run `seek_readable_ucs()` to generate consistent output path

In [None]:
readable = seek_readable_ucs(min_freq=FRQ_FLOOR,
                             ucs_subdir='adv_adj',
                             contained_counts_path=ADVADJ_TSV)
print(readable.relative_to(RESULT_DIR))

    > seeking `adv_adj/NEGmirror/readable/AdvAdj_frq-thrMIN-7.35f_min100x*` frequency data and initial associations...
ucs/adv_adj/NEGmirror/readable/AdvAdj_frq-thrMIN-7.35f_min100x.rsort-view_am-only.csv


# %% [markdown]

 Snippet of starting frequency data (`ADVADJ_TSV`)

In [None]:
! head -5 {ADVADJ_TSV} | column -t

12089  more   important
10856  too    late
4923   too    early
3437   as     good
2942   quite  sure


# %% [markdown]

 2. Run `confirm_basic_ucs()` (if needed)

In [None]:
if not readable.is_file():
    basic_ucs_path = readable.parent.parent.joinpath(
        readable.name.replace('.rsort-view_am-only.txt', '.ds.gz'))
    print(
        f'Creating initial UCS table: `{basic_ucs_path.relative_to(RESULT_DIR)}')

    basic_ucs_path = confirm_basic_ucs(
        basic_ucs_path,
        freq_floor=FRQ_FLOOR,
        contained_counts_path=ADVADJ_TSV)

# %% [markdown]

 Excerpt of initial UCS table

In [None]:
init_readable = UCS_DIR.joinpath(
    f'adv_adj/{PAT_DIR}/readable'
).joinpath(f'{ADVADJ_TSV.name.replace(".tsv","")}_min{FRQ_FLOOR}x.init.txt')
! head -7 {init_readable}

           l1  l2                   f     f2     f1       N  
-------------  ---------------  -----  -----  -----  ------  
         more  important        12089  14658  77678  289776  
           as  important         1037  14658  31129  289776  
           so  important          529  14658  24985  289776  
       really  important          115  14658  11266  289776  
         very  important          104  14658   8904  289776  


# %% [markdown]

 3. Run `associate_ucs()` (if needed)

In [None]:
if not readable.is_file():
    associate_ucs(basic_ucs_path)

transform_ucs_log = f'/share/compling/projects/sanpi/logs/associate/ucs//ucs-{PAT_DIR}_Adv{UNIT}_frq-thrMIN-7-35f_min{FRQ_FLOOR}x*.log'
! head -15 `ls -t1 {transform_ucs_log} | head -1`
! echo '...'
! tail -2 `ls -t1 {transform_ucs_log} | head -1`

# Manipulating AdvAdj_frq-thrMIN-7-35f_min100x ucs table
path to this script: /share/compling/projects/sanpi/script/transform_ucs.sh
Sun Jun  2 19:01:21 EDT 2024
(TMP: /share/compling/projects/sanpi/results/ucs/adv_adj/NEGmirror/tmp/tmp_NEGmirror-20240602-190121.AdvAdj_frq-thrMIN-7-35f_min100x)
Caught a SIGPIPE at /usr/share/perl/5.30/Pod/Perldoc.pm line 1910.
Can't close STDOUT: Broken pipe at (eval 44) line 1.
END failed--call queue aborted.
## Initial Contingency Info

DATA SET FILE:  /share/compling/projects/sanpi/results/ucs/adv_adj/NEGmirror/AdvAdj_frq-thrMIN-7.35f_min100x.ds.gz

# Frequency signatures computed by the ucs-make-tables tool for relational cooccurrences.
# Sample size:  N = 289776 tokens,  V = 40004 pair types.
# A frequency threshold of f >= 100 was applied, leaving V = 362 pair types.

...
Loading data set /share/compling/projects/sanpi/results/ucs/adv_adj/NEGmirror/AdvAdj_frq-thrMIN-7.35f_min100x.rsort.gz ... 362 rows
Script finished at Sun Jun  2 19:01:33 EDT 20

# %% [markdown]

 4. Run `ucs_to_csv()` to convert `ucs/[PAT_DIR]/readable/*.txt` to format that `pandas` can parse as a dataframe

In [None]:
! head -5 {readable}
csv_path = ucs2csv(readable)
print(f'CSV: `{csv_path.relative_to(RESULT_DIR)}`')

l1,l2,f,E11,am_log_likelihood,am_odds_ratio_disc,am_p1_given2,am_p2_given1,am_p1_given2_simple,am_p2_given1_simple,f1,f2,N
too,late,10856,1734.319902,41654.72274,3.15554110,0.868947302,0.235412697,0.995232857,0.235626072,46073,10908,289776
too,early,4923,785.595401,18334.89269,3.19755679,0.851887408,0.106778309,0.996357013,0.106852169,46073,4941,289776
mutually,exclusive,221,0.226589,3208.87569,4.74113935,0.824543966,0.901878485,0.824626866,0.902040816,245,268,289776
too,old,2318,395.103131,7455.93081,1.88672370,0.780494759,0.049626202,0.932796781,0.050311462,46073,2485,289776
UCS table text converted & saved as /share/compling/projects/sanpi/results/ucs/adv_adj/NEGmirror/readable/AdvAdj_frq-thrMIN-7.35f_min100x.rsort-view_am-only.csv
CSV: `ucs/adv_adj/NEGmirror/readable/AdvAdj_frq-thrMIN-7.35f_min100x.rsort-view_am-only.csv`


# %% [markdown]

##

In [None]:
adx_amdf = pd.read_csv(csv_path).convert_dtypes()
adx_amdf

Unnamed: 0,l1,l2,f,E11,am_log_likelihood,am_odds_ratio_disc,am_p1_given2,am_p2_given1,am_p1_given2_simple,am_p2_given1_simple,f1,f2,N
0,too,late,10856,1734.32,41654.72,3.16,0.87,0.24,1.00,0.24,46073,10908,289776
1,too,early,4923,785.60,18334.89,3.20,0.85,0.11,1.00,0.11,46073,4941,289776
2,mutually,exclusive,221,0.23,3208.88,4.74,0.82,0.90,0.82,0.90,245,268,289776
3,too,old,2318,395.10,7455.93,1.89,0.78,0.05,0.93,0.05,46073,2485,289776
4,too,careful,660,113.20,2080.83,1.83,0.77,0.01,0.93,0.01,46073,712,289776
...,...,...,...,...,...,...,...,...,...,...,...,...,...
357,more,special,280,542.29,-199.57,-0.36,-0.13,-0.00,0.14,0.00,77678,2023,289776
358,too,easy,121,803.72,-1023.62,-0.89,-0.14,-0.02,0.02,0.00,46073,5055,289776
359,more,successful,155,349.02,-174.14,-0.43,-0.15,-0.00,0.12,0.00,77678,1302,289776
360,more,simple,176,748.43,-788.32,-0.74,-0.21,-0.01,0.06,0.00,77678,2792,289776


In [None]:
# %%

adx_amdf['key'] = (adx_amdf.l1 + '~' +
                   adx_amdf.l2).astype('string')
adx_amdf = adx_amdf.set_index('key')
adx_amdf

Unnamed: 0_level_0,l1,l2,f,E11,am_log_likelihood,am_odds_ratio_disc,am_p1_given2,am_p2_given1,am_p1_given2_simple,am_p2_given1_simple,f1,f2,N
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
too~late,too,late,10856,1734.32,41654.72,3.16,0.87,0.24,1.00,0.24,46073,10908,289776
too~early,too,early,4923,785.60,18334.89,3.20,0.85,0.11,1.00,0.11,46073,4941,289776
mutually~exclusive,mutually,exclusive,221,0.23,3208.88,4.74,0.82,0.90,0.82,0.90,245,268,289776
too~old,too,old,2318,395.10,7455.93,1.89,0.78,0.05,0.93,0.05,46073,2485,289776
too~careful,too,careful,660,113.20,2080.83,1.83,0.77,0.01,0.93,0.01,46073,712,289776
...,...,...,...,...,...,...,...,...,...,...,...,...,...
more~special,more,special,280,542.29,-199.57,-0.36,-0.13,-0.00,0.14,0.00,77678,2023,289776
too~easy,too,easy,121,803.72,-1023.62,-0.89,-0.14,-0.02,0.02,0.00,46073,5055,289776
more~successful,more,successful,155,349.02,-174.14,-0.43,-0.15,-0.00,0.12,0.00,77678,1302,289776
more~simple,more,simple,176,748.43,-788.32,-0.74,-0.21,-0.01,0.06,0.00,77678,2792,289776


# %% [markdown]

 6. Save to `./results/assoc_df/`

In [None]:
df_csv_path = AM_DF_DIR.joinpath(
    str(csv_path.relative_to(UCS_DIR))
    .replace('/readable', '')
    .replace('.rsort-view_am-only', ''))

if not df_csv_path.is_file():
    confirm_dir(df_csv_path.parent)
    adx_amdf.to_csv(df_csv_path)

df_pkl_path = df_csv_path.with_suffix('.pkl.gz')
if not df_pkl_path.is_file():
    adx_amdf.to_pickle(df_csv_path.with_suffix('.pkl.gz'))

# %% [markdown]

 7. Add additional AM via `add_extra_am()`
 Define dictionary containing relevant vocab sizes
 !!! Warning This is a `#HACK`: \
     Rather than developing a command/code to retrieve the vocab sizes programmatically,
     I simply copied the values given in the log output of `transform_usc.sh`
     for each `PAT_DIR`+`UNIT` combination

In [None]:
VOCABS = {
    # // 'ANYmirror': {'Adj': 83422},
    'NEGmirror': {'Adj': 40004},
    'POSmirror': {'Adj': 178159},
    'RBdirect':  {'Adj': 61860}
}  # ! #HACK
VOCAB = VOCABS[PAT_DIR][UNIT]

print(pd.DataFrame(VOCABS).convert_dtypes().to_markdown(intfmt=','))
VOCAB = None
ex_adx_amdf = add_extra_am(df=adx_amdf,
                           verbose=True,
                           vocab=VOCAB,
                           metrics=['t_score', 'mutual_information']
                           ).convert_dtypes()

|     |   NEGmirror |   POSmirror |   RBdirect |
|:----|------------:|------------:|-----------:|
| Adj |      40,004 |     178,159 |     61,860 |

Preview of Extended Measures (rounded)

| key                |   t_score |   mutual_information |   deltaP_min |   deltaP_max |   deltaP_max_abs |   deltaP_product |   unexpected_f |   unexpected_ratio |
|:-------------------|----------:|---------------------:|-------------:|-------------:|-----------------:|-----------------:|---------------:|-------------------:|
| too~late           |     87.55 |                 0.80 |         0.24 |         0.87 |             0.87 |             0.20 |       9,121.68 |               0.84 |
| too~early          |     58.97 |                 0.80 |         0.11 |         0.85 |             0.85 |             0.09 |       4,137.40 |               0.84 |
| mutually~exclusive |     14.85 |                 2.99 |         0.82 |         0.90 |             0.90 |             0.74 |         220.77 |              

# %% [markdown]

 Save extended AM tables to `extra/` subdirectory if not already saved

In [None]:
df_extra_csv = df_csv_path.parent / 'extra' / \
    df_csv_path.name.replace('.csv', '_extra.csv')
print(df_extra_csv)
if not df_extra_csv.is_file():
    confirm_dir(df_extra_csv.parent)
    ex_adx_amdf.to_csv(df_extra_csv)

df_extra_pkl = df_extra_csv.with_suffix('.pkl.gz')
if not df_extra_pkl.is_file():
    ex_adx_amdf.to_pickle(df_extra_pkl)

/share/compling/projects/sanpi/results/assoc_df/adv_adj/NEGmirror/extra/AdvAdj_frq-thrMIN-7.35f_min100x_extra.csv


In [None]:
# %%

ex_adx_full = ex_adx_amdf.copy()
ex_adx_abbr = adjust_assoc_columns(
    ex_adx_amdf[[c for c in ['polarity', 'quant'] + FOCUS if c in ex_adx_amdf.columns]]).sort_values('LRC', ascending=False)
cols = ex_adx_abbr.columns

# %% [markdown]

 Define lexical items with given lean shown in binary environment evaluation

In [None]:
pos_prone = {
    'Adj': [
        'unrelated',
        'unable',
        'akin',
        'larger',
        'different',
        'familiar',
        'similar',
        'likely',
        'brief',
        'unaware'
    ],
    'Adv': [
        'slightly',
        'definitely',
        'utterly',
        # LRC top
        'pretty',
        'rather',
        'plain',
        'fairly',
        'somewhat',
        'otherwise',
        'downright',
        'relatively',
        # G2 top
        # 'very',
        # 'even',
        # 'just',
        # dP1 top (and odds ratio disc)
        'plain',
        'maybe'
    ],
    'Bigr': [
        # G2 top
        'completely_different',
        'too_familiar',
        'even_better',
        # dP1 top
        'quite_different',
        'too_real',
        'well_aware',
        # LRC top
        'too_common',
        'entirely_different'
    ]}
neg_prone = {
    'Bigr': [
        # LRC top
        'quite_sure',
        'really_sure',
        'too_early',
        'too_pleased',
        'too_fancy',
        # dP1 top
        'entirely_sure',
        'ever_easy',
        'ever_perfect',
        'particularly_surprising',
        'particularly_new',
        # G2 top
        'too_late',
        'more_important',
        'so_easy',
        'as_good',
        'too_old'
    ],
    'Adv': [
        'yet',
        # LRC top
        'ever',
        'any',
        'longer',
        'necessarily',
        'that',
        # dP1 top
        'before',
        'wise',  # ? How is this used as an adverb?
        'earthly',
        'remotely',
        'exactly',
        # G2 top
        'particularly',
        'too',
        # 'inherently'
    ],
    'Adj': [
        # LRC top
        'early',
        'late',
        'fancy',
        'alone',
        'sure',
        # dP1 top
        'shabby',
        'demoralizing',
        'alone',
        'aggravating',
        'groundbreaking',
        'eventful',
        # G2 top
        'important',
        'frustrating',
        'evident',
        'certain'
    ]
}


def sort_prone_by_f2(prone_list, amdf):
    return amdf.copy().loc[amdf.l2.isin(prone_list), ['f2', 'l2']].drop_duplicates().reset_index(drop=True).set_index('l2').round(1).sort_values(['f2'], ascending=False).index.to_list()


pos_prone[UNIT] = sort_prone_by_f2(pos_prone[UNIT], ex_adx_abbr)
neg_prone[UNIT] = sort_prone_by_f2(neg_prone[UNIT], ex_adx_abbr)
pos_prone[UNIT]

['different', 'likely']

# %% [markdown]

 Strongest associations for each polarity by metric

In [None]:
def show_metric_top(amdf: pd.DataFrame,
                    metric: str,
                    k=5, 
                    cols=[None]):
    if not any(cols):
        cols = amdf.columns
    return (amdf.nlargest(k, metric)
            .loc[:, [metric] + cols[cols != metric].to_list()]
            .reset_index(drop=True).set_index(['l1', 'l2'])
            )

# %% [markdown]

 Top consevative log ratio $LRC$ values

In [None]:
exdf = show_metric_top(ex_adx_abbr, "LRC", k=8)
# pos_prone, neg_prone = update_prone(exdf, pos_prone, neg_prone)
exdf

Unnamed: 0_level_0,Unnamed: 1_level_0,LRC,f,unexp_f,dP1,dP2,dP1_simple,dP2_simple,G2,N,f1,f2,exp_f
l1,l2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
mutually,exclusive,11.54,221,220.77,0.82,0.9,0.82,0.9,3208.88,289776,245,268,0.23
too,late,9.26,10856,9121.68,0.87,0.24,1.0,0.24,41654.72,289776,46073,10908,1734.32
too,early,9.05,4923,4137.4,0.85,0.11,1.0,0.11,18334.89,289776,46073,4941,785.6
statistically,significant,7.77,108,107.56,0.11,0.81,0.11,0.81,1117.06,289776,133,962,0.44
politically,correct,7.77,120,119.44,0.38,0.23,0.38,0.23,1130.87,289776,521,313,0.56
long,enough,7.6,165,164.19,0.17,0.7,0.17,0.7,1616.25,289776,235,993,0.81
much,more,7.3,449,443.52,0.62,0.2,0.62,0.2,3527.05,289776,2203,721,5.48
before,available,7.07,175,173.8,0.14,0.6,0.15,0.61,1560.25,289776,288,1206,1.2


# %% [markdown]

 Top $\Delta P(\texttt{adv}|\texttt{adj})$ values

In [None]:
exdf = show_metric_top(ex_adx_abbr, 'dP1', k=8)
# pos_prone, neg_prone = update_prone(exdf, pos_prone, neg_prone)
exdf

Unnamed: 0_level_0,Unnamed: 1_level_0,dP1,f,unexp_f,LRC,dP2,dP1_simple,dP2_simple,G2,N,f1,f2,exp_f
l1,l2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
too,late,0.87,10856,9121.68,9.26,0.24,1.0,0.24,41654.72,289776,46073,10908,1734.32
too,early,0.85,4923,4137.4,9.05,0.11,1.0,0.11,18334.89,289776,46073,4941,785.6
mutually,exclusive,0.82,221,220.77,11.54,0.9,0.82,0.9,3208.88,289776,245,268,0.23
too,old,0.78,2318,1922.9,5.71,0.05,0.93,0.05,7455.93,289776,46073,2485,395.1
too,careful,0.77,660,546.8,5.19,0.01,0.93,0.01,2080.83,289776,46073,712,113.2
so,blind,0.73,181,161.77,4.46,0.01,0.81,0.01,680.15,289776,24985,223,19.23
so,glad,0.71,180,160.43,4.34,0.01,0.79,0.01,660.36,289776,24985,227,19.57
too,young,0.7,977,795.75,4.47,0.02,0.86,0.02,2730.53,289776,46073,1140,181.25


# %% [markdown]

 Top conditional probability $P(\texttt{adv}|\texttt{adj})$ values

In [None]:
exdf = show_metric_top(ex_adx_abbr, 'dP1_simple', k=8)
# pos_prone, neg_prone = update_prone(exdf, pos_prone, neg_prone)
exdf

Unnamed: 0_level_0,Unnamed: 1_level_0,dP1_simple,f,unexp_f,LRC,dP1,dP2,dP2_simple,G2,N,f1,f2,exp_f
l1,l2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
too,early,1.0,4923,4137.4,9.05,0.85,0.11,0.11,18334.89,289776,46073,4941,785.6
too,late,1.0,10856,9121.68,9.26,0.87,0.24,0.24,41654.72,289776,46073,10908,1734.32
too,old,0.93,2318,1922.9,5.71,0.78,0.05,0.05,7455.93,289776,46073,2485,395.1
too,careful,0.93,660,546.8,5.19,0.77,0.01,0.01,2080.83,289776,46073,712,113.2
more,frustrating,0.92,1853,1311.25,4.42,0.65,0.02,0.02,3857.11,289776,77678,2021,541.75
more,relaxing,0.91,372,262.36,3.72,0.64,0.0,0.0,755.45,289776,77678,409,109.64
more,aggravating,0.9,100,70.25,2.69,0.63,0.0,0.0,198.53,289776,77678,111,29.75
more,discouraging,0.88,107,74.56,2.64,0.62,0.0,0.0,203.87,289776,77678,121,32.44


# %% [markdown]

 Top $\Delta P(\texttt{adj}|\texttt{adv})$ values

In [None]:
exdf = show_metric_top(ex_adx_abbr, 'dP2', k=8)
# pos_prone, neg_prone = update_prone(exdf, pos_prone, neg_prone)
exdf

Unnamed: 0_level_0,Unnamed: 1_level_0,dP2,f,unexp_f,LRC,dP1,dP1_simple,dP2_simple,G2,N,f1,f2,exp_f
l1,l2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
mutually,exclusive,0.9,221,220.77,11.54,0.82,0.82,0.9,3208.88,289776,245,268,0.23
statistically,significant,0.81,108,107.56,7.77,0.11,0.11,0.81,1117.06,289776,133,962,0.44
damn,good,0.78,102,98.2,4.49,0.01,0.01,0.81,594.34,289776,126,8732,3.8
long,enough,0.7,165,164.19,7.6,0.17,0.17,0.7,1616.25,289776,235,993,0.81
before,available,0.6,175,173.8,7.07,0.14,0.15,0.61,1560.25,289776,288,1206,1.2
readily,available,0.6,122,121.16,6.95,0.1,0.1,0.61,1081.51,289776,201,1206,0.84
inherently,wrong,0.51,1488,1436.02,5.15,0.27,0.28,0.53,8470.06,289776,2829,5324,51.98
fundamentally,wrong,0.5,166,160.12,4.52,0.03,0.03,0.52,894.47,289776,320,5324,5.88


# %% [markdown]

 Top conditional probability $P(\texttt{adj}|\texttt{adv})$ values

In [None]:
exdf = show_metric_top(ex_adx_abbr, 'dP2_simple', k=8)
# pos_prone, neg_prone = update_prone(exdf, pos_prone, neg_prone)
exdf

Unnamed: 0_level_0,Unnamed: 1_level_0,dP2_simple,f,unexp_f,LRC,dP1,dP2,dP1_simple,G2,N,f1,f2,exp_f
l1,l2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
mutually,exclusive,0.9,221,220.77,11.54,0.82,0.9,0.82,3208.88,289776,245,268,0.23
statistically,significant,0.81,108,107.56,7.77,0.11,0.81,0.11,1117.06,289776,133,962,0.44
damn,good,0.81,102,98.2,4.49,0.01,0.78,0.01,594.34,289776,126,8732,3.8
long,enough,0.7,165,164.19,7.6,0.17,0.7,0.17,1616.25,289776,235,993,0.81
before,available,0.61,175,173.8,7.07,0.14,0.6,0.15,1560.25,289776,288,1206,1.2
readily,available,0.61,122,121.16,6.95,0.1,0.6,0.1,1081.51,289776,201,1206,0.84
inherently,wrong,0.53,1488,1436.02,5.15,0.27,0.51,0.28,8470.06,289776,2829,5324,51.98
fundamentally,wrong,0.52,166,160.12,4.52,0.03,0.5,0.03,894.47,289776,320,5324,5.88


# %% [markdown]

 Top log-likelihood $G^2$ values

In [None]:
exdf = show_metric_top(ex_adx_abbr, 'G2', k=8)
# pos_prone, neg_prone = update_prone(exdf, pos_prone, neg_prone)
exdf

Unnamed: 0_level_0,Unnamed: 1_level_0,G2,f,unexp_f,LRC,dP1,dP2,dP1_simple,dP2_simple,N,f1,f2,exp_f
l1,l2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
too,late,41654.72,10856,9121.68,9.26,0.87,0.24,1.0,0.24,289776,46073,10908,1734.32
more,important,21092.39,12089,8159.74,3.55,0.59,0.14,0.82,0.16,289776,77678,14658,3929.26
too,early,18334.89,4923,4137.4,9.05,0.85,0.11,1.0,0.11,289776,46073,4941,785.6
quite,sure,16330.79,2942,2818.21,5.43,0.5,0.46,0.51,0.47,289776,6235,5753,123.79
inherently,wrong,8470.06,1488,1436.02,5.15,0.27,0.51,0.28,0.53,289776,2829,5324,51.98
too,old,7455.93,2318,1922.9,5.71,0.78,0.05,0.93,0.05,289776,46073,2485,395.1
so,easy,6499.18,2620,2184.15,3.34,0.44,0.1,0.52,0.1,289776,24985,5055,435.85
even,close,5123.03,871,846.25,5.5,0.33,0.3,0.34,0.31,289776,2812,2550,24.75


In [None]:
# %%

sig_adx_abbr = ex_adx_abbr.loc[ex_adx_abbr.LRC.abs() > 1, :]
sig_adx_abbr

Unnamed: 0_level_0,f,unexp_f,LRC,dP1,dP2,dP1_simple,dP2_simple,G2,N,f1,f2,exp_f,l1,l2
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
mutually~exclusive,221,220.77,11.54,0.82,0.90,0.82,0.90,3208.88,289776,245,268,0.23,mutually,exclusive
too~late,10856,9121.68,9.26,0.87,0.24,1.00,0.24,41654.72,289776,46073,10908,1734.32,too,late
too~early,4923,4137.40,9.05,0.85,0.11,1.00,0.11,18334.89,289776,46073,4941,785.60,too,early
statistically~significant,108,107.56,7.77,0.11,0.81,0.11,0.81,1117.06,289776,133,962,0.44,statistically,significant
politically~correct,120,119.44,7.77,0.38,0.23,0.38,0.23,1130.87,289776,521,313,0.56,politically,correct
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
very~important,104,-346.40,-1.55,-0.02,-0.04,0.01,0.01,-410.76,289776,8904,14658,450.40,very,important
really~important,115,-454.88,-1.78,-0.03,-0.04,0.01,0.01,-576.03,289776,11266,14658,569.88,really,important
more~simple,176,-572.43,-1.97,-0.21,-0.01,0.06,0.00,-788.32,289776,77678,2792,748.43,more,simple
too~easy,121,-682.72,-2.38,-0.14,-0.02,0.02,0.00,-1023.62,289776,46073,5055,803.72,too,easy


# %% [markdown]

 Positive Prone Adverbs with significant LRC

In [None]:
sig_adx_abbr.loc[sig_adx_abbr.l1.isin(pos_prone['Adv'])]

Unnamed: 0_level_0,f,unexp_f,LRC,dP1,dP2,dP1_simple,dP2_simple,G2,N,f1,f2,exp_f,l1,l2
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1


# %% [markdown]

 Negative Prone Adverbs with significant LRC

In [None]:
sig_adx_abbr.loc[sig_adx_abbr.l1.isin(neg_prone['Adv'])]

Unnamed: 0_level_0,f,unexp_f,LRC,dP1,dP2,dP1_simple,dP2_simple,G2,N,f1,f2,exp_f,l1,l2
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
too~late,10856,9121.68,9.26,0.87,0.24,1.00,0.24,41654.72,289776,46073,10908,1734.32,too,late
too~early,4923,4137.40,9.05,0.85,0.11,1.00,0.11,18334.89,289776,46073,4941,785.60,too,early
before~available,175,173.80,7.07,0.14,0.60,0.15,0.61,1560.25,289776,288,1206,1.20,before,available
any~better,381,376.94,6.82,0.34,0.35,0.35,0.36,3008.29,289776,1066,1103,4.06,any,better
too~old,2318,1922.90,5.71,0.78,0.05,0.93,0.05,7455.93,289776,46073,2485,395.10,too,old
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
too~happy,1174,629.92,1.25,0.19,0.02,0.34,0.03,705.55,289776,46073,3422,544.08,too,happy
too~rich,129,77.01,1.09,0.24,0.00,0.39,0.00,104.51,289776,46073,327,51.99,too,rich
too~long,120,71.03,1.04,0.23,0.00,0.39,0.00,94.73,289776,46073,308,48.97,too,long
too~good,517,-871.35,-1.31,-0.10,-0.02,0.06,0.01,-840.90,289776,46073,8732,1388.35,too,good


# %% [markdown]

 Positive Prone Adjectives with significant LRC

In [None]:
sig_adx_abbr.loc[sig_adx_abbr.l2.isin(pos_prone['Adj'])]

Unnamed: 0_level_0,f,unexp_f,LRC,dP1,dP2,dP1_simple,dP2_simple,G2,N,f1,f2,exp_f,l1,l2
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
ever~likely,103,90.27,2.55,0.12,0.02,0.13,0.02,262.87,289776,4688,787,12.73,ever,likely


# %% [markdown]

 Negative Prone Adjectives with significant LRC

In [None]:
sig_adx_abbr.loc[sig_adx_abbr.l2.isin(neg_prone['Adj'])]

Unnamed: 0_level_0,f,unexp_f,LRC,dP1,dP2,dP1_simple,dP2_simple,G2,N,f1,f2,exp_f,l1,l2
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
too~late,10856,9121.68,9.26,0.87,0.24,1.0,0.24,41654.72,289776,46073,10908,1734.32,too,late
too~early,4923,4137.4,9.05,0.85,0.11,1.0,0.11,18334.89,289776,46073,4941,785.6,too,early
quite~sure,2942,2818.21,5.43,0.5,0.46,0.51,0.47,16330.79,289776,6235,5753,123.79,quite,sure
absolutely~certain,108,105.85,5.23,0.09,0.19,0.09,0.2,666.75,289776,547,1138,2.15,absolutely,certain
truly~alone,187,181.51,5.12,0.32,0.07,0.33,0.07,1034.99,289776,2817,565,5.49,truly,alone
more~frustrating,1853,1311.25,4.42,0.65,0.02,0.92,0.02,3857.11,289776,77678,2021,541.75,more,frustrating
more~evident,1279,888.97,3.82,0.61,0.02,0.88,0.02,2418.23,289776,77678,1455,390.03,more,evident
more~important,12089,8159.74,3.55,0.59,0.14,0.82,0.16,21092.39,289776,77678,14658,3929.26,more,important
really~alone,194,172.03,3.15,0.31,0.02,0.34,0.02,565.27,289776,11266,565,21.97,really,alone
too~fancy,529,404.82,3.0,0.52,0.01,0.68,0.01,1054.78,289776,46073,781,124.18,too,fancy
