In [2]:
import pandas as pd

from source.utils import FREQ_DIR, RESULT_DIR, UCS_DIR, confirm_dir
from source.utils.associate import (BINARY_ASSOC_ARGS, add_extra_am,
                                    associate_ucs, confirm_basic_ucs)
from source.utils.associate import convert_ucs_to_csv as ucs2csv
from source.utils.associate import get_associations_csv as init_am, AM_DF_DIR
from source.utils.associate import manipulate_ucs, seek_readable_ucs, adjust_assoc_columns
pd.set_option('display.float_format', '{:,.2f}'.format)



set parameters

In [3]:
UNIT = 'Adj'
# PAT_DIR = 'POSmirror'
# PAT_DIR = 'NEGmirror'
# PAT_DIR = 'ANYmirror'
# PAT_DIR = 'RBdirect'
PAT_DIR = 'RBXadj'
# FRQ_FLOOR = 3
# FRQ_FLOOR = 10
# FRQ_FLOOR = 20
# FRQ_FLOOR = 50
FRQ_FLOOR = 100  # BUG 100 will be used regardless, so set it to this to at least keep the naming accurate
ADVADJ_TSV = FREQ_DIR.joinpath(
    f'{PAT_DIR}/ucs_format/Adv{UNIT}_frq-thrMIN-7.35f.tsv')
FOCUS = ['f', 'unexpected_f',
         'conservative_log_ratio',
         'am_p1_given2', 'am_p2_given1',
         'am_p1_given2_simple', 'am_p2_given1_simple',
         'am_log_likelihood',
         #  'mutual_information', 'am_odds_ratio_disc', 't_score',
         'N', 'f1', 'f2', 'E11',
         'l1', 'l2']

In [4]:
def invert_set_dict(d: dict):
    return {v: k for k in d for v in d[k]}

1. Run `seek_readable_ucs()` to generate consistent output path

In [5]:
readable = seek_readable_ucs(min_freq=FRQ_FLOOR,
                             ucs_subdir='adv_adj',
                             contained_counts_path=ADVADJ_TSV)
print(readable.relative_to(RESULT_DIR))

    > seeking `adv_adj/RBXadj/readable/AdvAdj_frq-thrMIN-7.35f_min100x*` frequency data and initial associations...
ucs/adv_adj/RBXadj/readable/AdvAdj_frq-thrMIN-7.35f_min100x.rsort-view_am-only.csv


Snippet of starting frequency data (`ADVADJ_TSV`)

In [6]:
! head -5 {ADVADJ_TSV} | column -t

1191864  so    many
748533   most  important
614652   so    much
583184   too   much
507499   very  good




 2. Run `confirm_basic_ucs()` (if needed)

In [7]:
if not readable.is_file():
    basic_ucs_path = readable.parent.parent.joinpath(
        readable.name.replace('.rsort-view_am-only.txt', '.ds.gz'))
    print(
        f'Creating initial UCS table: `{basic_ucs_path.relative_to(RESULT_DIR)}')

    basic_ucs_path = confirm_basic_ucs(
        basic_ucs_path,
        freq_floor=FRQ_FLOOR,
        contained_counts_path=ADVADJ_TSV)

Excerpt of initial UCS table

In [8]:
init_readable = UCS_DIR.joinpath(
    f'adv_adj/{PAT_DIR}/readable'
).joinpath(f'{ADVADJ_TSV.name.replace(".tsv","")}_min{FRQ_FLOOR}x.init.txt')
! head -7 {init_readable}

                  l1  l2                       f       f2        f1         N  
--------------------  -----------------  -------  -------  --------  --------  
                very  many                 21237  2212989  10051689  86330753  
                more  many                   373  2212989   9607398  86330753  
                most  many                   140  2212989   7734049  86330753  
                  so  many               1191874  2212989   5819223  86330753  
                 not  many                 58442  2212989   4511534  86330753  




 3. Run `associate_ucs()` (if needed)

In [9]:
if not readable.is_file():
    associate_ucs(basic_ucs_path)

transform_ucs_log = f'/share/compling/projects/sanpi/logs/associate/ucs//ucs-{PAT_DIR}_Adv{UNIT}_frq-thrMIN-7-35f_min{FRQ_FLOOR}x*.log'
! head -15 `ls -t1 {transform_ucs_log} | head -1`
! echo '...'
! tail -2 `ls -t1 {transform_ucs_log} | head -1`

# Manipulating AdvAdj_frq-thrMIN-7-35f_min100x ucs table
path to this script: /share/compling/projects/sanpi/script/transform_ucs.sh
Wed May  8 19:36:58 EDT 2024
(TMP: /share/compling/projects/sanpi/results/ucs/adv_adj/RBXadj/tmp/tmp_RBXadj-20240508-193658.AdvAdj_frq-thrMIN-7-35f_min100x)
## Initial Contingency Info

DATA SET FILE:  /share/compling/projects/sanpi/results/ucs/adv_adj/RBXadj/AdvAdj_frq-thrMIN-7.35f_min100x.ds.gz

# Frequency signatures computed by the ucs-make-tables tool for relational cooccurrences.
# Sample size:  N = 86330753 tokens,  V = 1940305 pair types.
# A frequency threshold of f >= 100 was applied, leaving V = 55282 pair types.

##:: size = 55282
##:: threshold = 100

...
Loading data set /share/compling/projects/sanpi/results/ucs/adv_adj/RBXadj/AdvAdj_frq-thrMIN-7.35f_min100x.rsort.gz ... 55282 rows
Script finished at Wed May  8 19:37:30 EDT 2024




 4. Run `ucs_to_csv()` to convert `ucs/[PAT_DIR]/readable/*.txt` to format that `pandas` can parse as a dataframe

In [10]:
! head -15 {readable} | tail -5 | column -t -s,
csv_path = ucs2csv(readable)
print(f'CSV: `{csv_path.relative_to(RESULT_DIR)}`')

fast       bowler        223  2.1621913e-02  4.0895153e+03  5.6438806403  0.97797792533  0.02723824671  0.97807017544  0.02723830463  8187    228  86330753
medically  induced       207  5.1598577e-02  3.3995261e+03  5.1945370450  0.97617410199  0.00985145549  0.97641509434  0.00985151342  21012   212  86330753
nerve      wracking      561  3.7696880e-03  1.4336513e+04  8.8419681624  0.97395828700  0.99292018023  0.97395833333  0.99292035398  565     576  86330753
only       half-joking   209  1.1667425e+00  2.1160061e+03  3.6591035362  0.95775935445  0.00045016902  0.96313364055  0.00045026219  464174  217  86330753
fully      multi-coated  145  4.2635221e-01  1.6518884e+03  3.8981875937  0.95744305077  0.00059478501  0.96026490066  0.00059485471  243757  151  86330753
UCS table text converted & saved as /share/compling/projects/sanpi/results/ucs/adv_adj/RBXadj/readable/AdvAdj_frq-thrMIN-7.35f_min100x.rsort-view_am-only.csv
CSV: `ucs/adv_adj/RBXadj/readable/AdvAdj_frq-thrMIN-7.35f_min1

In [None]:
adx_amdf = pd.read_csv(csv_path).convert_dtypes()
adx_amdf

Unnamed: 0,l1,l2,f,E11,am_log_likelihood,am_odds_ratio_disc,am_p1_given2,am_p2_given1,am_p1_given2_simple,am_p2_given1_simple,f1,f2,N
0,hip,flexor,104,0.00,2491.45,7.56,1.00,0.17,1.00,0.17,595,104,86330753
1,thoroughly,cum-dumped,175,0.09,2671.08,5.86,1.00,0.00,1.00,0.00,41945,175,86330753
2,super,duper,721,2.66,8064.58,5.11,0.99,0.00,1.00,0.00,318597,722,86330753
3,far,away,1284,6.71,13424.89,4.52,0.99,0.00,0.99,0.00,448431,1291,86330753
4,fait,accompli,264,0.00,7157.93,9.16,0.99,0.99,0.99,0.99,267,268,86330753
...,...,...,...,...,...,...,...,...,...,...,...,...,...
55277,very,impossible,129,23964.33,-49382.63,-2.32,-0.12,-0.00,0.00,0.00,10051689,205822,86330753
55278,very,easier,129,27673.63,-57254.24,-2.38,-0.12,-0.00,0.00,0.00,10051689,237680,86330753
55279,very,higher,158,31582.26,-65236.97,-2.35,-0.12,-0.00,0.00,0.00,10051689,271250,86330753
55280,very,worse,103,24935.84,-51729.40,-2.44,-0.12,-0.00,0.00,0.00,10051689,214166,86330753


In [None]:
adx_amdf['key'] = (adx_amdf.l1 + '~' +
                   adx_amdf.l2).astype('string')
adx_amdf = adx_amdf.set_index('key')
adx_amdf

Unnamed: 0_level_0,l1,l2,f,E11,am_log_likelihood,am_odds_ratio_disc,am_p1_given2,am_p2_given1,am_p1_given2_simple,am_p2_given1_simple,f1,f2,N
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
hip~flexor,hip,flexor,104,0.00,2491.45,7.56,1.00,0.17,1.00,0.17,595,104,86330753
thoroughly~cum-dumped,thoroughly,cum-dumped,175,0.09,2671.08,5.86,1.00,0.00,1.00,0.00,41945,175,86330753
super~duper,super,duper,721,2.66,8064.58,5.11,0.99,0.00,1.00,0.00,318597,722,86330753
far~away,far,away,1284,6.71,13424.89,4.52,0.99,0.00,0.99,0.00,448431,1291,86330753
fait~accompli,fait,accompli,264,0.00,7157.93,9.16,0.99,0.99,0.99,0.99,267,268,86330753
...,...,...,...,...,...,...,...,...,...,...,...,...,...
very~impossible,very,impossible,129,23964.33,-49382.63,-2.32,-0.12,-0.00,0.00,0.00,10051689,205822,86330753
very~easier,very,easier,129,27673.63,-57254.24,-2.38,-0.12,-0.00,0.00,0.00,10051689,237680,86330753
very~higher,very,higher,158,31582.26,-65236.97,-2.35,-0.12,-0.00,0.00,0.00,10051689,271250,86330753
very~worse,very,worse,103,24935.84,-51729.40,-2.44,-0.12,-0.00,0.00,0.00,10051689,214166,86330753


6. Save to `./results/assoc_df/`

In [None]:
df_csv_path = AM_DF_DIR.joinpath(
    str(csv_path.relative_to(UCS_DIR))
    .replace('/readable', '')
    .replace('.rsort-view_am-only', ''))

if not df_csv_path.is_file():
    confirm_dir(df_csv_path.parent)
    adx_amdf.to_csv(df_csv_path)

df_pkl_path = df_csv_path.with_suffix('.pkl.gz')
if not df_pkl_path.is_file():
    adx_amdf.to_pickle(df_csv_path.with_suffix('.pkl.gz'))

7. Add additional AM via `add_extra_am()`
   
   Define dictionary containing relevant vocab sizes
   
   !!! Warning This is a `#HACK`: \
       Rather than developing a command/code to retrieve the vocab sizes programmatically,
       I simply copied the values given in the log output of `transform_usc.sh`
       for each `PAT_DIR`+`UNIT` combination

In [None]:
VOCABS = {
    # // 'ANYmirror': {'Adj': 83422},
    'NEGmirror': {'Adj': 40004},
    'POSmirror': {'Adj': 178159},
    'RBdirect':  {'Adj': 61860},
    'RBXadj':  {'Adj': 1940305}
}  # ! #HACK
VOCAB = VOCABS[PAT_DIR][UNIT]

print(pd.DataFrame(VOCABS).convert_dtypes().to_markdown(intfmt=','))
VOCAB = None
ex_adx_amdf = add_extra_am(df=adx_amdf,
                           verbose=True,
                           vocab=VOCAB,
                           metrics=['t_score', 'mutual_information']
                           ).convert_dtypes()

|     |   NEGmirror |   POSmirror |   RBdirect |    RBXadj |
|:----|------------:|------------:|-----------:|----------:|
| Adj |      40,004 |     178,159 |     61,860 | 1,940,305 |

Preview of Extended Measures (rounded)

| key                   |   t_score |   mutual_information |   deltaP_min |   deltaP_max |   deltaP_max_abs |   deltaP_product |   unexpected_f |   unexpected_ratio |
|:----------------------|----------:|---------------------:|-------------:|-------------:|-----------------:|-----------------:|---------------:|-------------------:|
| hip~flexor            |     10.20 |                 5.16 |         0.17 |         1.00 |             1.00 |             0.17 |         104.00 |               1.00 |
| thoroughly~cum-dumped |     13.22 |                 3.31 |         0.00 |         1.00 |             1.00 |             0.00 |         174.91 |               1.00 |
| super~duper           |     26.75 |                 2.43 |         0.00 |         0.99 |             0.99 

Save extended AM tables to `extra/` subdirectory if not already saved

In [None]:
df_extra_csv = df_csv_path.parent / 'extra' / \
    df_csv_path.name.replace('.csv', '_extra.csv')
print(df_extra_csv)
if not df_extra_csv.is_file():
    confirm_dir(df_extra_csv.parent)
    ex_adx_amdf.to_csv(df_extra_csv)

df_extra_pkl = df_extra_csv.with_suffix('.pkl.gz')
if not df_extra_pkl.is_file():
    ex_adx_amdf.to_pickle(df_extra_pkl)

/share/compling/projects/sanpi/results/assoc_df/adv_adj/RBXadj/extra/AdvAdj_frq-thrMIN-7.35f_min100x_extra.csv


In [None]:


ex_adx_full = ex_adx_amdf.copy()
ex_adx_abbr = adjust_assoc_columns(
    ex_adx_amdf[[c for c in ['polarity', 'quant'] + FOCUS if c in ex_adx_amdf.columns]]).sort_values('LRC', ascending=False)
cols = ex_adx_abbr.columns



 Define lexical items with given lean shown in binary environment evaluation

In [None]:
pos_prone = {
    'Adj': [
        'unrelated',
        'unable',
        'akin',
        'larger',
        'different',
        'familiar',
        'similar',
        'likely',
        'brief',
        'unaware'
    ],
    'Adv': [
        'slightly',
        'definitely',
        'utterly',
        # LRC top
        'pretty',
        'rather',
        'plain',
        'fairly',
        'somewhat',
        'otherwise',
        'downright',
        'relatively',
        # G2 top
        # 'very',
        # 'even',
        # 'just',
        # dP1 top (and odds ratio disc)
        'plain',
        'maybe'
    ],
    'Bigr': [
        # G2 top
        'completely_different',
        'too_familiar',
        'even_better',
        # dP1 top
        'quite_different',
        'too_real',
        'well_aware',
        # LRC top
        'too_common',
        'entirely_different'
    ]}
neg_prone = {
    'Bigr': [
        # LRC top
        'quite_sure',
        'really_sure',
        'too_early',
        'too_pleased',
        'too_fancy',
        # dP1 top
        'entirely_sure',
        'ever_easy',
        'ever_perfect',
        'particularly_surprising',
        'particularly_new',
        # G2 top
        'too_late',
        'more_important',
        'so_easy',
        'as_good',
        'too_old'
    ],
    'Adv': [
        'yet',
        # LRC top
        'ever',
        'any',
        'longer',
        'necessarily',
        'that',
        # dP1 top
        'before',
        'wise',  # ? How is this used as an adverb?
        'earthly',
        'remotely',
        'exactly',
        # G2 top
        'particularly',
        'too',
        # 'inherently'
    ],
    'Adj': [
        # LRC top
        'early',
        'late',
        'fancy',
        'alone',
        'sure',
        # dP1 top
        'shabby',
        'demoralizing',
        'alone',
        'aggravating',
        'groundbreaking',
        'eventful',
        # G2 top
        'important',
        'frustrating',
        'evident',
        'certain'
    ]
}


def sort_prone_by_f2(prone_list, amdf):
    return amdf.copy().loc[amdf.l2.isin(prone_list), ['f2', 'l2']].drop_duplicates().reset_index(drop=True).set_index('l2').round(1).sort_values(['f2'], ascending=False).index.to_list()


pos_prone[UNIT] = sort_prone_by_f2(pos_prone[UNIT], ex_adx_abbr)
neg_prone[UNIT] = sort_prone_by_f2(neg_prone[UNIT], ex_adx_abbr)
pos_prone[UNIT]

['likely',
 'different',
 'similar',
 'familiar',
 'larger',
 'unable',
 'unaware',
 'brief',
 'unrelated',
 'akin']

Strongest associations for each polarity by metric

In [None]:
def show_metric_top(amdf: pd.DataFrame,
                    metric: str,
                    k=5,
                    cols=[None]):
    if not any(cols):
        cols = amdf.columns
    return (amdf.nlargest(k, metric)
            .loc[:, [metric] + cols[cols != metric].to_list()]
            .reset_index(drop=True).set_index(['l1', 'l2'])
            )

Top consevative log ratio $LRC$ values

In [None]:
exdf = show_metric_top(ex_adx_abbr, "LRC", k=8)
# pos_prone, neg_prone = update_prone(exdf, pos_prone, neg_prone)
exdf

Unnamed: 0_level_0,Unnamed: 1_level_0,LRC,f,unexp_f,dP1,dP2,dP1_simple,dP2_simple,G2,N,f1,f2,exp_f
l1,l2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
fait,accompli,20.78,264,264.0,0.99,0.99,0.99,0.99,7157.93,86330753,267,268,0.0
smack,dab,20.62,184,184.0,0.98,0.97,0.98,0.97,5096.69,86330753,189,187,0.0
nerve,wracking,20.61,561,561.0,0.97,0.99,0.97,0.99,14336.51,86330753,565,576,0.0
slam,dunk,20.56,158,158.0,0.94,0.86,0.94,0.86,4264.99,86330753,184,168,0.0
ex,officio,20.2,281,281.0,0.95,0.74,0.95,0.74,7109.1,86330753,382,295,0.0
bone,marrow,20.13,1147,1146.98,0.98,0.82,0.98,0.82,26533.24,86330753,1396,1168,0.02
bona,fide,20.09,454,454.0,0.93,0.99,0.93,0.99,11663.44,86330753,458,486,0.0
anal,retentive,17.5,233,233.0,0.52,0.68,0.52,0.68,5389.98,86330753,342,450,0.0


Top $\Delta P(\texttt{adv}|\texttt{adj})$ values

In [None]:
exdf = show_metric_top(ex_adx_abbr, 'dP1', k=8)
# pos_prone, neg_prone = update_prone(exdf, pos_prone, neg_prone)
exdf

Unnamed: 0_level_0,Unnamed: 1_level_0,dP1,f,unexp_f,LRC,dP2,dP1_simple,dP2_simple,G2,N,f1,f2,exp_f
l1,l2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
hip,flexor,1.0,104,104.0,16.69,0.17,1.0,0.17,2491.45,86330753,595,104,0.0
thoroughly,cum-dumped,1.0,175,174.91,11.31,0.0,1.0,0.0,2671.08,86330753,41945,175,0.09
super,duper,0.99,721,718.34,10.44,0.0,1.0,0.0,8064.58,86330753,318597,722,2.66
far,away,0.99,1284,1277.29,12.4,0.0,0.99,0.0,13424.89,86330753,448431,1291,6.71
fait,accompli,0.99,264,264.0,20.78,0.99,0.99,0.99,7157.93,86330753,267,268,0.0
smack,dab,0.98,184,184.0,20.62,0.97,0.98,0.97,5096.69,86330753,189,187,0.0
massively,multiplayer,0.98,169,168.96,13.79,0.01,0.98,0.01,2812.58,86330753,19291,172,0.04
bone,marrow,0.98,1147,1146.98,20.13,0.82,0.98,0.82,26533.24,86330753,1396,1168,0.02


Top conditional probability $P(\texttt{adv}|\texttt{adj})$ values

In [None]:
exdf = show_metric_top(ex_adx_abbr, 'dP1_simple', k=8)
# pos_prone, neg_prone = update_prone(exdf, pos_prone, neg_prone)
exdf

Unnamed: 0_level_0,Unnamed: 1_level_0,dP1_simple,f,unexp_f,LRC,dP1,dP2,dP2_simple,G2,N,f1,f2,exp_f
l1,l2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
hip,flexor,1.0,104,104.0,16.69,1.0,0.17,0.17,2491.45,86330753,595,104,0.0
thoroughly,cum-dumped,1.0,175,174.91,11.31,1.0,0.0,0.0,2671.08,86330753,41945,175,0.09
not,human-vetted,1.0,195,184.81,4.64,0.95,0.0,0.0,1151.11,86330753,4511534,195,10.19
more,infodownload,1.0,162,143.97,3.19,0.89,0.0,0.0,711.39,86330753,9607398,162,18.03
more,moderen,1.0,133,118.2,2.9,0.89,0.0,0.0,584.05,86330753,9607398,133,14.8
super,duper,1.0,721,718.34,10.44,0.99,0.0,0.0,8064.58,86330753,318597,722,2.66
far,away,0.99,1284,1277.29,12.4,0.99,0.0,0.0,13424.89,86330753,448431,1291,6.71
most,wanted,0.99,3371,3064.97,8.5,0.9,0.0,0.0,15796.15,86330753,7734049,3416,306.03


Top $\Delta P(\texttt{adj}|\texttt{adv})$ values

In [None]:
exdf = show_metric_top(ex_adx_abbr, 'dP2', k=8)
# pos_prone, neg_prone = update_prone(exdf, pos_prone, neg_prone)
exdf

Unnamed: 0_level_0,Unnamed: 1_level_0,dP2,f,unexp_f,LRC,dP1,dP1_simple,dP2_simple,G2,N,f1,f2,exp_f
l1,l2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
non-personally,identifiable,1.0,177,176.95,11.7,0.01,0.01,1.0,2881.18,86330753,177,25293,0.05
chock,full,1.0,1288,1286.41,9.66,0.01,0.01,1.0,17267.03,86330753,1288,106587,1.59
nerve,wracking,0.99,561,561.0,20.61,0.97,0.97,0.99,14336.51,86330753,565,576,0.0
awe,inspiring,0.99,140,139.94,11.1,0.0,0.0,0.99,2156.04,86330753,141,37532,0.06
scantily,clad,0.99,5367,5366.52,15.07,0.71,0.71,0.99,104950.63,86330753,5413,7606,0.48
sizey,deep,0.99,130,129.87,9.95,0.0,0.0,0.99,1794.29,86330753,131,83134,0.13
bona,fide,0.99,454,454.0,20.09,0.93,0.93,0.99,11663.44,86330753,458,486,0.0
fait,accompli,0.99,264,264.0,20.78,0.99,0.99,0.99,7157.93,86330753,267,268,0.0


Top conditional probability $P(\texttt{adj}|\texttt{adv})$ values

In [None]:
exdf = show_metric_top(ex_adx_abbr, 'dP2_simple', k=8)
# pos_prone, neg_prone = update_prone(exdf, pos_prone, neg_prone)
exdf

Unnamed: 0_level_0,Unnamed: 1_level_0,dP2_simple,f,unexp_f,LRC,dP1,dP2,dP1_simple,G2,N,f1,f2,exp_f
l1,l2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
non-personally,identifiable,1.0,177,176.95,11.7,0.01,1.0,0.01,2881.18,86330753,177,25293,0.05
chock,full,1.0,1288,1286.41,9.66,0.01,1.0,0.01,17267.03,86330753,1288,106587,1.59
nerve,wracking,0.99,561,561.0,20.61,0.97,0.99,0.97,14336.51,86330753,565,576,0.0
awe,inspiring,0.99,140,139.94,11.1,0.0,0.99,0.0,2156.04,86330753,141,37532,0.06
sizey,deep,0.99,130,129.87,9.95,0.0,0.99,0.0,1794.29,86330753,131,83134,0.13
scantily,clad,0.99,5367,5366.52,15.07,0.71,0.99,0.71,104950.63,86330753,5413,7606,0.48
bona,fide,0.99,454,454.0,20.09,0.93,0.99,0.93,11663.44,86330753,458,486,0.0
fait,accompli,0.99,264,264.0,20.78,0.99,0.99,0.99,7157.93,86330753,267,268,0.0


Top log-likelihood $G^2$ values

In [None]:
exdf = show_metric_top(ex_adx_abbr, 'G2', k=8)
# pos_prone, neg_prone = update_prone(exdf, pos_prone, neg_prone)
exdf

Unnamed: 0_level_0,Unnamed: 1_level_0,G2,f,unexp_f,LRC,dP1,dP2,dP1_simple,dP2_simple,N,f1,f2,exp_f
l1,l2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
so,many,3735386.5,1191874,1042704.95,4.0,0.48,0.19,0.54,0.2,86330753,5819223,2212989,149169.05
not,sure,1680246.0,467213,423055.38,4.47,0.51,0.1,0.55,0.1,86330753,4511534,844981,44157.62
too,much,1636073.6,583185,508918.75,3.47,0.29,0.15,0.33,0.16,86330753,3604498,1778739,74266.25
much,more,1434754.7,355655,331551.0,4.44,0.33,0.17,0.34,0.18,86330753,2015845,1032280,24104.0
much,better,1282245.6,295224,277866.87,4.77,0.38,0.14,0.4,0.15,86330753,2015845,743338,17357.13
so,much,1232042.7,614676,494778.07,2.86,0.28,0.09,0.35,0.11,86330753,5819223,1778739,119897.93
most,recent,1114531.3,313185,277177.86,5.14,0.69,0.04,0.78,0.04,86330753,7734049,401927,36007.14
most,important,1106213.5,748533,551162.29,2.38,0.26,0.08,0.34,0.1,86330753,7734049,2203136,197370.71


In [None]:
sig_adx_abbr = ex_adx_abbr.loc[ex_adx_abbr.LRC.abs() > 1, :]
sig_adx_abbr

Unnamed: 0_level_0,f,unexp_f,LRC,dP1,dP2,dP1_simple,dP2_simple,G2,N,f1,f2,exp_f,l1,l2
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
fait~accompli,264,264.00,20.78,0.99,0.99,0.99,0.99,7157.93,86330753,267,268,0.00,fait,accompli
smack~dab,184,184.00,20.62,0.98,0.97,0.98,0.97,5096.69,86330753,189,187,0.00,smack,dab
nerve~wracking,561,561.00,20.61,0.97,0.99,0.97,0.99,14336.51,86330753,565,576,0.00,nerve,wracking
slam~dunk,158,158.00,20.56,0.94,0.86,0.94,0.86,4264.99,86330753,184,168,0.00,slam,dunk
ex~officio,281,281.00,20.20,0.95,0.74,0.95,0.74,7109.10,86330753,382,295,0.00,ex,officio
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
very~available,403,-100459.05,-7.79,-0.12,-0.01,0.00,0.00,-210254.79,86330753,10051689,866272,100862.05,very,available
more~enough,115,-50385.44,-8.28,-0.11,-0.01,0.00,0.00,-105717.87,86330753,9607398,453790,50500.44,more,enough
more~many,373,-245901.54,-9.17,-0.11,-0.03,0.00,0.00,-523711.57,86330753,9607398,2212989,246274.54,more,many
most~many,140,-198113.40,-10.00,-0.09,-0.03,0.00,0.00,-418743.50,86330753,7734049,2212989,198253.40,most,many


Positive Prone Adverbs with significant LRC

In [None]:
sig_adx_abbr.loc[sig_adx_abbr.l1.isin(pos_prone['Adv'])]

Unnamed: 0_level_0,f,unexp_f,LRC,dP1,dP2,dP1_simple,dP2_simple,G2,N,f1,f2,exp_f,l1,l2
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
otherwise~product-specific,105,104.84,10.14,0.81,0.00,0.81,0.00,1278.72,86330753,106856,130,0.16,otherwise,product-specific
slightly~ajar,638,634.19,8.94,0.77,0.00,0.78,0.00,5985.76,86330753,400197,822,3.81,slightly,ajar
otherwise~law-abiding,500,498.30,8.45,0.36,0.00,0.36,0.00,4898.19,86330753,106856,1373,1.70,otherwise,law-abiding
slightly~stoopid,180,178.60,7.47,0.59,0.00,0.60,0.00,1528.40,86330753,400197,302,1.40,slightly,stoopid
slightly~off-center,316,313.21,7.31,0.52,0.00,0.52,0.00,2566.21,86330753,400197,602,2.79,slightly,off-center
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
pretty~ready,152,-4447.26,-4.37,-0.02,-0.00,0.00,0.00,-7953.56,86330753,1652360,240297,4599.26,pretty,ready
relatively~different,133,-6473.91,-5.03,-0.01,-0.01,0.00,0.00,-12023.06,86330753,626884,909864,6606.91,relatively,different
rather~many,173,-10148.44,-5.36,-0.00,-0.03,0.00,0.00,-19190.39,86330753,402649,2212989,10321.44,rather,many
relatively~important,281,-15716.90,-5.42,-0.01,-0.03,0.00,0.00,-29678.85,86330753,626884,2203136,15997.90,relatively,important




 Negative Prone Adverbs with significant LRC

In [None]:
sig_adx_abbr.loc[sig_adx_abbr.l1.isin(neg_prone['Adv'])]

Unnamed: 0_level_0,f,unexp_f,LRC,dP1,dP2,dP1_simple,dP2_simple,G2,N,f1,f2,exp_f,l1,l2
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
yet~unborn,463,462.25,10.52,0.73,0.00,0.73,0.00,5505.48,86330753,101707,635,0.75,yet,unborn
remotely~exploitable,154,153.70,8.61,0.13,0.01,0.13,0.01,1639.61,86330753,22194,1158,0.30,remotely,exploitable
exactly~alike,3040,3030.54,8.56,0.23,0.05,0.23,0.05,29939.31,86330753,61599,13261,9.46,exactly,alike
yet~unnamed,771,768.38,8.49,0.35,0.01,0.35,0.01,7535.14,86330753,101707,2227,2.62,yet,unnamed
ever~olympic,229,228.24,8.42,0.43,0.00,0.43,0.00,2273.22,86330753,124592,529,0.76,ever,olympic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
too~likely,1196,-42592.94,-5.05,-0.04,-0.01,0.00,0.00,-78883.59,86330753,3604498,1048782,43788.94,too,likely
too~necessary,127,-7697.19,-5.37,-0.04,-0.00,0.00,0.00,-14690.20,86330753,3604498,187396,7824.19,too,necessary
too~responsible,124,-8672.69,-5.57,-0.04,-0.00,0.00,0.00,-16677.58,86330753,3604498,210688,8796.69,too,responsible
too~possible,234,-14974.86,-5.62,-0.04,-0.00,0.00,0.00,-28694.70,86330753,3604498,364265,15208.86,too,possible




 Positive Prone Adjectives with significant LRC

In [None]:
sig_adx_abbr.loc[sig_adx_abbr.l2.isin(pos_prone['Adj'])]

Unnamed: 0_level_0,f,unexp_f,LRC,dP1,dP2,dP1_simple,dP2_simple,G2,N,f1,f2,exp_f,l1,l2
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
blissfully~unaware,3112,3109.50,10.34,0.11,0.40,0.11,0.40,39949.36,86330753,7768,27809,2.50,blissfully,unaware
blithely~unaware,255,254.72,9.46,0.01,0.29,0.01,0.29,3048.13,86330753,874,27809,0.28,blithely,unaware
mercifully~brief,290,289.43,8.62,0.01,0.15,0.01,0.15,3084.05,86330753,1927,25571,0.57,mercifully,brief
confusingly~similar,700,697.18,7.80,0.00,0.63,0.00,0.64,6914.49,86330753,1100,221410,2.82,confusingly,similar
intimately~familiar,3832,3811.43,7.48,0.02,0.45,0.02,0.46,34594.29,86330753,8385,211750,20.57,intimately,familiar
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
less~different,198,-13511.54,-5.63,-0.02,-0.01,0.00,0.00,-25691.21,86330753,1300804,909864,13709.54,less,different
most~different,1364,-80147.31,-5.84,-0.09,-0.01,0.00,0.00,-157571.34,86330753,7734049,909864,81511.31,most,different
so~larger,122,-10118.07,-5.85,-0.07,-0.00,0.00,0.00,-19880.09,86330753,5819223,151916,10240.07,so,larger
much~likely,259,-24230.33,-6.15,-0.02,-0.01,0.00,0.00,-46973.59,86330753,2015845,1048782,24489.33,much,likely




 Negative Prone Adjectives with significant LRC

In [None]:
sig_adx_abbr.loc[sig_adx_abbr.l2.isin(neg_prone['Adj'])]

Unnamed: 0_level_0,f,unexp_f,LRC,dP1,dP2,dP1_simple,dP2_simple,G2,N,f1,f2,exp_f,l1,l2
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
too~late,136383,129584.07,6.84,0.80,0.04,0.84,0.04,728970.71,86330753,3604498,162840,6798.93,too,late
fashionably~late,384,380.38,6.41,0.00,0.20,0.00,0.20,2903.39,86330753,1919,162840,3.62,fashionably,late
amply~evident,102,101.18,6.28,0.00,0.09,0.00,0.09,789.89,86330753,1168,60888,0.82,amply,evident
plainly~evident,367,362.07,5.86,0.01,0.05,0.01,0.05,2460.79,86330753,6989,60888,4.93,plainly,evident
too~shabby,5903,5568.65,5.83,0.70,0.00,0.74,0.00,28458.74,86330753,3604498,8008,334.35,too,shabby
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
totally~important,122,-9135.96,-5.61,-0.00,-0.03,0.00,0.00,-17488.68,86330753,362777,2203136,9257.96,totally,important
extremely~sure,111,-9599.34,-5.79,-0.01,-0.01,0.00,0.00,-18410.50,86330753,992094,844981,9710.34,extremely,sure
slightly~important,106,-10106.91,-5.90,-0.00,-0.03,0.00,0.00,-19552.84,86330753,400197,2203136,10212.91,slightly,important
most~late,157,-14431.23,-6.10,-0.09,-0.00,0.00,0.00,-28830.40,86330753,7734049,162840,14588.23,most,late
