# Identifying Adverbs with Strongest Negative Environment Associations

In [51]:
from pathlib import Path

import pandas as pd
from pprint import pprint
# from source.utils import PKL_SUFF
from source.utils.associate import AM_DF_DIR, TOP_AM_DIR, adjust_assoc_columns
from source.utils.general import print_iter, snake_to_camel, timestamp_today, confirm_dir

SET_FLOOR = 300
MIR_FLOOR = 100
K = 6

TAG='NEQ'
TOP_AM_TAG_DIR = TOP_AM_DIR / TAG
confirm_dir(TOP_AM_TAG_DIR)

# for loading `polar/*/bigram/*` tables
bigram_floor = 100
mirror_floor = 50

Set columns and diplay settings

In [52]:
FOCUS = ['f',
         'am_p1_given2', 'am_p1_given2_simple', 'conservative_log_ratio',
         'am_log_likelihood',
        #  'mutual_information', 
         'am_odds_ratio_disc', 't_score',
         'N', 'f1', 'f2', 'E11', 'unexpected_f', 
         'l1', 'l2']
pd.set_option('display.max_colwidth', 20)
pd.set_option('display.max_columns', 12)
pd.set_option('display.width', 90)
pd.set_option("display.precision", 2)
pd.set_option("styler.format.precision", 2)
pd.set_option("styler.format.thousands", ",")
pd.set_option("display.float_format", '{:,.2f}'.format)
# pd.set_option("styler.render.repr", "html")

In [53]:
def force_ints(_df):
    count_cols = _df.filter(regex=r'total|^[fN]').columns
    _df[count_cols] = _df[count_cols].astype('int')
    # _df[count_cols] = _df[:, count_cols].astype('int64')
    # print(_df.dtypes.to_frame('dtypes'))
    return _df

In [54]:
def nb_show_table(df, n_dec: int = 2,
                  adjust_columns: bool = True,
                   outpath:Path=None, 
                   return_df:bool=False) -> None: 
    _df = df.copy()
    try: 
        start_0 = _df.index.start == 0
    except AttributeError: 
        pass
    else:
        _df.index.name = 'rank'
        if start_0: 
            _df.index = _df.index + 1
    if adjust_columns: 
        _df = adjust_assoc_columns(_df)
    _df.columns = [f'`{c}`' for c in _df.columns]
    _df.index = [f'**{r}**' for r in _df.index ]
    table = _df.to_markdown(floatfmt=f',.{n_dec}f', intfmt=',')
    if outpath:
        outpath.write_text(table)

    print(f'\n{table}\n')
    return (_df if return_df else None)

## Set paths and load adverb association tables

In [55]:
def update_index(df, pat_name:str = None):
    neg_env_name = df.filter(like='NEG', axis=0).l1.iloc[0]
    # > will be either `NEGATED` or `NEGMIR`
    #   both are shortened to just `NEG` for the keys in their separate dataframes
    # > replace to avoid ambiguity in `key` values when combined
    #! some filtering relies on 'NEG', so have to keep that prefix
    index_update = pat_name or ('NEGmir' if neg_env_name.endswith('MIR') else 'NEGany')
    df.index = df.index.str.replace('NEG', index_update)
    return df

In [56]:
POLAR_DIR = AM_DF_DIR.joinpath('polar')

polar_adv_dirs = []
# results/assoc_df/polar/RBdirect/adv/extra/polarized-adv_35f-7c_min5000x_extra.pkl.gz
adv_am_paths = {
    p.name: tuple(
        p.joinpath('adv/extra').glob(
            f'*{TAG}*min{SET_FLOOR}*parq')
    )[0]
    for p in POLAR_DIR.iterdir()}
pprint(adv_am_paths)

{'RBdirect': PosixPath('/share/compling/projects/sanpi/results/assoc_df/polar/RBdirect/adv/extra/polarized-adv_NEQ-direct_min300x_extra.parq'),
 'mirror': PosixPath('/share/compling/projects/sanpi/results/assoc_df/polar/mirror/adv/extra/polarized-adv_NEQ-mirror_min300x_extra.parq')}


In [57]:
setdiff_adv = update_index(pd.read_parquet(adv_am_paths['RBdirect'], columns=FOCUS))
mirror_adv = update_index(pd.read_parquet(adv_am_paths['mirror'], columns=FOCUS))
nb_show_table(setdiff_adv.sample(min(6,K)).sort_values('conservative_log_ratio', ascending=False))


|                   |    `f` |   `dP1` |   `dP1_simple` |   `LRC` |      `G2` |   `odds_r_disc` |   `t` |       `N` |      `f1` |   `f2` |   `exp_f` |   `unexp_f` | `l1`       | `l2`     |
|:------------------|-------:|--------:|---------------:|--------:|----------:|----------------:|------:|----------:|----------:|-------:|----------:|------------:|:-----------|:---------|
| **COM~once**      |  4,616 |    0.46 |           0.96 |    4.14 |  5,117.25 |            1.41 | 32.65 | 6,347,364 | 3,173,552 |  4,796 |  2,397.90 |    2,218.10 | COMPLEMENT | once     |
| **COM~vastly**    |  1,752 |    0.40 |           0.90 |    2.57 |  1,400.72 |            0.93 | 18.48 | 6,347,364 | 3,173,552 |  1,957 |    978.46 |      773.54 | COMPLEMENT | vastly   |
| **COM~severely**  |    927 |    0.35 |           0.85 |    1.85 |    574.80 |            0.74 | 12.43 | 6,347,364 | 3,173,552 |  1,097 |    548.48 |      378.52 | COMPLEMENT | severely |
| **NEGany~unduly** |    930 |    0.33 |           0.8


|                           |    `f` |   `dP1` |   `dP1_simple` |   `LRC` |       `G2` |   `odds_r_disc` |     `t` |       `N` |      `f1` |   `f2` |   `exp_f` |   `unexp_f` | `l1`       | `l2`           |
|:--------------------------|-------:|--------:|---------------:|--------:|-----------:|----------------:|--------:|----------:|----------:|-------:|----------:|------------:|:-----------|:---------------|
| **COM~certainly**         |  4,454 |    0.47 |           0.97 |    4.34 |   5,085.46 |            1.48 |   32.28 | 6,347,364 | 3,173,552 |  4,600 |  2,299.91 |    2,154.09 | COMPLEMENT | certainly      |
| **COM~partially**         |  1,027 |    0.41 |           0.91 |    2.59 |     876.99 |            1.00 |   14.42 | 6,347,364 | 3,173,552 |  1,130 |    564.98 |      462.02 | COMPLEMENT | partially      |
| **NEGany~overwhelmingly** |    905 |   -0.10 |           0.40 |   -0.24 |     -81.68 |           -0.17 |   -7.08 | 6,347,364 | 3,173,660 |  2,236 |  1,117.99 |     -212.99 | NEGATED    | overwhelmingly |
| **COM~financially**       |  2,090 |   -0.10 |           0.40 |   -0.39 |    -224.93 |           -0.18 |  -11.85 | 6,347,364 | 3,173,552 |  5,264 |  2,631.89 |     -541.89 | COMPLEMENT | financially    |
| **NEGany~ultimately**     |    509 |   -0.26 |           0.24 |   -1.32 |    -632.42 |           -0.51 |  -25.22 | 6,347,364 | 3,173,660 |  2,156 |  1,077.99 |     -568.99 | NEGATED    | ultimately     |
| **COM~particularly**      | 20,635 |   -0.23 |           0.27 |   -1.37 | -16,788.46 |           -0.43 | -121.44 | 6,347,364 | 3,173,552 | 76,162 | 38,079.44 |  -17,444.44 | COMPLEMENT | particularly   |



In [58]:
nb_show_table(mirror_adv.sample(min(6,K)).sort_values('conservative_log_ratio', ascending=False))


|                     |   `f` |   `dP1` |   `dP1_simple` |   `LRC` |     `G2` |   `odds_r_disc` |   `t` |     `N` |    `f1` |   `f2` |   `exp_f` |   `unexp_f` | `l1`   | `l2`     |
|:--------------------|------:|--------:|---------------:|--------:|---------:|----------------:|------:|--------:|--------:|-------:|----------:|------------:|:-------|:---------|
| **POS~pretty**      | 5,049 |    0.48 |           0.98 |    4.71 | 6,024.97 |            1.61 | 34.64 | 583,470 | 291,729 |  5,176 |  2,587.95 |    2,461.05 | POSMIR | pretty   |
| **POS~rather**      | 1,753 |    0.48 |           0.98 |    4.64 | 2,158.90 |            1.73 | 20.55 | 583,470 | 291,729 |  1,785 |    892.48 |      860.52 | POSMIR | rather   |
| **NEGmir~remotely** | 1,840 |    0.44 |           0.94 |    3.37 | 1,849.23 |            1.21 | 20.13 | 583,470 | 291,732 |  1,953 |    976.49 |      863.51 | NEGMIR | remotely |
| **POS~all**         | 1,114 |    0.11 |           0.61 |    0.32 |    93.57 |            0.2


|                  |    `f` |   `dP1` |   `dP1_simple` |   `LRC` |      `G2` |   `odds_r_disc` |    `t` |     `N` |    `f1` |   `f2` |   `exp_f` |   `unexp_f` | `l1`   | `l2`     |
|:-----------------|-------:|--------:|---------------:|--------:|----------:|----------------:|-------:|--------:|--------:|-------:|----------:|------------:|:-------|:---------|
| **POS~slightly** |  1,513 |    0.44 |           0.93 |    3.16 |  1,465.03 |            1.15 |  18.09 | 583,470 | 291,729 |  1,619 |    809.48 |      703.52 | POSMIR | slightly |
| **POS~very**     | 36,427 |    0.33 |           0.80 |    1.95 | 19,317.81 |            0.66 |  72.08 | 583,470 | 291,729 | 45,341 | 22,670.03 |   13,756.97 | POSMIR | very     |
| **POS~utterly**  |    435 |    0.34 |           0.84 |    1.55 |    257.92 |            0.71 |   8.39 | 583,470 | 291,729 |    520 |    259.99 |      175.01 | POSMIR | utterly  |
| **POS~mentally** |    369 |    0.30 |           0.80 |    1.19 |    176.58 |            0.60 |   7.18 | 583,470 | 291,729 |    462 |    231.00 |      138.00 | POSMIR | mentally |
| **NEGmir~as**    | 31,169 |    0.18 |           0.66 |    0.89 |  5,411.20 |            0.31 |  43.02 | 583,470 | 291,732 | 47,150 | 23,574.76 |    7,594.24 | NEGMIR | as       |
| **POS~overly**   |    589 |   -0.20 |           0.30 |   -0.89 |   -336.13 |           -0.37 | -16.54 | 583,470 | 291,729 |  1,981 |    990.48 |     -401.48 | POSMIR | overly   |



## Calculate "Most Negative" Adverbs for each Polarity Approximation

In [59]:
def get_top_vals(df: pd.DataFrame,
                 index_like: str = 'NEG',
                 metric_filter: str | list = ['am_p1_given2', 'conservative_log_ratio'],
                 k: int = 10,
                 val_col: str = None,
                 ignore_neg_adv: bool = True):
    env_df = df.copy().loc[df.conservative_log_ratio >=
                           1].filter(like=index_like, axis=0)
    if ignore_neg_adv:
        env_df = env_df.loc[~df.l2.isin(
            ("n't", 'not', 'barely', 'never', 'no', 'none')), :]
    if isinstance(metric_filter, str):
        metric_filter = [metric_filter]

    top = pd.concat([env_df.nlargest(k, m) for m in metric_filter]
                    ).drop_duplicates(keep='first')

    if val_col:
        top = top[[val_col] + metric_filter]

    return top.sort_values(metric_filter, ascending=False)


[setdiff_top15, mirror_top15] = [
    get_top_vals(adv_df, k=15)
    for adv_df in (setdiff_adv, mirror_adv)
]


In [60]:
nb_show_table(setdiff_top15.reset_index().filter(regex=r'^[^l]'))


|        | `key`              |     `f` |   `dP1` |   `dP1_simple` |   `LRC` |       `G2` |   `odds_r_disc` |    `t` |       `N` |      `f1` |    `f2` |   `exp_f` |   `unexp_f` |
|:-------|:-------------------|--------:|--------:|---------------:|--------:|-----------:|----------------:|-------:|----------:|----------:|--------:|----------:|------------:|
| **1**  | NEGany~that        | 164,768 |    0.50 |           0.99 |    6.26 | 214,504.57 |            1.96 | 200.61 | 6,347,364 | 3,173,660 | 166,676 | 83,337.42 |   81,430.58 |
| **2**  | NEGany~necessarily |  42,595 |    0.50 |           0.99 |    6.77 |  56,251.14 |            2.17 | 102.49 | 6,347,364 | 3,173,660 |  42,886 | 21,442.85 |   21,152.15 |
| **3**  | NEGany~exactly     |  43,813 |    0.49 |           0.98 |    5.71 |  54,870.72 |            1.81 | 103.01 | 6,347,364 | 3,173,660 |  44,503 | 22,251.35 |   21,561.65 |
| **4**  | NEGany~immediately |  56,099 |    0.47 |           0.97 |    4.68 |  63,920.54 |            1

### `NEQ` 15 Most Negatively Associated Adverbs for superset data (
    
_Absent Negative_ approximation) as ranked by $\Delta P(1|2)$ (`dP1`) and $LRC$

|        | `key`              |     `f` |   `dP1` |   `dP1_simple` |   `LRC` |       `G2` |   `odds_r_disc` |    `t` |       `N` |      `f1` |    `f2` |   `exp_f` |   `unexp_f` |
|:-------|:-------------------|--------:|--------:|---------------:|--------:|-----------:|----------------:|-------:|----------:|----------:|--------:|----------:|------------:|
| **1**  | NEGany~that        | 164,768 |    0.50 |           0.99 |    6.26 | 214,504.57 |            1.96 | 200.61 | 6,347,364 | 3,173,660 | 166,676 | 83,337.42 |   81,430.58 |
| **2**  | NEGany~necessarily |  42,595 |    0.50 |           0.99 |    6.77 |  56,251.14 |            2.17 | 102.49 | 6,347,364 | 3,173,660 |  42,886 | 21,442.85 |   21,152.15 |
| **3**  | NEGany~exactly     |  43,813 |    0.49 |           0.98 |    5.71 |  54,870.72 |            1.81 | 103.01 | 6,347,364 | 3,173,660 |  44,503 | 22,251.35 |   21,561.65 |
| **4**  | NEGany~immediately |  56,099 |    0.47 |           0.97 |    4.68 |  63,920.54 |            1.47 | 114.33 | 6,347,364 | 3,173,660 |  58,040 | 29,019.80 |   27,079.20 |
| **5**  | NEGany~yet         |  51,867 |    0.47 |           0.96 |    4.52 |  57,900.12 |            1.42 | 109.45 | 6,347,364 | 3,173,660 |  53,881 | 26,940.31 |   24,926.69 |
| **6**  | NEGany~before      |     308 |    0.45 |           0.95 |    2.68 |     326.41 |            1.30 |   8.35 | 6,347,364 | 3,173,660 |     323 |    161.50 |      146.50 |
| **7**  | NEGany~any         |  15,384 |    0.45 |           0.95 |    3.91 |  15,851.55 |            1.26 |  58.57 | 6,347,364 | 3,173,660 |  16,238 |  8,118.94 |    7,265.06 |
| **8**  | NEGany~anymore     |     422 |    0.45 |           0.95 |    2.76 |     431.36 |            1.24 |   9.69 | 6,347,364 | 3,173,660 |     446 |    223.00 |      199.00 |
| **9**  | NEGany~remotely    |   5,661 |    0.42 |           0.92 |    3.16 |   5,075.57 |            1.05 |  34.30 | 6,347,364 | 3,173,660 |   6,161 |  3,080.48 |    2,580.52 |
| **10** | NEGany~terribly    |  17,949 |    0.41 |           0.91 |    3.10 |  15,186.21 |            0.99 |  60.07 | 6,347,364 | 3,173,660 |  19,802 |  9,900.93 |    8,048.07 |
| **11** | NEGany~only        | 113,502 |    0.39 |           0.89 |    2.89 |  88,060.81 |            0.90 | 146.68 | 6,347,364 | 3,173,660 | 128,174 | 64,086.56 |   49,415.44 |
| **12** | NEGany~altogether  |   4,568 |    0.39 |           0.89 |    2.64 |   3,490.73 |            0.89 |  29.44 | 6,347,364 | 3,173,660 |   5,156 |  2,577.98 |    1,990.02 |
| **13** | NEGany~overly      |  24,613 |    0.38 |           0.87 |    2.67 |  17,861.62 |            0.85 |  67.23 | 6,347,364 | 3,173,660 |  28,132 | 14,065.90 |   10,547.10 |
| **14** | NEGany~entirely    |  63,321 |    0.37 |           0.87 |    2.66 |  45,040.32 |            0.83 | 106.96 | 6,347,364 | 3,173,660 |  72,811 | 36,405.25 |   26,915.75 |
| **15** | NEGany~consciously |     925 |    0.37 |           0.87 |    2.05 |     639.67 |            0.81 |  12.86 | 6,347,364 | 3,173,660 |   1,068 |    534.00 |      391.00 |
| **16** | NEGany~merely      |   5,918 |    0.34 |           0.84 |    2.19 |   3,642.62 |            0.73 |  31.33 | 6,347,364 | 3,173,660 |   7,016 |  3,507.98 |    2,410.02 |



In [61]:
nb_show_table(mirror_top15.reset_index().filter(regex=r'^[^l]'))


|        | `key`                |   `f` |   `dP1` |   `dP1_simple` |   `LRC` |     `G2` |   `odds_r_disc` |   `t` |     `N` |    `f1` |   `f2` |   `exp_f` |   `unexp_f` |
|:-------|:---------------------|------:|--------:|---------------:|--------:|---------:|----------------:|------:|--------:|--------:|-------:|----------:|------------:|
| **1**  | NEGmir~ever          | 4,709 |    0.49 |           0.98 |    5.17 | 5,883.26 |            1.79 | 33.75 | 583,470 | 291,732 |  4,786 |  2,392.98 |    2,316.02 |
| **2**  | NEGmir~any           | 1,066 |    0.47 |           0.97 |    4.00 | 1,252.02 |            1.56 | 15.88 | 583,470 | 291,732 |  1,095 |    547.49 |      518.51 |
| **3**  | NEGmir~necessarily   |   963 |    0.47 |           0.97 |    3.86 | 1,114.70 |            1.52 | 15.05 | 583,470 | 291,732 |    992 |    495.99 |      467.01 |
| **4**  | NEGmir~that          | 4,308 |    0.45 |           0.94 |    3.66 | 4,405.21 |            1.24 | 30.91 | 583,470 | 291,732 |  4,559 |

### `NEQ` 15 Most Negatively Associated Adverbs for `mirror` subset 

(_Present Positive_ approximation) as ranked by $\Delta P(1|2)$ (`dP1`) and $LRC$

|        | `key`                |   `f` |   `dP1` |   `dP1_simple` |   `LRC` |     `G2` |   `odds_r_disc` |   `t` |     `N` |    `f1` |   `f2` |   `exp_f` |   `unexp_f` |
|:-------|:---------------------|------:|--------:|---------------:|--------:|---------:|----------------:|------:|--------:|--------:|-------:|----------:|------------:|
| **1**  | NEGmir~ever          | 4,709 |    0.49 |           0.98 |    5.17 | 5,883.26 |            1.79 | 33.75 | 583,470 | 291,732 |  4,786 |  2,392.98 |    2,316.02 |
| **2**  | NEGmir~any           | 1,066 |    0.47 |           0.97 |    4.00 | 1,252.02 |            1.56 | 15.88 | 583,470 | 291,732 |  1,095 |    547.49 |      518.51 |
| **3**  | NEGmir~necessarily   |   963 |    0.47 |           0.97 |    3.86 | 1,114.70 |            1.52 | 15.05 | 583,470 | 291,732 |    992 |    495.99 |      467.01 |
| **4**  | NEGmir~that          | 4,308 |    0.45 |           0.94 |    3.66 | 4,405.21 |            1.24 | 30.91 | 583,470 | 291,732 |  4,559 |  2,279.48 |    2,028.52 |
| **5**  | NEGmir~remotely      | 1,840 |    0.44 |           0.94 |    3.37 | 1,849.23 |            1.21 | 20.13 | 583,470 | 291,732 |  1,953 |    976.49 |      863.51 |
| **6**  | NEGmir~exactly       |   813 |    0.44 |           0.94 |    2.95 |   790.27 |            1.16 | 13.27 | 583,470 | 291,732 |    869 |    434.50 |      378.50 |
| **7**  | NEGmir~particularly  | 9,243 |    0.43 |           0.92 |    3.30 | 8,516.58 |            1.08 | 43.98 | 583,470 | 291,732 | 10,029 |  5,014.45 |    4,228.55 |
| **8**  | NEGmir~inherently    | 2,864 |    0.36 |           0.86 |    2.24 | 1,899.59 |            0.78 | 22.29 | 583,470 | 291,732 |  3,342 |  1,670.98 |    1,193.02 |
| **9**  | NEGmir~intrinsically |   433 |    0.34 |           0.84 |    1.58 |   262.63 |            0.72 |  8.43 | 583,470 | 291,732 |    515 |    257.50 |      175.50 |
| **10** | NEGmir~overtly       |   391 |    0.33 |           0.83 |    1.43 |   219.61 |            0.68 |  7.81 | 583,470 | 291,732 |    473 |    236.50 |      154.50 |
| **11** | NEGmir~especially    | 1,569 |    0.28 |           0.78 |    1.43 |   658.82 |            0.54 | 14.13 | 583,470 | 291,732 |  2,019 |  1,009.49 |      559.51 |
| **12** | NEGmir~fully         | 1,664 |    0.23 |           0.73 |    1.08 |   492.40 |            0.43 | 12.75 | 583,470 | 291,732 |  2,288 |  1,143.99 |      520.01 |



### Or here, the least "negative"/most "non-negative"

In [62]:
def show_top_positive(adv_df, 
                      k:int=15, 
                      filter_and_sort:list=['conservative_log_ratio', 
                                            'am_log_likelihood', 
                                            'am_p1_given2']):
    
    _l1 = adv_df.filter(like='O', axis=0).l1.iat[0].lower().strip()
    _N = int(adv_df.N.iat[0])
    ie = '(`set_diff`, $*\complement_{N^+}$)' if _l1.startswith("com") else '(`mirror`, $@P$)'
    print(f'#### Adverbs in top {k}',
          r'for $LRC$, $G^2$, and $\Delta P(\texttt{env}|\texttt{adv})$',
          f'measuring association with *{_l1.capitalize()}* Environments {ie}', 
          end='\n'*2)
    print(f'Total Tokens in dataset: $N = {_N:,}$')
    nb_show_table(
        get_top_vals(
            adv_df.filter(items=FOCUS), 
            k=k,
            metric_filter=filter_and_sort,
            index_like='O',  # should match "POS" & "COM", but neither "NEG*"
            ).round(2).sort_values(filter_and_sort, ascending=False).set_index('l2').drop(['N', 'l1'], axis=1)
    )
    
# All data
show_top_positive(setdiff_adv, k=15)

#### Adverbs in top 15 for $LRC$, $G^2$, and $\Delta P(\texttt{env}|\texttt{adv})$ measuring association with *Complement* Environments (`set_diff`, $*\complement_{N^+}$)

Total Tokens in dataset: $N = 6,347,364$

|                    |        `f` |   `dP1` |   `dP1_simple` |   `LRC` |       `G2` |   `odds_r_disc` |    `t` |         `f1` |       `f2` |    `exp_f` |   `unexp_f` |
|:-------------------|-----------:|--------:|---------------:|--------:|-----------:|----------------:|-------:|-------------:|-----------:|-----------:|------------:|
| **increasingly**   |  17,147.00 |    0.50 |           1.00 |    7.07 |  22,976.10 |            2.37 |  65.20 | 3,173,552.00 |  17,220.00 |   8,609.65 |    8,537.35 |
| **relatively**     |  26,303.00 |    0.49 |           0.99 |    5.97 |  33,565.49 |            1.92 |  80.11 | 3,173,552.00 |  26,621.00 |  13,309.95 |   12,993.05 |
| **almost**         |  19,843.00 |    0.48 |           0.98 |    5.28 |  24,212.21 |            1.70 |  69.03 | 3

#### Adverbs in top 15 for $LRC$, $G^2$, and $\Delta P(\texttt{env}|\texttt{adv})$ measuring association with *Complement* Environments (`set_diff`, $*\complement_{N^+}$)

Total Tokens in dataset: $N = 6,347,364$

|                    |        `f` |   `dP1` |   `dP1_simple` |   `LRC` |       `G2` |   `odds_r_disc` |    `t` |         `f1` |       `f2` |    `exp_f` |   `unexp_f` |
|:-------------------|-----------:|--------:|---------------:|--------:|-----------:|----------------:|-------:|-------------:|-----------:|-----------:|------------:|
| **increasingly**   |  17,147.00 |    0.50 |           1.00 |    7.07 |  22,976.10 |            2.37 |  65.20 | 3,173,552.00 |  17,220.00 |   8,609.65 |    8,537.35 |
| **relatively**     |  26,303.00 |    0.49 |           0.99 |    5.97 |  33,565.49 |            1.92 |  80.11 | 3,173,552.00 |  26,621.00 |  13,309.95 |   12,993.05 |
| **almost**         |  19,843.00 |    0.48 |           0.98 |    5.28 |  24,212.21 |            1.70 |  69.03 | 3,173,552.00 |  20,240.00 |  10,119.59 |    9,723.41 |
| **mostly**         |   9,295.00 |    0.48 |           0.98 |    5.14 |  11,346.01 |            1.71 |  47.26 | 3,173,552.00 |   9,478.00 |   4,738.81 |    4,556.19 |
| **seemingly**      |   7,411.00 |    0.48 |           0.98 |    5.07 |   9,037.07 |            1.70 |  42.19 | 3,173,552.00 |   7,558.00 |   3,778.85 |    3,632.15 |
| **fairly**         |  17,040.00 |    0.48 |           0.98 |    5.00 |  20,307.10 |            1.61 |  63.67 | 3,173,552.00 |  17,457.00 |   8,728.14 |    8,311.86 |
| **pretty**         |  68,498.00 |    0.48 |           0.97 |    4.96 |  80,502.90 |            1.55 | 127.13 | 3,173,552.00 |  70,454.00 |  35,225.56 |   33,272.44 |
| **albeit**         |     734.00 |    0.50 |           1.00 |    4.92 |     982.83 |            2.32 |  13.49 | 3,173,552.00 |     737.00 |     368.48 |      365.52 |
| **largely**        |   7,916.00 |    0.48 |           0.98 |    4.89 |   9,476.32 |            1.63 |  43.45 | 3,173,552.00 |   8,101.00 |   4,050.33 |    3,865.67 |
| **partly**         |   3,592.00 |    0.48 |           0.98 |    4.84 |   4,379.67 |            1.70 |  29.38 | 3,173,552.00 |   3,663.00 |   1,831.42 |    1,760.58 |
| **rather**         |  16,570.00 |    0.47 |           0.97 |    4.75 |  19,253.20 |            1.53 |  62.47 | 3,173,552.00 |  17,059.00 |   8,529.15 |    8,040.85 |
| **sometimes**      |   6,493.00 |    0.47 |           0.97 |    4.58 |   7,549.67 |            1.54 |  39.12 | 3,173,552.00 |   6,682.00 |   3,340.86 |    3,152.14 |
| **also**           |  48,143.00 |    0.46 |           0.96 |    4.44 |  53,221.81 |            1.40 | 105.23 | 3,173,552.00 |  50,109.00 |  25,053.47 |   23,089.53 |
| **supposedly**     |   1,222.00 |    0.48 |           0.98 |    4.40 |   1,503.72 |            1.74 |  17.16 | 3,173,552.00 |   1,244.00 |     621.97 |      600.03 |
| **certainly**      |   4,454.00 |    0.47 |           0.97 |    4.34 |   5,085.46 |            1.48 |  32.28 | 3,173,552.00 |   4,600.00 |   2,299.91 |    2,154.09 |
| **now**            |  19,616.00 |    0.46 |           0.96 |    4.27 |  21,346.76 |            1.36 |  66.99 | 3,173,552.00 |  20,468.00 |  10,233.58 |    9,382.42 |
| **most**           | 325,174.00 |    0.47 |           0.94 |    4.03 | 344,851.77 |            1.27 | 268.29 | 3,173,552.00 | 344,378.00 | 172,181.95 |  152,992.05 |
| **allegedly**      |     694.00 |    0.48 |           0.98 |    4.00 |     850.58 |            1.71 |  12.93 | 3,173,552.00 |     707.00 |     353.49 |      340.51 |
| **understandably** |     552.00 |    0.49 |           0.99 |    4.00 |     692.56 |            1.81 |  11.58 | 3,173,552.00 |     560.00 |     279.99 |      272.01 |
| **still**          |  35,199.00 |    0.45 |           0.95 |    3.99 |  36,320.19 |            1.26 |  88.57 | 3,173,552.00 |  37,164.00 |  18,581.24 |   16,617.76 |
| **beyond**         |     639.00 |    0.48 |           0.98 |    3.94 |     782.97 |            1.71 |  12.40 | 3,173,552.00 |     651.00 |     325.49 |      313.51 |
| **admittedly**     |     574.00 |    0.48 |           0.98 |    3.91 |     708.52 |            1.74 |  11.77 | 3,173,552.00 |     584.00 |     291.99 |      282.01 |
| **undoubtedly**    |     510.00 |    0.48 |           0.98 |    3.80 |     628.74 |            1.73 |  11.09 | 3,173,552.00 |     519.00 |     259.49 |      250.51 |
| **presumably**     |     320.00 |    0.49 |           0.99 |    3.70 |     413.77 |            1.96 |   8.86 | 3,173,552.00 |     323.00 |     161.49 |      158.51 |
| **highly**         |  33,581.00 |    0.44 |           0.93 |    3.65 |  32,384.35 |            1.15 |  85.07 | 3,173,552.00 |  35,986.00 |  17,992.26 |   15,588.74 |
| **hopefully**      |     320.00 |    0.48 |           0.98 |    3.46 |     398.92 |            1.77 |   8.80 | 3,173,552.00 |     325.00 |     162.49 |      157.51 |
| **extremely**      |  41,289.00 |    0.41 |           0.91 |    3.24 |  35,860.38 |            1.02 |  91.69 | 3,173,552.00 |  45,317.00 |  22,657.57 |   18,631.43 |
| **less**           |  52,587.00 |    0.30 |           0.80 |    1.90 |  25,036.58 |            0.60 |  85.34 | 3,173,552.00 |  66,037.00 |  33,017.15 |   19,569.85 |
| **more**           | 392,003.00 |    0.23 |           0.71 |    1.25 | 107,688.86 |            0.42 | 183.76 | 3,173,552.00 | 553,922.00 | 276,949.66 |  115,053.34 |
| **very**           | 412,871.00 |    0.20 |           0.69 |    1.10 |  93,225.19 |            0.37 | 173.58 | 3,173,552.00 | 602,694.00 | 301,334.66 |  111,536.34 |



In [63]:
# Mirror Data ~ explicitly positive ~ positive trigger present
show_top_positive(mirror_adv, k=15)

#### Adverbs in top 15 for $LRC$, $G^2$, and $\Delta P(\texttt{env}|\texttt{adv})$ measuring association with *Posmir* Environments (`mirror`, $@P$)

Total Tokens in dataset: $N = 583,470$



|                |       `f` |   `dP1` |   `dP1_simple` |   `LRC` |      `G2` |   `odds_r_disc` |   `t` |       `f1` |      `f2` |   `exp_f` |   `unexp_f` |
|:---------------|----------:|--------:|---------------:|--------:|----------:|----------------:|------:|-----------:|----------:|----------:|------------:|
| **pretty**     |  5,049.00 |    0.48 |           0.98 |    4.71 |  6,024.97 |            1.61 | 34.64 | 291,729.00 |  5,176.00 |  2,587.95 |    2,461.05 |
| **rather**     |  1,753.00 |    0.48 |           0.98 |    4.64 |  2,158.90 |            1.73 | 20.55 | 291,729.00 |  1,785.00 |    892.48 |      860.52 |
| **plain**      |  1,001.00 |    0.48 |           0.98 |    4.38 |  1,240.10 |            1.76 | 15.55 | 291,729.00 |  1,018.00 |    508.99 |      492.01 |
| **somewhat**   |    937.00 |    0.48 |           0.98 |    4.34 |  1,160.12 |            1.76 | 15.04 | 291,729.00 |    953.00 |    476.49 |      460.51 |
| **fairly**     |  1,163.00 |    0.48 |           0.98 |

#### Adverbs in top 15 for $LRC$, $G^2$, and $\Delta P(\texttt{env}|\texttt{adv})$ measuring association with *Posmir* Environments (`mirror`, $@P$)

Total Tokens in dataset: $N = 583,470$

|                |       `f` |   `dP1` |   `dP1_simple` |   `LRC` |      `G2` |   `odds_r_disc` |   `t` |       `f1` |      `f2` |   `exp_f` |   `unexp_f` |
|:---------------|----------:|--------:|---------------:|--------:|----------:|----------------:|------:|-----------:|----------:|----------:|------------:|
| **pretty**     |  5,049.00 |    0.48 |           0.98 |    4.71 |  6,024.97 |            1.61 | 34.64 | 291,729.00 |  5,176.00 |  2,587.95 |    2,461.05 |
| **rather**     |  1,753.00 |    0.48 |           0.98 |    4.64 |  2,158.90 |            1.73 | 20.55 | 291,729.00 |  1,785.00 |    892.48 |      860.52 |
| **plain**      |  1,001.00 |    0.48 |           0.98 |    4.38 |  1,240.10 |            1.76 | 15.55 | 291,729.00 |  1,018.00 |    508.99 |      492.01 |
| **somewhat**   |    937.00 |    0.48 |           0.98 |    4.34 |  1,160.12 |            1.76 | 15.04 | 291,729.00 |    953.00 |    476.49 |      460.51 |
| **fairly**     |  1,163.00 |    0.48 |           0.98 |    4.31 |  1,413.04 |            1.68 | 16.70 | 291,729.00 |  1,187.00 |    593.49 |      569.51 |
| **otherwise**  |  1,426.00 |    0.47 |           0.97 |    4.07 |  1,657.11 |            1.53 | 18.33 | 291,729.00 |  1,468.00 |    733.98 |      692.02 |
| **maybe**      |    546.00 |    0.48 |           0.98 |    3.97 |    677.87 |            1.76 | 11.49 | 291,729.00 |    555.00 |    277.49 |      268.51 |
| **downright**  |    985.00 |    0.47 |           0.97 |    3.89 |  1,144.00 |            1.53 | 15.23 | 291,729.00 |  1,014.00 |    506.99 |      478.01 |
| **already**    |    860.00 |    0.47 |           0.97 |    3.76 |    989.13 |            1.50 | 14.20 | 291,729.00 |    887.00 |    443.49 |      416.51 |
| **relatively** |  1,073.00 |    0.47 |           0.97 |    3.75 |  1,210.84 |            1.45 | 15.80 | 291,729.00 |  1,111.00 |    555.49 |      517.51 |
| **almost**     |  1,065.00 |    0.46 |           0.96 |    3.67 |  1,184.44 |            1.41 | 15.69 | 291,729.00 |  1,106.00 |    552.99 |      512.01 |
| **equally**    |  1,538.00 |    0.46 |           0.95 |    3.60 |  1,642.68 |            1.32 | 18.68 | 291,729.00 |  1,611.00 |    805.48 |      732.52 |
| **perhaps**    |    732.00 |    0.47 |           0.96 |    3.52 |    819.90 |            1.43 | 13.03 | 291,729.00 |    759.00 |    379.49 |      352.51 |
| **highly**     |  1,848.00 |    0.44 |           0.93 |    3.21 |  1,788.17 |            1.15 | 19.98 | 291,729.00 |  1,978.00 |    988.98 |      859.02 |
| **slightly**   |  1,513.00 |    0.44 |           0.93 |    3.16 |  1,465.03 |            1.15 | 18.09 | 291,729.00 |  1,619.00 |    809.48 |      703.52 |
| **extremely**  |  3,575.00 |    0.42 |           0.92 |    3.14 |  3,256.93 |            1.07 | 27.34 | 291,729.00 |  3,881.00 |  1,940.46 |    1,634.54 |
| **also**       |  1,370.00 |    0.43 |           0.93 |    3.06 |  1,302.12 |            1.13 | 17.13 | 291,729.00 |  1,472.00 |    735.98 |      634.02 |
| **simply**     |  1,663.00 |    0.43 |           0.93 |    3.04 |  1,549.42 |            1.10 | 18.77 | 291,729.00 |  1,795.00 |    897.48 |      765.52 |
| **still**      |  2,706.00 |    0.41 |           0.91 |    2.93 |  2,356.33 |            1.02 | 23.50 | 291,729.00 |  2,967.00 |  1,483.47 |    1,222.53 |
| **incredibly** |  1,826.00 |    0.42 |           0.92 |    2.89 |  1,616.39 |            1.04 | 19.40 | 291,729.00 |  1,994.00 |    996.98 |      829.02 |
| **possibly**   |    339.00 |    0.46 |           0.96 |    2.88 |    366.75 |            1.34 |  8.80 | 291,729.00 |    354.00 |    177.00 |      162.00 |
| **just**       |  5,883.00 |    0.39 |           0.89 |    2.70 |  4,553.79 |            0.90 | 33.45 | 291,729.00 |  6,635.00 |  3,317.43 |    2,565.57 |
| **even**       | 12,382.00 |    0.32 |           0.81 |    1.98 |  6,616.74 |            0.65 | 42.89 | 291,729.00 | 15,220.00 |  7,609.84 |    4,772.16 |
| **very**       | 36,427.00 |    0.33 |           0.80 |    1.95 | 19,317.81 |            0.66 | 72.08 | 291,729.00 | 45,341.00 | 22,670.03 |   13,756.97 |



## Compile top NEG~adverb associations across both approximation methods

### Define the functions

In [64]:
def load_backup(
                adv_set:set,
    lower_floor: int = 100,
                loaded_path: Path = adv_am_paths['RBdirect'], 
                ) -> pd.DataFrame:
    located_paths = tuple(loaded_path.parent.glob(
        f'*ALL*min{round(SET_FLOOR//3, -2)}x*parq'))
    if any(located_paths):
        backup_df = pd.read_parquet(located_paths[0], columns=FOCUS, filters=[('l2', 'in', adv_set)])

        backup_df = backup_df.filter(like='NEG', axis=0).reset_index().set_index('l2')
        backup_df.index.name = 'adv'
        return backup_df
    else:
        return []


def uncat(df):
    cats = df.select_dtypes('category').columns
    df[cats] = df[cats].astype('string')
    # print(df.dtypes)
    return df, cats


def fill_empties(name_1, name_2, both, loaded_paths, adv_set):
    for name in (name_1, name_2):
        name = name.strip('_')
        path = loaded_paths['RBdirect'] if name == 'SET' else loaded_paths['mirror']
        if any(both[f'f_{name}'].isna()):

            floor = 100
            neg_backup = load_backup(lower_floor=floor, loaded_path=path, adv_set=adv_set)
            if not any(neg_backup):
                print('Error. Backup data not found. [in fill_empties()]')

            neg_backup.columns = (pd.Series(adjust_assoc_columns(neg_backup.columns)
                                            ) + f'_{name}').to_list()
            both, cats = uncat(both)
            neg_backup, __ = uncat(neg_backup)

            undefined_adv = both.loc[
                both[f'f_{name}'].isna(), :].index.to_list()

            both.loc[undefined_adv,
                     neg_backup.columns] = neg_backup.filter(items=undefined_adv, axis=0)

            both[cats] = both[cats].astype('category')

    return both


def combine_top(df_1: pd.DataFrame,
                name_1: str,
                df_2: pd.DataFrame,
                name_2: str,
                env_filter: str = 'NEG',
                filter_items: list = FOCUS,
                k: int = 10) -> pd.DataFrame:
    print(f'### `{TAG}` Most Negative Adverb Selections')
    top_dfs = [
        (get_top_vals(adv_df,  k=k,
                      index_like=env_filter,
                      metric_filter=['am_p1_given2',
                                     'conservative_log_ratio'])
         .sort_values('conservative_log_ratio', ascending=False))
        for adv_df in [df_1, df_2]
    ]
    for i, name in enumerate([name_1, name_2]):

        print_iter(
            [f'_{w}_' for w in top_dfs[i].l2], bullet='1.',
            header=(f'`{name}`: union of top {k} adverbs ranked by '
                    r'$LRC$ & $\Delta P(\texttt{env}|\texttt{adv})$'))
    top_adv_lists = [dx.l2.to_list() for dx in top_dfs]
    top_adv = pd.Series(top_adv_lists[0] + top_adv_lists[1]).drop_duplicates()
    # top_adv = pd.concat((top_dfs[0].l2, top_dfs[1].l2)).drop_duplicates()

    print_iter(
        [f'_{w}_' for w in top_adv], bullet='1.',
        header=f'Union of top adverbs for `{name_1}` and `{name_2}`. (Novel `{name_2}` adverbs listed last)')
    print(f'\n### `{name_1}` Adverb Associations (in initially loaded table)\n')
    df_1 = narrow_selection(df_1, top_adv, env_filter, filter_items)
    print(f'\n### `{name_2}` Adverb Associations (in initially loaded table)\n')
    df_2 = narrow_selection(df_2, top_adv, env_filter, filter_items)

    name_1, name_2 = [f"_{n.strip('_')}" for n in [name_1, name_2]]
    both = df_1.join(df_2, how="outer", lsuffix=name_1, rsuffix=name_2)

    # ! Empty cells need to be filled _before_ calculating mean
    both = fill_empties(name_1, name_2, both, adv_am_paths, adv_set=set(top_adv))
    both = force_ints(both)
    both = add_means(both)
    both = add_f_ratio(both, name_2, name_1)
    return both.sort_values('mean_dP1', ascending=False)


def add_f_ratio(df, subset_name, superset_name):
    counts = df.filter(regex=r'^[Nf][12]?').columns.str.split(
        '_').str.get(0).drop_duplicates()
    for count in counts:
        ratio_col = f'ratio_{count}{subset_name}'
        df[ratio_col] = (df[f'{count}{subset_name}']
                         / df[f'{count}{superset_name}'])
        # print(df.filter(like=count))
    return df


def add_means(both):
    for metric in (both.select_dtypes(include='number').columns.to_series()
                   .str.replace(r'_(MIR|SET)$', '', regex=True).unique()):
        both[f'mean_{snake_to_camel(metric)}'] = both.filter(
            regex=f"^{metric}").agg('mean', axis='columns')
    return both


def narrow_selection(df: pd.DataFrame,
                     top_adv: list,
                     env_filter: str = 'NEG',
                     filter_items: list = FOCUS):
    df = adjust_assoc_columns(
        df.filter(items=filter_items)
        .filter(like=env_filter, axis=0)
        .reset_index().set_index('l2')
        .filter(top_adv, axis=0)).sort_values(['LRC', 'dP1'], ascending=False)
    df.index.name = 'adv'
    nb_show_table(df.drop(['N', 'key', 'l1'], axis=1).round(
        2).sort_values(['LRC', 'dP1', ], ascending=False))

    return df

### Run it 🏃‍♀️

In [65]:
C = combine_top(setdiff_adv, 'SET',
                mirror_adv, 'MIR', k=K)

### `NEQ` Most Negative Adverb Selections

`SET`: union of top 6 adverbs ranked by $LRC$ & $\Delta P(\texttt{env}|\texttt{adv})$
1. _necessarily_
1. _that_
1. _exactly_
1. _immediately_
1. _yet_
1. _any_
1. _before_

`MIR`: union of top 6 adverbs ranked by $LRC$ & $\Delta P(\texttt{env}|\texttt{adv})$
1. _ever_
1. _any_
1. _necessarily_
1. _that_
1. _remotely_
1. _particularly_
1. _exactly_

Union of top adverbs for `SET` and `MIR`. (Novel `MIR` adverbs listed last)
1. _necessarily_
1. _that_
1. _exactly_
1. _immediately_
1. _yet_
1. _any_
1. _before_
1. _ever_
1. _remotely_
1. _particularly_

### `SET` Adverb Associations (in initially loaded table)


|                  |        `f` |   `dP1` |   `dP1_simple` |   `LRC` |       `G2` |   `odds_r_disc` |    `t` |         `f1` |       `f2` |   `exp_f` |   `unexp_f` |
|:-----------------|-----------:|--------:|---------------:|--------:|-----------:|----------------:|-------:|-------------:|-----------:|----------:|------------:|
| **neces

  both.loc[undefined_adv,


### `NEQ` Most Negative Adverb Selections

`SET`: union of top 6 adverbs ranked by $LRC$ & $\Delta P(\texttt{env}|\texttt{adv})$
1. _necessarily_
1. _that_
1. _exactly_
1. _immediately_
1. _yet_
1. _any_
1. _before_

`MIR`: union of top 6 adverbs ranked by $LRC$ & $\Delta P(\texttt{env}|\texttt{adv})$
1. _ever_
1. _any_
1. _necessarily_
1. _that_
1. _remotely_
1. _particularly_
1. _exactly_

Union of top adverbs for `SET` and `MIR`. (Novel `MIR` adverbs listed last)
1. _necessarily_
1. _that_
1. _exactly_
1. _immediately_
1. _yet_
1. _any_
1. _before_
1. _ever_
1. _remotely_
1. _particularly_

### `SET` Adverb Associations (in initially loaded table)


|                  |        `f` |   `dP1` |   `dP1_simple` |   `LRC` |       `G2` |   `odds_r_disc` |    `t` |         `f1` |       `f2` |   `exp_f` |   `unexp_f` |
|:-----------------|-----------:|--------:|---------------:|--------:|-----------:|----------------:|-------:|-------------:|-----------:|----------:|------------:|
| **necessarily**  |  42,595.00 |    0.50 |           0.99 |    6.77 |  56,251.14 |            2.17 | 102.49 | 3,173,660.00 |  42,886.00 | 21,442.85 |   21,152.15 |
| **that**         | 164,768.00 |    0.50 |           0.99 |    6.26 | 214,504.57 |            1.96 | 200.61 | 3,173,660.00 | 166,676.00 | 83,337.42 |   81,430.58 |
| **exactly**      |  43,813.00 |    0.49 |           0.98 |    5.71 |  54,870.72 |            1.81 | 103.01 | 3,173,660.00 |  44,503.00 | 22,251.35 |   21,561.65 |
| **immediately**  |  56,099.00 |    0.47 |           0.97 |    4.68 |  63,920.54 |            1.47 | 114.33 | 3,173,660.00 |  58,040.00 | 29,019.80 |   27,079.20 |
| **yet**          |  51,867.00 |    0.47 |           0.96 |    4.52 |  57,900.12 |            1.42 | 109.45 | 3,173,660.00 |  53,881.00 | 26,940.31 |   24,926.69 |
| **any**          |  15,384.00 |    0.45 |           0.95 |    3.91 |  15,851.55 |            1.26 |  58.57 | 3,173,660.00 |  16,238.00 |  8,118.94 |    7,265.06 |
| **remotely**     |   5,661.00 |    0.42 |           0.92 |    3.16 |   5,075.57 |            1.05 |  34.30 | 3,173,660.00 |   6,161.00 |  3,080.48 |    2,580.52 |
| **before**       |     308.00 |    0.45 |           0.95 |    2.68 |     326.41 |            1.30 |   8.35 | 3,173,660.00 |     323.00 |    161.50 |      146.50 |
| **particularly** |  55,527.00 |    0.23 |           0.73 |    1.37 |  16,791.84 |            0.43 |  74.04 | 3,173,660.00 |  76,162.00 | 38,080.74 |   17,446.26 |
| **ever**         |   5,932.00 |    0.05 |           0.55 |    0.12 |      91.19 |            0.08 |   6.45 | 3,173,660.00 |  10,870.00 |  5,434.96 |      497.04 |


### `MIR` Adverb Associations (in initially loaded table)


|                  |      `f` |   `dP1` |   `dP1_simple` |   `LRC` |     `G2` |   `odds_r_disc` |   `t` |       `f1` |      `f2` |   `exp_f` |   `unexp_f` |
|:-----------------|---------:|--------:|---------------:|--------:|---------:|----------------:|------:|-----------:|----------:|----------:|------------:|
| **ever**         | 4,709.00 |    0.49 |           0.98 |    5.17 | 5,883.26 |            1.79 | 33.75 | 291,732.00 |  4,786.00 |  2,392.98 |    2,316.02 |
| **any**          | 1,066.00 |    0.47 |           0.97 |    4.00 | 1,252.02 |            1.56 | 15.88 | 291,732.00 |  1,095.00 |    547.49 |      518.51 |
| **necessarily**  |   963.00 |    0.47 |           0.97 |    3.86 | 1,114.70 |            1.52 | 15.05 | 291,732.00 |    992.00 |    495.99 |      467.01 |
| **that**         | 4,308.00 |    0.45 |           0.94 |    3.66 | 4,405.21 |            1.24 | 30.91 | 291,732.00 |  4,559.00 |  2,279.48 |    2,028.52 |
| **remotely**     | 1,840.00 |    0.44 |           0.94 |    3.37 | 1,849.23 |            1.21 | 20.13 | 291,732.00 |  1,953.00 |    976.49 |      863.51 |
| **particularly** | 9,243.00 |    0.43 |           0.92 |    3.30 | 8,516.58 |            1.08 | 43.98 | 291,732.00 | 10,029.00 |  5,014.45 |    4,228.55 |
| **exactly**      |   813.00 |    0.44 |           0.94 |    2.95 |   790.27 |            1.16 | 13.27 | 291,732.00 |    869.00 |    434.50 |      378.50 |
| **yet**          |   320.00 |    0.26 |           0.76 |    0.90 |   122.77 |            0.51 |  6.18 | 291,732.00 |    419.00 |    209.50 |      110.50 |
| **immediately**  |   403.00 |    0.21 |           0.71 |    0.67 |   107.39 |            0.40 |  6.03 | 291,732.00 |    564.00 |    282.00 |      121.00 |


In [66]:
nb_show_table(C.filter(regex=r'^ratio_f2?_')
              .assign(f_minus_f2=C.ratio_f_MIR - C.ratio_f2_MIR)
              .multiply(100).round(1)
              .sort_values(['f_minus_f2', 'ratio_f_MIR'], ascending=False),
              n_dec=1, adjust_columns=False)



|                  |   `ratio_f_MIR` |   `ratio_f2_MIR` |   `f_minus_f2` |
|:-----------------|----------------:|-----------------:|---------------:|
| **ever**         |            79.4 |             44.0 |           35.4 |
| **before**       |            93.8 |             89.8 |            4.0 |
| **particularly** |            16.6 |             13.2 |            3.5 |
| **remotely**     |            32.5 |             31.7 |            0.8 |
| **any**          |             6.9 |              6.7 |            0.2 |
| **that**         |             2.6 |              2.7 |           -0.1 |
| **necessarily**  |             2.3 |              2.3 |           -0.1 |
| **exactly**      |             1.9 |              2.0 |           -0.1 |
| **yet**          |             0.6 |              0.8 |           -0.2 |
| **immediately**  |             0.7 |              1.0 |           -0.3 |




|                  |   `ratio_f_MIR` |   `ratio_f2_MIR` |   `f_minus_f2` |
|:-----------------|----------------:|-----------------:|---------------:|
| **ever**         |            79.4 |             44.0 |           35.4 |
| **before**       |            93.8 |             89.8 |            4.0 |
| **particularly** |            16.6 |             13.2 |            3.5 |
| **remotely**     |            32.5 |             31.7 |            0.8 |
| **any**          |             6.9 |              6.7 |            0.2 |
| **that**         |             2.6 |              2.7 |           -0.1 |
| **necessarily**  |             2.3 |              2.3 |           -0.1 |
| **exactly**      |             1.9 |              2.0 |           -0.1 |
| **yet**          |             0.6 |              0.8 |           -0.2 |
| **immediately**  |             0.7 |              1.0 |           -0.3 |



In [67]:
nb_show_table(
    C
    # .assign(f_percent_MIR=C.ratio_f_MIR * 100)
    .filter(regex=r'^f_.*[MS]').sort_index(axis=1, ascending=False)
    .assign(
        f_diff=C.f_SET-C.f_MIR).sort_values('f_diff', ascending=False)
    .rename(columns={'f_SET':'total negations', 
                     'f_MIR':'mirror subset negations', 
                     'f_diff': 'negations not in mirror subset'}), n_dec=0)


|                  |   `total negations` |   `mirror subset negations` |   `negations not in mirror subset` |
|:-----------------|--------------------:|----------------------------:|-----------------------------------:|
| **that**         |             164,768 |                       4,308 |                            160,460 |
| **immediately**  |              56,099 |                         403 |                             55,696 |
| **yet**          |              51,867 |                         320 |                             51,547 |
| **particularly** |              55,527 |                       9,243 |                             46,284 |
| **exactly**      |              43,813 |                         813 |                             43,000 |
| **necessarily**  |              42,595 |                         963 |                             41,632 |
| **any**          |              15,384 |                       1,066 |                             14,318 |
| **remot


|                  |   `total negations` |   `mirror subset negations` |   `negations not in mirror subset` |
|:-----------------|--------------------:|----------------------------:|-----------------------------------:|
| **that**         |             164,768 |                       4,308 |                            160,460 |
| **immediately**  |              56,099 |                         403 |                             55,696 |
| **yet**          |              51,867 |                         320 |                             51,547 |
| **particularly** |              55,527 |                       9,243 |                             46,284 |
| **exactly**      |              43,813 |                         813 |                             43,000 |
| **necessarily**  |              42,595 |                         963 |                             41,632 |
| **any**          |              15,384 |                       1,066 |                             14,318 |
| **remotely**     |               5,661 |                       1,840 |                              3,821 |
| **ever**         |               5,932 |                       4,709 |                              1,223 |
| **before**       |                 308 |                         289 |                                 19 |



#### Marginal (_Adverb Total_) Frequency Comparison


|                  |   `total adverb tokens` |   `mirror subset adverb tokens` |   `adverb tokens not in mirror subset` |
|:-----------------|------------------------:|--------------------------------:|---------------------------------------:|
| **that**         |                 166,676 |                           4,559 |                                162,117 |
| **particularly** |                  76,162 |                          10,029 |                                 66,133 |
| **immediately**  |                  58,040 |                             564 |                                 57,476 |
| **yet**          |                  53,881 |                             419 |                                 53,462 |
| **exactly**      |                  44,503 |                             869 |                                 43,634 |
| **necessarily**  |                  42,886 |                             992 |                                 41,894 |
| **any**          |                  16,238 |                           1,095 |                                 15,143 |
| **ever**         |                  10,870 |                           4,786 |                                  6,084 |
| **remotely**     |                   6,161 |                           1,953 |                                  4,208 |
| **before**       |                     323 |                             290 |                                     33 |



In [68]:
nb_show_table(
    C
    # .assign(f2_percent_MIR=C.ratio_f2_MIR * 100)
    .filter(regex=r'^f2_.*[MS]').sort_index(axis=1, ascending=False)
    .assign(
        f2_diff=C.f2_SET-C.f2_MIR).sort_values('f2_diff', ascending=False)
    .rename(columns={'f2_SET':'total adverb tokens', 
                     'f2_MIR':'mirror subset adverb tokens', 
                     'f2_diff': 'adverb tokens not in mirror subset'}), n_dec=0)


|                  |   `total adverb tokens` |   `mirror subset adverb tokens` |   `adverb tokens not in mirror subset` |
|:-----------------|------------------------:|--------------------------------:|---------------------------------------:|
| **that**         |                 166,676 |                           4,559 |                                162,117 |
| **particularly** |                  76,162 |                          10,029 |                                 66,133 |
| **immediately**  |                  58,040 |                             564 |                                 57,476 |
| **yet**          |                  53,881 |                             419 |                                 53,462 |
| **exactly**      |                  44,503 |                             869 |                                 43,634 |
| **necessarily**  |                  42,886 |                             992 |                                 41,894 |
| **any**          |   

In [69]:
full_C = C.copy()
main_cols_ordered = pd.concat((*[C.filter(like=m).columns.to_series() for m in ('LRC', 'P1', 'G2')],
                               *[C.filter(regex=f'^{f}_').columns.to_series() for f in ['f', 'f1', 'f2'] ]) 
                              ).to_list()
# print_iter([f'`{c}`' for c in main_cols_ordered], bullet='1.', header='Main Columns')
main_C = C[[c for c in main_cols_ordered if c in C.columns]]
nb_show_table(main_C.sort_values('mean_dP1', ascending=False), return_df=True)


|                  |   `LRC_SET` |   `LRC_MIR` |   `mean_LRC` |   `dP1_SET` |   `dP1_simple_SET` |   `dP1_MIR` |   `dP1_simple_MIR` |   `mean_dP1` |   `mean_dP1Simple` |   `G2_SET` |   `G2_MIR` |   `mean_G2` |    `f_SET` |   `f_MIR` |     `f1_SET` |   `f1_MIR` |   `f2_SET` |   `f2_MIR` |
|:-----------------|------------:|------------:|-------------:|------------:|-------------------:|------------:|-------------------:|-------------:|-------------------:|-----------:|-----------:|------------:|-----------:|----------:|-------------:|-----------:|-----------:|-----------:|
| **before**       |        2.68 |        6.21 |         4.44 |        0.45 |               0.95 |        0.83 |               1.00 |         0.81 |               0.98 |     326.41 |   1,006.68 |      666.55 |     308.00 |    289.00 | 3,173,660.00 | 291,732.00 |     323.00 |     290.00 |
| **necessarily**  |        6.77 |        3.86 |         5.31 |        0.50 |               0.99 |        0.47 |               0.97 

Unnamed: 0,`LRC_SET`,`LRC_MIR`,`mean_LRC`,`dP1_SET`,`dP1_simple_SET`,`dP1_MIR`,...,`f_SET`,`f_MIR`,`f1_SET`,`f1_MIR`,`f2_SET`,`f2_MIR`
**before**,2.68,6.21,4.44,0.45,0.95,0.83,...,308,289,3173660,291732,323,290
**necessarily**,6.77,3.86,5.31,0.5,0.99,0.47,...,42595,963,3173660,291732,42886,992
**that**,6.26,3.66,4.96,0.5,0.99,0.45,...,164768,4308,3173660,291732,166676,4559
**exactly**,5.71,2.95,4.33,0.49,0.98,0.44,...,43813,813,3173660,291732,44503,869
**any**,3.91,4.0,3.96,0.45,0.95,0.47,...,15384,1066,3173660,291732,16238,1095
**remotely**,3.16,3.37,3.27,0.42,0.92,0.44,...,5661,1840,3173660,291732,6161,1953
**yet**,4.52,0.9,2.71,0.47,0.96,0.26,...,51867,320,3173660,291732,53881,419
**immediately**,4.68,0.67,2.68,0.47,0.97,0.21,...,56099,403,3173660,291732,58040,564
**particularly**,1.37,3.3,2.33,0.23,0.73,0.43,...,55527,9243,3173660,291732,76162,10029
**ever**,0.12,5.17,2.65,0.05,0.55,0.49,...,5932,4709,3173660,291732,10870,4786



|                  |   `LRC_SET` |   `LRC_MIR` |   `mean_LRC` |   `dP1_SET` |   `dP1_simple_SET` |   `dP1_MIR` |   `dP1_simple_MIR` |   `mean_dP1` |   `mean_dP1Simple` |   `G2_SET` |   `G2_MIR` |   `mean_G2` |    `f_SET` |   `f_MIR` |     `f1_SET` |   `f1_MIR` |   `f2_SET` |   `f2_MIR` |
|:-----------------|------------:|------------:|-------------:|------------:|-------------------:|------------:|-------------------:|-------------:|-------------------:|-----------:|-----------:|------------:|-----------:|----------:|-------------:|-----------:|-----------:|-----------:|
| **before**       |        2.68 |        6.21 |         4.44 |        0.45 |               0.95 |        0.83 |               1.00 |         0.81 |               0.98 |     326.41 |   1,006.68 |      666.55 |     308.00 |    289.00 | 3,173,660.00 | 291,732.00 |     323.00 |     290.00 |
| **necessarily**  |        6.77 |        3.86 |         5.31 |        0.50 |               0.99 |        0.47 |               0.97 |         0.73 |               0.98 |  56,251.14 |   1,114.70 |   28,682.92 |  42,595.00 |    963.00 | 3,173,660.00 | 291,732.00 |  42,886.00 |     992.00 |
| **that**         |        6.26 |        3.66 |         4.96 |        0.50 |               0.99 |        0.45 |               0.94 |         0.72 |               0.97 | 214,504.57 |   4,405.21 |  109,454.89 | 164,768.00 |  4,308.00 | 3,173,660.00 | 291,732.00 | 166,676.00 |   4,559.00 |
| **exactly**      |        5.71 |        2.95 |         4.33 |        0.49 |               0.98 |        0.44 |               0.94 |         0.71 |               0.96 |  54,870.72 |     790.27 |   27,830.50 |  43,813.00 |    813.00 | 3,173,660.00 | 291,732.00 |  44,503.00 |     869.00 |
| **any**          |        3.91 |        4.00 |         3.96 |        0.45 |               0.95 |        0.47 |               0.97 |         0.71 |               0.96 |  15,851.55 |   1,252.02 |    8,551.79 |  15,384.00 |  1,066.00 | 3,173,660.00 | 291,732.00 |  16,238.00 |   1,095.00 |
| **remotely**     |        3.16 |        3.37 |         3.27 |        0.42 |               0.92 |        0.44 |               0.94 |         0.68 |               0.93 |   5,075.57 |   1,849.23 |    3,462.40 |   5,661.00 |  1,840.00 | 3,173,660.00 | 291,732.00 |   6,161.00 |   1,953.00 |
| **yet**          |        4.52 |        0.90 |         2.71 |        0.47 |               0.96 |        0.26 |               0.76 |         0.61 |               0.86 |  57,900.12 |     122.77 |   29,011.45 |  51,867.00 |    320.00 | 3,173,660.00 | 291,732.00 |  53,881.00 |     419.00 |
| **immediately**  |        4.68 |        0.67 |         2.68 |        0.47 |               0.97 |        0.21 |               0.71 |         0.59 |               0.84 |  63,920.54 |     107.39 |   32,013.96 |  56,099.00 |    403.00 | 3,173,660.00 | 291,732.00 |  58,040.00 |     564.00 |
| **particularly** |        1.37 |        3.30 |         2.33 |        0.23 |               0.73 |        0.43 |               0.92 |         0.58 |               0.83 |  16,791.84 |   8,516.58 |   12,654.21 |  55,527.00 |  9,243.00 | 3,173,660.00 | 291,732.00 |  76,162.00 |  10,029.00 |
| **ever**         |        0.12 |        5.17 |         2.65 |        0.05 |               0.55 |        0.49 |               0.98 |         0.52 |               0.76 |      91.19 |   5,883.26 |    2,987.23 |   5,932.00 |  4,709.00 | 3,173,660.00 | 291,732.00 |  10,870.00 |   4,786.00 |



## Save full adverb selection as `.csv`

In [70]:
combined_top_csv_output = TOP_AM_TAG_DIR / \
    f'{TAG}-Top{K}_NEG-ADV_combined.{timestamp_today()}.csv'
print(
    f'Saving Combined "Most Negative Adverbs" AM table as csv:  \n> `{combined_top_csv_output}`')

C.to_csv(combined_top_csv_output, float_format='{:.4f}'.format)

C

Saving Combined "Most Negative Adverbs" AM table as csv:  
> `/share/compling/projects/sanpi/results/top_AM/NEQ/NEQ-Top6_NEG-ADV_combined.2024-07-25.csv`


Unnamed: 0_level_0,key_SET,f_SET,dP1_SET,dP1_simple_SET,LRC_SET,G2_SET,...,mean_expF,mean_unexpF,ratio_f_MIR,ratio_N_MIR,ratio_f1_MIR,ratio_f2_MIR
adv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
before,NEGany~before,308,0.45,0.95,2.68,326.41,...,105.6,192.9,0.94,0.27,0.09,0.9
necessarily,NEGany~necessarily,42595,0.5,0.99,6.77,56251.14,...,10969.42,10809.58,0.02,0.09,0.09,0.02
that,NEGany~that,164768,0.5,0.99,6.26,214504.57,...,42808.45,41729.55,0.03,0.09,0.09,0.03
exactly,NEGany~exactly,43813,0.49,0.98,5.71,54870.72,...,11342.92,10970.08,0.02,0.09,0.09,0.02
any,NEGany~any,15384,0.45,0.95,3.91,15851.55,...,4333.22,3891.78,0.07,0.09,0.09,0.07
remotely,NEGany~remotely,5661,0.42,0.92,3.16,5075.57,...,2028.48,1722.02,0.33,0.09,0.09,0.32
yet,NEGany~yet,51867,0.47,0.96,4.52,57900.12,...,13574.91,12518.59,0.01,0.09,0.09,0.01
immediately,NEGany~immediately,56099,0.47,0.97,4.68,63920.54,...,14650.9,13600.1,0.01,0.09,0.09,0.01
particularly,NEGany~particularly,55527,0.23,0.73,1.37,16791.84,...,21547.59,10837.41,0.17,0.09,0.09,0.13
ever,NEGany~ever,5932,0.05,0.55,0.12,91.19,...,3913.97,1406.53,0.79,0.09,0.09,0.44


Save `all-columns`, `means`, and `MAIN` as markdown formatted tables

In [71]:
C.to_markdown(floatfmt=',.2f', intfmt=',', buf=TOP_AM_TAG_DIR /
              f'Top{K}_NEG-ADV_combined_all-columns_{timestamp_today()}.md')
C.filter(like='mean_').to_markdown(floatfmt=',.2f', intfmt=',', buf=TOP_AM_TAG_DIR /
                                   f'Top{K}_NEG-ADV_combined_means_{timestamp_today()}.md')
C[main_cols_ordered].to_markdown(floatfmt=',.2f', intfmt=',', buf=TOP_AM_TAG_DIR /
                                 f'Top{K}_NEG-ADV_combined_MAIN_{timestamp_today()}.md')

In [72]:
nb_show_table(C[main_cols_ordered])


|                  |   `LRC_SET` |   `LRC_MIR` |   `mean_LRC` |   `dP1_SET` |   `dP1_simple_SET` |   `dP1_MIR` |   `dP1_simple_MIR` |   `mean_dP1` |   `mean_dP1Simple` |   `G2_SET` |   `G2_MIR` |   `mean_G2` |    `f_SET` |   `f_MIR` |     `f1_SET` |   `f1_MIR` |   `f2_SET` |   `f2_MIR` |
|:-----------------|------------:|------------:|-------------:|------------:|-------------------:|------------:|-------------------:|-------------:|-------------------:|-----------:|-----------:|------------:|-----------:|----------:|-------------:|-----------:|-----------:|-----------:|
| **before**       |        2.68 |        6.21 |         4.44 |        0.45 |               0.95 |        0.83 |               1.00 |         0.81 |               0.98 |     326.41 |   1,006.68 |      666.55 |     308.00 |    289.00 | 3,173,660.00 | 291,732.00 |     323.00 |     290.00 |
| **necessarily**  |        6.77 |        3.86 |         5.31 |        0.50 |               0.99 |        0.47 |               0.97 


|                  |   `LRC_SET` |   `LRC_MIR` |   `mean_LRC` |   `dP1_SET` |   `dP1_simple_SET` |   `dP1_MIR` |   `dP1_simple_MIR` |   `mean_dP1` |   `mean_dP1Simple` |   `G2_SET` |   `G2_MIR` |   `mean_G2` |    `f_SET` |   `f_MIR` |     `f1_SET` |   `f1_MIR` |   `f2_SET` |   `f2_MIR` |
|:-----------------|------------:|------------:|-------------:|------------:|-------------------:|------------:|-------------------:|-------------:|-------------------:|-----------:|-----------:|------------:|-----------:|----------:|-------------:|-----------:|-----------:|-----------:|
| **before**       |        2.68 |        6.21 |         4.44 |        0.45 |               0.95 |        0.83 |               1.00 |         0.81 |               0.98 |     326.41 |   1,006.68 |      666.55 |     308.00 |    289.00 | 3,173,660.00 | 291,732.00 |     323.00 |     290.00 |
| **necessarily**  |        6.77 |        3.86 |         5.31 |        0.50 |               0.99 |        0.47 |               0.97 |         0.73 |               0.98 |  56,251.14 |   1,114.70 |   28,682.92 |  42,595.00 |    963.00 | 3,173,660.00 | 291,732.00 |  42,886.00 |     992.00 |
| **that**         |        6.26 |        3.66 |         4.96 |        0.50 |               0.99 |        0.45 |               0.94 |         0.72 |               0.97 | 214,504.57 |   4,405.21 |  109,454.89 | 164,768.00 |  4,308.00 | 3,173,660.00 | 291,732.00 | 166,676.00 |   4,559.00 |
| **exactly**      |        5.71 |        2.95 |         4.33 |        0.49 |               0.98 |        0.44 |               0.94 |         0.71 |               0.96 |  54,870.72 |     790.27 |   27,830.50 |  43,813.00 |    813.00 | 3,173,660.00 | 291,732.00 |  44,503.00 |     869.00 |
| **any**          |        3.91 |        4.00 |         3.96 |        0.45 |               0.95 |        0.47 |               0.97 |         0.71 |               0.96 |  15,851.55 |   1,252.02 |    8,551.79 |  15,384.00 |  1,066.00 | 3,173,660.00 | 291,732.00 |  16,238.00 |   1,095.00 |
| **remotely**     |        3.16 |        3.37 |         3.27 |        0.42 |               0.92 |        0.44 |               0.94 |         0.68 |               0.93 |   5,075.57 |   1,849.23 |    3,462.40 |   5,661.00 |  1,840.00 | 3,173,660.00 | 291,732.00 |   6,161.00 |   1,953.00 |
| **yet**          |        4.52 |        0.90 |         2.71 |        0.47 |               0.96 |        0.26 |               0.76 |         0.61 |               0.86 |  57,900.12 |     122.77 |   29,011.45 |  51,867.00 |    320.00 | 3,173,660.00 | 291,732.00 |  53,881.00 |     419.00 |
| **immediately**  |        4.68 |        0.67 |         2.68 |        0.47 |               0.97 |        0.21 |               0.71 |         0.59 |               0.84 |  63,920.54 |     107.39 |   32,013.96 |  56,099.00 |    403.00 | 3,173,660.00 | 291,732.00 |  58,040.00 |     564.00 |
| **particularly** |        1.37 |        3.30 |         2.33 |        0.23 |               0.73 |        0.43 |               0.92 |         0.58 |               0.83 |  16,791.84 |   8,516.58 |   12,654.21 |  55,527.00 |  9,243.00 | 3,173,660.00 | 291,732.00 |  76,162.00 |  10,029.00 |
| **ever**         |        0.12 |        5.17 |         2.65 |        0.05 |               0.55 |        0.49 |               0.98 |         0.52 |               0.76 |      91.19 |   5,883.26 |    2,987.23 |   5,932.00 |  4,709.00 | 3,173,660.00 | 291,732.00 |  10,870.00 |   4,786.00 |

