# Identifying Adverbs with Strongest Negative Environment Associations

In [78]:
from pathlib import Path

import pandas as pd

from source.utils import PKL_SUFF
from source.utils.associate import AM_DF_DIR, TOP_AM_DIR, adjust_assoc_columns
from source.utils.general import print_iter, snake_to_camel, timestamp_today

SET_FLOOR = 1000
MIR_FLOOR = 200
K = 5

# for loading `polar/*/bigram/*` tables
bigram_floor = 200
mirror_floor = 50

Set columns and diplay settings

In [79]:
FOCUS = ['f',
         'am_p1_given2', 'conservative_log_ratio',
         'am_log_likelihood',
        #  'mutual_information', 'am_odds_ratio_disc', 't_score',
         'N', 'f1', 'f2', 'E11', 'unexpected_f', 
         'l1', 'l2']
pd.set_option('display.max_colwidth', 20)
pd.set_option('display.max_columns', 12)
pd.set_option('display.width', 90)
pd.set_option("display.precision", 2)
pd.set_option("styler.format.precision", 2)
pd.set_option("styler.format.thousands", ",")
pd.set_option("display.float_format", '{:,.2f}'.format)
# pd.set_option("styler.render.repr", "html")

In [80]:
def force_ints(_df):
    count_cols = _df.filter(regex=r'total|^[fN]').columns
    _df[count_cols] = _df[count_cols].astype('int')
    # _df[count_cols] = _df[:, count_cols].astype('int64')
    # print(_df.dtypes.to_frame('dtypes'))
    return _df

In [81]:
def nb_show_table(df, n_dec: int = 2,
                  adjust_columns: bool = True,
                   outpath:Path=None, 
                   return_df:bool=False) -> None: 
    _df = df.copy()
    if adjust_columns: 
        _df = adjust_assoc_columns(_df)
    _df.columns = [f'`{c}`' for c in _df.columns]
    _df.index = [f'**{r}**' for r in _df.index ]
    table = _df.to_markdown(floatfmt=f',.{n_dec}f', intfmt=',')
    if outpath:
        outpath.write_text(table)

    print(f'\n{table}\n')
    return (_df if return_df else None)

## Set paths and load adverb association tables

In [82]:
def update_index(df, pat_name:str = None):
    neg_env_name = df.filter(like='NEG', axis=0).l1[0]
    # > will be either `NEGATED` or `NEGMIR`
    #   both are shortened to just `NEG` for the keys in their separate dataframes
    # > replace to avoid ambiguity in `key` values when combined
    #! some filtering relies on 'NEG', so have to keep that prefix
    index_update = pat_name or ('NEGmir' if neg_env_name.endswith('MIR') else 'NEGany')
    df.index = df.index.str.replace('NEG', index_update)
    return df

In [83]:
POLAR_DIR = AM_DF_DIR.joinpath('polar')

polar_adv_dirs = []
# results/assoc_df/polar/RBdirect/adv/extra/polarized-adv_35f-7c_min5000x_extra.pkl.gz
adv_am_paths = {
    p.name: tuple(
        p.joinpath('adv/extra').glob(
            f'*35f-7c_min{SET_FLOOR if p.name == "RBdirect" else MIR_FLOOR}x*{PKL_SUFF}')
    )[0]
    for p in POLAR_DIR.iterdir()}

setdiff_adv = update_index(pd.read_pickle(adv_am_paths['RBdirect']))
mirror_adv = update_index(pd.read_pickle(adv_am_paths['NEGmirror']))
nb_show_table(setdiff_adv.sample(K//2).sort_values('conservative_log_ratio', ascending=False)[FOCUS])


|                   |    `f` |   `dP1` |   `LRC` |    `G2` |        `N` |       `f1` |   `f2` |   `exp_f` |   `unexp_f` | `l1`       | `l2`      |
|:------------------|-------:|--------:|--------:|--------:|-----------:|-----------:|-------:|----------:|------------:|:-----------|:----------|
| **COM~initially** | 23,770 |   -0.00 |    0.00 |   -2.21 | 86,330,752 | 83,102,035 | 24,740 | 23,814.74 |      -44.74 | COMPLEMENT | initially |
| **COM~publicly**  | 32,804 |   -0.01 |   -0.31 | -177.30 | 86,330,752 | 83,102,035 | 34,594 | 33,300.21 |     -496.21 | COMPLEMENT | publicly  |



In [84]:
nb_show_table(mirror_adv.sample(K//2).sort_values('conservative_log_ratio', ascending=False)[FOCUS])


|               |    `f` |   `dP1` |   `LRC` |     `G2` |       `N` |      `f1` |   `f2` |   `exp_f` |   `unexp_f` | `l1`   | `l2`   |
|:--------------|-------:|--------:|--------:|---------:|----------:|----------:|-------:|----------:|------------:|:-------|:-------|
| **POS~maybe** |  2,573 |    0.16 |    3.43 |   846.03 | 1,761,853 | 1,472,036 |  2,581 |  2,156.44 |      416.56 | POSMIR | maybe  |
| **POS~just**  | 27,625 |    0.14 |    2.60 | 5,785.97 | 1,761,853 | 1,472,036 | 28,371 | 23,704.10 |    3,920.90 | POSMIR | just   |



## Calculate "Most Negative" Adverbs for each Polarity Approximation

In [85]:
def get_top_vals(df: pd.DataFrame,
                 index_like: str = 'NEG',
                 metric_filter: str | list = ['am_p1_given2', 'conservative_log_ratio'],
                 k: int = 10,
                 val_col: str = None,
                 ignore_neg_adv: bool = True):
    env_df = df.copy().loc[df.conservative_log_ratio >=
                           1].filter(like=index_like, axis=0)
    if ignore_neg_adv:
        env_df = env_df.loc[~df.l2.isin(
            ("n't", 'not', 'barely', 'never', 'no', 'none')), :]
    if isinstance(metric_filter, str):
        metric_filter = [metric_filter]

    top = pd.concat([env_df.nlargest(k, m) for m in metric_filter]
                    ).drop_duplicates(keep='first')

    if val_col:
        top = top[[val_col] + metric_filter]

    return top.sort_values(metric_filter, ascending=False)


[setdiff_top15, mirror_top15] = [
    get_top_vals(adv_df, k=15)
    for adv_df in (setdiff_adv, mirror_adv)
]
nb_show_table(setdiff_top15.filter(items=FOCUS).reset_index())


|        | `key`              |     `f` |   `dP1` |   `LRC` |       `G2` |        `N` |      `f1` |    `f2` |   `exp_f` |   `unexp_f` | `l1`    | `l2`        |
|:-------|:-------------------|--------:|--------:|--------:|-----------:|-----------:|----------:|--------:|----------:|------------:|:--------|:------------|
| **0**  | NEGany~necessarily |  42,708 |    0.72 |    6.23 | 219,003.46 | 86,330,752 | 3,226,213 |  56,694 |  2,118.68 |   40,589.32 | NEGATED | necessarily |
| **1**  | NEGany~exactly     |  43,635 |    0.67 |    5.90 | 214,404.20 | 86,330,752 | 3,226,213 |  61,599 |  2,301.98 |   41,333.02 | NEGATED | exactly     |
| **2**  | NEGany~that        | 165,411 |    0.63 |    5.62 | 781,016.11 | 86,330,752 | 3,226,213 | 250,392 |  9,357.24 |  156,053.76 | NEGATED | that        |
| **3**  | NEGany~immediately |  57,319 |    0.52 |    4.96 | 239,462.58 | 86,330,752 | 3,226,213 | 103,177 |  3,855.76 |   53,463.24 | NEGATED | immediately |
| **4**  | NEGany~yet         |  52,546

15 Most Negatively Associated Adverbs for full dataset (_Absent Negative_ approximation) as ranked by $\Delta P(1|2)$ (`dP1`) and $LRC$

|        | `key`              |     `f` |   `dP1` |   `LRC` |       `G2` |        `N` |      `f1` |    `f2` |   `exp_f` |   `unexp_f` | `l1`    | `l2`        |
|:-------|:-------------------|--------:|--------:|--------:|-----------:|-----------:|----------:|--------:|----------:|------------:|:--------|:------------|
| **0**  | NEGany~necessarily |  42,708 |    0.72 |    6.23 | 219,003.46 | 86,330,752 | 3,226,213 |  56,694 |  2,118.68 |   40,589.32 | NEGATED | necessarily |
| **1**  | NEGany~exactly     |  43,635 |    0.67 |    5.90 | 214,404.20 | 86,330,752 | 3,226,213 |  61,599 |  2,301.98 |   41,333.02 | NEGATED | exactly     |
| **2**  | NEGany~that        | 165,411 |    0.63 |    5.62 | 781,016.11 | 86,330,752 | 3,226,213 | 250,392 |  9,357.24 |  156,053.76 | NEGATED | that        |
| **3**  | NEGany~immediately |  57,319 |    0.52 |    4.96 | 239,462.58 | 86,330,752 | 3,226,213 | 103,177 |  3,855.76 |   53,463.24 | NEGATED | immediately |
| **4**  | NEGany~yet         |  52,546 |    0.48 |    4.74 | 209,055.78 | 86,330,752 | 3,226,213 | 101,707 |  3,800.83 |   48,745.17 | NEGATED | yet         |
| **5**  | NEGany~terribly    |  18,054 |    0.22 |    3.09 |  42,704.93 | 86,330,752 | 3,226,213 |  70,174 |  2,622.43 |   15,431.57 | NEGATED | terribly    |
| **6**  | NEGany~remotely    |   5,679 |    0.22 |    3.03 |  13,354.33 | 86,330,752 | 3,226,213 |  22,194 |    829.40 |    4,849.60 | NEGATED | remotely    |
| **7**  | NEGany~only        | 114,070 |    0.21 |    3.04 | 261,936.36 | 86,330,752 | 3,226,213 | 464,168 | 17,346.13 |   96,723.87 | NEGATED | only        |
| **8**  | NEGany~altogether  |   4,575 |    0.18 |    2.75 |   9,468.00 | 86,330,752 | 3,226,213 |  20,636 |    771.17 |    3,803.82 | NEGATED | altogether  |
| **9**  | NEGany~entirely    |  63,708 |    0.17 |    2.74 | 125,925.14 | 86,330,752 | 3,226,213 | 303,833 | 11,354.35 |   52,353.65 | NEGATED | entirely    |
| **10** | NEGany~overly      |  24,707 |    0.17 |    2.66 |  46,993.58 | 86,330,752 | 3,226,213 | 122,058 |  4,561.35 |   20,145.65 | NEGATED | overly      |
| **11** | NEGany~merely      |   5,944 |    0.13 |    2.26 |   9,223.66 | 86,330,752 | 3,226,213 |  35,608 |  1,330.68 |    4,613.32 | NEGATED | merely      |
| **12** | NEGany~any         |  15,492 |    0.13 |    2.28 |  23,683.00 | 86,330,752 | 3,226,213 |  94,152 |  3,518.50 |   11,973.50 | NEGATED | any         |
| **13** | NEGany~always      | 104,605 |    0.12 |    2.28 | 157,437.56 | 86,330,752 | 3,226,213 | 651,053 | 24,330.10 |   80,274.90 | NEGATED | always      |
| **14** | NEGany~directly    |   8,317 |    0.12 |    2.13 |  11,654.57 | 86,330,752 | 3,226,213 |  54,441 |  2,034.48 |    6,282.52 | NEGATED | directly    |

In [86]:
nb_show_table(mirror_top15.filter(items=FOCUS).reset_index())


|        | `key`                |   `f` |   `dP1` |   `LRC` |      `G2` |       `N` |    `f1` |   `f2` |   `exp_f` |   `unexp_f` | `l1`   | `l2`          |
|:-------|:---------------------|------:|--------:|--------:|----------:|----------:|--------:|-------:|----------:|------------:|:-------|:--------------|
| **0**  | NEGmir~before        |   288 |    0.84 |    1.31 |  1,039.94 | 1,761,853 | 289,770 |    288 |     47.37 |      240.63 | NEGMIR | before        |
| **1**  | NEGmir~ever          | 4,688 |    0.77 |    5.73 | 14,624.92 | 1,761,853 | 289,770 |  5,027 |    826.79 |    3,861.21 | NEGMIR | ever          |
| **2**  | NEGmir~any           | 1,066 |    0.74 |    4.88 |  3,151.64 | 1,761,853 | 289,770 |  1,178 |    193.74 |      872.26 | NEGMIR | any           |
| **3**  | NEGmir~longer        |   802 |    0.74 |    4.71 |  2,350.18 | 1,761,853 | 289,770 |    891 |    146.54 |      655.46 | NEGMIR | longer        |
| **4**  | NEGmir~necessarily   |   960 |    0.71 |    4.47 |  

### _Previous_ Before Additional Filtering

15 Most Negatively Associated Adverbs for `mirror` subset (_Present Positive_ approximation) as ranked by $\Delta P(1|2)$ (`dP1`) and $LRC$

|        | `key`                |   `f` |   `dP1` |   `LRC` |      `G2` |       `N` |    `f1` |   `f2` |   `exp_f` |   `unexp_f` | `l1`   | `l2`          |
|:-------|:---------------------|------:|--------:|--------:|----------:|----------:|--------:|-------:|----------:|------------:|:-------|:--------------|
| **0**  | NEGmir~before        |   290 |    0.84 |    5.11 |  1,080.52 | 2,032,082 | 293,963 |    294 |     42.53 |      247.47 | NEGMIR | before        |
| **1**  | NEGmir~ever          | 4,718 |    0.77 |    5.57 | 15,340.34 | 2,032,082 | 293,963 |  5,179 |    749.20 |    3,968.80 | NEGMIR | ever          |
| **2**  | NEGmir~exactly       |   813 |    0.59 |    3.51 |  1,939.47 | 2,032,082 | 293,963 |  1,114 |    161.15 |      651.85 | NEGMIR | exactly       |
| **3**  | NEGmir~any           | 1,082 |    0.57 |    3.48 |  2,511.26 | 2,032,082 | 293,963 |  1,514 |    219.02 |      862.98 | NEGMIR | any           |
| **4**  | NEGmir~remotely      | 1,846 |    0.54 |    3.35 |  4,009.84 | 2,032,082 | 293,963 |  2,717 |    393.04 |    1,452.96 | NEGMIR | remotely      |
| **5**  | NEGmir~particularly  | 9,278 |    0.48 |    3.15 | 17,999.07 | 2,032,082 | 293,963 | 14,954 |  2,163.26 |    7,114.74 | NEGMIR | particularly  |
| **6**  | NEGmir~that          | 4,338 |    0.44 |    2.86 |  7,632.21 | 2,032,082 | 293,963 |  7,472 |  1,080.91 |    3,257.09 | NEGMIR | that          |
| **7**  | NEGmir~necessarily   |   971 |    0.43 |    2.66 |  1,688.91 | 2,032,082 | 293,963 |  1,681 |    243.18 |      727.82 | NEGMIR | necessarily   |
| **8**  | NEGmir~inherently    | 2,872 |    0.36 |    2.42 |  4,160.38 | 2,032,082 | 293,963 |  5,649 |    817.19 |    2,054.81 | NEGMIR | inherently    |
| **9**  | NEGmir~overtly       |   392 |    0.29 |    1.71 |    443.78 | 2,032,082 | 293,963 |    898 |    129.91 |      262.09 | NEGMIR | overtly       |
| **10** | NEGmir~intrinsically |   432 |    0.29 |    1.73 |    487.95 | 2,032,082 | 293,963 |    991 |    143.36 |      288.64 | NEGMIR | intrinsically |
| **11** | NEGmir~especially    | 1,573 |    0.21 |    1.49 |  1,232.03 | 2,032,082 | 293,963 |  4,400 |    636.51 |      936.49 | NEGMIR | especially    |
| **12** | NEGmir~yet           |   320 |    0.21 |    1.18 |    242.23 | 2,032,082 | 293,963 |    909 |    131.50 |      188.50 | NEGMIR | yet           |
| **13** | NEGmir~fully         | 1,668 |    0.18 |    1.31 |  1,086.24 | 2,032,082 | 293,963 |  5,084 |    735.46 |      932.54 | NEGMIR | fully         |
| **14** | NEGmir~terribly      | 1,579 |    0.16 |    1.14 |    847.65 | 2,032,082 | 293,963 |  5,218 |    754.84 |      824.16 | NEGMIR | terribly      |

### Or here, the least "negative"/most "non-negative"

In [87]:
def show_top_positive(adv_df, 
                      k:int=15, 
                      filter_and_sort:list=['conservative_log_ratio', 
                                            'am_log_likelihood', 
                                            'am_p1_given2']):
    
    _l1 = adv_df.filter(like='O', axis=0).l1.iat[0].lower().strip()
    _N = int(adv_df.N.iat[0])
    ie = '(`set_diff`, $*\complement_{N^+}$)' if _l1.startswith("com") else '(`mirror`, $@P$)'
    print(f'#### Adverbs in top {k}',
          r'for $LRC$, $G^2$, and $\Delta P(\texttt{env}|\texttt{adv})$',
          f'measuring association with *{_l1.capitalize()}* Environments {ie}', 
          end='\n'*2)
    print(f'Total Tokens in dataset: $N = {_N:,}$')
    nb_show_table(
        get_top_vals(
            adv_df.filter(items=FOCUS+['am_p2_given1', 'am_p1_given2_simple']), 
            k=k,
            metric_filter=filter_and_sort,
            index_like='O',  # should match "POS" & "COM", but neither "NEG*"
            ).round(2).sort_values(filter_and_sort, ascending=False).set_index('l2').drop(['N', 'l1'], axis=1)
    )
    
# All data
show_top_positive(setdiff_adv, k=15)

#### Adverbs in top 15 for $LRC$, $G^2$, and $\Delta P(\texttt{env}|\texttt{adv})$ measuring association with *Complement* Environments (`set_diff`, $*\complement_{N^+}$)

Total Tokens in dataset: $N = 86,330,752$

|                    |          `f` |   `dP1` |   `LRC` |       `G2` |          `f1` |         `f2` |      `exp_f` |   `unexp_f` |   `dP2` |   `dP1_simple` |
|:-------------------|-------------:|--------:|--------:|-----------:|--------------:|-------------:|-------------:|------------:|--------:|---------------:|
| **increasingly**   |   404,356.00 |    0.04 |    6.00 |  29,076.69 | 83,102,035.00 |   404,521.00 |   389,392.16 |   14,963.84 |    0.00 |           1.00 |
| **relatively**     |   626,369.00 |    0.04 |    5.24 |  42,957.87 | 83,102,035.00 |   626,884.00 |   603,438.92 |   22,930.08 |    0.01 |           1.00 |
| **almost**         |   466,468.00 |    0.04 |    4.85 |  31,107.72 | 83,102,035.00 |   466,967.00 |   449,502.72 |   16,965.28 |    0.01 |           1.

#### Adverbs in top 15 for $LRC$, $G^2$, and $\Delta P(\texttt{env}|\texttt{adv})$ measuring association with *Complement* Environments (`set_diff`, $*\complement_{N^+}$)

Total Tokens in dataset: $N = 86,330,752$

|                    |          `f` |   `dP1` |   `LRC` |       `G2` |          `f1` |         `f2` |      `exp_f` |   `unexp_f` |   `dP2` |   `dP1_simple` |
|:-------------------|-------------:|--------:|--------:|-----------:|--------------:|-------------:|-------------:|------------:|--------:|---------------:|
| **increasingly**   |   404,356.00 |    0.04 |    6.00 |  29,076.69 | 83,102,035.00 |   404,521.00 |   389,392.16 |   14,963.84 |    0.00 |           1.00 |
| **relatively**     |   626,369.00 |    0.04 |    5.24 |  42,957.87 | 83,102,035.00 |   626,884.00 |   603,438.92 |   22,930.08 |    0.01 |           1.00 |
| **almost**         |   466,468.00 |    0.04 |    4.85 |  31,107.72 | 83,102,035.00 |   466,967.00 |   449,502.72 |   16,965.28 |    0.01 |           1.00 |
| **seemingly**      |   176,135.00 |    0.04 |    4.77 |  11,864.41 | 83,102,035.00 |   176,304.00 |   169,710.34 |    6,424.66 |    0.00 |           1.00 |
| **mostly**         |   212,255.00 |    0.04 |    4.71 |  14,160.67 | 83,102,035.00 |   212,478.00 |   204,531.45 |    7,723.55 |    0.00 |           1.00 |
| **pretty**         | 1,650,041.00 |    0.04 |    4.64 | 107,081.72 | 83,102,035.00 | 1,652,360.00 | 1,590,562.75 |   59,478.25 |    0.02 |           1.00 |
| **fairly**         |   401,326.00 |    0.04 |    4.50 |  25,904.34 | 83,102,035.00 |   401,879.00 |   386,848.97 |   14,477.03 |    0.00 |           1.00 |
| **partly**         |    80,461.00 |    0.04 |    4.50 |   5,418.01 | 83,102,035.00 |    80,538.00 |    77,525.93 |    2,935.07 |    0.00 |           1.00 |
| **rather**         |   402,067.00 |    0.04 |    4.44 |  25,775.15 | 83,102,035.00 |   402,648.00 |   387,589.21 |   14,477.79 |    0.00 |           1.00 |
| **largely**        |   186,382.00 |    0.04 |    4.36 |  12,018.96 | 83,102,035.00 |   186,638.00 |   179,657.85 |    6,724.15 |    0.00 |           1.00 |
| **sometimes**      |   154,738.00 |    0.04 |    4.25 |   9,894.59 | 83,102,035.00 |   154,963.00 |   149,167.48 |    5,570.52 |    0.00 |           1.00 |
| **also**           | 1,135,038.00 |    0.04 |    4.13 |  69,302.53 | 83,102,035.00 | 1,137,293.00 | 1,094,758.94 |   40,279.06 |    0.01 |           1.00 |
| **supposedly**     |    30,854.00 |    0.04 |    4.13 |   2,118.60 | 83,102,035.00 |    30,878.00 |    29,723.18 |    1,130.82 |    0.00 |           1.00 |
| **once**           |   108,130.00 |    0.04 |    4.01 |   6,779.79 | 83,102,035.00 |   108,308.00 |   104,257.35 |    3,872.65 |    0.00 |           1.00 |
| **certainly**      |   107,358.00 |    0.04 |    3.98 |   6,710.96 | 83,102,035.00 |   107,538.00 |   103,516.14 |    3,841.85 |    0.00 |           1.00 |
| **now**            |   456,039.00 |    0.04 |    3.88 |  27,026.68 | 83,102,035.00 |   457,065.00 |   439,971.05 |   16,067.95 |    0.01 |           1.00 |
| **most**           | 7,713,908.00 |    0.04 |    3.84 | 465,492.10 | 83,102,035.00 | 7,734,027.00 | 7,444,779.15 |  269,128.85 |    0.09 |           1.00 |
| **slightly**       |   399,124.00 |    0.03 |    3.63 |  22,711.33 | 83,102,035.00 |   400,193.00 |   385,226.03 |   13,897.97 |    0.00 |           1.00 |
| **still**          |   854,311.00 |    0.03 |    3.55 |  47,347.08 | 83,102,035.00 |   856,873.00 |   824,826.48 |   29,484.52 |    0.01 |           1.00 |
| **albeit**         |    17,169.00 |    0.04 |    3.53 |   1,270.78 | 83,102,035.00 |    17,172.00 |    16,529.78 |      639.22 |    0.00 |           1.00 |
| **admittedly**     |    13,998.00 |    0.04 |    3.34 |     945.10 | 83,102,035.00 |    14,011.00 |    13,487.00 |      511.00 |    0.00 |           1.00 |
| **understandably** |    13,111.00 |    0.04 |    3.24 |     879.17 | 83,102,035.00 |    13,124.00 |    12,633.17 |      477.83 |    0.00 |           1.00 |
| **highly**         |   789,705.00 |    0.03 |    3.08 |  39,233.91 | 83,102,035.00 |   793,031.00 |   763,372.13 |   26,332.87 |    0.01 |           1.00 |
| **extremely**      |   986,551.00 |    0.03 |    2.69 |  43,399.83 | 83,102,035.00 |   992,094.00 |   954,990.29 |   31,560.71 |    0.01 |           0.99 |
| **hopefully**      |     7,834.00 |    0.04 |    2.65 |     530.95 | 83,102,035.00 |     7,841.00 |     7,547.75 |      286.25 |    0.00 |           1.00 |
| **presumably**     |     8,011.00 |    0.04 |    2.59 |     568.20 | 83,102,035.00 |     8,015.00 |     7,715.24 |      295.76 |    0.00 |           1.00 |
| **less**           | 1,286,169.00 |    0.03 |    1.71 |  34,129.70 | 83,102,035.00 | 1,300,817.00 | 1,252,167.24 |   34,001.76 |    0.01 |           0.99 |
| **alternately**    |     4,148.00 |    0.04 |    1.11 |     294.82 | 83,102,035.00 |     4,150.00 |     3,994.79 |      153.21 |    0.00 |           1.00 |
| **more**           | 9,438,165.00 |    0.02 |    1.10 | 141,966.92 | 83,102,035.00 | 9,607,426.00 | 9,248,114.18 |  190,050.82 |    0.06 |           0.98 |



In [88]:
# Mirror Data ~ explicitly positive ~ positive trigger present
show_top_positive(mirror_adv, k=15)

#### Adverbs in top 15 for $LRC$, $G^2$, and $\Delta P(\texttt{env}|\texttt{adv})$ measuring association with *Posmir* Environments (`mirror`, $@P$)

Total Tokens in dataset: $N = 1,761,853$

|                  |        `f` |   `dP1` |   `LRC` |      `G2` |         `f1` |       `f2` |    `exp_f` |   `unexp_f` |   `dP2` |   `dP1_simple` |
|:-----------------|-----------:|--------:|--------:|----------:|-------------:|-----------:|-----------:|------------:|--------:|---------------:|
| **pretty**       |  24,599.00 |    0.16 |    4.59 |  7,751.98 | 1,472,036.00 |  24,729.00 |  20,661.19 |    3,937.81 |    0.02 |           0.99 |
| **rather**       |   8,259.00 |    0.16 |    4.39 |  2,671.93 | 1,472,036.00 |   8,291.00 |   6,927.17 |    1,331.83 |    0.01 |           1.00 |
| **plain**        |   5,053.00 |    0.16 |    4.15 |  1,660.56 | 1,472,036.00 |   5,069.00 |   4,235.17 |      817.83 |    0.00 |           1.00 |
| **fairly**       |   5,678.00 |    0.16 |    4.07 |  1,820.50 | 1,

#### Adverbs in top 15 for $LRC$, $G^2$, and $\Delta P(\texttt{env}|\texttt{adv})$ measuring association with *Posmir* Environments (`mirror`, $@P$)

Total Tokens in dataset: $N = 1,761,853$

|                  |        `f` |   `dP1` |   `LRC` |      `G2` |         `f1` |       `f2` |    `exp_f` |   `unexp_f` |   `dP2` |   `dP1_simple` |
|:-----------------|-----------:|--------:|--------:|----------:|-------------:|-----------:|-----------:|------------:|--------:|---------------:|
| **pretty**       |  24,599.00 |    0.16 |    4.59 |  7,751.98 | 1,472,036.00 |  24,729.00 |  20,661.19 |    3,937.81 |    0.02 |           0.99 |
| **rather**       |   8,259.00 |    0.16 |    4.39 |  2,671.93 | 1,472,036.00 |   8,291.00 |   6,927.17 |    1,331.83 |    0.01 |           1.00 |
| **plain**        |   5,053.00 |    0.16 |    4.15 |  1,660.56 | 1,472,036.00 |   5,069.00 |   4,235.17 |      817.83 |    0.00 |           1.00 |
| **fairly**       |   5,678.00 |    0.16 |    4.07 |  1,820.50 | 1,472,036.00 |   5,702.00 |   4,764.05 |      913.95 |    0.00 |           1.00 |
| **somewhat**     |   4,441.00 |    0.16 |    3.93 |  1,436.47 | 1,472,036.00 |   4,458.00 |   3,724.68 |      716.32 |    0.00 |           1.00 |
| **otherwise**    |   6,562.00 |    0.16 |    3.85 |  2,012.72 | 1,472,036.00 |   6,603.00 |   5,516.84 |    1,045.16 |    0.00 |           0.99 |
| **downright**    |   4,730.00 |    0.16 |    3.64 |  1,446.97 | 1,472,036.00 |   4,760.00 |   3,977.00 |      753.00 |    0.00 |           0.99 |
| **relatively**   |   5,328.00 |    0.16 |    3.61 |  1,603.27 | 1,472,036.00 |   5,366.00 |   4,483.32 |      844.68 |    0.00 |           0.99 |
| **already**      |   4,277.00 |    0.16 |    3.54 |  1,302.51 | 1,472,036.00 |   4,305.00 |   3,596.85 |      680.15 |    0.00 |           0.99 |
| **almost**       |   5,286.00 |    0.16 |    3.47 |  1,551.93 | 1,472,036.00 |   5,330.00 |   4,453.24 |      832.76 |    0.00 |           0.99 |
| **maybe**        |   2,573.00 |    0.16 |    3.43 |    846.03 | 1,472,036.00 |   2,581.00 |   2,156.44 |      416.56 |    0.00 |           1.00 |
| **equally**      |   7,235.00 |    0.15 |    3.36 |  2,018.36 | 1,472,036.00 |   7,314.00 |   6,110.88 |    1,124.12 |    0.00 |           0.99 |
| **perhaps**      |   3,353.00 |    0.16 |    3.22 |    989.23 | 1,472,036.00 |   3,380.00 |   2,824.00 |      528.99 |    0.00 |           0.99 |
| **highly**       |   9,133.00 |    0.15 |    3.18 |  2,407.55 | 1,472,036.00 |   9,260.00 |   7,736.77 |    1,396.23 |    0.01 |           0.99 |
| **slightly**     |   7,524.00 |    0.15 |    3.09 |  1,970.46 | 1,472,036.00 |   7,631.00 |   6,375.73 |    1,148.27 |    0.00 |           0.99 |
| **extremely**    |  17,254.00 |    0.15 |    3.06 |  4,253.50 | 1,472,036.00 |  17,559.00 |  14,670.62 |    2,583.38 |    0.01 |           0.98 |
| **also**         |   6,904.00 |    0.15 |    3.02 |  1,789.00 | 1,472,036.00 |   7,006.00 |   5,853.54 |    1,050.46 |    0.00 |           0.99 |
| **simply**       |   7,695.00 |    0.15 |    2.90 |  1,912.92 | 1,472,036.00 |   7,826.00 |   6,538.66 |    1,156.34 |    0.00 |           0.98 |
| **still**        |  13,239.00 |    0.15 |    2.85 |  3,122.90 | 1,472,036.00 |  13,504.00 |  11,282.65 |    1,956.35 |    0.01 |           0.98 |
| **incredibly**   |   8,847.00 |    0.15 |    2.81 |  2,118.18 | 1,472,036.00 |   9,016.00 |   7,532.91 |    1,314.09 |    0.01 |           0.98 |
| **just**         |  27,625.00 |    0.14 |    2.60 |  5,785.97 | 1,472,036.00 |  28,371.00 |  23,704.10 |    3,920.90 |    0.02 |           0.97 |
| **surprisingly** |   1,439.00 |    0.16 |    2.50 |    427.85 | 1,472,036.00 |   1,450.00 |   1,211.48 |      227.52 |    0.00 |           0.99 |
| **sometimes**    |   1,302.00 |    0.16 |    2.36 |    380.76 | 1,472,036.00 |   1,313.00 |   1,097.02 |      204.98 |    0.00 |           0.99 |
| **even**         |  58,121.00 |    0.12 |    1.89 |  8,471.57 | 1,472,036.00 |  60,933.00 |  50,909.79 |    7,211.21 |    0.03 |           0.95 |
| **very**         | 175,104.00 |    0.13 |    1.88 | 25,839.60 | 1,472,036.00 | 184,008.00 | 153,739.50 |   21,364.50 |    0.09 |           0.95 |
| **strangely**    |     696.00 |    0.16 |    1.56 |    202.78 | 1,472,036.00 |     702.00 |     586.52 |      109.48 |    0.00 |           0.99 |





##### _Previously_ 

`mirror` subset (_Present Positive_ approximation) 

|                    |    `f` | `dP1` | `LRC` |     `G2` |       `N` |      `f1` |   `f2` |   `exp_f` | `unexp_f` | `l1`   | `l2`       |
|:-------------------|-------:|------:|------:|---------:|----------:|----------:|-------:|----------:|----------:|:-------|:-----------|
| **POS~pretty**     | 26,788 |  0.14 |  4.48 | 7,278.87 | 2,032,082 | 1,738,105 | 26,919 | 23,024.69 |  3,763.31 | POSMIR | pretty     |
| **POS~rather**     |  9,290 |  0.14 |  4.34 | 2,607.01 | 2,032,082 | 1,738,105 |  9,322 |  7,973.41 |  1,316.59 | POSMIR | rather     |
| **POS~plain**      |  6,049 |  0.14 |  4.19 | 1,733.36 | 2,032,082 | 1,738,105 |  6,065 |  5,187.59 |    861.41 | POSMIR | plain      |
| **POS~otherwise**  |  9,368 |  0.14 |  4.12 | 2,558.73 | 2,032,082 | 1,738,105 |  9,410 |  8,048.68 |  1,319.32 | POSMIR | otherwise  |
| **POS~fairly**     |  6,184 |  0.14 |  3.97 | 1,713.96 | 2,032,082 | 1,738,105 |  6,208 |  5,309.90 |    874.10 | POSMIR | fairly     |
| **POS~somewhat**   |  4,961 |  0.14 |  3.87 | 1,391.12 | 2,032,082 | 1,738,105 |  4,978 |  4,257.84 |    703.16 | POSMIR | somewhat   |
| **POS~downright**  |  5,502 |  0.14 |  3.63 | 1,465.04 | 2,032,082 | 1,738,105 |  5,532 |  4,731.70 |    770.30 | POSMIR | downright  |
| **POS~already**    |  5,035 |  0.14 |  3.56 | 1,336.93 | 2,032,082 | 1,738,105 |  5,063 |  4,330.55 |    704.45 | POSMIR | already    |
| **POS~relatively** |  5,774 |  0.14 |  3.51 | 1,496.02 | 2,032,082 | 1,738,105 |  5,812 |  4,971.19 |    802.81 | POSMIR | relatively |
| **POS~maybe**      |  2,998 |  0.14 |  3.43 |   857.78 | 2,032,082 | 1,738,105 |  3,006 |  2,571.13 |    426.87 | POSMIR | maybe      |

## Compile top NEG~adverb associations across both approximation methods

In [89]:
def load_backup(lower_floor: int = 100,
                loaded_path: Path = adv_am_paths['RBdirect']) -> pd.DataFrame:
    located_paths = tuple(loaded_path.parent.glob(
        f'*35f-7c_min{lower_floor}x*{PKL_SUFF}'))
    if any(located_paths):
        backup_df = pd.read_pickle(located_paths[0])

        backup_df = backup_df.filter(like='NEG', axis=0).filter(
            items=FOCUS).reset_index().set_index('l2')
        backup_df.index.name = 'adv'
        return backup_df
    else:
        return []


def uncat(df):
    cats = df.select_dtypes('category').columns
    df[cats] = df[cats].astype('string')
    # print(df.dtypes)
    return df, cats


def fill_empties(name_1, name_2, both, loaded_paths):
    for name in (name_1, name_2):
        name = name.strip('_')
        path = loaded_paths['RBdirect'] if name == 'SET' else loaded_paths['NEGmirror']
        if any(both[f'f_{name}'].isna()):

            floor = 100
            neg_backup = load_backup(floor, loaded_path=path)
            if not any(neg_backup):
                print('Error. Backup data not found. [in fill_empties()]')

            neg_backup.columns = (pd.Series(adjust_assoc_columns(neg_backup.columns)
                                            ) + f'_{name}').to_list()
            both, cats = uncat(both)
            neg_backup, __ = uncat(neg_backup)

            undefined_adv = both.loc[
                both[f'f_{name}'].isna(), :].index.to_list()

            both.loc[undefined_adv,
                     neg_backup.columns] = neg_backup.filter(items=undefined_adv, axis=0)

            both[cats] = both[cats].astype('category')

    return both


def combine_top(df_1: pd.DataFrame,
                name_1: str,
                df_2: pd.DataFrame,
                name_2: str,
                env_filter: str = 'NEG',
                filter_items: list = FOCUS,
                k: int = 10) -> pd.DataFrame:
    print('### Adverb Selections')
    top_dfs = [
        (get_top_vals(adv_df,  k=k,
                      index_like=env_filter,
                      metric_filter=['am_p1_given2',
                                     'conservative_log_ratio'])
         .sort_values('conservative_log_ratio', ascending=False))
        for adv_df in [df_1, df_2]
    ]
    for i, name in enumerate([name_1, name_2]):

        print_iter(
            [f'_{w}_' for w in top_dfs[i].l2], bullet='1.',
            header=(f'`{name}`: union of top {k} adverbs ranked by '
                    r'$LRC$ & $\Delta P(\texttt{env}|\texttt{adv})$'))
    top_adv_lists = [dx.l2.to_list() for dx in top_dfs]
    top_adv = pd.Series(top_adv_lists[0] + top_adv_lists[1]).drop_duplicates()
    # top_adv = pd.concat((top_dfs[0].l2, top_dfs[1].l2)).drop_duplicates()

    print_iter(
        [f'_{w}_' for w in top_adv], bullet='1.',
        header=f'Union of top adverbs for `{name_1}` and `{name_2}`. (Novel `{name_2}` adverbs listed last)')
    print(f'\n### `{name_1}` Adverb Associations (in initially loaded table)\n')
    df_1 = narrow_selection(df_1, top_adv, env_filter, filter_items)
    print(f'\n### `{name_2}` Adverb Associations (in initially loaded table)\n')
    df_2 = narrow_selection(df_2, top_adv, env_filter, filter_items)

    name_1, name_2 = [f"_{n.strip('_')}" for n in [name_1, name_2]]
    both = df_1.join(df_2, how="outer", lsuffix=name_1, rsuffix=name_2)

    # ! Empty cells need to be filled _before_ calculating mean
    both = fill_empties(name_1, name_2, both, adv_am_paths)
    both = force_ints(both)
    both = add_means(both)
    both = add_f_ratio(both, name_2, name_1)
    return both.sort_values('mean_dP1', ascending=False)


def add_f_ratio(df, subset_name, superset_name):
    counts = df.filter(regex=r'^[Nf][12]?').columns.str.split(
        '_').str.get(0).drop_duplicates()
    for count in counts:
        ratio_col = f'ratio_{count}{subset_name}'
        df[ratio_col] = (df[f'{count}{subset_name}']
                         / df[f'{count}{superset_name}'])
        # print(df.filter(like=count))
    return df

def add_means(both):
    for metric in (both.select_dtypes(include='number').columns.to_series()
                   .str.replace(r'_(MIR|SET)$', '', regex=True).unique()):
        both[f'mean_{snake_to_camel(metric)}'] = both.filter(
            regex=f"^{metric}").agg('mean', axis='columns')
    return both


def narrow_selection(df: pd.DataFrame,
                     top_adv: list,
                     env_filter: str = 'NEG',
                     filter_items: list = FOCUS):
    df = adjust_assoc_columns(
        df.filter(items=filter_items)
        .filter(like=env_filter, axis=0)
        .reset_index().set_index('l2')
        .filter(top_adv, axis=0)).sort_values(['LRC', 'dP1'], ascending=False)
    df.index.name = 'adv'
    nb_show_table(df.drop(['N', 'key', 'l1'], axis=1).round(2).sort_values(['LRC','dP1', ], ascending=False))

    return df

In [90]:
C = combine_top(setdiff_adv, 'SET',
                mirror_adv, 'MIR', k=K)

### Adverb Selections

`SET`: union of top 5 adverbs ranked by $LRC$ & $\Delta P(\texttt{env}|\texttt{adv})$
1. _necessarily_
1. _exactly_
1. _that_
1. _immediately_
1. _yet_

`MIR`: union of top 5 adverbs ranked by $LRC$ & $\Delta P(\texttt{env}|\texttt{adv})$
1. _ever_
1. _any_
1. _longer_
1. _necessarily_
1. _that_
1. _before_

Union of top adverbs for `SET` and `MIR`. (Novel `MIR` adverbs listed last)
1. _necessarily_
1. _exactly_
1. _that_
1. _immediately_
1. _yet_
1. _ever_
1. _any_
1. _longer_
1. _before_

### `SET` Adverb Associations (in initially loaded table)


|                 |        `f` |   `dP1` |   `LRC` |       `G2` |         `f1` |       `f2` |   `exp_f` |   `unexp_f` |
|:----------------|-----------:|--------:|--------:|-----------:|-------------:|-----------:|----------:|------------:|
| **necessarily** |  42,708.00 |    0.72 |    6.23 | 219,003.46 | 3,226,213.00 |  56,694.00 |  2,118.68 |   40,589.32 |
| **exactly**     |  43,635.00 |    0.67 |    5.90 | 214,404.

### Adverb Selections

`SET`: union of top 6 adverbs ranked by $LRC$ & $\Delta P(\texttt{env}|\texttt{adv})$
1. _necessarily_
1. _exactly_
1. _that_
1. _immediately_
1. _yet_
1. _terribly_

`MIR`: union of top 6 adverbs ranked by $LRC$ & $\Delta P(\texttt{env}|\texttt{adv})$
1. _ever_
1. _any_
1. _longer_
1. _necessarily_
1. _that_
1. _remotely_
1. _before_

Union of top adverbs for `SET` and `MIR`. (Novel `MIR` adverbs listed last)
1. _necessarily_
1. _exactly_
1. _that_
1. _immediately_
1. _yet_
1. _terribly_
1. _ever_
1. _any_
1. _longer_
1. _remotely_
1. _before_

### `SET` Adverb Associations (in initially loaded table)


|                 |        `f` |   `dP1` |   `LRC` |       `G2` |         `f1` |       `f2` |   `exp_f` |   `unexp_f` |
|:----------------|-----------:|--------:|--------:|-----------:|-------------:|-----------:|----------:|------------:|
| **necessarily** |  42,708.00 |    0.72 |    6.23 | 219,003.46 | 3,226,213.00 |  56,694.00 |  2,118.68 |   40,589.32 |
| **exactly**     |  43,635.00 |    0.67 |    5.90 | 214,404.20 | 3,226,213.00 |  61,599.00 |  2,301.98 |   41,333.02 |
| **that**        | 165,411.00 |    0.63 |    5.62 | 781,016.11 | 3,226,213.00 | 250,392.00 |  9,357.24 |  156,053.76 |
| **immediately** |  57,319.00 |    0.52 |    4.96 | 239,462.58 | 3,226,213.00 | 103,177.00 |  3,855.76 |   53,463.24 |
| **yet**         |  52,546.00 |    0.48 |    4.74 | 209,055.78 | 3,226,213.00 | 101,707.00 |  3,800.83 |   48,745.17 |
| **terribly**    |  18,054.00 |    0.22 |    3.09 |  42,704.93 | 3,226,213.00 |  70,174.00 |  2,622.43 |   15,431.57 |
| **remotely**    |   5,679.00 |    0.22 |    3.03 |  13,354.33 | 3,226,213.00 |  22,194.00 |    829.40 |    4,849.60 |
| **any**         |  15,492.00 |    0.13 |    2.28 |  23,683.00 | 3,226,213.00 |  94,152.00 |  3,518.50 |   11,973.50 |
| **ever**        |   5,967.00 |    0.01 |    0.28 |     353.58 | 3,226,213.00 | 124,592.00 |  4,656.05 |    1,310.95 |
| **longer**      |   1,448.00 |   -0.03 |   -1.87 |  -4,977.41 | 3,226,213.00 | 157,984.00 |  5,903.92 |   -4,455.92 |


### `MIR` Adverb Associations (in initially loaded table)


|                 |      `f` |   `dP1` |   `LRC` |      `G2` |       `f1` |     `f2` |   `exp_f` |   `unexp_f` |
|:----------------|---------:|--------:|--------:|----------:|-----------:|---------:|----------:|------------:|
| **ever**        | 4,688.00 |    0.77 |    5.73 | 14,624.92 | 289,770.00 | 5,027.00 |    826.79 |    3,861.21 |
| **any**         | 1,066.00 |    0.74 |    4.88 |  3,151.64 | 289,770.00 | 1,178.00 |    193.74 |      872.26 |
| **longer**      |   802.00 |    0.74 |    4.71 |  2,350.18 | 289,770.00 |   891.00 |    146.54 |      655.46 |
| **necessarily** |   960.00 |    0.71 |    4.47 |  2,679.92 | 289,770.00 | 1,100.00 |    180.92 |      779.08 |
| **that**        | 4,293.00 |    0.62 |    3.95 | 10,223.36 | 289,770.00 | 5,488.00 |    902.61 |    3,390.39 |
| **remotely**    | 1,841.00 |    0.62 |    3.87 |  4,419.89 | 289,770.00 | 2,336.00 |    384.20 |    1,456.80 |
| **exactly**     |   811.00 |    0.62 |    3.66 |  1,931.41 | 289,770.00 | 1,034.00 |    170.06 |      640.94 |
| **before**      |   288.00 |    0.84 |    1.31 |  1,039.94 | 289,770.00 |   288.00 |     47.37 |      240.63 |
| **yet**         |   319.00 |    0.23 |    1.20 |    242.11 | 289,770.00 |   810.00 |    133.22 |      185.78 |
| **terribly**    | 1,571.00 |    0.18 |    1.18 |    857.86 | 289,770.00 | 4,596.00 |    755.90 |      815.10 |
| **immediately** |   403.00 |    0.17 |    0.93 |    212.93 | 289,770.00 | 1,193.00 |    196.21 |      206.79 |


---

### $\textbf{\textit{Previous}}$ `MIR` Adverb Associations (in initially loaded table)

`MIR`: union of top 5 adverbs ranked by $LRC$ & $\Delta P(\texttt{env}|\texttt{adv})$

1. _ever_
1. _before_
1. _exactly_
1. _any_
1. _remotely_

|                 | `key`              |   `f` |   `dP1` |   `LRC` |      `G2` |       `N` |    `f1` |   `f2` |   `exp_f` |   `unexp_f` | `l1`   |
|:----------------|:-------------------|------:|--------:|--------:|----------:|----------:|--------:|-------:|----------:|------------:|:-------|
| **before**      | NEGmir~before      |   290 |    0.84 |    5.11 |  1,080.52 | 2,032,082 | 293,963 |    294 |     42.53 |      247.47 | NEGMIR |
| **ever**        | NEGmir~ever        | 4,718 |    0.77 |    5.57 | 15,340.34 | 2,032,082 | 293,963 |  5,179 |    749.20 |    3,968.80 | NEGMIR |
| **exactly**     | NEGmir~exactly     |   813 |    0.59 |    3.51 |  1,939.47 | 2,032,082 | 293,963 |  1,114 |    161.15 |      651.85 | NEGMIR |
| **any**         | NEGmir~any         | 1,082 |    0.57 |    3.48 |  2,511.26 | 2,032,082 | 293,963 |  1,514 |    219.02 |      862.98 | NEGMIR |
| **remotely**    | NEGmir~remotely    | 1,846 |    0.54 |    3.35 |  4,009.84 | 2,032,082 | 293,963 |  2,717 |    393.04 |    1,452.96 | NEGMIR |
| **that**        | NEGmir~that        | 4,338 |    0.44 |    2.86 |  7,632.21 | 2,032,082 | 293,963 |  7,472 |  1,080.91 |    3,257.09 | NEGMIR |
| **necessarily** | NEGmir~necessarily |   971 |    0.43 |    2.66 |  1,688.91 | 2,032,082 | 293,963 |  1,681 |    243.18 |      727.82 | NEGMIR |
| **yet**         | NEGmir~yet         |   320 |    0.21 |    1.18 |    242.23 | 2,032,082 | 293,963 |    909 |    131.50 |      188.50 | NEGMIR |
| **immediately** | NEGmir~immediately |   407 |    0.14 |    0.79 |    181.20 | 2,032,082 | 293,963 |  1,442 |    208.60 |      198.40 | NEGMIR |



### Frequency Comparisons between Polarity Approximations: All Data vs. Mirror Subset
The following values indicate the percentage of the negated frequency (`f`) and the marginal frequency (`f2`) accounted for by the `mirror` subset for each adverb. 
That is, `ratio_f_MIR` indicates the percentage of negated tokens with the specific triggers covered by `NEGmirror`, 
and `ratio_f2_MIR` the percentage of all adverb tokens which were captured by _either_ mirror pattern, `POSmirror` or `NEGmirror`. 
The third column then indicates the discrepancy between these percentages: 
For example, 

- [ ] 🚩 **finish this discussion!**

Note that _before_ and _ever_ have a much higher proportions of their negated tokens representated in the mirror subset. 
However, the discrepancy indicated by the `difference` column, which illuminates the 

#### Percentage Comparision

|                   |  joint % MIR |  adverb % MIR | % MIR $\Delta$ |
|:------------------|-------------:|--------------:|---------------:|
| **ever**          |         79.1 |           4.2 |           74.9 |
| **before**        |         93.2 |          39.3 |           53.9 |
| **inherently**    |         41.9 |          10.3 |           31.7 |
| **intrinsically** |         40.3 |           9.9 |           30.4 |
| **remotely**      |         32.5 |          12.2 |           20.3 |
| **particularly**  |         16.6 |           2.6 |           14.0 |
| **overtly**       |         18.1 |           5.9 |           12.2 |
| **any**           |          7.0 |           1.6 |            5.4 |
| **terribly**      |          8.7 |           7.4 |            1.3 |
| **exactly**       |          1.9 |           1.8 |            0.1 |
| **entirely**      |          3.8 |           3.9 |           -0.1 |
| **yet**           |          0.6 |           0.9 |           -0.3 |
| **that**          |          2.6 |           3.0 |           -0.4 |
| **necessarily**   |          2.3 |           3.0 |           -0.7 |
| **immediately**   |          0.7 |           1.4 |           -0.7 |
| **only**          |          0.2 |           1.1 |           -1.0 |
| **altogether**    |          2.4 |           8.8 |           -6.3 |

In [91]:
nb_show_table(C.filter(regex=r'^ratio_f2?_')
              .assign(f_minus_f2=C.ratio_f_MIR - C.ratio_f2_MIR)
              .multiply(100).round(1)
              .sort_values(['f_minus_f2', 'ratio_f_MIR'], ascending=False),
              n_dec=1, adjust_columns=False)



|                 |   `ratio_f_MIR` |   `ratio_f2_MIR` |   `f_minus_f2` |
|:----------------|----------------:|-----------------:|---------------:|
| **ever**        |            78.6 |              4.0 |           74.5 |
| **longer**      |            55.4 |              0.6 |           54.8 |
| **before**      |            92.6 |             38.5 |           54.1 |
| **any**         |             6.9 |              1.3 |            5.6 |
| **that**        |             2.6 |              2.2 |            0.4 |
| **necessarily** |             2.2 |              1.9 |            0.3 |
| **exactly**     |             1.9 |              1.7 |            0.2 |
| **yet**         |             0.6 |              0.8 |           -0.2 |
| **immediately** |             0.7 |              1.2 |           -0.5 |



#### Joint (_Negated_) Frequency Comparison

|                   |   `total negations` |   `mirror subset negations` |   `negations not in mirror subset` |
|:------------------|--------------------:|----------------------------:|-----------------------------------:|
| **that**          |             165,411 |                       4,338 |                            161,073 |
| **only**          |             114,070 |                         173 |                            113,897 |
| **entirely**      |              63,708 |                       2,429 |                             61,279 |
| **immediately**   |              57,319 |                         407 |                             56,912 |
| **yet**           |              52,546 |                         320 |                             52,226 |
| **particularly**  |              55,799 |                       9,278 |                             46,521 |
| **exactly**       |              43,635 |                         813 |                             42,822 |
| **necessarily**   |              42,708 |                         971 |                             41,737 |
| **terribly**      |              18,054 |                       1,579 |                             16,475 |
| **any**           |              15,492 |                       1,082 |                             14,410 |
| **altogether**    |               4,575 |                         112 |                              4,463 |
| **inherently**    |               6,847 |                       2,872 |                              3,975 |
| **remotely**      |               5,679 |                       1,846 |                              3,833 |
| **overtly**       |               2,169 |                         392 |                              1,777 |
| **ever**          |               5,967 |                       4,718 |                              1,249 |
| **intrinsically** |               1,071 |                         432 |                                639 |
| **before**        |                 311 |                         290 |                                 21 |

In [92]:
nb_show_table(
    C
    # .assign(f_percent_MIR=C.ratio_f_MIR * 100)
    .filter(regex=r'^f_.*[MS]').sort_index(axis=1, ascending=False)
    .assign(
        f_diff=C.f_SET-C.f_MIR).sort_values('f_diff', ascending=False)
    .rename(columns={'f_SET':'total negations', 
                     'f_MIR':'mirror subset negations', 
                     'f_diff': 'negations not in mirror subset'}), n_dec=0)


|                 |   `total negations` |   `mirror subset negations` |   `negations not in mirror subset` |
|:----------------|--------------------:|----------------------------:|-----------------------------------:|
| **that**        |             165,411 |                       4,293 |                            161,118 |
| **immediately** |              57,319 |                         403 |                             56,916 |
| **yet**         |              52,546 |                         319 |                             52,227 |
| **exactly**     |              43,635 |                         811 |                             42,824 |
| **necessarily** |              42,708 |                         960 |                             41,748 |
| **any**         |              15,492 |                       1,066 |                             14,426 |
| **ever**        |               5,967 |                       4,688 |                              1,279 |
| **longer**      

#### Marginal (_Adverb Total_) Frequency Comparison

|                   |   `total adverb tokens` |   `mirror subset adverb tokens` |   `adverb tokens not in mirror subset` |
|:------------------|------------------------:|--------------------------------:|---------------------------------------:|
| **particularly**  |                 575,960 |                          14,954 |                                561,006 |
| **only**          |                 464,168 |                           5,169 |                                458,999 |
| **entirely**      |                 303,833 |                          11,803 |                                292,030 |
| **that**          |                 250,392 |                           7,472 |                                242,920 |
| **ever**          |                 124,592 |                           5,179 |                                119,413 |
| **immediately**   |                 103,177 |                           1,442 |                                101,735 |
| **yet**           |                 101,707 |                             909 |                                100,798 |
| **any**           |                  94,152 |                           1,514 |                                 92,638 |
| **terribly**      |                  70,174 |                           5,218 |                                 64,956 |
| **exactly**       |                  61,599 |                           1,114 |                                 60,485 |
| **necessarily**   |                  56,694 |                           1,681 |                                 55,013 |
| **inherently**    |                  55,088 |                           5,649 |                                 49,439 |
| **remotely**      |                  22,194 |                           2,717 |                                 19,477 |
| **altogether**    |                  20,636 |                           1,808 |                                 18,828 |
| **overtly**       |                  15,219 |                             898 |                                 14,321 |
| **intrinsically** |                  10,001 |                             991 |                                  9,010 |
| **before**        |                     748 |                             294 |                                    454 |

In [93]:
nb_show_table(
    C
    # .assign(f2_percent_MIR=C.ratio_f2_MIR * 100)
    .filter(regex=r'^f2_.*[MS]').sort_index(axis=1, ascending=False)
    .assign(
        f2_diff=C.f2_SET-C.f2_MIR).sort_values('f2_diff', ascending=False)
    .rename(columns={'f2_SET':'total adverb tokens', 
                     'f2_MIR':'mirror subset adverb tokens', 
                     'f2_diff': 'adverb tokens not in mirror subset'}), n_dec=0)


|                 |   `total adverb tokens` |   `mirror subset adverb tokens` |   `adverb tokens not in mirror subset` |
|:----------------|------------------------:|--------------------------------:|---------------------------------------:|
| **that**        |                 250,392 |                           5,488 |                                244,904 |
| **longer**      |                 157,984 |                             891 |                                157,093 |
| **ever**        |                 124,592 |                           5,027 |                                119,565 |
| **immediately** |                 103,177 |                           1,193 |                                101,984 |
| **yet**         |                 101,707 |                             810 |                                100,897 |
| **any**         |                  94,152 |                           1,178 |                                 92,974 |
| **exactly**     |            

In [94]:
full_C = C.copy()
main_cols_ordered = pd.concat((*[C.filter(like=m).columns.to_series() for m in ('LRC', 'P1', 'G2')],
                               *[C.filter(regex=f'^{f}_').columns.to_series() for f in ['f', 'f1', 'f2'] ]) 
                              ).to_list()
# print_iter([f'`{c}`' for c in main_cols_ordered], bullet='1.', header='Main Columns')
main_C = C[[c for c in main_cols_ordered if c in C.columns]]
nb_show_table(main_C.sort_values('mean_dP1', ascending=False), return_df=True)


|                 |   `LRC_SET` |   `LRC_MIR` |   `mean_LRC` |   `dP1_SET` |   `dP1_MIR` |   `mean_dP1` |   `G2_SET` |   `G2_MIR` |   `mean_G2` |    `f_SET` |   `f_MIR` |     `f1_SET` |   `f1_MIR` |   `f2_SET` |   `f2_MIR` |
|:----------------|------------:|------------:|-------------:|------------:|------------:|-------------:|-----------:|-----------:|------------:|-----------:|----------:|-------------:|-----------:|-----------:|-----------:|
| **necessarily** |        6.23 |        4.47 |         5.35 |        0.72 |        0.71 |         0.71 | 219,003.46 |   2,679.92 |  110,841.69 |  42,708.00 |    960.00 | 3,226,213.00 | 289,770.00 |  56,694.00 |   1,100.00 |
| **exactly**     |        5.90 |        3.66 |         4.78 |        0.67 |        0.62 |         0.65 | 214,404.20 |   1,931.41 |  108,167.81 |  43,635.00 |    811.00 | 3,226,213.00 | 289,770.00 |  61,599.00 |   1,034.00 |
| **that**        |        5.62 |        3.95 |         4.79 |        0.63 |        0.62 |         

Unnamed: 0,`LRC_SET`,`LRC_MIR`,`mean_LRC`,`dP1_SET`,`dP1_MIR`,`mean_dP1`,...,`f_SET`,`f_MIR`,`f1_SET`,`f1_MIR`,`f2_SET`,`f2_MIR`
**necessarily**,6.23,4.47,5.35,0.72,0.71,0.71,...,42708,960,3226213,289770,56694,1100
**exactly**,5.9,3.66,4.78,0.67,0.62,0.65,...,43635,811,3226213,289770,61599,1034
**that**,5.62,3.95,4.79,0.63,0.62,0.62,...,165411,4293,3226213,289770,250392,5488
**before**,3.65,1.31,2.48,0.38,0.84,0.61,...,311,288,3226213,289770,748,288
**any**,2.28,4.88,3.58,0.13,0.74,0.43,...,15492,1066,3226213,289770,94152,1178
**ever**,0.28,5.73,3.0,0.01,0.77,0.39,...,5967,4688,3226213,289770,124592,5027
**yet**,4.74,1.2,2.97,0.48,0.23,0.35,...,52546,319,3226213,289770,101707,810
**longer**,-1.87,4.71,1.42,-0.03,0.74,0.35,...,1448,802,3226213,289770,157984,891
**immediately**,4.96,0.93,2.95,0.52,0.17,0.35,...,57319,403,3226213,289770,103177,1193


### Combined Table of AM values for "most negative" adverbs, by descending `mean_dP1`

|                 | `LRC_SET` | `LRC_MIR` | `mean_LRC` | `dP1_SET` | `dP1_MIR` | `mean_dP1` |   `G2_SET` |  `G2_MIR` |  `mean_G2` | `f_SET` | `f_MIR` |  `f1_SET` | `f1_MIR` | `f2_SET` | `f2_MIR` |
|:----------------|----------:|----------:|-----------:|----------:|----------:|-----------:|-----------:|----------:|-----------:|--------:|--------:|----------:|---------:|---------:|---------:|
| **exactly**     |      5.90 |      3.51 |       4.71 |      0.67 |      0.59 |       0.63 | 214,404.20 |  1,939.47 | 108,171.83 |  43,635 |     813 | 3,226,213 |  293,963 |   61,599 |    1,114 |
| **before**      |      3.65 |      5.11 |       4.38 |      0.38 |      0.84 |       0.61 |   1,062.13 |  1,080.52 |   1,071.32 |     311 |     290 | 3,226,213 |  293,963 |      748 |      294 |
| **necessarily** |      6.23 |      2.66 |       4.44 |      0.72 |      0.43 |       0.57 | 219,003.46 |  1,688.91 | 110,346.18 |  42,708 |     971 | 3,226,213 |  293,963 |   56,694 |    1,681 |
| **that**        |      5.62 |      2.86 |       4.24 |      0.63 |      0.44 |       0.53 | 781,016.11 |  7,632.21 | 394,324.16 | 165,411 |   4,338 | 3,226,213 |  293,963 |  250,392 |    7,472 |
| **ever**        |      0.28 |      5.57 |       2.92 |      0.01 |      0.77 |       0.39 |     353.58 | 15,340.34 |   7,846.96 |   5,967 |   4,718 | 3,226,213 |  293,963 |  124,592 |    5,179 |
| **remotely**    |      3.03 |      3.35 |       3.19 |      0.22 |      0.54 |       0.38 |  13,354.33 |  4,009.84 |   8,682.08 |   5,679 |   1,846 | 3,226,213 |  293,963 |   22,194 |    2,717 |
| **any**         |      2.28 |      3.48 |       2.88 |      0.13 |      0.57 |       0.35 |  23,683.00 |  2,511.26 |  13,097.13 |  15,492 |   1,082 | 3,226,213 |  293,963 |   94,152 |    1,514 |
| **yet**         |      4.74 |      1.18 |       2.96 |      0.48 |      0.21 |       0.34 | 209,055.78 |    242.23 | 104,649.01 |  52,546 |     320 | 3,226,213 |  293,963 |  101,707 |      909 |
| **immediately** |      4.96 |      0.79 |       2.88 |      0.52 |      0.14 |       0.33 | 239,462.58 |    181.20 | 119,821.89 |  57,319 |     407 | 3,226,213 |  293,963 |  103,177 |    1,442 |

Save full adverb selection as `.csv`

In [95]:
C.to_csv(TOP_AM_DIR / f'Top{K}_NEG-ADV_combined.35f-7c_{timestamp_today()}.csv')

Save `all-columns`, `means`, and `MAIN` as markdown formatted tables

In [96]:
C.to_markdown(floatfmt=',.2f', intfmt=',', buf=TOP_AM_DIR / f'Top{K}_NEG-ADV_combined_all-columns.35f-7c_{timestamp_today()}.md')
C.filter(like='mean_').to_markdown(floatfmt=',.2f', intfmt=',', buf=TOP_AM_DIR / f'Top{K}_NEG-ADV_combined_means.35f-7c_{timestamp_today()}.md')
C[main_cols_ordered].to_markdown(floatfmt=',.2f', intfmt=',', buf=TOP_AM_DIR / f'Top{K}_NEG-ADV_combined_MAIN.35f-7c_{timestamp_today()}.md')

## Collect bigrams corresponding to top adverbs

In [97]:
# results/assoc_df/polar/RBdirect/bigram/polarized-bigram_35f-7c_min1000x.pkl.gz

bigram_dfs = {d.name:
              update_index(pd.read_pickle(
                  tuple(d.joinpath('bigram/extra')
                        .glob(f'*35f-7c*min{mirror_floor if d.name == "NEGmirror" else bigram_floor}x*.pkl.gz')
                        )[0]))
              for d in POLAR_DIR.iterdir()}

In [98]:
def show_adv_bigrams(sample_size, C,
                     bigram_dfs,
                     selector: str = 'dP1',
                     column_list: list = None) -> dict:
    def _force_ints(_df):
        count_cols = _df.filter(regex=r'total$|^[fN]').columns
        _df.loc[:, count_cols] = _df.loc[:,count_cols].apply(
            pd.to_numeric, downcast='unsigned')
        return _df
    bigram_k = max(sample_size + 2, 10)
    print(
        f'## Top {bigram_k} "most negative" bigrams corresponding to top {K} adverbs\n')
    print(timestamp_today())
    patterns = list(bigram_dfs.keys())
    top_adverbs = C.index
    bigram_samples = dict.fromkeys(top_adverbs)
    bigrams = []
    adj = []
    for rank, adv in enumerate(top_adverbs, start=1):
        print(f'\n### {rank}. _{adv}_\n')
        adv_top = None
        bigram_samples[adv] = dict.fromkeys(patterns + ['both', 'adj'])
        adj_for_adv = []
        for pat, bdf in bigram_dfs.items():
            # avoid KeyError while maintaining intended order
            bdf = adjust_assoc_columns(
                bdf[[c for c in FOCUS+['adj', 'adj_total', 'adv', 'adv_total']
                     if c in bdf.columns]
                    ])
            # > Force significant & positive association according to LRC
            bdf = bdf.loc[bdf.LRC >= 1, :]

            bdf = _force_ints(bdf.loc[bdf.adv == adv, :])
            top_by_metric = [bdf.nlargest(bigram_k *2, m)
                 for m in
                 ['dP1', 'LRC']
                 #  [selector, list({'LRC', 'dP1'} - {selector}[0]])
            ]
            half_k = bigram_k//2
            adv_pat_bigrams = pd.concat(
                [top_bigrams.head(half_k) for top_bigrams in top_by_metric]
            ).drop_duplicates()
            if len(bdf) >= bigram_k:
                x=0
                while len(adv_pat_bigrams) < bigram_k: 
                    x += 1
                    next_ix = half_k + x
                    adv_pat_bigrams = pd.concat((adv_pat_bigrams, 
                                                top_by_metric[0].iloc[[next_ix], :], 
                                                top_by_metric[1].iloc[[next_ix], :])
                                                ).drop_duplicates()
                adv_pat_bigrams = adv_pat_bigrams.head(bigram_k)

            if adv_pat_bigrams.empty:
                print(f'No bigrams found in loaded `{pat}` AM table.')
            else:
                print(
                    f'\n#### Top {len(adv_pat_bigrams)} `{pat}` "{adv}_*" bigrams (sorted by `{selector}`; `LRC > 1`)\n')
                column_list = column_list or bdf.columns
                nb_show_table(adv_pat_bigrams[column_list], n_dec=2)

            adj_for_adv.extend(adv_pat_bigrams.adj.drop_duplicates().to_list())

            bigram_samples[adv][pat] = adv_pat_bigrams

            adv_top = adv_pat_bigrams if adv_top is None else pd.concat(
                [adv_top, adv_pat_bigrams])

        bigram_samples[adv]['adj'] = set(adj_for_adv)
        bigrams.extend(adv_top.l2.drop_duplicates().to_list())
        adj.extend(adj_for_adv)
        bigram_samples[adv]['both'] = adv_top
    bigram_samples['bigrams'] = set(bigrams)
    bigram_samples['adj'] = set(adj)
    return bigram_samples, bigram_k


samples_dict, bigram_k = show_adv_bigrams(
    K, C, bigram_dfs,
    column_list=[
        'adj', 'adj_total',
        *pd.Series(main_cols_ordered).str.replace(
            r'mean_|_SET|_MIR', '', regex=True)
        .drop_duplicates().to_list(),
        # 't', 'MI'
    ]
)

## Top 10 "most negative" bigrams corresponding to top 5 adverbs

2024-05-23

### 1. _necessarily_


#### Top 10 `RBdirect` "necessarily_*" bigrams (sorted by `dP1`; `LRC > 1`)


|                                       | `adj`          |   `adj_total` |   `LRC` |   `dP1` |      `G2` |   `f` |      `f1` |   `f2` |
|:--------------------------------------|:---------------|--------------:|--------:|--------:|----------:|------:|----------:|-------:|
| **NEGany~necessarily_sure**           | sure           |    844,981.00 |    5.91 |    0.95 |  1,436.68 |   222 | 3,226,213 |    224 |
| **NEGany~necessarily_surprising**     | surprising     |    150,067.00 |    7.22 |    0.93 |  2,150.86 |   343 | 3,226,213 |    355 |
| **NEGany~necessarily_indicative**     | indicative     |     12,760.00 |    8.37 |    0.93 |  8,811.69 | 1,406 | 3,226,213 |  1,456 |
| **NEGany~necessarily_representative** | representative |     25,187.00 |    7.31 |    0.91 |  3,044.27 |   496 | 3,226,213 |    524 |
| **N

## Top 10 "most negative" bigrams corresponding to top 5 adverbs

2024-05-23

### 1. _necessarily_


#### Top 10 `RBdirect` "necessarily_*" bigrams (sorted by `dP1`; `LRC > 1`)


|                                       | `adj`          |   `adj_total` |   `LRC` |   `dP1` |      `G2` |   `f` |      `f1` |   `f2` |
|:--------------------------------------|:---------------|--------------:|--------:|--------:|----------:|------:|----------:|-------:|
| **NEGany~necessarily_sure**           | sure           |    844,981.00 |    5.91 |    0.95 |  1,436.68 |   222 | 3,226,213 |    224 |
| **NEGany~necessarily_surprising**     | surprising     |    150,067.00 |    7.22 |    0.93 |  2,150.86 |   343 | 3,226,213 |    355 |
| **NEGany~necessarily_indicative**     | indicative     |     12,760.00 |    8.37 |    0.93 |  8,811.69 | 1,406 | 3,226,213 |  1,456 |
| **NEGany~necessarily_representative** | representative |     25,187.00 |    7.31 |    0.91 |  3,044.27 |   496 | 3,226,213 |    524 |
| **NEGany~necessarily_available**      | available      |    866,272.00 |    6.36 |    0.89 |  1,280.24 |   213 | 3,226,213 |    230 |
| **NEGany~necessarily_easy**           | easy           |    771,307.00 |    7.26 |    0.88 |  5,448.34 |   914 | 3,226,213 |    996 |
| **NEGany~necessarily_true**           | true           |    348,994.00 |    6.89 |    0.82 | 18,199.76 | 3,238 | 3,226,213 |  3,786 |
| **NEGany~necessarily_illegal**        | illegal        |     44,028.00 |    6.48 |    0.87 |  1,659.90 |   280 | 3,226,213 |    307 |
| **NEGany~necessarily_related**        | related        |    137,661.00 |    6.74 |    0.84 |  4,271.76 |   742 | 3,226,213 |    842 |
| **NEGany~necessarily_interested**     | interested     |    364,497.00 |    6.77 |    0.87 |  2,500.26 |   422 | 3,226,213 |    463 |


#### Top 3 `NEGmirror` "necessarily_*" bigrams (sorted by `dP1`; `LRC > 1`)


|                              | `adj`   |   `adj_total` |   `LRC` |   `dP1` |   `G2` |   `f` |    `f1` |   `f2` |
|:-----------------------------|:--------|--------------:|--------:|--------:|-------:|------:|--------:|-------:|
| **NEGmir~necessarily_wrong** | wrong   |     20,866.00 |    4.27 |    0.81 | 708.98 |   209 | 289,770 |    214 |
| **NEGmir~necessarily_bad**   | bad     |     10,783.00 |    2.02 |    0.76 | 153.43 |    50 | 289,770 |     54 |
| **NEGmir~necessarily_true**  | true    |      7,402.00 |    2.18 |    0.75 | 159.07 |    53 | 289,770 |     58 |


### 2. _exactly_


#### Top 10 `RBdirect` "exactly_*" bigrams (sorted by `dP1`; `LRC > 1`)


|                               | `adj`      |   `adj_total` |   `LRC` |   `dP1` |      `G2` |   `f` |      `f1` |   `f2` |
|:------------------------------|:-----------|--------------:|--------:|--------:|----------:|------:|----------:|-------:|
| **NEGany~exactly_surprising** | surprising |    150,067.00 |    7.34 |    0.96 |  2,863.35 |   441 | 3,226,213 |    444 |
| **NEGany~exactly_cheap**      | cheap      |     83,765.00 |    8.28 |    0.95 |  4,443.27 |   693 | 3,226,213 |    704 |
| **NEGany~exactly_subtle**     | subtle     |     56,845.00 |    6.92 |    0.94 |  1,671.02 |   264 | 3,226,213 |    271 |
| **NEGany~exactly_fun**        | fun        |    224,457.00 |    6.67 |    0.94 |  1,423.92 |   225 | 3,226,213 |    231 |
| **NEGany~exactly_conducive**  | conducive  |     16,405.00 |    6.56 |    0.93 |  1,313.09 |   208 | 3,226,213 |    214 |
| **NEGany~exactly_sure**       | sure       |    844,981.00 |    8.63 |    0.92 | 54,750.58 | 8,860 | 3,226,213 |  9,301 |
| **NEGany~exactly_new**        | new        |    321,311.00 |    8.54 |    0.93 |  8,697.93 | 1,378 | 3,226,213 |  1,418 |
| **NEGany~exactly_easy**       | easy       |    771,307.00 |    8.37 |    0.93 |  6,747.64 | 1,069 | 3,226,213 |  1,100 |
| **NEGany~exactly_clear**      | clear      |    491,108.00 |    8.30 |    0.92 | 10,937.16 | 1,759 | 3,226,213 |  1,835 |
| **NEGany~exactly_happy**      | happy      |    528,511.00 |    7.16 |    0.90 |  2,694.69 |   441 | 3,226,213 |    468 |

No bigrams found in loaded `NEGmirror` AM table.

### 3. _that_


#### Top 10 `RBdirect` "that_*" bigrams (sorted by `dP1`; `LRC > 1`)


|                             | `adj`       |   `adj_total` |   `LRC` |   `dP1` |      `G2` |   `f` |      `f1` |   `f2` |
|:----------------------------|:------------|--------------:|--------:|--------:|----------:|------:|----------:|-------:|
| **NEGany~that_uncommon**    | uncommon    |     61,767.00 |    8.39 |    0.94 |  5,136.91 |   804 | 3,226,213 |    819 |
| **NEGany~that_fond**        | fond        |     39,809.00 |    7.27 |    0.94 |  2,127.94 |   334 | 3,226,213 |    341 |
| **NEGany~that_surprising**  | surprising  |    150,067.00 |    8.14 |    0.92 |  7,115.30 | 1,141 | 3,226,213 |  1,187 |
| **NEGany~that_common**      | common      |    556,435.00 |    8.12 |    0.92 |  7,564.08 | 1,216 | 3,226,213 |  1,268 |
| **NEGany~that_dissimilar**  | dissimilar  |      8,816.00 |    7.00 |    0.92 |  1,904.15 |   307 | 3,226,213 |    321 |
| **NEGany~that_hard**        | hard        |    430,990.00 |    7.96 |    0.88 | 59,642.82 | 9,966 | 3,226,213 | 10,818 |
| **NEGany~that_complicated** | complicated |    180,071.00 |    7.95 |    0.91 |  7,450.89 | 1,208 | 3,226,213 |  1,270 |
| **NEGany~that_impressed**   | impressed   |    113,281.00 |    7.57 |    0.91 |  4,207.58 |   684 | 3,226,213 |    721 |
| **NEGany~that_noticeable**  | noticeable  |     40,372.00 |    6.78 |    0.91 |  1,632.07 |   265 | 3,226,213 |    279 |
| **NEGany~that_exciting**    | exciting    |    236,396.00 |    7.48 |    0.90 |  4,892.83 |   805 | 3,226,213 |    859 |


#### Top 10 `NEGmirror` "that_*" bigrams (sorted by `dP1`; `LRC > 1`)


|                            | `adj`      |   `adj_total` |   `LRC` |   `dP1` |     `G2` |   `f` |    `f1` |   `f2` |
|:---------------------------|:-----------|--------------:|--------:|--------:|---------:|------:|--------:|-------:|
| **NEGmir~that_popular**    | popular    |      5,787.00 |    2.50 |    0.76 |   200.44 |    65 | 289,770 |     70 |
| **NEGmir~that_interested** | interested |      9,258.00 |    2.42 |    0.76 |   190.06 |    62 | 289,770 |     67 |
| **NEGmir~that_difficult**  | difficult  |     16,043.00 |    2.15 |    0.75 |   155.64 |    52 | 289,770 |     57 |
| **NEGmir~that_hard**       | hard       |      7,311.00 |    2.31 |    0.74 |   168.31 |    57 | 289,770 |     63 |
| **NEGmir~that_close**      | close      |     13,962.00 |    2.39 |    0.73 |   174.26 |    60 | 289,770 |     67 |
| **NEGmir~that_simple**     | simple     |     25,382.00 |    4.34 |    0.73 | 1,370.94 |   473 | 289,770 |    529 |
| **NEGmir~that_easy**       | easy       |     20,050.00 |    4.21 |    0.72 | 1,258.15 |   442 | 289,770 |    500 |
| **NEGmir~that_great**      | great      |      5,819.00 |    3.52 |    0.67 |   728.46 |   282 | 289,770 |    340 |
| **NEGmir~that_good**       | good       |     33,540.00 |    3.07 |    0.56 |   953.31 |   447 | 289,770 |    615 |
| **NEGmir~that_big**        | big        |      7,859.00 |    3.06 |    0.70 |   309.58 |   113 | 289,770 |    131 |


### 4. _before_

No bigrams found in loaded `RBdirect` AM table.
No bigrams found in loaded `NEGmirror` AM table.

### 5. _any_


#### Top 10 `RBdirect` "any_*" bigrams (sorted by `dP1`; `LRC > 1`)


|                          | `adj`     |   `adj_total` |   `LRC` |   `dP1` |     `G2` |   `f` |      `f1` |   `f2` |
|:-------------------------|:----------|--------------:|--------:|--------:|---------:|------:|----------:|-------:|
| **NEGany~any_happier**   | happier   |     19,501.00 |    4.65 |    0.53 | 3,488.76 |   830 | 3,226,213 |  1,472 |
| **NEGany~any_simpler**   | simpler   |     26,094.00 |    3.09 |    0.30 |   671.74 |   228 | 3,226,213 |    672 |
| **NEGany~any_clearer**   | clearer   |     13,369.00 |    3.21 |    0.30 | 1,051.22 |   357 | 3,226,213 |  1,053 |
| **NEGany~any_different** | different |    909,864.00 |    2.98 |    0.24 | 2,270.24 |   910 | 3,226,213 |  3,313 |
| **NEGany~any_younger**   | younger   |     29,805.00 |    2.37 |    0.19 |   544.17 |   256 | 3,226,213 |  1,121 |
| **NEGany~any_worse**     | worse     |    214,166.00 |    2.47 |    0.16 | 3,165.88 | 1,693 | 3,226,213 |  8,487 |
| **NEGany~any_bigger**    | bigger    |    130,470.00 |    2.27 |    0.17 |   688.06 |   357 | 3,226,213 |  1,735 |
| **NEGany~any_harder**    | harder    |     99,332.00 |    1.98 |    0.15 |   395.22 |   227 | 3,226,213 |  1,221 |
| **NEGany~any_safer**     | safer     |     26,779.00 |    1.73 |    0.12 |   346.68 |   235 | 3,226,213 |  1,471 |
| **NEGany~any_easier**    | easier    |    237,680.00 |    1.95 |    0.11 | 2,164.75 | 1,607 | 3,226,213 | 10,860 |


#### Top 4 `NEGmirror` "any_*" bigrams (sorted by `dP1`; `LRC > 1`)


|                       | `adj`   |   `adj_total` |   `LRC` |   `dP1` |     `G2` |   `f` |    `f1` |   `f2` |
|:----------------------|:--------|--------------:|--------:|--------:|---------:|------:|--------:|-------:|
| **NEGmir~any_better** | better  |     14,076.00 |    4.44 |    0.75 | 1,148.18 |   381 | 289,770 |    416 |
| **NEGmir~any_easier** | easier  |      2,409.00 |    2.42 |    0.75 |   181.98 |    61 | 289,770 |     67 |
| **NEGmir~any_worse**  | worse   |      8,490.00 |    2.87 |    0.72 |   248.63 |    88 | 289,770 |    100 |
| **NEGmir~any_closer** | closer  |        986.00 |    2.21 |    0.68 |   149.62 |    56 | 289,770 |     66 |


### 6. _ever_


#### Top 5 `RBdirect` "ever_*" bigrams (sorted by `dP1`; `LRC > 1`)


|                         | `adj`   |   `adj_total` |   `LRC` |   `dP1` |     `G2` |   `f` |      `f1` |   `f2` |
|:------------------------|:--------|--------------:|--------:|--------:|---------:|------:|----------:|-------:|
| **NEGany~ever_simple**  | simple  |    427,167.00 |    5.54 |    0.77 | 1,142.04 |   212 | 3,226,213 |    262 |
| **NEGany~ever_easy**    | easy    |    771,307.00 |    5.06 |    0.63 | 2,030.58 |   430 | 3,226,213 |    641 |
| **NEGany~ever_good**    | good    |  2,037,285.00 |    3.76 |    0.40 | 1,178.00 |   332 | 3,226,213 |    756 |
| **NEGany~ever_perfect** | perfect |    164,519.00 |    3.48 |    0.37 |   736.05 |   217 | 3,226,213 |    527 |
| **NEGany~ever_able**    | able    |    428,268.00 |    1.81 |    0.13 |   363.95 |   234 | 3,226,213 |  1,398 |


#### Top 6 `NEGmirror` "ever_*" bigrams (sorted by `dP1`; `LRC > 1`)


|                         | `adj`   |   `adj_total` |   `LRC` |   `dP1` |     `G2` |   `f` |    `f1` |   `f2` |
|:------------------------|:--------|--------------:|--------:|--------:|---------:|------:|--------:|-------:|
| **NEGmir~ever_easy**    | easy    |     20,050.00 |    3.21 |    0.83 | 1,311.83 |   367 | 289,770 |    368 |
| **NEGmir~ever_perfect** | perfect |      3,708.00 |    2.38 |    0.83 |   735.10 |   207 | 289,770 |    208 |
| **NEGmir~ever_good**    | good    |     33,540.00 |    4.72 |    0.82 | 1,034.95 |   298 | 289,770 |    302 |
| **NEGmir~ever_wrong**   | wrong   |     20,866.00 |    2.56 |    0.82 |   349.21 |   102 | 289,770 |    104 |
| **NEGmir~ever_free**    | free    |      5,043.00 |    1.97 |    0.81 |   231.61 |    69 | 289,770 |     71 |
| **NEGmir~ever_able**    | able    |      6,448.00 |    3.66 |    0.79 |   437.65 |   136 | 289,770 |    143 |


### 7. _yet_


#### Top 10 `RBdirect` "yet_*" bigrams (sorted by `dP1`; `LRC > 1`)


|                          | `adj`     |   `adj_total` |   `LRC` |   `dP1` |      `G2` |    `f` |      `f1` |   `f2` |
|:-------------------------|:----------|--------------:|--------:|--------:|----------:|-------:|----------:|-------:|
| **NEGany~yet_clear**     | clear     |    491,108.00 |   10.26 |    0.95 | 67,924.56 | 10,553 | 3,226,213 | 10,693 |
| **NEGany~yet_eligible**  | eligible  |     49,578.00 |    7.72 |    0.94 |  2,929.15 |    459 | 3,226,213 |    468 |
| **NEGany~yet_official**  | official  |      9,778.00 |    7.33 |    0.94 |  2,236.98 |    353 | 3,226,213 |    362 |
| **NEGany~yet_ready**     | ready     |    240,297.00 |    9.23 |    0.93 | 48,012.06 |  7,611 | 3,226,213 |  7,838 |
| **NEGany~yet_certain**   | certain   |    104,544.00 |    8.12 |    0.93 |  5,491.41 |    874 | 3,226,213 |    903 |
| **NEGany~yet_complete**  | complete  |    107,018.00 |    8.42 |    0.92 | 13,815.99 |  2,220 | 3,226,213 |  2,314 |
| **NEGany~yet_sure**      | sure      |    844,981.00 |    8.37 |    0.92 | 12,379.79 |  1,990 | 3,226,213 |  2,075 |
| **NEGany~yet_available** | available |    866,272.00 |    7.69 |    0.87 | 44,196.15 |  7,481 | 3,226,213 |  8,238 |
| **NEGany~yet_right**     | right     |    204,572.00 |    6.50 |    0.92 |  1,254.20 |    202 | 3,226,213 |    211 |
| **NEGany~yet_final**     | final     |      9,657.00 |    7.45 |    0.91 |  4,028.75 |    659 | 3,226,213 |    699 |

No bigrams found in loaded `NEGmirror` AM table.

### 8. _longer_


#### Top 5 `RBdirect` "longer_*" bigrams (sorted by `dP1`; `LRC > 1`)


|                          | `adj`     |   `adj_total` |   `LRC` |   `dP1` |   `G2` |    `f` |       `f1` |   `f2` |
|:-------------------------|:----------|--------------:|--------:|--------:|-------:|-------:|-----------:|-------:|
| **COM~longer_lasting**   | lasting   |     24,344.00 |    1.44 |    0.04 | 244.09 |  3,860 | 83,102,035 |  3,866 |
| **COM~longer_enough**    | enough    |    453,790.00 |    1.41 |    0.03 | 216.98 |  3,952 | 83,102,035 |  3,964 |
| **COM~longer_able**      | able      |    428,268.00 |    2.28 |    0.03 | 623.67 | 11,677 | 83,102,035 | 11,716 |
| **COM~longer_available** | available |    866,272.00 |    2.45 |    0.03 | 974.55 | 18,865 | 83,102,035 | 18,935 |
| **COM~longer_necessary** | necessary |    187,396.00 |    1.27 |    0.03 | 220.07 |  5,365 | 83,102,035 |  5,399 |

No bigrams found in loaded `NEGmirror` AM table.

### 9. _immediately_


#### Top 5 `RBdirect` "immediately_*" bigrams (sorted by `dP1`; `LRC > 1`)


|                                  | `adj`     |   `adj_total` |   `LRC` |   `dP1` |       `G2` |    `f` |      `f1` |   `f2` |
|:---------------------------------|:----------|--------------:|--------:|--------:|-----------:|-------:|----------:|-------:|
| **NEGany~immediately_possible**  | possible  |    364,265.00 |    7.68 |    0.90 |   6,269.26 |  1,027 | 3,226,213 |  1,091 |
| **NEGany~immediately_clear**     | clear     |    491,108.00 |    8.32 |    0.90 | 153,302.22 | 25,276 | 3,226,213 | 27,066 |
| **NEGany~immediately_available** | available |    866,272.00 |    5.77 |    0.66 | 102,962.94 | 21,297 | 3,226,213 | 30,725 |
| **NEGany~immediately_able**      | able      |    428,268.00 |    4.87 |    0.58 |   2,851.84 |    639 | 3,226,213 |  1,036 |
| **NEGany~immediately_obvious**   | obvious   |    193,498.00 |    4.59 |    0.49 |   9,043.23 |  2,258 | 3,226,213 |  4,305 |


#### Top 1 `NEGmirror` "immediately_*" bigrams (sorted by `dP1`; `LRC > 1`)


|                                  | `adj`     |   `adj_total` |   `LRC` |   `dP1` |   `G2` |   `f` |    `f1` |   `f2` |
|:---------------------------------|:----------|--------------:|--------:|--------:|-------:|------:|--------:|-------:|
| **NEGmir~immediately_available** | available |     12,636.00 |    1.94 |    0.43 | 254.47 |   162 | 289,770 |    274 |



In [99]:
bigram_dfs['RBdirect'].filter(like='~before_', axis=0)

Unnamed: 0_level_0,l1,l2,f,E11,am_log_likelihood,am_odds_ratio_disc,...,f1_sqrt,f2_sqrt,adv,adj,adv_total,adj_total
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1


In [100]:
for key, info in samples_dict.items():
    if key in ('bigrams', 'adj'):
        key = f'ALL {key.replace("adj", "adjectives")}'
    formatted_iter = [
        f'_{a.replace("_", " ")}_' for a
        in (info['adj'] if isinstance(info, dict)
            else info)]
    print_iter(formatted_iter,
               header=f'1. _{key}_ ({len(formatted_iter)} unique)',
               bullet='1.', indent=3)


1. _necessarily_ (12 unique)
   1. _representative_
   1. _surprising_
   1. _wrong_
   1. _bad_
   1. _available_
   1. _related_
   1. _illegal_
   1. _indicative_
   1. _true_
   1. _sure_
   1. _easy_
   1. _interested_

1. _exactly_ (10 unique)
   1. _clear_
   1. _subtle_
   1. _surprising_
   1. _fun_
   1. _new_
   1. _conducive_
   1. _cheap_
   1. _sure_
   1. _happy_
   1. _easy_

1. _that_ (19 unique)
   1. _hard_
   1. _good_
   1. _dissimilar_
   1. _simple_
   1. _noticeable_
   1. _close_
   1. _exciting_
   1. _surprising_
   1. _great_
   1. _difficult_
   1. _easy_
   1. _big_
   1. _common_
   1. _uncommon_
   1. _popular_
   1. _fond_
   1. _impressed_
   1. _complicated_
   1. _interested_

1. _before_ (0 unique)


1. _any_ (12 unique)
   1. _simpler_
   1. _worse_
   1. _better_
   1. _easier_
   1. _younger_
   1. _harder_
   1. _happier_
   1. _safer_
   1. _clearer_
   1. _bigger_
   1. _closer_
   1. _different_

1. _ever_ (7 unique)
   1. _simple_
   1. _pe

In [101]:
NEG_bigrams_sample = pd.concat(
    (ad['both'] for ad in samples_dict.values() if isinstance(ad, dict))).sort_values('LRC', ascending=False)

In [102]:
top_NEGbigram_df_path = TOP_AM_DIR.joinpath(
    f'Top{K}_NEG-ADV_top-{bigram_k}-bigrams.{timestamp_today()}.csv')
print(top_NEGbigram_df_path)
NEG_bigrams_sample.to_csv(top_NEGbigram_df_path)
nb_show_table(NEG_bigrams_sample.sort_values('LRC', ascending=False), outpath= top_NEGbigram_df_path.with_suffix('.md'))

/share/compling/projects/sanpi/results/top_AM/Top5_NEG-ADV_top-10-bigrams.2024-05-23.csv

|                                       |    `f` |   `dP1` |   `LRC` |       `G2` |        `N` |       `f1` |   `f2` |   `exp_f` |   `unexp_f` | `l1`       | `l2`                       | `adj`          |   `adj_total` | `adv`       |   `adv_total` |
|:--------------------------------------|-------:|--------:|--------:|-----------:|-----------:|-----------:|-------:|----------:|------------:|:-----------|:---------------------------|:---------------|--------------:|:------------|--------------:|
| **NEGany~yet_clear**                  | 10,553 |    0.95 |   10.26 |  67,924.56 | 86,330,752 |  3,226,213 | 10,693 |    399.60 |   10,153.40 | NEGATED    | yet_clear                  | clear          |    491,108.00 | yet         |    101,707.00 |
| **NEGany~yet_ready**                  |  7,611 |    0.93 |    9.23 |  48,012.06 | 86,330,752 |  3,226,213 |  7,838 |    292.91 |    7,318.09 | NEGATED    | ye

In [103]:
NEG_bigrams_sample.l1.value_counts()

l1
NEGATED       60
NEGMIR        24
COMPLEMENT     5
Name: count, dtype: Int64

In [104]:
nb_show_table(NEG_bigrams_sample.filter(like='O', axis=0))


|                          |    `f` |   `dP1` |   `LRC` |   `G2` |        `N` |       `f1` |   `f2` |   `exp_f` |   `unexp_f` | `l1`       | `l2`             | `adj`     |   `adj_total` | `adv`   |   `adv_total` |
|:-------------------------|-------:|--------:|--------:|-------:|-----------:|-----------:|-------:|----------:|------------:|:-----------|:-----------------|:----------|--------------:|:--------|--------------:|
| **COM~longer_available** | 18,865 |    0.03 |    2.45 | 974.55 | 86,330,752 | 83,102,035 | 18,935 | 18,226.84 |      638.16 | COMPLEMENT | longer_available | available |    866,272.00 | longer  |    157,984.00 |
| **COM~longer_able**      | 11,677 |    0.03 |    2.28 | 623.67 | 86,330,752 | 83,102,035 | 11,716 | 11,277.83 |      399.17 | COMPLEMENT | longer_able      | able      |    428,268.00 | longer  |    157,984.00 |
| **COM~longer_lasting**   |  3,860 |    0.04 |    1.44 | 244.09 | 86,330,752 | 83,102,035 |  3,866 |  3,721.41 |      138.59 | COMPLEMENT | lo