# Identifying Adverbs with Strongest Negative Environment Associations

In [1]:
from pathlib import Path

import pandas as pd

from source.utils import PKL_SUFF
from source.utils.associate import AM_DF_DIR, TOP_AM_DIR, adjust_assoc_columns
from source.utils.general import print_iter, snake_to_camel, timestamp_today

SET_FLOOR = 2000
MIR_FLOOR = 200
K = 5



Set columns and diplay settings

In [2]:
FOCUS = ['f',
         'am_p1_given2', 'conservative_log_ratio',
         'am_log_likelihood',
        #  'mutual_information', 'am_odds_ratio_disc', 't_score',
         'N', 'f1', 'f2', 'E11', 'unexpected_f', 
         'l1', 'l2']
pd.set_option('display.max_colwidth', 20)
pd.set_option('display.max_columns', 12)
pd.set_option('display.width', 90)
pd.set_option("display.precision", 2)
pd.set_option("styler.format.precision", 2)
pd.set_option("styler.format.thousands", ",")
pd.set_option("display.float_format", '{:,.2f}'.format)
# pd.set_option("styler.render.repr", "html")

In [3]:
def force_ints(_df):
    count_cols = _df.filter(regex=r'total|^[fN]').columns
    _df[count_cols] = _df[count_cols].astype('int')
    # _df[count_cols] = _df[:, count_cols].astype('int64')
    # print(_df.dtypes.to_frame('dtypes'))
    return _df

In [4]:
def nb_show_table(df, n_dec: int = 2,
                  adjust_columns: bool = True,
                   outpath:Path=None, 
                   return_df:bool=False) -> None: 
    _df = df.copy()
    if adjust_columns: 
        _df = adjust_assoc_columns(_df)
    _df.columns = [f'`{c}`' for c in _df.columns]
    _df.index = [f'**{r}**' for r in _df.index ]
    table = _df.to_markdown(floatfmt=f',.{n_dec}f', intfmt=',')
    if outpath:
        outpath.write_text(table)

    print(f'\n{table}\n')
    return (_df if return_df else None)

## Set paths and load adverb association tables

In [5]:
def update_index(df, pat_name:str = None):
    neg_env_name = df.filter(like='NEG', axis=0).l1[0]
    # > will be either `NEGATED` or `NEGMIR`
    #   both are shortened to just `NEG` for the keys in their separate dataframes
    # > replace to avoid ambiguity in `key` values when combined
    #! some filtering relies on 'NEG', so have to keep that prefix
    index_update = pat_name or ('NEGmir' if neg_env_name.endswith('MIR') else 'NEGany')
    df.index = df.index.str.replace('NEG', index_update)
    return df

In [32]:
POLAR_DIR = AM_DF_DIR.joinpath('polar')

polar_adv_dirs = []
# results/assoc_df/polar/RBdirect/adv/extra/polarized-adv_35f-7c_min5000x_extra.pkl.gz
adv_am_paths = {
    p.name: tuple(
        p.joinpath('adv/extra').glob(
            f'*35f-7c_min{SET_FLOOR if p.name == "RBdirect" else MIR_FLOOR}x*{PKL_SUFF}')
    )[0]
    for p in POLAR_DIR.iterdir()}

setdiff_adv = update_index(pd.read_pickle(adv_am_paths['RBdirect']))
mirror_adv = update_index(pd.read_pickle(adv_am_paths['NEGmirror']))
nb_show_table(setdiff_adv.sample(K//2).sort_values('conservative_log_ratio', ascending=False)[FOCUS])


|                          |    `f` |   `dP1` |   `LRC` |      `G2` |        `N` |       `f1` |    `f2` |   `exp_f` |   `unexp_f` | `l1`       | `l2`          |
|:-------------------------|-------:|--------:|--------:|----------:|-----------:|-----------:|--------:|----------:|------------:|:-----------|:--------------|
| **COM~reasonably**       | 63,753 |    0.02 |    1.16 |  1,226.62 | 86,330,752 | 83,102,035 |  64,688 | 62,268.71 |    1,484.29 | COMPLEMENT | reasonably    |
| **NEGany~significantly** |  3,013 |   -0.02 |   -0.67 | -1,120.96 | 86,330,752 |  3,226,213 | 139,099 |  5,198.18 |   -2,185.18 | NEGATED    | significantly |



In [35]:
nb_show_table(mirror_adv.sample(K//2).sort_values('conservative_log_ratio', ascending=False)[FOCUS])


|                      |   `f` |   `dP1` |   `LRC` |   `G2` |       `N` |      `f1` |   `f2` |   `exp_f` |   `unexp_f` | `l1`   | `l2`       |
|:---------------------|------:|--------:|--------:|-------:|----------:|----------:|-------:|----------:|------------:|:-------|:-----------|
| **POS~absolutely**   | 5,834 |    0.06 |    0.52 | 202.52 | 2,032,082 | 1,738,105 |  6,384 |  5,460.44 |      373.56 | POSMIR | absolutely |
| **NEGmir~seriously** |   435 |   -0.05 |   -0.28 | -88.22 | 2,032,082 |   293,963 |  4,453 |    644.18 |     -209.18 | NEGMIR | seriously  |



## Calculate "Most Negative" Adverbs for each Polarity Approximation

In [44]:
def get_top_vals(df: pd.DataFrame,
                 index_like: str = 'NEG',
                 metric_filter: str | list = ['am_p1_given2', 'conservative_log_ratio'],
                 k: int = 10,
                 val_col: str = None,
                 ignore_neg_adv: bool = True):
    env_df = df.copy().loc[df.conservative_log_ratio >=
                           1].filter(like=index_like, axis=0)
    if ignore_neg_adv:
        env_df = env_df.loc[~df.l2.isin(
            ("n't", 'not', 'barely', 'never', 'no', 'none')), :]
    if isinstance(metric_filter, str):
        metric_filter = [metric_filter]

    top = pd.concat([env_df.nlargest(k, m) for m in metric_filter]
                    ).drop_duplicates(keep='first')

    if val_col:
        top = top[[val_col] + metric_filter]

    return top.sort_values(metric_filter, ascending=False)


[setdiff_top15, mirror_top15] = [
    get_top_vals(adv_df, k=15)
    for adv_df in (setdiff_adv, mirror_adv)
]
nb_show_table(setdiff_top15.filter(items=FOCUS).reset_index())


|        | `key`              |     `f` |   `dP1` |   `LRC` |       `G2` |        `N` |      `f1` |    `f2` |   `exp_f` |   `unexp_f` | `l1`    | `l2`        |
|:-------|:-------------------|--------:|--------:|--------:|-----------:|-----------:|----------:|--------:|----------:|------------:|:--------|:------------|
| **0**  | NEGany~necessarily |  42,708 |    0.72 |    6.23 | 219,003.46 | 86,330,752 | 3,226,213 |  56,694 |  2,118.68 |   40,589.32 | NEGATED | necessarily |
| **1**  | NEGany~exactly     |  43,635 |    0.67 |    5.90 | 214,404.20 | 86,330,752 | 3,226,213 |  61,599 |  2,301.98 |   41,333.02 | NEGATED | exactly     |
| **2**  | NEGany~that        | 165,411 |    0.63 |    5.62 | 781,016.11 | 86,330,752 | 3,226,213 | 250,392 |  9,357.24 |  156,053.76 | NEGATED | that        |
| **3**  | NEGany~immediately |  57,319 |    0.52 |    4.96 | 239,462.58 | 86,330,752 | 3,226,213 | 103,177 |  3,855.76 |   53,463.24 | NEGATED | immediately |
| **4**  | NEGany~yet         |  52,546

15 Most Negatively Associated Adverbs for full dataset (_Absent Negative_ approximation) as ranked by $\Delta P(1|2)$ (`dP1`) and $LRC$

|        | `key`              |     `f` |   `dP1` |   `LRC` |       `G2` |        `N` |      `f1` |    `f2` |   `exp_f` |   `unexp_f` | `l1`    | `l2`        |
|:-------|:-------------------|--------:|--------:|--------:|-----------:|-----------:|----------:|--------:|----------:|------------:|:--------|:------------|
| **0**  | NEGany~necessarily |  42,708 |    0.72 |    6.23 | 219,003.46 | 86,330,752 | 3,226,213 |  56,694 |  2,118.68 |   40,589.32 | NEGATED | necessarily |
| **1**  | NEGany~exactly     |  43,635 |    0.67 |    5.90 | 214,404.20 | 86,330,752 | 3,226,213 |  61,599 |  2,301.98 |   41,333.02 | NEGATED | exactly     |
| **2**  | NEGany~that        | 165,411 |    0.63 |    5.62 | 781,016.11 | 86,330,752 | 3,226,213 | 250,392 |  9,357.24 |  156,053.76 | NEGATED | that        |
| **3**  | NEGany~immediately |  57,319 |    0.52 |    4.96 | 239,462.58 | 86,330,752 | 3,226,213 | 103,177 |  3,855.76 |   53,463.24 | NEGATED | immediately |
| **4**  | NEGany~yet         |  52,546 |    0.48 |    4.74 | 209,055.78 | 86,330,752 | 3,226,213 | 101,707 |  3,800.83 |   48,745.17 | NEGATED | yet         |
| **5**  | NEGany~terribly    |  18,054 |    0.22 |    3.09 |  42,704.93 | 86,330,752 | 3,226,213 |  70,174 |  2,622.43 |   15,431.57 | NEGATED | terribly    |
| **6**  | NEGany~remotely    |   5,679 |    0.22 |    3.03 |  13,354.33 | 86,330,752 | 3,226,213 |  22,194 |    829.40 |    4,849.60 | NEGATED | remotely    |
| **7**  | NEGany~only        | 114,070 |    0.21 |    3.04 | 261,936.36 | 86,330,752 | 3,226,213 | 464,168 | 17,346.13 |   96,723.87 | NEGATED | only        |
| **8**  | NEGany~altogether  |   4,575 |    0.18 |    2.75 |   9,468.00 | 86,330,752 | 3,226,213 |  20,636 |    771.17 |    3,803.82 | NEGATED | altogether  |
| **9**  | NEGany~entirely    |  63,708 |    0.17 |    2.74 | 125,925.14 | 86,330,752 | 3,226,213 | 303,833 | 11,354.35 |   52,353.65 | NEGATED | entirely    |
| **10** | NEGany~overly      |  24,707 |    0.17 |    2.66 |  46,993.58 | 86,330,752 | 3,226,213 | 122,058 |  4,561.35 |   20,145.65 | NEGATED | overly      |
| **11** | NEGany~merely      |   5,944 |    0.13 |    2.26 |   9,223.66 | 86,330,752 | 3,226,213 |  35,608 |  1,330.68 |    4,613.32 | NEGATED | merely      |
| **12** | NEGany~any         |  15,492 |    0.13 |    2.28 |  23,683.00 | 86,330,752 | 3,226,213 |  94,152 |  3,518.50 |   11,973.50 | NEGATED | any         |
| **13** | NEGany~always      | 104,605 |    0.12 |    2.28 | 157,437.56 | 86,330,752 | 3,226,213 | 651,053 | 24,330.10 |   80,274.90 | NEGATED | always      |
| **14** | NEGany~directly    |   8,317 |    0.12 |    2.13 |  11,654.57 | 86,330,752 | 3,226,213 |  54,441 |  2,034.48 |    6,282.52 | NEGATED | directly    |

In [39]:
nb_show_table(mirror_top15.filter(items=FOCUS).reset_index())


|        | `key`                |   `f` |   `dP1` |   `LRC` |      `G2` |       `N` |    `f1` |   `f2` |   `exp_f` |   `unexp_f` | `l1`   | `l2`          |
|:-------|:---------------------|------:|--------:|--------:|----------:|----------:|--------:|-------:|----------:|------------:|:-------|:--------------|
| **0**  | NEGmir~before        |   290 |    0.84 |    5.11 |  1,080.52 | 2,032,082 | 293,963 |    294 |     42.53 |      247.47 | NEGMIR | before        |
| **1**  | NEGmir~ever          | 4,718 |    0.77 |    5.57 | 15,340.34 | 2,032,082 | 293,963 |  5,179 |    749.20 |    3,968.80 | NEGMIR | ever          |
| **2**  | NEGmir~exactly       |   813 |    0.59 |    3.51 |  1,939.47 | 2,032,082 | 293,963 |  1,114 |    161.15 |      651.85 | NEGMIR | exactly       |
| **3**  | NEGmir~any           | 1,082 |    0.57 |    3.48 |  2,511.26 | 2,032,082 | 293,963 |  1,514 |    219.02 |      862.98 | NEGMIR | any           |
| **4**  | NEGmir~remotely      | 1,846 |    0.54 |    3.35 |  

15 Most Negatively Associated Adverbs for `mirror` subset (_Present Positive_ approximation) as ranked by $\Delta P(1|2)$ (`dP1`) and $LRC$

|        | `key`                |   `f` |   `dP1` |   `LRC` |      `G2` |       `N` |    `f1` |   `f2` |   `exp_f` |   `unexp_f` | `l1`   | `l2`          |
|:-------|:---------------------|------:|--------:|--------:|----------:|----------:|--------:|-------:|----------:|------------:|:-------|:--------------|
| **0**  | NEGmir~before        |   290 |    0.84 |    5.11 |  1,080.52 | 2,032,082 | 293,963 |    294 |     42.53 |      247.47 | NEGMIR | before        |
| **1**  | NEGmir~ever          | 4,718 |    0.77 |    5.57 | 15,340.34 | 2,032,082 | 293,963 |  5,179 |    749.20 |    3,968.80 | NEGMIR | ever          |
| **2**  | NEGmir~exactly       |   813 |    0.59 |    3.51 |  1,939.47 | 2,032,082 | 293,963 |  1,114 |    161.15 |      651.85 | NEGMIR | exactly       |
| **3**  | NEGmir~any           | 1,082 |    0.57 |    3.48 |  2,511.26 | 2,032,082 | 293,963 |  1,514 |    219.02 |      862.98 | NEGMIR | any           |
| **4**  | NEGmir~remotely      | 1,846 |    0.54 |    3.35 |  4,009.84 | 2,032,082 | 293,963 |  2,717 |    393.04 |    1,452.96 | NEGMIR | remotely      |
| **5**  | NEGmir~particularly  | 9,278 |    0.48 |    3.15 | 17,999.07 | 2,032,082 | 293,963 | 14,954 |  2,163.26 |    7,114.74 | NEGMIR | particularly  |
| **6**  | NEGmir~that          | 4,338 |    0.44 |    2.86 |  7,632.21 | 2,032,082 | 293,963 |  7,472 |  1,080.91 |    3,257.09 | NEGMIR | that          |
| **7**  | NEGmir~necessarily   |   971 |    0.43 |    2.66 |  1,688.91 | 2,032,082 | 293,963 |  1,681 |    243.18 |      727.82 | NEGMIR | necessarily   |
| **8**  | NEGmir~inherently    | 2,872 |    0.36 |    2.42 |  4,160.38 | 2,032,082 | 293,963 |  5,649 |    817.19 |    2,054.81 | NEGMIR | inherently    |
| **9**  | NEGmir~overtly       |   392 |    0.29 |    1.71 |    443.78 | 2,032,082 | 293,963 |    898 |    129.91 |      262.09 | NEGMIR | overtly       |
| **10** | NEGmir~intrinsically |   432 |    0.29 |    1.73 |    487.95 | 2,032,082 | 293,963 |    991 |    143.36 |      288.64 | NEGMIR | intrinsically |
| **11** | NEGmir~especially    | 1,573 |    0.21 |    1.49 |  1,232.03 | 2,032,082 | 293,963 |  4,400 |    636.51 |      936.49 | NEGMIR | especially    |
| **12** | NEGmir~yet           |   320 |    0.21 |    1.18 |    242.23 | 2,032,082 | 293,963 |    909 |    131.50 |      188.50 | NEGMIR | yet           |
| **13** | NEGmir~fully         | 1,668 |    0.18 |    1.31 |  1,086.24 | 2,032,082 | 293,963 |  5,084 |    735.46 |      932.54 | NEGMIR | fully         |
| **14** | NEGmir~terribly      | 1,579 |    0.16 |    1.14 |    847.65 | 2,032,082 | 293,963 |  5,218 |    754.84 |      824.16 | NEGMIR | terribly      |

### Or here, the least "negative"/most "non-negative"

In [47]:
for adv_df in (setdiff_adv, mirror_adv):
    nb_show_table(
        get_top_vals(
            adv_df.filter(items=FOCUS), 
            k=10,
            index_like='O',  # should match "POS" & "COM", but neither "NEG*"
            ).sort_values('conservative_log_ratio', ascending=False)
    )


|                      |       `f` |   `dP1` |   `LRC` |       `G2` |        `N` |       `f1` |      `f2` |      `exp_f` |   `unexp_f` | `l1`       | `l2`         |
|:---------------------|----------:|--------:|--------:|-----------:|-----------:|-----------:|----------:|-------------:|------------:|:-----------|:-------------|
| **COM~increasingly** |   404,356 |    0.04 |    6.00 |  29,076.69 | 86,330,752 | 83,102,035 |   404,521 |   389,392.16 |   14,963.84 | COMPLEMENT | increasingly |
| **COM~relatively**   |   626,369 |    0.04 |    5.24 |  42,957.87 | 86,330,752 | 83,102,035 |   626,884 |   603,438.92 |   22,930.08 | COMPLEMENT | relatively   |
| **COM~almost**       |   466,468 |    0.04 |    4.85 |  31,107.72 | 86,330,752 | 83,102,035 |   466,967 |   449,502.72 |   16,965.28 | COMPLEMENT | almost       |
| **COM~seemingly**    |   176,135 |    0.04 |    4.77 |  11,864.41 | 86,330,752 | 83,102,035 |   176,304 |   169,710.34 |    6,424.66 | COMPLEMENT | seemingly    |
| **COM~m

#### 10 Most Positively/Non-Negatively Associated Adverbs  as ranked by $\Delta P(1|2)$ (`dP1`) and $LRC$ (sorted by `LRC`)

full dataset (_Absent Negation_ approximation)

|                      |       `f` |   `dP1` |   `LRC` |       `G2` |        `N` |       `f1` |      `f2` |      `exp_f` |   `unexp_f` | `l1`       | `l2`         |
|:---------------------|----------:|--------:|--------:|-----------:|-----------:|-----------:|----------:|-------------:|------------:|:-----------|:-------------|
| **COM~increasingly** |   404,356 |    0.04 |    6.00 |  29,076.69 | 86,330,752 | 83,102,035 |   404,521 |   389,392.16 |   14,963.84 | COMPLEMENT | increasingly |
| **COM~relatively**   |   626,369 |    0.04 |    5.24 |  42,957.87 | 86,330,752 | 83,102,035 |   626,884 |   603,438.92 |   22,930.08 | COMPLEMENT | relatively   |
| **COM~almost**       |   466,468 |    0.04 |    4.85 |  31,107.72 | 86,330,752 | 83,102,035 |   466,967 |   449,502.72 |   16,965.28 | COMPLEMENT | almost       |
| **COM~seemingly**    |   176,135 |    0.04 |    4.77 |  11,864.41 | 86,330,752 | 83,102,035 |   176,304 |   169,710.34 |    6,424.66 | COMPLEMENT | seemingly    |
| **COM~mostly**       |   212,255 |    0.04 |    4.71 |  14,160.67 | 86,330,752 | 83,102,035 |   212,478 |   204,531.45 |    7,723.55 | COMPLEMENT | mostly       |
| **COM~pretty**       | 1,650,041 |    0.04 |    4.64 | 107,081.72 | 86,330,752 | 83,102,035 | 1,652,360 | 1,590,562.75 |   59,478.25 | COMPLEMENT | pretty       |
| **COM~fairly**       |   401,326 |    0.04 |    4.50 |  25,904.34 | 86,330,752 | 83,102,035 |   401,879 |   386,848.97 |   14,477.03 | COMPLEMENT | fairly       |
| **COM~partly**       |    80,461 |    0.04 |    4.50 |   5,418.01 | 86,330,752 | 83,102,035 |    80,538 |    77,525.93 |    2,935.07 | COMPLEMENT | partly       |
| **COM~rather**       |   402,067 |    0.04 |    4.44 |  25,775.15 | 86,330,752 | 83,102,035 |   402,648 |   387,589.21 |   14,477.79 | COMPLEMENT | rather       |
| **COM~largely**      |   186,382 |    0.04 |    4.36 |  12,018.96 | 86,330,752 | 83,102,035 |   186,638 |   179,657.85 |    6,724.15 | COMPLEMENT | largely      |
| **COM~supposedly**   |    30,854 |    0.04 |    4.13 |   2,118.60 | 86,330,752 | 83,102,035 |    30,878 |    29,723.18 |    1,130.82 | COMPLEMENT | supposedly   |
| **COM~most**         | 7,713,908 |    0.04 |    3.84 | 465,492.10 | 86,330,752 | 83,102,035 | 7,734,027 | 7,444,779.15 |  269,128.85 | COMPLEMENT | most         |
| **COM~albeit**       |    17,169 |    0.04 |    3.53 |   1,270.78 | 86,330,752 | 83,102,035 |    17,172 |    16,529.78 |      639.22 | COMPLEMENT | albeit       |
| **COM~presumably**   |     8,011 |    0.04 |    2.59 |     568.20 | 86,330,752 | 83,102,035 |     8,015 |     7,715.24 |      295.76 | COMPLEMENT | presumably   |
| **COM~alternately**  |     4,148 |    0.04 |    1.11 |     294.82 | 86,330,752 | 83,102,035 |     4,150 |     3,994.79 |      153.21 | COMPLEMENT | alternately  |

`mirror` subset (_Present Positive_ approximation) 

|                    |    `f` |   `dP1` |   `LRC` |     `G2` |       `N` |      `f1` |   `f2` |   `exp_f` |   `unexp_f` | `l1`   | `l2`       |
|:-------------------|-------:|--------:|--------:|---------:|----------:|----------:|-------:|----------:|------------:|:-------|:-----------|
| **POS~pretty**     | 26,788 |    0.14 |    4.48 | 7,278.87 | 2,032,082 | 1,738,105 | 26,919 | 23,024.69 |    3,763.31 | POSMIR | pretty     |
| **POS~rather**     |  9,290 |    0.14 |    4.34 | 2,607.01 | 2,032,082 | 1,738,105 |  9,322 |  7,973.41 |    1,316.59 | POSMIR | rather     |
| **POS~plain**      |  6,049 |    0.14 |    4.19 | 1,733.36 | 2,032,082 | 1,738,105 |  6,065 |  5,187.59 |      861.41 | POSMIR | plain      |
| **POS~otherwise**  |  9,368 |    0.14 |    4.12 | 2,558.73 | 2,032,082 | 1,738,105 |  9,410 |  8,048.68 |    1,319.32 | POSMIR | otherwise  |
| **POS~fairly**     |  6,184 |    0.14 |    3.97 | 1,713.96 | 2,032,082 | 1,738,105 |  6,208 |  5,309.90 |      874.10 | POSMIR | fairly     |
| **POS~somewhat**   |  4,961 |    0.14 |    3.87 | 1,391.12 | 2,032,082 | 1,738,105 |  4,978 |  4,257.84 |      703.16 | POSMIR | somewhat   |
| **POS~downright**  |  5,502 |    0.14 |    3.63 | 1,465.04 | 2,032,082 | 1,738,105 |  5,532 |  4,731.70 |      770.30 | POSMIR | downright  |
| **POS~already**    |  5,035 |    0.14 |    3.56 | 1,336.93 | 2,032,082 | 1,738,105 |  5,063 |  4,330.55 |      704.45 | POSMIR | already    |
| **POS~relatively** |  5,774 |    0.14 |    3.51 | 1,496.02 | 2,032,082 | 1,738,105 |  5,812 |  4,971.19 |      802.81 | POSMIR | relatively |
| **POS~maybe**      |  2,998 |    0.14 |    3.43 |   857.78 | 2,032,082 | 1,738,105 |  3,006 |  2,571.13 |      426.87 | POSMIR | maybe      |

In [48]:
def load_backup(lower_floor: int = 100,
                loaded_path: Path = adv_am_paths['RBdirect']) -> pd.DataFrame:
    located_paths = tuple(loaded_path.parent.glob(
        f'*35f-7c_min{lower_floor}x*{PKL_SUFF}'))
    if any(located_paths):
        backup_df = pd.read_pickle(located_paths[0])

        backup_df = backup_df.filter(like='NEG', axis=0).filter(
            items=FOCUS).reset_index().set_index('l2')
        backup_df.index.name = 'adv'
        return backup_df
    else:
        return []


def uncat(df):
    cats = df.select_dtypes('category').columns
    df[cats] = df[cats].astype('string')
    # print(df.dtypes)
    return df, cats


def fill_empties(name_1, name_2, both, loaded_paths):
    for name in (name_1, name_2):
        name = name.strip('_')
        path = loaded_paths['RBdirect'] if name == 'SET' else loaded_paths['NEGmirror']
        if any(both[f'f_{name}'].isna()):

            floor = 100
            neg_backup = load_backup(floor, loaded_path=path)
            if not any(neg_backup):
                print('Error. Backup data not found. [in fill_empties()]')

            neg_backup.columns = (pd.Series(adjust_assoc_columns(neg_backup.columns)
                                            ) + f'_{name}').to_list()
            both, cats = uncat(both)
            neg_backup, __ = uncat(neg_backup)

            undefined_adv = both.loc[
                both[f'f_{name}'].isna(), :].index.to_list()

            both.loc[undefined_adv,
                     neg_backup.columns] = neg_backup.filter(items=undefined_adv, axis=0)

            both[cats] = both[cats].astype('category')

    return both


def combine_top(df_1: pd.DataFrame,
                name_1: str,
                df_2: pd.DataFrame,
                name_2: str,
                env_filter: str = 'NEG',
                filter_items: list = FOCUS,
                k: int = 10) -> pd.DataFrame:

    top_dfs = [
        (get_top_vals(adv_df,  k=k,
                      index_like=env_filter,
                      metric_filter=['am_p1_given2',
                                     'conservative_log_ratio'])
         .sort_values('conservative_log_ratio', ascending=False))
        for adv_df in [df_1, df_2]
    ]
    for i, name in enumerate([name_1, name_2]):

        print_iter(
            [f'_{w}_' for w in top_dfs[i].l2], bullet='1.',
            header=(f'`{name}`: union of top {k} adverbs ranked by '
                    r'$LRC$ & $\Delta P(\texttt{env}|\texttt{adv})$'))
    top_adv_lists = [dx.l2.to_list() for dx in top_dfs]
    top_adv = pd.Series(top_adv_lists[0] + top_adv_lists[1]).drop_duplicates()
    # top_adv = pd.concat((top_dfs[0].l2, top_dfs[1].l2)).drop_duplicates()

    print_iter(
        [f'_{w}_' for w in top_adv], bullet='1.',
        header=f'Union of top adverbs for {name_1} and {name_2}. (Novel {name_2} adverbs listed last)')

    df_1 = narrow_selection(df_1, top_adv, env_filter, filter_items)
    df_2 = narrow_selection(df_2, top_adv, env_filter, filter_items)

    name_1, name_2 = [f"_{n.strip('_')}" for n in [name_1, name_2]]
    both = df_1.join(df_2, how="outer", lsuffix=name_1, rsuffix=name_2)

    # ! Empty cells need to be filled _before_ calculating mean
    both = fill_empties(name_1, name_2, both, adv_am_paths)
    both = force_ints(both)
    both = add_means(both)
    both = add_f_ratio(both, name_2, name_1)
    return both


def add_f_ratio(df, subset_name, superset_name):
    counts = df.filter(regex=r'^[Nf][12]?').columns.str.split(
        '_').str.get(0).drop_duplicates()
    for count in counts:
        ratio_col = f'ratio_{count}{subset_name}'
        df[ratio_col] = (df[f'{count}{subset_name}']
                         / df[f'{count}{superset_name}'])
        print(df.filter(like=count))
    return df

def add_means(both):
    for metric in (both.select_dtypes(include='number').columns.to_series()
                   .str.replace(r'_(MIR|SET)$', '', regex=True).unique()):
        both[f'mean_{snake_to_camel(metric)}'] = both.filter(
            regex=f"^{metric}").agg('mean', axis='columns')
    return both


def narrow_selection(df: pd.DataFrame,
                     top_adv: list,
                     env_filter: str = 'NEG',
                     filter_items: list = FOCUS):
    df = adjust_assoc_columns(
        df.filter(items=filter_items)
        .filter(like=env_filter, axis=0)
        .reset_index().set_index('l2')
        .filter(top_adv, axis=0))
    df.index.name = 'adv'
    nb_show_table(df)

    return df

## Compile top NEG~adverb associations across both approximation methods

In [57]:
C = combine_top(setdiff_adv.copy(), 'SET',
                mirror_adv.copy(), 'MIR', k=K)


`SET`: union of top 5 adverbs ranked by $LRC$ & $\Delta P(\texttt{env}|\texttt{adv})$
1. _necessarily_
1. _exactly_
1. _that_
1. _immediately_
1. _yet_

`MIR`: union of top 5 adverbs ranked by $LRC$ & $\Delta P(\texttt{env}|\texttt{adv})$
1. _ever_
1. _before_
1. _exactly_
1. _any_
1. _remotely_

Union of top adverbs for SET and MIR. (Novel MIR adverbs listed last)
1. _necessarily_
1. _exactly_
1. _that_
1. _immediately_
1. _yet_
1. _ever_
1. _before_
1. _any_
1. _remotely_

|                 | `key`              |     `f` |   `dP1` |   `LRC` |       `G2` |        `N` |      `f1` |    `f2` |   `exp_f` |   `unexp_f` | `l1`    |
|:----------------|:-------------------|--------:|--------:|--------:|-----------:|-----------:|----------:|--------:|----------:|------------:|:--------|
| **necessarily** | NEGany~necessarily |  42,708 |    0.72 |    6.23 | 219,003.46 | 86,330,752 | 3,226,213 |  56,694 |  2,118.68 |   40,589.32 | NEGATED |
| **exactly**     | NEGany~exactly     |  43,635 |    

### Frequency Comparisons between Polarity Approximations: All Data vs. Mirror Subset
The following values indicate the percentage of the negated frequency (`f`) and the marginal frequency (`f2`) accounted for by the `mirror` subset for each adverb. 
That is, `ratio_f_MIR` indicates the percentage of negated tokens with the specific triggers covered by `NEGmirror`, 
and `ratio_f2_MIR` the percentage of all adverb tokens which were captured by _either_ mirror pattern, `POSmirror` or `NEGmirror`. 
The third column then indicates the discrepancy between these percentages: 
For example, 

- [ ] 🚩 **finish this discussion!**

Note that _before_ and _ever_ have a much higher proportions of their negated tokens representated in the mirror subset. 
However, the discrepancy indicated by the `difference` column, which illuminates the 

#### Percentage Comparision

|                   |  joint % MIR |  adverb % MIR | % MIR $\Delta$ |
|:------------------|-------------:|--------------:|---------------:|
| **ever**          |         79.1 |           4.2 |           74.9 |
| **before**        |         93.2 |          39.3 |           53.9 |
| **inherently**    |         41.9 |          10.3 |           31.7 |
| **intrinsically** |         40.3 |           9.9 |           30.4 |
| **remotely**      |         32.5 |          12.2 |           20.3 |
| **particularly**  |         16.6 |           2.6 |           14.0 |
| **overtly**       |         18.1 |           5.9 |           12.2 |
| **any**           |          7.0 |           1.6 |            5.4 |
| **terribly**      |          8.7 |           7.4 |            1.3 |
| **exactly**       |          1.9 |           1.8 |            0.1 |
| **entirely**      |          3.8 |           3.9 |           -0.1 |
| **yet**           |          0.6 |           0.9 |           -0.3 |
| **that**          |          2.6 |           3.0 |           -0.4 |
| **necessarily**   |          2.3 |           3.0 |           -0.7 |
| **immediately**   |          0.7 |           1.4 |           -0.7 |
| **only**          |          0.2 |           1.1 |           -1.0 |
| **altogether**    |          2.4 |           8.8 |           -6.3 |

In [58]:
nb_show_table(C.filter(regex=r'^ratio_f2?_')
              .assign(f_minus_f2=C.ratio_f_MIR - C.ratio_f2_MIR)
              .multiply(100).round(1)
              .sort_values(['f_minus_f2', 'ratio_f_MIR'], ascending=False),
              n_dec=1, adjust_columns=False)



|                 |   `ratio_f_MIR` |   `ratio_f2_MIR` |   `f_minus_f2` |
|:----------------|----------------:|-----------------:|---------------:|
| **ever**        |            79.1 |              4.2 |           74.9 |
| **before**      |            93.2 |             39.3 |           53.9 |
| **remotely**    |            32.5 |             12.2 |           20.3 |
| **any**         |             7.0 |              1.6 |            5.4 |
| **exactly**     |             1.9 |              1.8 |            0.1 |
| **yet**         |             0.6 |              0.9 |           -0.3 |
| **that**        |             2.6 |              3.0 |           -0.4 |
| **necessarily** |             2.3 |              3.0 |           -0.7 |
| **immediately** |             0.7 |              1.4 |           -0.7 |



#### Joint (_Negated_) Frequency Comparison

|                   |   `total negations` |   `mirror subset negations` |   `negations not in mirror subset` |
|:------------------|--------------------:|----------------------------:|-----------------------------------:|
| **that**          |             165,411 |                       4,338 |                            161,073 |
| **only**          |             114,070 |                         173 |                            113,897 |
| **entirely**      |              63,708 |                       2,429 |                             61,279 |
| **immediately**   |              57,319 |                         407 |                             56,912 |
| **yet**           |              52,546 |                         320 |                             52,226 |
| **particularly**  |              55,799 |                       9,278 |                             46,521 |
| **exactly**       |              43,635 |                         813 |                             42,822 |
| **necessarily**   |              42,708 |                         971 |                             41,737 |
| **terribly**      |              18,054 |                       1,579 |                             16,475 |
| **any**           |              15,492 |                       1,082 |                             14,410 |
| **altogether**    |               4,575 |                         112 |                              4,463 |
| **inherently**    |               6,847 |                       2,872 |                              3,975 |
| **remotely**      |               5,679 |                       1,846 |                              3,833 |
| **overtly**       |               2,169 |                         392 |                              1,777 |
| **ever**          |               5,967 |                       4,718 |                              1,249 |
| **intrinsically** |               1,071 |                         432 |                                639 |
| **before**        |                 311 |                         290 |                                 21 |

In [59]:
nb_show_table(
    C
    # .assign(f_percent_MIR=C.ratio_f_MIR * 100)
    .filter(regex=r'^f_.*[MS]').sort_index(axis=1, ascending=False)
    .assign(
        f_diff=C.f_SET-C.f_MIR).sort_values('f_diff', ascending=False)
    .rename(columns={'f_SET':'total negations', 
                     'f_MIR':'mirror subset negations', 
                     'f_diff': 'negations not in mirror subset'}), n_dec=0)


|                 |   `total negations` |   `mirror subset negations` |   `negations not in mirror subset` |
|:----------------|--------------------:|----------------------------:|-----------------------------------:|
| **that**        |             165,411 |                       4,338 |                            161,073 |
| **immediately** |              57,319 |                         407 |                             56,912 |
| **yet**         |              52,546 |                         320 |                             52,226 |
| **exactly**     |              43,635 |                         813 |                             42,822 |
| **necessarily** |              42,708 |                         971 |                             41,737 |
| **any**         |              15,492 |                       1,082 |                             14,410 |
| **remotely**    |               5,679 |                       1,846 |                              3,833 |
| **ever**        

#### Marginal (_Adverb Total_) Frequency Comparison

|                   |   `total adverb tokens` |   `mirror subset adverb tokens` |   `adverb tokens not in mirror subset` |
|:------------------|------------------------:|--------------------------------:|---------------------------------------:|
| **particularly**  |                 575,960 |                          14,954 |                                561,006 |
| **only**          |                 464,168 |                           5,169 |                                458,999 |
| **entirely**      |                 303,833 |                          11,803 |                                292,030 |
| **that**          |                 250,392 |                           7,472 |                                242,920 |
| **ever**          |                 124,592 |                           5,179 |                                119,413 |
| **immediately**   |                 103,177 |                           1,442 |                                101,735 |
| **yet**           |                 101,707 |                             909 |                                100,798 |
| **any**           |                  94,152 |                           1,514 |                                 92,638 |
| **terribly**      |                  70,174 |                           5,218 |                                 64,956 |
| **exactly**       |                  61,599 |                           1,114 |                                 60,485 |
| **necessarily**   |                  56,694 |                           1,681 |                                 55,013 |
| **inherently**    |                  55,088 |                           5,649 |                                 49,439 |
| **remotely**      |                  22,194 |                           2,717 |                                 19,477 |
| **altogether**    |                  20,636 |                           1,808 |                                 18,828 |
| **overtly**       |                  15,219 |                             898 |                                 14,321 |
| **intrinsically** |                  10,001 |                             991 |                                  9,010 |
| **before**        |                     748 |                             294 |                                    454 |

In [60]:
nb_show_table(
    C
    # .assign(f2_percent_MIR=C.ratio_f2_MIR * 100)
    .filter(regex=r'^f2_.*[MS]').sort_index(axis=1, ascending=False)
    .assign(
        f2_diff=C.f2_SET-C.f2_MIR).sort_values('f2_diff', ascending=False)
    .rename(columns={'f2_SET':'total adverb tokens', 
                     'f2_MIR':'mirror subset adverb tokens', 
                     'f2_diff': 'adverb tokens not in mirror subset'}), n_dec=0)


|                 |   `total adverb tokens` |   `mirror subset adverb tokens` |   `adverb tokens not in mirror subset` |
|:----------------|------------------------:|--------------------------------:|---------------------------------------:|
| **that**        |                 250,392 |                           7,472 |                                242,920 |
| **ever**        |                 124,592 |                           5,179 |                                119,413 |
| **immediately** |                 103,177 |                           1,442 |                                101,735 |
| **yet**         |                 101,707 |                             909 |                                100,798 |
| **any**         |                  94,152 |                           1,514 |                                 92,638 |
| **exactly**     |                  61,599 |                           1,114 |                                 60,485 |
| **necessarily** |            

In [61]:
full_C = C.copy()
main_cols_ordered = pd.concat((*[C.filter(like=m).columns.to_series() for m in ('LRC', 'P1', 'G2')],
                               *[C.filter(regex=f'^{f}_').columns.to_series() for f in ['f', 'f1', 'f2'] ]) 
                              ).to_list()
print_iter([f'`{c}`' for c in main_cols_ordered], bullet='1.', header='Main Columns')
main_C = C[[c for c in main_cols_ordered if c in C.columns]]
main_C


Main Columns
1. `LRC_SET`
1. `LRC_MIR`
1. `mean_LRC`
1. `dP1_SET`
1. `dP1_MIR`
1. `mean_dP1`
1. `G2_SET`
1. `G2_MIR`
1. `mean_G2`
1. `f_SET`
1. `f_MIR`
1. `f1_SET`
1. `f1_MIR`
1. `f2_SET`
1. `f2_MIR`


Unnamed: 0_level_0,LRC_SET,LRC_MIR,mean_LRC,dP1_SET,dP1_MIR,mean_dP1,G2_SET,G2_MIR,mean_G2,f_SET,f_MIR,f1_SET,f1_MIR,f2_SET,f2_MIR
adv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
any,2.28,3.48,2.88,0.13,0.57,0.35,23683.0,2511.26,13097.13,15492,1082,3226213,293963,94152,1514
before,3.65,5.11,4.38,0.38,0.84,0.61,1062.13,1080.52,1071.32,311,290,3226213,293963,748,294
ever,0.28,5.57,2.92,0.01,0.77,0.39,353.58,15340.34,7846.96,5967,4718,3226213,293963,124592,5179
exactly,5.9,3.51,4.71,0.67,0.59,0.63,214404.2,1939.47,108171.83,43635,813,3226213,293963,61599,1114
immediately,4.96,0.79,2.88,0.52,0.14,0.33,239462.58,181.2,119821.89,57319,407,3226213,293963,103177,1442
necessarily,6.23,2.66,4.44,0.72,0.43,0.57,219003.46,1688.91,110346.18,42708,971,3226213,293963,56694,1681
remotely,3.03,3.35,3.19,0.22,0.54,0.38,13354.33,4009.84,8682.08,5679,1846,3226213,293963,22194,2717
that,5.62,2.86,4.24,0.63,0.44,0.53,781016.11,7632.21,394324.16,165411,4338,3226213,293963,250392,7472
yet,4.74,1.18,2.96,0.48,0.21,0.34,209055.78,242.23,104649.01,52546,320,3226213,293963,101707,909


In [62]:
C.index.name = 'adv'
C=force_ints(C.sort_values('mean_LRC', ascending=False))
pd.set_option('display.max_columns', 16)
nb_show_table(C)


|                 | `key_SET`          |   `f_SET` |   `dP1_SET` |   `LRC_SET` |   `G2_SET` |    `N_SET` |   `f1_SET` |   `f2_SET` |   `exp_f_SET` |   `unexp_f_SET` | `l1_SET`   | `key_MIR`          |   `f_MIR` |   `dP1_MIR` |   `LRC_MIR` |   `G2_MIR` |   `N_MIR` |   `f1_MIR` |   `f2_MIR` |   `exp_f_MIR` |   `unexp_f_MIR` | `l1_MIR`   |   `mean_f` |   `mean_dP1` |   `mean_LRC` |   `mean_G2` |      `mean_N` |    `mean_f1` |   `mean_f2` |   `mean_expF` |   `mean_unexpF` |   `r_f_MIR` |   `r_N_MIR` |   `r_f1_MIR` |   `r_f2_MIR` |
|:----------------|:-------------------|----------:|------------:|------------:|-----------:|-----------:|-----------:|-----------:|--------------:|----------------:|:-----------|:-------------------|----------:|------------:|------------:|-----------:|----------:|-----------:|-----------:|--------------:|----------------:|:-----------|-----------:|-------------:|-------------:|------------:|--------------:|-------------:|------------:|--------------:|----------

Save full adverb selection as `.csv`

In [63]:
C.to_csv(TOP_AM_DIR / f'Top{K}_NEG-ADV_combined.35f-7c_{timestamp_today()}.csv')

Save `all-columns`, `means`, and `MAIN` as markdown formatted tables

In [64]:
C.to_markdown(floatfmt=',.2f', intfmt=',', buf=TOP_AM_DIR / f'Top{K}_NEG-ADV_combined_all-columns.35f-7c_{timestamp_today()}.md')
C.filter(like='mean_').to_markdown(floatfmt=',.2f', intfmt=',', buf=TOP_AM_DIR / f'Top{K}_NEG-ADV_combined_means.35f-7c_{timestamp_today()}.md')
C[main_cols_ordered].to_markdown(floatfmt=',.2f', intfmt=',', buf=TOP_AM_DIR / f'Top{K}_NEG-ADV_combined_MAIN.35f-7c_{timestamp_today()}.md')

## Collect bigrams corresponding to top adverbs

In [65]:
# results/assoc_df/polar/RBdirect/bigram/polarized-bigram_35f-7c_min1000x.pkl.gz
bigram_floor = 200
mirror_floor = 50
bigram_dfs = {d.name:
              update_index(pd.read_pickle(
                  tuple(d.joinpath('bigram/extra')
                        .glob(f'*35f-7c*min{mirror_floor if d.name == "NEGmirror" else bigram_floor}x*.pkl.gz')
                        )[0]))
              for d in POLAR_DIR.iterdir()}

In [73]:
def show_adv_bigrams(sample_size, C, 
                     bigram_dfs, 
                     selector:str='dP1',
                     column_list: list = None) -> dict:
    def _force_ints(_df):
        count_cols = _df.filter(regex=r'total$|^[fN]').columns
        _df[count_cols] = _df[count_cols].apply(
            pd.to_numeric, downcast='unsigned')
        return _df
    bigram_k = max(sample_size + 2, 10)
    print(f'## Top {bigram_k} "most negative" bigrams corresponding to top {K} adverbs\n')
    print(timestamp_today())
    patterns = list(bigram_dfs.keys())
    top_adverbs = C.index
    bigram_samples = dict.fromkeys(top_adverbs)
    bigrams = []
    adj = []
    for rank,adv in enumerate(top_adverbs, start=1):
        print(f'\n### {rank}. _{adv}_\n')
        adv_top = None
        bigram_samples[adv] = dict.fromkeys(patterns + ['both', 'adj'])
        adj_for_adv = []
        for pat, bdf in bigram_dfs.items():
            # avoid KeyError while maintaining intended order
            bdf = adjust_assoc_columns(
                bdf[[c for c in FOCUS+['adj', 'adj_total', 'adv', 'adv_total']
                     if c in bdf.columns]
                    ])
            #> Force significant & positive association according to LRC
            bdf = bdf.loc[bdf.LRC >= 1, :]

            adv_pat_bigrams = _force_ints(
                bdf.filter(like=f'~{adv}_', axis=0)
                .nlargest(bigram_k, [selector]+list({'LRC', 'dP1'} - {selector}))
                )

            if adv_pat_bigrams.empty:
                print(f'No bigrams found in loaded `{pat}` AM table.')
            else:
                print(
                    f'\n#### Top {bigram_k} `{pat}` "{adv}_*" bigrams (sorted by `{selector}`; `LRC > 1`)\n')
                column_list = column_list or bdf.columns
                nb_show_table(adv_pat_bigrams[column_list], n_dec=2)

            adj_for_adv.extend(adv_pat_bigrams.adj.drop_duplicates().to_list())

            bigram_samples[adv][pat] = adv_pat_bigrams

            adv_top = adv_pat_bigrams if adv_top is None else pd.concat(
                [adv_top, adv_pat_bigrams])

        bigram_samples[adv]['adj'] = set(adj_for_adv)
        bigrams.extend(adv_top.l2.drop_duplicates().to_list())
        adj.extend(adj_for_adv)
        bigram_samples[adv]['both'] = adv_top
    bigram_samples['bigrams'] = set(bigrams)
    bigram_samples['adj'] = set(adj)
    return bigram_samples, bigram_k


samples_dict, bigram_k = show_adv_bigrams(
    K, C, bigram_dfs,
    column_list=[
        'adj', 'adj_total',
        *pd.Series(main_cols_ordered).str.replace(
            r'mean_|_SET|_MIR', '', regex=True)
        .drop_duplicates().to_list(),
        # 't', 'MI'
    ]
)

## Top 10 "most negative" bigrams corresponding to top 5 adverbs

2024-05-21

### 1. _exactly_


#### Top 10 `RBdirect` "exactly_*" bigrams (sorted by `dP1`; `LRC > 1`)


|                               | `adj`      |   `adj_total` |   `LRC` |   `dP1` |      `G2` |   `f` |      `f1` |   `f2` |
|:------------------------------|:-----------|--------------:|--------:|--------:|----------:|------:|----------:|-------:|
| **NEGany~exactly_surprising** | surprising |       150,067 |    7.34 |    0.96 |  2,863.35 |   441 | 3,226,213 |    444 |
| **NEGany~exactly_cheap**      | cheap      |        83,765 |    8.28 |    0.95 |  4,443.27 |   693 | 3,226,213 |    704 |
| **NEGany~exactly_subtle**     | subtle     |        56,845 |    6.92 |    0.94 |  1,671.02 |   264 | 3,226,213 |    271 |
| **NEGany~exactly_fun**        | fun        |       224,457 |    6.67 |    0.94 |  1,423.92 |   225 | 3,226,213 |    231 |
| **NEGany~exactly_conducive**  | conducive  |        16,405 |    6.56 |    0.93 |  1

In [74]:
for key, info in samples_dict.items():
    if key in ('bigrams', 'adj'):
        key = f'ALL {key.replace("adj", "adjectives")}'
    formatted_iter = [
        f'_{a.replace("_", " ")}_' for a
        in (info['adj'] if isinstance(info, dict)
            else info)]
    print_iter(formatted_iter,
               header=f'1. _{key}_ ({len(formatted_iter)} unique)',
               bullet='1.', indent=3)


1. _exactly_ (10 unique)
   1. _surprising_
   1. _new_
   1. _conducive_
   1. _subtle_
   1. _clear_
   1. _hard_
   1. _sure_
   1. _easy_
   1. _cheap_
   1. _fun_

1. _necessarily_ (13 unique)
   1. _surprising_
   1. _interested_
   1. _aware_
   1. _illegal_
   1. _available_
   1. _obvious_
   1. _bad_
   1. _wrong_
   1. _true_
   1. _sure_
   1. _indicative_
   1. _easy_
   1. _representative_

1. _before_ (1 unique)
   1. _available_

1. _that_ (20 unique)
   1. _surprising_
   1. _interested_
   1. _fond_
   1. _good_
   1. _hard_
   1. _uncommon_
   1. _impressed_
   1. _simple_
   1. _popular_
   1. _serious_
   1. _noticeable_
   1. _exciting_
   1. _dissimilar_
   1. _complicated_
   1. _difficult_
   1. _big_
   1. _unusual_
   1. _close_
   1. _common_
   1. _easy_

1. _remotely_ (5 unique)
   1. _interested_
   1. _interesting_
   1. _close_
   1. _true_
   1. _similar_

1. _yet_ (10 unique)
   1. _eligible_
   1. _public_
   1. _clear_
   1. _complete_
   1. _right

In [75]:
NEG_bigrams_sample = pd.concat(
    (ad['both'] for ad in samples_dict.values() if isinstance(ad, dict))).sort_values('LRC', ascending=False)

In [76]:
top_NEGbigram_df_path = TOP_AM_DIR.joinpath(
    f'Top{K}_NEG-ADV_top-{bigram_k}-bigrams.{timestamp_today()}.csv')
print(top_NEGbigram_df_path)
NEG_bigrams_sample.to_csv(top_NEGbigram_df_path)
nb_show_table(NEG_bigrams_sample.sort_values('LRC', ascending=False), outpath= top_NEGbigram_df_path.with_suffix('.md'))

/share/compling/projects/sanpi/results/top_AM/Top5_NEG-ADV_top-10-bigrams.2024-05-21.csv

|                                       |    `f` |   `dP1` |   `LRC` |       `G2` |        `N` |       `f1` |   `f2` |   `exp_f` |   `unexp_f` | `l1`       | `l2`                       | `adj`          |   `adj_total` | `adv`       |   `adv_total` |
|:--------------------------------------|-------:|--------:|--------:|-----------:|-----------:|-----------:|-------:|----------:|------------:|:-----------|:---------------------------|:---------------|--------------:|:------------|--------------:|
| **NEGany~yet_clear**                  | 10,553 |    0.95 |   10.26 |  67,924.56 | 86,330,752 |  3,226,213 | 10,693 |    399.60 |   10,153.40 | NEGATED    | yet_clear                  | clear          |    491,108.00 | yet         |    101,707.00 |
| **NEGany~yet_ready**                  |  7,611 |    0.93 |    9.23 |  48,012.06 | 86,330,752 |  3,226,213 |  7,838 |    292.91 |    7,318.09 | NEGATED    | ye

In [77]:
NEG_bigrams_sample.l1.value_counts()

l1
NEGATED       67
NEGMIR        36
COMPLEMENT     4
Name: count, dtype: Int64