In [137]:
import re
from pathlib import Path

import pandas as pd

from source.utils import POST_PROC_DIR, print_iter, print_md_table
from source.utils.associate import AM_DF_DIR, TOP_AM_DIR, adjust_assoc_columns
from source.utils.general import PKL_SUFF, confirm_dir, timestamp_today
from source.utils.sample import sample_pickle as sp

REFILTER_NEG = False
N_EX_PER_BIGRAM = 99
VERBOSE = True
K = 20
BK = max(K+2, 10)
DATE = timestamp_today()
FOCUS = adjust_assoc_columns(['f', 'E11', 'unexpected_f',
                              'am_p1_given2', 'conservative_log_ratio',
                              'am_log_likelihood',
                              #   't_score', 'mutual_information', 'am_odds_ratio_disc',
                              'N', 'f1', 'f2', 'l1', 'l2'])
pd.set_option("display.float_format", '{:,.2f}'.format)
pd.set_option("display.max_colwidth", 70)

NEG_HITS_PATH = POST_PROC_DIR.joinpath(
    'RBdirect/trigger-bigrams_frq-thrMIN-7.35f.pkl.gz')
PRE_FILTERED_NEG_HITS = NEG_HITS_PATH.with_name(NEG_HITS_PATH.name.replace('trigger', f'onlyTop{K}_NEG-ADV'))
if not NEG_HITS_PATH.is_file():
    NEG_HITS_PATH = NEG_HITS_PATH.with_name(
        'trigger-bigrams_thr0-001p.35f.pkl.gz')
    

In [138]:
def nb_show_table(df, n_dec:int=2, 
                   adjust_columns:bool=True, 
                   outpath:Path=None,
                   suppress_printing:bool=not VERBOSE) -> None: 
    _df = df.copy()
    if adjust_columns: 
        _df = adjust_assoc_columns(_df)

    _df.columns = [f'`{c}`' for c in _df.columns]
    _df.index = [f'**{r}**' for r in _df.index ]
    table = _df.convert_dtypes().to_markdown(floatfmt=f',.{n_dec}f', intfmt=',')
    if outpath:
        outpath.write_text(table)
    if not suppress_printing:
        print(f'\n{table}\n')
    
def force_ints(_df):
    count_cols = _df.filter(regex=r'total|^[fN]').columns
    _df.loc[:, count_cols] = _df.loc[:, count_cols].astype('int')
    # _df[count_cols] = _df[:, count_cols].astype('int64')
    # print(_df.dtypes.to_frame('dtypes'))
    return _df


In [139]:
def embolden(series,
            bold_regex=None):
    bold_regex = bold_regex or r" (n[o']t) "
    return series.apply(
        lambda x: re.sub(bold_regex,
                        r' __`\1`__ ', x, flags=re.I))

In [140]:
adv_am = []
while not any(adv_am):
    try:
        adv_am = pd.read_csv(
            TOP_AM_DIR / f'Top{K}_NEG-ADV_combined.35f-7c_{DATE}.csv'
            ).set_index('adv')
    except FileNotFoundError:
        DATE = DATE[:-1]+str(int(DATE[-1])-1)
adv_am


Unnamed: 0_level_0,key_SET,f_SET,dP1_SET,LRC_SET,G2_SET,N_SET,f1_SET,f2_SET,exp_f_SET,unexp_f_SET,...,mean_G2,mean_N,mean_f1,mean_f2,mean_expF,mean_unexpF,ratio_f_MIR,ratio_N_MIR,ratio_f1_MIR,ratio_f2_MIR
adv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
exactly,NEGany~exactly,43635,0.67,5.9,214404.2,86330752,3226213,61599,2301.98,41333.02,...,108171.83,44181417.0,1760088.0,31356.5,1231.57,20992.43,0.02,0.02,0.09,0.02
before,NEG~before,311,0.38,3.65,1062.13,86330752,3226213,748,27.95,283.05,...,1071.32,44181417.0,1760088.0,521.0,35.24,265.26,0.93,0.02,0.09,0.39
necessarily,NEGany~necessarily,42708,0.72,6.23,219003.46,86330752,3226213,56694,2118.68,40589.32,...,110346.18,44181417.0,1760088.0,29187.5,1180.93,20658.57,0.02,0.02,0.09,0.03
that,NEGany~that,165411,0.63,5.62,781016.11,86330752,3226213,250392,9357.24,156053.76,...,394324.16,44181417.0,1760088.0,128932.0,5219.08,79655.42,0.03,0.02,0.09,0.03
ever,NEGany~ever,5967,0.01,0.28,353.58,86330752,3226213,124592,4656.05,1310.95,...,7846.96,44181417.0,1760088.0,64885.5,2702.62,2639.88,0.79,0.02,0.09,0.04
remotely,NEGany~remotely,5679,0.22,3.03,13354.33,86330752,3226213,22194,829.4,4849.6,...,8682.08,44181417.0,1760088.0,12455.5,611.22,3151.28,0.33,0.02,0.09,0.12
any,NEGany~any,15492,0.13,2.28,23683.0,86330752,3226213,94152,3518.5,11973.5,...,13097.13,44181417.0,1760088.0,47833.0,1868.76,6418.24,0.07,0.02,0.09,0.02
yet,NEGany~yet,52546,0.48,4.74,209055.78,86330752,3226213,101707,3800.83,48745.17,...,104649.01,44181417.0,1760088.0,51308.0,1966.16,24466.84,0.01,0.02,0.09,0.01
immediately,NEGany~immediately,57319,0.52,4.96,239462.58,86330752,3226213,103177,3855.76,53463.24,...,119821.89,44181417.0,1760088.0,52309.5,2032.18,26830.82,0.01,0.02,0.09,0.01


In [141]:
def compare_datasets(adv_am, met = 'dP1', k=5):
    met_adv_am = adv_am.filter(like=met)
    for col in met_adv_am.columns:
        print(f'Top {k} by `{col}`')
        print(met_adv_am.nlargest(k, col).to_markdown(
            floatfmt=(',.3f' if met=='dP1' else ',.2f')), '\n')

compare_datasets(adv_am)

Top 5 by `dP1_SET`
| adv         |   dP1_SET |   dP1_MIR |   mean_dP1 |
|:------------|----------:|----------:|-----------:|
| necessarily |     0.716 |     0.433 |      0.575 |
| exactly     |     0.671 |     0.585 |      0.628 |
| that        |     0.625 |     0.438 |      0.531 |
| immediately |     0.519 |     0.138 |      0.328 |
| yet         |     0.480 |     0.207 |      0.344 | 

Top 5 by `dP1_MIR`
| adv      |   dP1_SET |   dP1_MIR |   mean_dP1 |
|:---------|----------:|----------:|-----------:|
| before   |     0.378 |     0.842 |      0.610 |
| ever     |     0.011 |     0.768 |      0.389 |
| exactly  |     0.671 |     0.585 |      0.628 |
| any      |     0.127 |     0.570 |      0.349 |
| remotely |     0.219 |     0.535 |      0.377 | 

Top 5 by `mean_dP1`
| adv         |   dP1_SET |   dP1_MIR |   mean_dP1 |
|:------------|----------:|----------:|-----------:|
| exactly     |     0.671 |     0.585 |      0.628 |
| before      |     0.378 |     0.842 |      0.610 |
| nec

In [142]:
compare_datasets(adv_am, 'LRC')

Top 5 by `LRC_SET`
| adv         |   LRC_SET |   LRC_MIR |   mean_LRC |
|:------------|----------:|----------:|-----------:|
| necessarily |      6.23 |      2.66 |       4.44 |
| exactly     |      5.90 |      3.51 |       4.71 |
| that        |      5.62 |      2.86 |       4.24 |
| immediately |      4.96 |      0.79 |       2.88 |
| yet         |      4.74 |      1.18 |       2.96 | 

Top 5 by `LRC_MIR`
| adv      |   LRC_SET |   LRC_MIR |   mean_LRC |
|:---------|----------:|----------:|-----------:|
| ever     |      0.28 |      5.57 |       2.92 |
| before   |      3.65 |      5.11 |       4.38 |
| exactly  |      5.90 |      3.51 |       4.71 |
| any      |      2.28 |      3.48 |       2.88 |
| remotely |      3.03 |      3.35 |       3.19 | 

Top 5 by `mean_LRC`
| adv         |   LRC_SET |   LRC_MIR |   mean_LRC |
|:------------|----------:|----------:|-----------:|
| exactly     |      5.90 |      3.51 |       4.71 |
| necessarily |      6.23 |      2.66 |       4.44 |
| bef

In [143]:
def pin_top_adv(adv_am, 
                column = 'mean_dP1', 
                verbose:bool = VERBOSE):
    sorted_adv_am = adv_am.sort_values(column, ascending=False)
    top = sorted_adv_am.index.to_series()
    if verbose:
        print(
                f'Top Adverb Selection, ranked by descending `{column}`',
            sorted_adv_am[[column]].reset_index().to_markdown(floatfmt=',.3f'), 
            sep='\n\n', end='\n\n'
                )
    return top.to_list(), sorted_adv_am

TOP_ADV, adv_am = pin_top_adv(adv_am)

Top Adverb Selection, ranked by descending `mean_dP1`

|    | adv         |   mean_dP1 |
|---:|:------------|-----------:|
|  0 | exactly     |      0.628 |
|  1 | before      |      0.610 |
|  2 | necessarily |      0.575 |
|  3 | that        |      0.531 |
|  4 | ever        |      0.389 |
|  5 | remotely    |      0.377 |
|  6 | any         |      0.349 |
|  7 | yet         |      0.344 |
|  8 | immediately |      0.328 |



In [144]:
bigram_am = (pd.read_csv(TOP_AM_DIR / f'Top{K}_NEG-ADV_top-{BK}-bigrams.{DATE}.csv')
             .set_index('key')
             #> not strictly necessary (loaded table should already satisfy this) but just in case...
             .filter(regex=r'~'+r'_|~'.join(TOP_ADV)+'_', axis=0))

In [145]:
overall_k = int(BK/2 * K)
nb_show_table(bigram_am.round(2).nlargest(overall_k, ['dP1','LRC','G2']), 
            outpath=TOP_AM_DIR / f'Top{K}_NEG-ADV_top{overall_k}bigrams-overall.md', suppress_printing=not VERBOSE)


|                                   |    `f` |   `dP1` |   `LRC` |      `G2` |        `N` |      `f1` |   `f2` |   `exp_f` |   `unexp_f` | `l1`    | `l2`                   | `adj`       |   `adj_total` | `adv`       |   `adv_total` |
|:----------------------------------|-------:|--------:|--------:|----------:|-----------:|----------:|-------:|----------:|------------:|:--------|:-----------------------|:------------|--------------:|:------------|--------------:|
| **NEGany~exactly_surprising**     |    441 |    0.96 |    7.34 |  2,863.35 | 86,330,752 | 3,226,213 |    444 |     16.59 |      424.41 | NEGATED | exactly_surprising     | surprising  |       150,067 | exactly     |        61,599 |
| **NEGany~exactly_shy**            |    124 |    0.96 |    1.53 |    815.15 | 86,330,752 | 3,226,213 |    124 |      4.63 |      119.37 | NEGATED | exactly_shy            | shy         |        50,956 | exactly     |        61,599 |
| **NEGany~exactly_forthcoming**    |    107 |    0.96 |    1.3

In [146]:
def clarify_categories(neg_hits, verbose=VERBOSE):
    def lemma_aint_to_not(neg_hits: pd.DataFrame, verbose):
        neg_hits['neg_lemma'] = (neg_hits.neg_lemma.astype('string')
                                .str.replace('aint', "not")
                                .str.replace("ain't", 'not'))
        if verbose:
            print('Updated `neg_lemma` counts with "ain(\')t" replaced by "not"', 
              neg_hits.neg_lemma.value_counts().to_markdown(floatfmt=',.0f', intfmt=','), 
              sep='\n\n')
        return neg_hits
    neg_hits = lemma_aint_to_not(neg_hits, verbose)
    word_cols = neg_hits.filter(regex=r'head|lower|lemma').columns
    #> drop empty categories if already categorical; make categorical if not already
    neg_hits[word_cols] = neg_hits[word_cols].astype('string').astype('category')
    return neg_hits

In [147]:
if PRE_FILTERED_NEG_HITS.is_file() and not REFILTER_NEG: 
    neg_hits = pd.read_pickle(PRE_FILTERED_NEG_HITS)
else:
    # # neg_hits = pd.read_pickle(NEG_HITS_PATH).filter(regex=r'^[nab].*lower|text|str|head')
    # #> Added `neg_lemma` column to selection
    # neg_hits = pd.read_pickle(NEG_HITS_PATH).filter(regex=r'^[nab].*lower|text|str|head|neg_lemma')
    # neg_hits = neg_hits.drop_duplicates(['text_window', 'bigram_lower', 'neg_form_lower'])
    # word_cols = neg_hits.filter(regex=r'head|lower|neg_lemma').columns
    # neg_hits[word_cols] = neg_hits[word_cols].astype('category')
    # neg_hits = neg_hits.loc[neg_hits.adv_form_lower.isin(adv_am.index), :]
    neg_hits = pd.read_pickle(NEG_HITS_PATH).filter(
        regex=r'^[nab].*lower|text|str|head|(adv|neg)_lemma')
    neg_hits = neg_hits.drop_duplicates(['text_window', 'bigram_lower', 'neg_form_lower'])

    neg_hits = neg_hits.loc[(neg_hits.adv_lemma.isin(TOP_ADV))
                                        | (neg_hits.adv_form_lower.isin(TOP_ADV)), :]
    if VERBOSE:
        print(neg_hits.neg_lemma.value_counts().to_markdown(floatfmt=',.0f', intfmt=','))
    neg_hits = clarify_categories(neg_hits)


In [148]:

# sourcery skip: use-fstring-for-concatenation
if 'all_forms_lower' not in neg_hits.columns: 
    neg_hits['all_forms_lower'] = (
        neg_hits.neg_form_lower.astype('string') 
        + '_' 
        + neg_hits.bigram_lower.astype('string')
        ).astype('category')
nb_show_table(neg_hits.sample(3).filter(like='lower'), adjust_columns=False, suppress_printing=not VERBOSE)


|                                             | `neg_form_lower`   | `adv_form_lower`   | `adj_form_lower`   | `bigram_lower`   | `all_forms_lower`   |
|:--------------------------------------------|:-------------------|:-------------------|:-------------------|:-----------------|:--------------------|
| **pcc_eng_22_107.04960_x1721218_094:5-7-8** | n't                | any                | clearer            | any_clearer      | n't_any_clearer     |
| **pcc_eng_26_035.3548_x0555314_63:1-6-7**   | none               | any                | larger             | any_larger       | none_any_larger     |
| **nyt_eng_19941029_0058_44:2-7-8**          | none               | that               | brilliant          | that_brilliant   | none_that_brilliant |



In [149]:
if not PRE_FILTERED_NEG_HITS.is_file() or REFILTER_NEG:
    neg_hits.to_pickle(PRE_FILTERED_NEG_HITS)
    print(f'Saved Limited "NEG" hit table as: `{PRE_FILTERED_NEG_HITS.relative_to(POST_PROC_DIR.parent)}`')
else:
    print(f'Limited "NEG" hit table already saved as: `{PRE_FILTERED_NEG_HITS.relative_to(POST_PROC_DIR.parent)}`')

Limited "NEG" hit table already saved as: `4_post-processed/RBdirect/onlyTop5_NEG-ADV-bigrams_frq-thrMIN-7.35f.pkl.gz`


In [150]:
neg_hits.loc[neg_hits.adv_lemma.astype('string') 
             != neg_hits.adv_form_lower.astype('string')
             ].filter(regex=r'adv|window')

Unnamed: 0_level_0,text_window,adv_lemma,adv_form_lower
hit_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
pcc_eng_18_059.2123_x0942427_06:6-7-8,"i gez , it aint dat easy .",that,dat
pcc_eng_00_078.9695_x1260460_177:4-6-7,""" It ai n't really dat hard , to get fucked",that,dat
pcc_eng_24_046.7793_x0739923_37:2-3-4,Its not dat easy to let go specially,that,dat
pcc_eng_27_003.1965_x0035346_67:30-32-33,still dat Garuda shud n't be dat ashamed coz he still has,that,dat


In [151]:
if VERBOSE:
    print(neg_hits.adv_lemma.value_counts().to_markdown(floatfmt=',.0f', intfmt=','))

| adv_lemma   |   count |
|:------------|--------:|
| that        | 156,725 |
| yet         |  50,567 |
| immediately |  48,725 |
| exactly     |  42,058 |
| necessarily |  40,346 |
| any         |  14,912 |
| ever        |   5,774 |
| remotely    |   5,569 |
| before      |     306 |


In [152]:
if VERBOSE:
    fewer = sp(data=neg_hits, regex=True, print_sample=False,
           columns=['WITH::bigram|neg|str'], 
           filters=['neg_form_lower==fewer'])
    nb_show_table(fewer.assign(token_str=embolden(fewer.token_str, r' (fewer) ')), adjust_columns=False)


- *filtering rows...*
  - regex parsing = True


  - ✓ Applied filter: `neg_form_lower==fewer`

### All (2) row(s) matching filter(s) from `input frame`


|                                              | `token_str`                                                                                                      | `neg_lemma`   | `neg_form_lower`   | `bigram_lower`   |
|:---------------------------------------------|:-----------------------------------------------------------------------------------------------------------------|:--------------|:-------------------|:-----------------|
| **pcc_eng_04_001.4775_x0007747_3:4-6-7**     | In 2D far __`fewer`__ are exactly solvable , the simplest being a rectangle with Dirichlet boundary conditions . | few           | fewer              | exactly_solvable |
| **pcc_eng_19_031.1707_x0487155_18:16-18-19** | Few of us , and particularly the media , are that good , and even __`fewer`__ are that brave .                   | few           | fewer              | that_brave       |



In [153]:
if VERBOSE:
    rare_forms = neg_hits.neg_form_lower.value_counts().nsmallest(8).index
    nb_show_table(neg_hits.loc[neg_hits.neg_form_lower.isin(rare_forms), :].sort_values('neg_form_lower').filter(regex=r'bigram|neg|text'))


|                                               | `text_window`                                                 | `neg_lemma`   | `neg_form_lower`   | `bigrlower`       |
|:----------------------------------------------|:--------------------------------------------------------------|:--------------|:-------------------|:------------------|
| **pcc_eng_26_040.5604_x0639652_30:29-31-32**  | \ " It can \'t be that bad , \ " before                       | not           | \'t                | that_bad          |
| **pcc_eng_04_001.4775_x0007747_3:4-6-7**      | In 2D far fewer are exactly solvable , the simplest being     | few           | fewer              | exactly_solvable  |
| **pcc_eng_19_031.1707_x0487155_18:16-18-19**  | good , and even fewer are that brave .                        | few           | fewer              | that_brave        |
| **pcc_eng_04_038.2619_x0602310_31:4-5-6**     | All- natural is n''t necessarily green .                      | not           | n''t          

In [154]:
if VERBOSE:
    nb_show_table(neg_hits.loc[(neg_hits.neg_form_lower!="n't") 
                   & (neg_hits.neg_lemma.astype('string') != neg_hits.neg_form_lower.astype('string')), 
                   ['neg_lemma', 'neg_form_lower', 'text_window']].sample(10))


|                                               | `neg_lemma`   | `neg_form_lower`   | `text_window`                                         |
|:----------------------------------------------|:--------------|:-------------------|:------------------------------------------------------|
| **pcc_eng_02_020.3671_x0313547_17:4-5-6**     | not           | ain't              | " So it ain't that bad for me . "                     |
| **pcc_eng_19_050.0752_x0792228_047:12-13-14** | not           | ain't              | Barlow , but I ain't that dumb .                      |
| **pcc_eng_01_054.0376_x0857380_39:17-18-19**  | not           | ain't              | days , but it ain't exactly cheap .                   |
| **pcc_eng_27_049.9943_x0791790_107:5-6-7**    | not           | ain't              | And what it is ain't exactly clear .                  |
| **pcc_eng_01_064.3697_x1024956_25:4-5-6**     | not           | ain't              | What it is ain't exactly clear                        

In [155]:
if VERBOSE:
    nb_show_table(neg_hits.loc[(neg_hits.neg_lemma!="not") 
                   & (neg_hits.neg_lemma.astype('string') != neg_hits.neg_form_lower.astype('string')), 
                   ['neg_lemma', 'neg_form_lower', 'text_window']])


|                                               | `neg_lemma`   | `neg_form_lower`   | `text_window`                                               |
|:----------------------------------------------|:--------------|:-------------------|:------------------------------------------------------------|
| **pcc_eng_04_001.4775_x0007747_3:4-6-7**      | few           | fewer              | In 2D far fewer are exactly solvable , the simplest being   |
| **pcc_eng_19_031.1707_x0487155_18:16-18-19**  | few           | fewer              | good , and even fewer are that brave .                      |
| **pcc_eng_20_080.0820_x1277728_116:20-23-24** | nobody        | nobodies           | away from the story nobodies dick is that big and if it was |
| **pcc_eng_15_058.2861_x0925810_13:3-4-5**     | nothing       | nothings           | Jeez , nothings ever simple , is it ?                       |



In [156]:
if VERBOSE:
    print('Weird example, but illustrates structural relationship which is probably not caught by the patterns for accurate parses: Possessive quantified pronoun in subject', 
      neg_hits.loc[neg_hits.neg_form_lower=='nobodies', ['neg_lemma', 'neg_form_lower', 'bigram_lower', 'text_window', 'token_str']].T.to_markdown(), 
      sep='\n\n')

Weird example, but illustrates structural relationship which is probably not caught by the patterns for accurate parses: Possessive quantified pronoun in subject

|                | pcc_eng_20_080.0820_x1277728_116:20-23-24                                                                                                                                                                                                                                                                                                                                                       |
|:---------------|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| neg_lemma      | nobody  

In [157]:
if VERBOSE:
    nb_show_table(neg_hits.filter(like='lower').loc[neg_hits.adv_form_lower=='exactly',:].sample(10))


|                                               | `neg_form_lower`   | `adv_form_lower`   | `adj_form_lower`   | `bigrlower`         | `all_forms_lower`       |
|:----------------------------------------------|:-------------------|:-------------------|:-------------------|:--------------------|:------------------------|
| **pcc_eng_15_061.4342_x0976233_13:46-47-48**  | not                | exactly            | urbane             | exactly_urbane      | not_exactly_urbane      |
| **nyt_eng_20040830_0198_21:3-4-5**            | n't                | exactly            | difficult          | exactly_difficult   | n't_exactly_difficult   |
| **pcc_eng_08_045.6424_x0722557_11:12-13-14**  | not                | exactly            | new                | exactly_new         | not_exactly_new         |
| **pcc_eng_04_108.04322_x1736287_10:21-22-23** | not                | exactly            | favorable          | exactly_favorable   | not_exactly_favorable   |
| **pcc_eng_09_051.1336_x0811080_

In [158]:
def collect_adv_bigram_ex(amdf: pd.DataFrame,
                     hits_df: pd.DataFrame,
                     adv: str = 'exactly',
                     n_bigrams: int = BK,
                     n_examples: int = 50,
                     verbose:bool=False,
                     metric: str | list = ['dP1', 'LRC']) -> dict:
    if amdf.adv.nunique() > 1: 
        amdf = amdf.filter(like=f'~{adv}_',
                            axis=0).nlargest(n_bigrams, metric)
    examples = {}
    for i, bigram in enumerate(amdf['l2'].unique(), start=1):
        bigram_text = bigram.replace("_", " ")
        if verbose: 
            print(f'\n{i}. _{bigram_text}_')
        ex_for_bigram = sp(
            data=hits_df, print_sample=False, quiet=True,
            sample_size=n_examples,  sort_by='all_forms_lower',
            filters=[f'bigram_lower=={bigram}'],
            columns=['END::lower', 'text_window', 'token_str'])
        excerpt = embolden(ex_for_bigram.sample(min(len(ex_for_bigram), 5))[
                        'token_str'], f' ({bigram_text}) ').to_frame()
        excerpt.index = '`'+excerpt.index.astype('string')+'`'
        nb_show_table(excerpt, suppress_printing=not verbose)
        # print('\n   > ', [f'> {}' for i in ex_for_bigram.sample(3).index])
        examples[bigram] = ex_for_bigram
    return examples


def populate_adv_dir(adverb, bigram_am, neg_hits, n_ex:int=50,
                     rank_by: str | list = ['dP1', "LRC"], 
                     verbose:bool=False):
    output_dir = TOP_AM_DIR / 'neg_bigram_examples' / adverb
    table_csv_path = output_dir / \
        f'{adverb}_{BK}mostNEG-bigrams_AMscores_{timestamp_today()}.csv'
    confirm_dir(output_dir)
    this_adv_amdf = bigram_am.filter(
        like=f'~{adverb}_', axis=0).sort_values(rank_by, ascending=False)
    this_adv_amdf.to_csv(table_csv_path)

    nb_show_table(this_adv_amdf.filter(['N', 'f1', 'adv_total'])
                  .set_index(this_adv_amdf.l1 + f'_{adverb}').drop_duplicates(),
                  n_dec=0,
                  outpath=output_dir / f'{adverb}_MarginalFreqs_{timestamp_today()}.md', 
                  suppress_printing=not verbose)
    
    nb_show_table(this_adv_amdf.filter(regex=r'^([dLGeu]|f2?$|adj_total)').round(2).sort_values(rank_by, ascending=False), n_dec=2,
                  outpath=table_csv_path.with_suffix('.md'),
                  suppress_printing=not verbose)
    
    examples = collect_adv_bigram_ex(this_adv_amdf, neg_hits, metric=rank_by, n_examples=n_ex, verbose=verbose)

    print(f'\nSaving Samples in {output_dir}/...')

    paths = []
    for key, df in examples.items():
        out_path = output_dir.joinpath(f'{key}_{n_ex}ex.csv')
        df.to_csv(out_path)
        paths.append(out_path)
        
    if verbose:
        print_iter((f'`{p.relative_to(output_dir.parent.parent)}`' for p in paths), header='\nSamples saved as...', bullet='1.')

print(f'# {BK} Most Negative Bigrams for each of the {K} Most Negative Adverbs\n')

for rank, adverb in enumerate(adv_am.index, start=1):
    print(f'\n## {rank}. *{adverb}*')
    populate_adv_dir(adverb, bigram_am, neg_hits, rank_by=['dP1', 'LRC'], n_ex=N_EX_PER_BIGRAM, 
                     verbose=VERBOSE)

# 10 Most Negative Bigrams for each of the 5 Most Negative Adverbs


## 1. *exactly*

|                     |        `N` |      `f1` |   `adv_total` |
|:--------------------|-----------:|----------:|--------------:|
| **NEGATED_exactly** | 86,330,752 | 3,226,213 |        61,599 |
| **NEGMIR_exactly**  |  2,032,082 |   293,963 |         1,114 |


|                                |   `f` |   `dP1` |   `LRC` |      `G2` |   `f2` |   `exp_f` |   `unexp_f` |   `adj_total` |
|:-------------------------------|------:|--------:|--------:|----------:|-------:|----------:|------------:|--------------:|
| **NEGany~exactly_surprising**  |   441 |    0.96 |    7.34 |  2,863.35 |    444 |     16.59 |      424.41 |       150,067 |
| **NEGany~exactly_shy**         |   124 |    0.96 |    1.53 |    815.15 |    124 |      4.63 |      119.37 |        50,956 |
| **NEGany~exactly_forthcoming** |   107 |    0.96 |    1.32 |    703.40 |    107 |      4.00 |      103.00 |        11,270 |
| **NEGany~exactly_imp