# Identifying Adverbs with Strongest Negative Environment Associations

In [68]:
from pathlib import Path

import pandas as pd
from pprint import pprint

from source.utils import PKL_SUFF, SAMPLE_ADV, print_md_table
from source.utils.associate import AM_DF_DIR, TOP_AM_DIR, adjust_assoc_columns
from source.utils.general import print_iter, snake_to_camel, timestamp_today

SET_FLOOR = 2000
MIR_FLOOR = 200
K = 9

Set columns and diplay settings

In [69]:
FOCUS = ['f', 'E11', 'unexpected_f',
         'am_p1_given2', 'conservative_log_ratio',
         'am_log_likelihood', 't_score',
         'mutual_information', 'am_odds_ratio_disc',
         'N', 'f1', 'f2', 'l1', 'l2']
pd.set_option('display.max_colwidth', 20)
pd.set_option('display.max_columns', 12)
pd.set_option('display.width', 90)
pd.set_option("display.precision", 2)
pd.set_option("styler.format.precision", 2)
pd.set_option("styler.format.thousands", ",")
pd.set_option("display.float_format", '{:,.2f}'.format)
pd.set_option("styler.render.repr", "html")

## Set paths and load adverb association tables

In [70]:
POLAR_DIR = AM_DF_DIR.joinpath('polar')

polar_adv_dirs = []
# results/assoc_df/polar/RBdirect/adv/extra/polarized-adv_35f-7c_min5000x_extra.pkl.gz
adv_am_paths = {
    p.name: tuple(
        p.joinpath('adv/extra').glob(
            f'*35f-7c_min{SET_FLOOR if p.name == "RBdirect" else MIR_FLOOR}x*{PKL_SUFF}')
    )[0]
    for p in POLAR_DIR.iterdir()}

setdiff_adv = pd.read_pickle(adv_am_paths['RBdirect'])
mirror_adv = pd.read_pickle(adv_am_paths['NEGmirror'])

In [71]:
adjust_assoc_columns(setdiff_adv.sample(K).sort_values('conservative_log_ratio', ascending=False)[FOCUS])

Unnamed: 0_level_0,f,exp_f,unexp_f,dP1,LRC,G2,...,odds_r_disc,N,f1,f2,l1,l2
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
NEG~particularly,55799,21523.84,34275.16,0.06,1.43,40303.42,...,0.45,86330752,3226213,575960,NEGATED,particularly
COM~startlingly,7276,7063.56,212.44,0.03,1.25,246.75,...,0.66,86330752,83102035,7338,COMPLEMENT,startlingly
COM~thankfully,2153,2075.37,77.63,0.04,0.53,138.39,...,1.38,86330752,83102035,2156,COMPLEMENT,thankfully
NEG~mentally,2446,3061.08,-615.08,-0.01,-0.18,-137.73,...,-0.1,86330752,3226213,81912,NEGATED,mentally
COM~logistically,2012,2089.81,-77.81,-0.04,-0.42,-61.04,...,-0.31,86330752,83102035,2171,COMPLEMENT,logistically
COM~universally,13569,14370.67,-801.67,-0.05,-1.16,-864.02,...,-0.41,86330752,83102035,14929,COMPLEMENT,universally
COM~mutually,39541,43780.04,-4239.04,-0.09,-1.85,-6808.21,...,-0.59,86330752,83102035,45481,COMPLEMENT,mutually
COM~consciously,3506,4272.02,-766.02,-0.17,-2.5,-1830.82,...,-0.84,86330752,83102035,4438,COMPLEMENT,consciously
COM~entirely,240125,292469.83,-52344.83,-0.17,-2.74,-125839.79,...,-0.84,86330752,83102035,303833,COMPLEMENT,entirely


In [72]:
adjust_assoc_columns(mirror_adv.sample(K).sort_values('conservative_log_ratio', ascending=False)[FOCUS])

Unnamed: 0_level_0,f,exp_f,unexp_f,dP1,LRC,G2,...,odds_r_disc,N,f1,f2,l1,l2
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
POS~maybe,2998,2571.13,426.87,0.14,3.43,857.78,...,1.78,2032082,1738105,3006,POSMIR,maybe
POS~perhaps,4035,3474.36,560.64,0.14,3.26,1042.16,...,1.4,2032082,1738105,4062,POSMIR,perhaps
POS~deeply,5506,4781.31,724.69,0.13,2.68,1175.7,...,1.04,2032082,1738105,5590,POSMIR,deeply
POS~no,1995,1772.25,222.75,0.11,1.29,263.29,...,0.64,2032082,1738105,2072,POSMIR,no
POS~severely,607,536.29,70.71,0.11,0.72,89.89,...,0.7,2032082,1738105,627,POSMIR,severely
POS~significantly,1616,1490.84,125.16,0.07,0.44,86.39,...,0.33,2032082,1738105,1743,POSMIR,significantly
POS~notably,211,221.53,-10.53,-0.04,0.0,-3.23,...,-0.13,2032082,1738105,259,POSMIR,notably
POS~ethically,205,225.81,-20.81,-0.08,0.0,-11.68,...,-0.23,2032082,1738105,264,POSMIR,ethically
POS~as,96064,109071.95,-13007.95,-0.11,-0.91,-9900.48,...,-0.31,2032082,1738105,127520,POSMIR,as


## Calculate "Most Negative" Adverbs for each Polarity Approximation

In [73]:
def get_top_vals(df: pd.DataFrame,
                 index_like: str = 'NEG',
                 metric_filter: str | list = 'conservative_log_ratio',
                 k: int = 10,
                 val_col: str = None,
                 ignore_neg_adv: bool = True):
    env_df = df.copy().loc[df.conservative_log_ratio >=
                           1].filter(like=index_like, axis=0)
    if ignore_neg_adv:
        env_df = env_df.loc[~df.l2.isin(
            ("n't", 'not', 'barely', 'never', 'no', 'none')), :]
    if isinstance(metric_filter, str):
        metric_filter = [metric_filter]

    top = pd.concat([env_df.nlargest(k, m) for m in metric_filter]
                    ).drop_duplicates(keep='first')

    if val_col:
        top = top[[val_col] + metric_filter]

    return top.sort_values(metric_filter[0], ascending=False)


[setdiff_top15, mirror_top15] = [
    get_top_vals(
        adv_df, k=15,
        metric_filter=['am_p1_given2', 'conservative_log_ratio'])
    for adv_df in (setdiff_adv, mirror_adv)
]
adjust_assoc_columns(setdiff_top15.filter(items=FOCUS).reset_index())

Unnamed: 0,key,f,exp_f,unexp_f,dP1,LRC,...,odds_r_disc,N,f1,f2,l1,l2
0,NEG~necessarily,42708,2118.68,40589.32,0.72,6.23,...,1.9,86330752,3226213,56694,NEGATED,necessarily
1,NEG~exactly,43635,2301.98,41333.02,0.67,5.9,...,1.8,86330752,3226213,61599,NEGATED,exactly
2,NEG~that,165411,9357.24,156053.76,0.63,5.62,...,1.72,86330752,3226213,250392,NEGATED,that
3,NEG~immediately,57319,3855.76,53463.24,0.52,4.96,...,1.52,86330752,3226213,103177,NEGATED,immediately
4,NEG~yet,52546,3800.83,48745.17,0.48,4.74,...,1.45,86330752,3226213,101707,NEGATED,yet
5,NEG~terribly,18054,2622.43,15431.57,0.22,3.09,...,0.95,86330752,3226213,70174,NEGATED,terribly
6,NEG~remotely,5679,829.4,4849.6,0.22,3.03,...,0.95,86330752,3226213,22194,NEGATED,remotely
7,NEG~only,114070,17346.13,96723.87,0.21,3.04,...,0.94,86330752,3226213,464168,NEGATED,only
8,NEG~altogether,4575,771.17,3803.82,0.18,2.75,...,0.87,86330752,3226213,20636,NEGATED,altogether
9,NEG~entirely,63708,11354.35,52353.65,0.17,2.74,...,0.84,86330752,3226213,303833,NEGATED,entirely


In [74]:
adjust_assoc_columns(mirror_top15.filter(items=FOCUS).reset_index())

Unnamed: 0,key,f,exp_f,unexp_f,dP1,LRC,...,odds_r_disc,N,f1,f2,l1,l2
0,NEG~before,290,42.53,247.47,0.84,5.11,...,2.58,2032082,293963,294,NEGMIR,before
1,NEG~ever,4718,749.2,3968.8,0.77,5.57,...,1.79,2032082,293963,5179,NEGMIR,ever
2,NEG~exactly,813,161.15,651.85,0.59,3.51,...,1.2,2032082,293963,1114,NEGMIR,exactly
3,NEG~any,1082,219.02,862.98,0.57,3.48,...,1.17,2032082,293963,1514,NEGMIR,any
4,NEG~remotely,1846,393.04,1452.96,0.54,3.35,...,1.1,2032082,293963,2717,NEGMIR,remotely
5,NEG~particularly,9278,2163.26,7114.74,0.48,3.15,...,1.0,2032082,293963,14954,NEGMIR,particularly
6,NEG~that,4338,1080.91,3257.09,0.44,2.86,...,0.92,2032082,293963,7472,NEGMIR,that
7,NEG~necessarily,971,243.18,727.82,0.43,2.66,...,0.91,2032082,293963,1681,NEGMIR,necessarily
8,NEG~inherently,2872,817.19,2054.81,0.36,2.42,...,0.79,2032082,293963,5649,NEGMIR,inherently
9,NEG~overtly,392,129.91,262.09,0.29,1.71,...,0.66,2032082,293963,898,NEGMIR,overtly


In [75]:
def load_backup(lower_floor: int = 100,
                loaded_path: Path = adv_am_paths['RBdirect']) -> pd.DataFrame:
    located_paths = tuple(loaded_path.parent.glob(
        f'*35f-7c_min{lower_floor}x*{PKL_SUFF}'))
    if any(located_paths):
        backup_df = pd.read_pickle(located_paths[0])

        backup_df = backup_df.filter(like='NEG', axis=0).filter(
            items=FOCUS).reset_index().set_index('l2')
        backup_df.index.name = 'adv'
        return backup_df
    else:
        return []



def fill_empties(name_1, name_2, both, loaded_paths):
    for name in (name_1, name_2):
        name = name.strip('_')
        path = loaded_paths['RBdirect'] if name == 'SET' else loaded_paths['NEGmirror']
        floor = 100
        if any(both[f'f_{name}'].isna()):

            neg_backup = load_backup(floor, loaded_path=path)
            neg_backup.columns = (pd.Series(adjust_assoc_columns(neg_backup.columns)
                                           ) + f'_{name}').to_list()
            if any(neg_backup):
                cats = both.select_dtypes(include='category').columns
                both[cats] = both[cats].astype('string')
                backup_cats = neg_backup.select_dtypes(
                    include='category').columns
                neg_backup[backup_cats] = neg_backup[backup_cats].astype(
                    'string')

                undefined = both.index[both[f'f_{name}'].isna()].to_list()
                both.loc[undefined,
                         neg_backup.columns] = neg_backup.filter(items=undefined, axis=0)

                both[cats] = both[cats].astype('category')

    return both


In [76]:
def combine_top(df_1: pd.DataFrame,
                name_1: str,
                df_2: pd.DataFrame,
                name_2: str,
                env_filter: str = 'NEG',
                filter_items: list = FOCUS,
                k: int = 10) -> pd.DataFrame:

    top_dfs = [get_top_vals(adv_df, index_like=env_filter, k=k,
                            metric_filter=['am_p1_given2',
                                           'conservative_log_ratio']
                            )
               for adv_df in [df_1, df_2]]
    for i, name in enumerate([name_1, name_2]):

        print_iter(
            top_dfs[i].l2.to_list(), bullet='1.',
            header=f'{name}: union of top {k} adverbs ranked by deltaP(1|2) and LRC')

    top_adv = pd.concat((top_dfs[0].l2, top_dfs[1].l2)).drop_duplicates()

    print_iter(
        top_adv, bullet='1.',
        header=f'Union of top adverbs for {name_1} and {name_2}. (Novel {name_2} adverbs listed last)')

    df_1, df_2 = [d.filter(items=filter_items)
                  .filter(like=env_filter, axis=0)
                  .reset_index().set_index('l2')
                  for d in [df_1, df_2]]
    df_1 = adjust_assoc_columns(df_1)
    df_2 = adjust_assoc_columns(df_2)

    both = pd.DataFrame(index=top_adv)
    name_1, name_2 = [f"_{n.strip('_')}" for n in [name_1, name_2]]
    both = both.join(df_1).join(df_2, lsuffix=name_1,
                                rsuffix=name_2).sort_index(axis=1)
    # ! Empty cells need to be filled _before_ calculating mean
    both = fill_empties(name_1, name_2, both, adv_am_paths)

    for metric in (both.select_dtypes(include='number').columns.to_series()
                   .str.replace(r'_(MIR|SET)$', '', regex=True).unique()):

        both[f'mean_{snake_to_camel(metric)}']= both.filter(
            regex=f"^{metric}").agg('mean', axis='columns')

    return both

## Compile top NEG~adverb associations across both approximation methods

In [77]:
C = combine_top(setdiff_adv.copy(), 'SET',
                mirror_adv.copy(), 'MIR', k=K)


SET: union of top 9 adverbs ranked by deltaP(1|2) and LRC
1. necessarily
1. exactly
1. that
1. immediately
1. yet
1. terribly
1. remotely
1. only
1. altogether

MIR: union of top 9 adverbs ranked by deltaP(1|2) and LRC
1. before
1. ever
1. exactly
1. any
1. remotely
1. particularly
1. that
1. necessarily
1. inherently

Union of top adverbs for SET and MIR. (Novel MIR adverbs listed last)
1. necessarily
1. exactly
1. that
1. immediately
1. yet
1. terribly
1. remotely
1. only
1. altogether
1. before
1. ever
1. any
1. particularly
1. inherently


SET: union of top 5 adverbs ranked by $\Delta P(\texttt{env}|\texttt{adv})$ and LRC
1. necessarily
1. exactly
1. that
1. immediately
1. yet

MIR: union of top 5 adverbs ranked by $\Delta P(\texttt{env}|\texttt{adv})$ and LRC
1. before
1. ever
1. exactly
1. any
1. remotely

Union of top adverbs for SET and MIR. (Novel MIR adverbs listed last)
1. necessarily
1. exactly
1. that
1. immediately
1. yet
1. before
1. ever
1. any
1. remotely

In [78]:
print(C.columns.tolist())

['G2_MIR', 'G2_SET', 'LRC_MIR', 'LRC_SET', 'MI_MIR', 'MI_SET', 'N_MIR', 'N_SET', 'dP1_MIR', 'dP1_SET', 'exp_f_MIR', 'exp_f_SET', 'f1_MIR', 'f1_SET', 'f2_MIR', 'f2_SET', 'f_MIR', 'f_SET', 'key_MIR', 'key_SET', 'l1_MIR', 'l1_SET', 'odds_r_disc_MIR', 'odds_r_disc_SET', 't_MIR', 't_SET', 'unexp_f_MIR', 'unexp_f_SET', 'mean_G2', 'mean_LRC', 'mean_MI', 'mean_N', 'mean_dP1', 'mean_expF', 'mean_f1', 'mean_f2', 'mean_f', 'mean_oddsRDisc', 'mean_t', 'mean_unexpF']


In [79]:
main_cols_ordered = pd.concat((*[C.filter(like=m).columns.to_series() for m in ('LRC', 'P1', 'G2')],
                               *[C.filter(regex=f'^{f}_').columns.to_series() for f in ['f', 'f1', 'f2'] ]) 
                              ).to_list()
print(main_cols_ordered)

['LRC_MIR', 'LRC_SET', 'mean_LRC', 'dP1_MIR', 'dP1_SET', 'mean_dP1', 'G2_MIR', 'G2_SET', 'mean_G2', 'f_MIR', 'f_SET', 'f1_MIR', 'f1_SET', 'f2_MIR', 'f2_SET']


In [80]:
C.index.name = 'adv'
C=C.sort_values('mean_LRC', ascending=False)
pd.set_option('display.max_columns', 16)
C[main_cols_ordered]

Unnamed: 0_level_0,LRC_MIR,LRC_SET,mean_LRC,dP1_MIR,dP1_SET,mean_dP1,G2_MIR,G2_SET,mean_G2,f_MIR,f_SET,f1_MIR,f1_SET,f2_MIR,f2_SET
adv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
exactly,3.51,5.9,4.71,0.59,0.67,0.63,1939.47,214404.2,108171.83,813.0,43635.0,293963.0,3226213.0,1114.0,61599.0
necessarily,2.66,6.23,4.44,0.43,0.72,0.57,1688.91,219003.46,110346.18,971.0,42708.0,293963.0,3226213.0,1681.0,56694.0
before,5.11,3.65,4.38,0.84,0.38,0.61,1080.52,1062.13,1071.32,290.0,311.0,293963.0,3226213.0,294.0,748.0
that,2.86,5.62,4.24,0.44,0.63,0.53,7632.21,781016.11,394324.16,4338.0,165411.0,293963.0,3226213.0,7472.0,250392.0
remotely,3.35,3.03,3.19,0.54,0.22,0.38,4009.84,13354.33,8682.08,1846.0,5679.0,293963.0,3226213.0,2717.0,22194.0
yet,1.18,4.74,2.96,0.21,0.48,0.34,242.23,209055.78,104649.01,320.0,52546.0,293963.0,3226213.0,909.0,101707.0
ever,5.57,0.28,2.92,0.77,0.01,0.39,15340.34,353.58,7846.96,4718.0,5967.0,293963.0,3226213.0,5179.0,124592.0
immediately,0.79,4.96,2.88,0.14,0.52,0.33,181.2,239462.58,119821.89,407.0,57319.0,293963.0,3226213.0,1442.0,103177.0
any,3.48,2.28,2.88,0.57,0.13,0.35,2511.26,23683.0,13097.13,1082.0,15492.0,293963.0,3226213.0,1514.0,94152.0
particularly,3.15,1.43,2.29,0.48,0.06,0.27,17999.07,40303.42,29151.24,9278.0,55799.0,293963.0,3226213.0,14954.0,575960.0


Save full adverb selection as `.csv`

In [81]:
C.to_csv(TOP_AM_DIR / f'Top{K}_NEG-ADV_combined.35f-7c_{timestamp_today()}.csv')

Save `all-columns`, `means`, and `MAIN` as markdown formatted tables

In [82]:
C.to_markdown(floatfmt=',.2f', intfmt=',', buf=TOP_AM_DIR / f'Top{K}_NEG-ADV_combined_all-columns.35f-7c_{timestamp_today()}.md')
C.filter(like='mean_').to_markdown(floatfmt=',.2f', intfmt=',', buf=TOP_AM_DIR / f'Top{K}_NEG-ADV_combined_means.35f-7c_{timestamp_today()}.md')
C[main_cols_ordered].to_markdown(floatfmt=',.2f', intfmt=',', buf=TOP_AM_DIR / f'Top{K}_NEG-ADV_combined_MAIN.35f-7c_{timestamp_today()}.md')

## Collect bigrams corresponding to top adverbs

In [83]:
# results/assoc_df/polar/RBdirect/bigram/polarized-bigram_35f-7c_min1000x.pkl.gz
bigram_floor = 200
bigram_dfs = {d.name:
              pd.read_pickle(
                  tuple(d.joinpath('bigram/extra')
                        .glob(f'*35f-7c*min{bigram_floor//2 if d.name == "NEGmirror" else bigram_floor}x*.pkl.gz')
                        )[0])
              for d in POLAR_DIR.iterdir()}

In [84]:
def show_adv_bigrams(sample_size, C, bigram_dfs, column_list: list = None) -> dict:

    print('# Top bigrams corresponding to top adverbs\n')
    print(timestamp_today())
    patterns = list(bigram_dfs.keys())
    top_adverbs = C.mean_LRC.nlargest(sample_size).index
    bigram_samples = dict.fromkeys(top_adverbs)
    bigrams = []
    adj = []
    for adv in top_adverbs:
        print(f'\n## _{adv}_\n')
        adv_top = None
        bigram_samples[adv] = dict.fromkeys(patterns + ['both', 'adj'])
        adj_for_adv = []
        for pat, bdf in bigram_dfs.items():
            bdf = bdf[FOCUS+['adv', 'adj', 'adj_total']]
            bdf.columns = adjust_assoc_columns(bdf.columns)
            bdf = bdf.loc[bdf.LRC >= 1, :]

            adv_pat_bigrams = bdf.filter(
                like=adv, axis=0).nlargest(sample_size, 'LRC')
            # print(adv_top_bigrams)
            if adv_pat_bigrams.empty:
                print(f'No bigrams found in loaded `{pat}` AM table.')
            else:
                column_list = column_list or bdf.columns
                print_md_table(adv_pat_bigrams[column_list], n_dec=2,
                               title=f'### Top {sample_size} `{pat}` "{adv}_*" bigrams (sorted by `LRC`; `LRC > 1`)')

            adj_for_adv.extend(adv_pat_bigrams.adj.drop_duplicates().to_list())

            bigram_samples[adv][pat] = adv_pat_bigrams

            adv_top = adv_pat_bigrams if adv_top is None else pd.concat(
                [adv_top, adv_pat_bigrams])

        bigram_samples[adv]['adj'] = set(adj_for_adv)
        bigrams.extend(adv_top.l2.drop_duplicates().to_list())
        adj.extend(adj_for_adv)
        bigram_samples[adv]['both'] = adv_top
    bigram_samples['bigrams'] = set(bigrams)
    bigram_samples['adj'] = set(adj)
    return bigram_samples


samples_dict = show_adv_bigrams(
    K, C, bigram_dfs,
    column_list=[
        'adj',
        *pd.Series(main_cols_ordered).str.replace(
            r'mean_|_SET|_MIR', '', regex=True)
        .drop_duplicates().to_list(),
        't', 'MI'
    ]
)

# Top bigrams corresponding to top adverbs

2024-05-15

## _exactly_


### Top 9 `NEGmirror` "exactly_*" bigrams (sorted by `LRC`; `LRC > 1`)

| key              | adj   |   LRC |   dP1 |     G2 |   f |      f1 |   f2 |     t |   MI |
|:-----------------|:------|------:|------:|-------:|----:|--------:|-----:|------:|-----:|
| NEG~exactly_sure | sure  |  2.09 |  0.85 | 560.65 | 148 | 293,963 |  149 | 10.39 | 0.84 |


### Top 9 `RBdirect` "exactly_*" bigrams (sorted by `LRC`; `LRC > 1`)

| key                    | adj        |   LRC |   dP1 |        G2 |     f |        f1 |    f2 |     t |   MI |
|:-----------------------|:-----------|------:|------:|----------:|------:|----------:|------:|------:|-----:|
| NEG~exactly_sure       | sure       |  8.63 |  0.92 | 54,750.58 | 8,860 | 3,226,213 | 9,301 | 90.43 | 1.41 |
| NEG~exactly_new        | new        |  8.54 |  0.93 |  8,697.93 | 1,378 | 3,226,213 | 1,418 | 35.69 | 1.42 |
| NEG~exactly_easy       | easy       |  8.37 |  0.93 |  6,747.6


### Top 9 `RBdirect` "necessarily_*" bigrams (sorted by `LRC`; `LRC > 1`)

| key                            | adj            |   LRC |   dP1 |        G2 |     f |        f1 |    f2 |     t |   MI |
|:-------------------------------|:---------------|------:|------:|----------:|------:|----------:|------:|------:|-----:|
| NEG~necessarily_indicative     | indicative     |  8.37 |  0.93 |  8,811.69 | 1,406 | 3,226,213 | 1,456 | 36.05 | 1.41 |
| NEG~necessarily_representative | representative |  7.31 |  0.91 |  3,044.27 |   496 | 3,226,213 |   524 | 21.39 | 1.40 |
| NEG~necessarily_easy           | easy           |  7.26 |  0.88 |  5,448.34 |   914 | 3,226,213 |   996 | 29.00 | 1.39 |
| NEG~necessarily_surprising     | surprising     |  7.22 |  0.93 |  2,150.86 |   343 | 3,226,213 |   355 | 17.80 | 1.41 |
| NEG~necessarily_true           | true           |  6.89 |  0.82 | 18,199.76 | 3,238 | 3,226,213 | 3,786 | 54.42 | 1.36 |
| NEG~necessarily_interested     | interested     |  6.77 |  0.

  adv_top = adv_pat_bigrams if adv_top is None else pd.concat(
  adv_top = adv_pat_bigrams if adv_top is None else pd.concat(
  adv_top = adv_pat_bigrams if adv_top is None else pd.concat(
  adv_top = adv_pat_bigrams if adv_top is None else pd.concat(
  adv_top = adv_pat_bigrams if adv_top is None else pd.concat(
  adv_top = adv_pat_bigrams if adv_top is None else pd.concat(
  adv_top = adv_pat_bigrams if adv_top is None else pd.concat(


# Top bigrams corresponding to top adverbs

2024-05-15

## _exactly_


### Top 5 `RBdirect` "exactly_*" bigrams (sorted by `LRC`; `LRC > 1`)

| key               | adj   |   LRC |   dP1 |        G2 |     f |        f1 |    f2 |     t |   MI |
|:------------------|:------|------:|------:|----------:|------:|----------:|------:|------:|-----:|
| NEG~exactly_sure  | sure  |  8.63 |  0.92 | 54,750.58 | 8,860 | 3,226,213 | 9,301 | 90.43 | 1.41 |
| NEG~exactly_new   | new   |  8.54 |  0.93 |  8,697.93 | 1,378 | 3,226,213 | 1,418 | 35.69 | 1.42 |
| NEG~exactly_easy  | easy  |  8.37 |  0.93 |  6,747.64 | 1,069 | 3,226,213 | 1,100 | 31.44 | 1.42 |
| NEG~exactly_clear | clear |  8.30 |  0.92 | 10,937.16 | 1,759 | 3,226,213 | 1,835 | 40.31 | 1.41 |
| NEG~exactly_cheap | cheap |  8.28 |  0.95 |  4,443.27 |   693 | 3,226,213 |   704 | 25.33 | 1.42 |


### Top 5 `NEGmirror` "exactly_*" bigrams (sorted by `LRC`; `LRC > 1`)

| key              | adj   |   LRC |   dP1 |     G2 |   f |      f1 |   f2 |     t |   MI |
|:-----------------|:------|------:|------:|-------:|----:|--------:|-----:|------:|-----:|
| NEG~exactly_sure | sure  |  2.09 |  0.85 | 560.65 | 148 | 293,963 |  149 | 10.39 | 0.84 |


## _necessarily_


### Top 5 `RBdirect` "necessarily_*" bigrams (sorted by `LRC`; `LRC > 1`)

| key                            | adj            |   LRC |   dP1 |        G2 |     f |        f1 |    f2 |     t |   MI |
|:-------------------------------|:---------------|------:|------:|----------:|------:|----------:|------:|------:|-----:|
| NEG~necessarily_indicative     | indicative     |  8.37 |  0.93 |  8,811.69 | 1,406 | 3,226,213 | 1,456 | 36.05 | 1.41 |
| NEG~necessarily_representative | representative |  7.31 |  0.91 |  3,044.27 |   496 | 3,226,213 |   524 | 21.39 | 1.40 |
| NEG~necessarily_easy           | easy           |  7.26 |  0.88 |  5,448.34 |   914 | 3,226,213 |   996 | 29.00 | 1.39 |
| NEG~necessarily_surprising     | surprising     |  7.22 |  0.93 |  2,150.86 |   343 | 3,226,213 |   355 | 17.80 | 1.41 |
| NEG~necessarily_true           | true           |  6.89 |  0.82 | 18,199.76 | 3,238 | 3,226,213 | 3,786 | 54.42 | 1.36 |


### Top 5 `NEGmirror` "necessarily_*" bigrams (sorted by `LRC`; `LRC > 1`)

| key                   | adj   |   LRC |   dP1 |     G2 |   f |      f1 |   f2 |     t |   MI |
|:----------------------|:------|------:|------:|-------:|----:|--------:|-----:|------:|-----:|
| NEG~necessarily_wrong | wrong |  4.19 |  0.77 | 693.55 | 213 | 293,963 |  233 | 12.29 | 0.80 |


## _before_

No bigrams found in loaded `RBdirect` AM table.

### Top 5 `NEGmirror` "before_*" bigrams (sorted by `LRC`; `LRC > 1`)

| key                  | adj       |   LRC |   dP1 |     G2 |   f |      f1 |   f2 |     t |   MI |
|:---------------------|:----------|------:|------:|-------:|----:|--------:|-----:|------:|-----:|
| NEG~before_available | available |  3.99 |  0.84 | 654.92 | 177 | 293,963 |  180 | 11.35 | 0.83 |


## _that_


### Top 5 `RBdirect` "that_*" bigrams (sorted by `LRC`; `LRC > 1`)

| key                  | adj         |   LRC |   dP1 |        G2 |     f |        f1 |     f2 |     t |   MI |
|:---------------------|:------------|------:|------:|----------:|------:|----------:|-------:|------:|-----:|
| NEG~that_uncommon    | uncommon    |  8.39 |  0.94 |  5,136.91 |   804 | 3,226,213 |    819 | 27.28 | 1.42 |
| NEG~that_surprising  | surprising  |  8.14 |  0.92 |  7,115.30 | 1,141 | 3,226,213 |  1,187 | 32.47 | 1.41 |
| NEG~that_common      | common      |  8.12 |  0.92 |  7,564.08 | 1,216 | 3,226,213 |  1,268 | 33.51 | 1.41 |
| NEG~that_hard        | hard        |  7.96 |  0.88 | 59,642.82 | 9,966 | 3,226,213 | 10,818 | 95.78 | 1.39 |
| NEG~that_complicated | complicated |  7.95 |  0.91 |  7,450.89 | 1,208 | 3,226,213 |  1,270 | 33.39 | 1.41 |


### Top 5 `NEGmirror` "that_*" bigrams (sorted by `LRC`; `LRC > 1`)

| key             | adj    |   LRC |   dP1 |       G2 |   f |      f1 |   f2 |     t |   MI |
|:----------------|:-------|------:|------:|---------:|----:|--------:|-----:|------:|-----:|
| NEG~that_simple | simple |  4.48 |  0.74 | 1,483.32 | 478 | 293,963 |  540 | 18.29 | 0.79 |
| NEG~that_easy   | easy   |  3.91 |  0.68 | 1,278.04 | 458 | 293,963 |  558 | 17.63 | 0.75 |
| NEG~that_big    | big    |  2.99 |  0.66 |   308.12 | 113 | 293,963 |  140 |  8.72 | 0.75 |
| NEG~that_good   | good   |  2.65 |  0.47 |   848.28 | 449 | 293,963 |  732 | 16.19 | 0.63 |
| NEG~that_great  | great  |  1.93 |  0.36 |   406.36 | 288 | 293,963 |  575 | 12.07 | 0.54 |


## _remotely_


### Top 5 `RBdirect` "remotely_*" bigrams (sorted by `LRC`; `LRC > 1`)

| key                     | adj        |   LRC |   dP1 |       G2 |   f |        f1 |    f2 |     t |   MI |
|:------------------------|:-----------|------:|------:|---------:|----:|----------:|------:|------:|-----:|
| NEG~remotely_true       | true       |  4.46 |  0.56 | 1,089.49 | 250 | 3,226,213 |   420 | 14.82 | 1.20 |
| NEG~remotely_close      | close      |  2.92 |  0.23 | 1,722.76 | 696 | 3,226,213 | 2,558 | 22.76 | 0.86 |
| NEG~remotely_interested | interested |  2.72 |  0.23 |   808.74 | 333 | 3,226,213 | 1,252 | 15.68 | 0.85 |


### Top 5 `NEGmirror` "remotely_*" bigrams (sorted by `LRC`; `LRC > 1`)

| key                | adj   |   LRC |   dP1 |     G2 |   f |      f1 |   f2 |     t |   MI |
|:-------------------|:------|------:|------:|-------:|----:|--------:|-----:|------:|-----:|
| NEG~remotely_close | close |  3.02 |  0.59 | 524.61 | 219 | 293,963 |  299 | 11.88 | 0.70 |

In [85]:
pprint({key: adv_dict['adj'] for key, adv_dict in samples_dict.items() if key not in {'bigrams', 'adj'}})

{'any': {'better',
         'bigger',
         'clearer',
         'different',
         'happier',
         'many',
         'simpler',
         'worse',
         'younger'},
 'before': {'available'},
 'ever': {'able',
          'clever',
          'easy',
          'good',
          'likely',
          'perfect',
          'severe',
          'simple',
          'wrong'},
 'exactly': {'cheap',
             'clear',
             'easy',
             'happy',
             'ideal',
             'new',
             'subtle',
             'sure',
             'surprising'},
 'immediately': {'able',
                 'apparent',
                 'available',
                 'clear',
                 'evident',
                 'obvious',
                 'possible',
                 'successful',
                 'visible'},
 'necessarily': {'easy',
                 'illegal',
                 'indicative',
                 'interested',
                 'new',
                 'related',


In [86]:
for key, info in samples_dict.items():
    if key in ('bigrams', 'adj'):
        key = f'ALL {key.replace("adj", "adjectives")}'
    formatted_iter = [
        f'_{a.replace("_", " ")}_' for a
        in (info['adj'] if isinstance(info, dict)
            else info)]
    print_iter(formatted_iter,
               header=f'+ _{key}_ ({len(formatted_iter)} unique)',
               bullet='+', indent=2)


+ _exactly_ (9 unique)
  + _ideal_
  + _easy_
  + _new_
  + _clear_
  + _subtle_
  + _cheap_
  + _happy_
  + _surprising_
  + _sure_

+ _necessarily_ (10 unique)
  + _indicative_
  + _illegal_
  + _easy_
  + _new_
  + _wrong_
  + _true_
  + _interested_
  + _representative_
  + _related_
  + _surprising_

+ _before_ (1 unique)
  + _available_

+ _that_ (16 unique)
  + _common_
  + _great_
  + _hard_
  + _big_
  + _complicated_
  + _easy_
  + _expensive_
  + _important_
  + _unusual_
  + _bad_
  + _good_
  + _simple_
  + _impressed_
  + _exciting_
  + _surprising_
  + _uncommon_

+ _remotely_ (3 unique)
  + _close_
  + _true_
  + _interested_

+ _yet_ (9 unique)
  + _certain_
  + _final_
  + _eligible_
  + _public_
  + _ready_
  + _complete_
  + _available_
  + _clear_
  + _sure_

+ _ever_ (9 unique)
  + _able_
  + _easy_
  + _wrong_
  + _clever_
  + _perfect_
  + _good_
  + _simple_
  + _likely_
  + _severe_

+ _immediately_ (9 unique)
  + _apparent_
  + _able_
  + _available_
  + _po


+ _exactly_ (5 unique)
  + _clear_
  + _sure_
  + _new_
  + _easy_
  + _cheap_

+ _necessarily_ (6 unique)
  + _surprising_
  + _true_
  + _representative_
  + _indicative_
  + _easy_
  + _wrong_

+ _before_ (1 unique)
  + _available_

+ _that_ (10 unique)
  + _complicated_
  + _surprising_
  + _easy_
  + _simple_
  + _common_
  + _uncommon_
  + _big_
  + _good_
  + _great_
  + _hard_

+ _remotely_ (3 unique)
  + _close_
  + _interested_
  + _true_

+ _ALL bigrams_ (25 unique)
  + _remotely true_
  + _necessarily indicative_
  + _that hard_
  + _necessarily easy_
  + _exactly easy_
  + _exactly clear_
  + _that big_
  + _that great_
  + _necessarily representative_
  + _exactly cheap_
  + _that surprising_
  + _necessarily surprising_
  + _exactly sure_
  + _that easy_
  + _that common_
  + _that simple_
  + _that complicated_
  + _before available_
  + _necessarily true_
  + _exactly new_
  + _that uncommon_
  + _necessarily wrong_
  + _remotely close_
  + _remotely interested_
  + _that good_

+ _ALL adjectives_ (21 unique)
  + _sure_
  + _new_
  + _indicative_
  + _simple_
  + _common_
  + _interested_
  + _available_
  + _true_
  + _cheap_
  + _close_
  + _complicated_
  + _representative_
  + _easy_
  + _wrong_
  + _clear_
  + _surprising_
  + _uncommon_
  + _big_
  + _good_
  + _great_
  + _hard_


In [87]:
all_top_adv_dfs = [ad['both']
                   for ad in samples_dict.values() if isinstance(ad, dict)]
NEG_bigrams_sample = pd.concat(
    all_top_adv_dfs).sort_values('LRC', ascending=False)
top_NEGbigram_df_path = TOP_AM_DIR.joinpath(
    f'Top{K}_NEG-ADV_top-bigrams.{timestamp_today()}.csv')
print(top_NEGbigram_df_path)
NEG_bigrams_sample.to_csv(
    top_NEGbigram_df_path)
NEG_bigrams_sample

/share/compling/projects/sanpi/results/top_AM/Top9_NEG-ADV_top-bigrams.2024-05-15.csv


Unnamed: 0_level_0,f,exp_f,unexp_f,dP1,LRC,G2,t,MI,...,N,f1,f2,l1,l2,adv,adj,adj_total
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
NEG~yet_clear,10553,399.60,10153.40,0.95,10.26,67924.56,98.84,1.42,...,86330752,3226213,10693,NEGATED,yet_clear,yet,clear,491108.00
NEG~yet_ready,7611,292.91,7318.09,0.93,9.23,48012.06,83.88,1.41,...,86330752,3226213,7838,NEGATED,yet_ready,yet,ready,240297.00
NEG~exactly_sure,8860,347.58,8512.42,0.92,8.63,54750.58,90.43,1.41,...,86330752,3226213,9301,NEGATED,exactly_sure,exactly,sure,844981.00
NEG~exactly_new,1378,52.99,1325.01,0.93,8.54,8697.93,35.69,1.42,...,86330752,3226213,1418,NEGATED,exactly_new,exactly,new,321311.00
NEG~yet_complete,2220,86.48,2133.52,0.92,8.42,13815.99,45.28,1.41,...,86330752,3226213,2314,NEGATED,yet_complete,yet,complete,107018.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
NEG~that_great,288,83.18,204.82,0.36,1.93,406.36,12.07,0.54,...,2032082,293963,575,NEGMIR,that_great,that,great,6821.00
NEG~immediately_available,164,43.98,120.02,0.39,1.91,258.42,9.37,0.57,...,2032082,293963,304,NEGMIR,immediately_avai...,immediately,available,14919.00
NEG~that_important,115,34.43,80.57,0.34,1.47,153.47,7.51,0.52,...,2032082,293963,238,NEGMIR,that_important,that,important,48905.00
POS~as_many,1689,1498.54,190.46,0.11,1.20,228.90,4.63,0.05,...,2032082,1738105,1752,POSMIR,as_many,as,many,4087.00


In [88]:
NEG_bigrams_sample.l1.value_counts()

l1
NEGATED       62
NEGMIR        19
COMPLEMENT     4
POSMIR         1
Name: count, dtype: Int64