In [None]:
# %%

# coding=utf-8
from source.utils.general import print_iter
from pathlib import Path

import pandas as pd

from source.utils import PKL_SUFF, SAMPLE_ADV, print_iter, print_md_table, timestamp_today
from source.utils.associate import AM_DF_DIR, adjust_assoc_columns
from source.utils.sample import sample_pickle as sampkl



# %% [markdown]


  Set columns and size of mantissa to view


In [None]:
# %%

FOCUS = ['f', 'E11', 'unexpected_f', 'unexpected_abs_sqrt',
         'am_p1_given2', 'conservative_log_ratio',
         'am_log_likelihood', 't_score',
         'mutual_information', 'am_odds_ratio_disc',
         'N', 'f1', 'f2', 'l1', 'l2']
pd.set_option('display.max_colwidth', 40)
pd.set_option('display.max_columns', 12)
pd.set_option('display.width', 250)
pd.set_option("display.precision", 3)
pd.set_option("styler.format.precision", 3)
pd.set_option("styler.format.thousands", ",")
pd.set_option("display.float_format", '{:,.3f}'.format)
pd.set_option("styler.render.repr", "html")

# %% [markdown]

 Set paths and load dataframes

In [None]:
# %%

POLAR_DIR = AM_DF_DIR.joinpath('polar')
SET_FLOOR = 2000
MIR_FLOOR = 100
setdiff_adv_dir, mirror_adv_dir = [neg_dir.joinpath('adv/extra')
                                   for neg_dir in
                                   POLAR_DIR.iterdir()]
# results/assoc_df/polar/RBdirect/adv/extra/polarized-adv_35f-7c_min5000x_extra.pkl.gz
setdiff_adv = pd.read_pickle(
    tuple(setdiff_adv_dir.glob(f'*35f-7c_min{SET_FLOOR}x*{PKL_SUFF}'))[0])
# results/assoc_df/polar/NEGmirror/adv/extra/polarized-adv_MIRROR_polarized.35f-7c_min5000x_extra.pkl.gz
mirror_adv = pd.read_pickle(
    tuple(mirror_adv_dir.glob(f'*35f-7c_min{MIR_FLOOR}x*{PKL_SUFF}'))[0])

setdiff_adv.sample(3)

Unnamed: 0_level_0,l1,l2,f,E11,am_log_likelihood,am_odds_ratio_disc,...,conservative_log_ratio_05,conservative_log_ratio_nc,conservative_log_ratio_dv,f_sqrt,f1_sqrt,f2_sqrt
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
COM~ecologically,COMPLEMENT,ecologically,9889,9647.183,214.064,0.459,...,0.943,1.177,0.962,99.443,9116.032,100.11
COM~formally,COMPLEMENT,formally,2855,2911.867,-25.844,-0.187,...,-0.084,-0.296,-0.102,53.432,9116.032,55.0
COM~vastly,COMPLEMENT,vastly,41236,39934.449,1732.839,0.806,...,2.253,2.423,2.267,203.066,9116.032,203.681


In [None]:
# %%

mirror_adv.sample(3)

Unnamed: 0_level_0,l1,l2,f,E11,am_log_likelihood,am_odds_ratio_disc,...,conservative_log_ratio_05,conservative_log_ratio_nc,conservative_log_ratio_dv,f_sqrt,f1_sqrt,f2_sqrt
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
NEG~medically,NEGMIR,medically,173,105.458,43.793,0.266,...,0.309,0.527,0.321,13.153,542.184,27.0
POS~second,POSMIR,second,182,173.632,3.047,0.157,...,0.0,0.0,0.0,13.491,1318.372,14.248
POS~unbearably,POSMIR,unbearably,200,180.475,18.637,0.47,...,0.0,0.367,0.0,14.142,1318.372,14.526


In [None]:
# %%

def get_top_vals(df: pd.DataFrame,
                 index_like: str = 'NEG',
                 metric_filter: str | list = 'conservative_log_ratio',
                 k: int = 10,
                 val_col: str = None,
                 ignore_neg_adv: bool = True):
    env_df = df.copy().loc[df.conservative_log_ratio >=
                           1].filter(like=index_like, axis=0)
    if ignore_neg_adv:
        env_df = env_df.loc[~df.l2.isin(
            ("n't", 'not', 'barely', 'never', 'no', 'none')), :]
    if isinstance(metric_filter, str):
        metric_filter = [metric_filter]

    top = pd.concat([env_df.nlargest(k, m) for m in metric_filter]
                    ).drop_duplicates(keep='first')

    if val_col:
        top = top[[val_col] + metric_filter]

    return top.sort_values(metric_filter[0], ascending=False)



[top20_sd, top20_mr] = [get_top_vals(adv_df, k=20, metric_filter=['am_p1_given2', 'conservative_log_ratio'])
                        for adv_df in [setdiff_adv, mirror_adv]]
print(top20_sd.filter(items=FOCUS).reset_index())


print(top20_mr.filter(items=FOCUS).reset_index())

                 key       f         E11  unexpected_f  unexpected_abs_sqrt  am_p1_given2  ...  am_odds_ratio_disc         N       f1       f2       l1            l2
0    NEG~necessarily   42708   2,118.676    40,589.324              201.468         0.716  ...               1.901  86330752  3226213    56694  NEGATED   necessarily
1        NEG~exactly   43635   2,301.978    41,333.022              203.305         0.671  ...               1.802  86330752  3226213    61599  NEGATED       exactly
2           NEG~that  165411   9,357.244   156,053.756              395.036         0.625  ...               1.723  86330752  3226213   250392  NEGATED          that
3    NEG~immediately   57319   3,855.764    53,463.236              231.221         0.519  ...               1.515  86330752  3226213   103177  NEGATED   immediately
4            NEG~yet   52546   3,800.829    48,745.171              220.783         0.480  ...               1.447  86330752  3226213   101707  NEGATED           yet
5   

In [None]:
# %%

def combine_top(df_1: pd.DataFrame,
                name_1: str,
                df_2: pd.DataFrame,
                name_2: str,
                env_filter: str = 'NEG',
                filter_items: list = FOCUS,
                k: int = 10) -> pd.DataFrame:

    top_dfs = [get_top_vals(adv_df, index_like=env_filter, k=k,
                            metric_filter=['am_p1_given2',
                                           'conservative_log_ratio']
                            )
               for adv_df in [df_1, df_2]]
    print(top_dfs[0].l2)
    print(top_dfs[1].l2)

    top_adv = top_dfs[0].l2.to_list()
    for adv in top_dfs[1].l2:
        if adv not in top_adv:
            top_adv.append(adv)

    df_1, df_2 = [d.filter(items=filter_items)
                  .filter(like=env_filter, axis=0)
                  .reset_index().set_index('l2')
                  for d in [df_1, df_2]]
    df_1 = adjust_assoc_columns(df_1)
    df_2 = adjust_assoc_columns(df_2)

    both = pd.DataFrame(index=top_adv)
    name_1, name_2 = [f"_{n.strip('_')}" for n in [name_1, name_2]]
    both = both.join(df_1).join(df_2, lsuffix=name_1,
                                rsuffix=name_2).sort_index(axis=1)
    # both['polar_approx'] = both.polar_approx.fillna('')(f'ADV_{name_1}')
    return both

# %% [markdown]

 Compile top NEG~adverb associations across both approximation methods

In [None]:
# %%

sample_size = 8
C = combine_top(setdiff_adv.copy(), 'SET',
                mirror_adv.copy(), 'MIR', k=sample_size)


def load_backup(lower_floor: int = 100):
    backup_set_df = pd.read_pickle(tuple(setdiff_adv_dir.glob(
        f'*35f-7c_min{lower_floor}x*{PKL_SUFF}'))[0])

    neg_set_backup = backup_set_df.filter(like='NEG', axis=0).filter(
        items=FOCUS).reset_index().set_index('l2')
    neg_set_backup.columns = pd.Series(
        adjust_assoc_columns(neg_set_backup.columns)) + '_SET'
    print(neg_set_backup.head())
    return neg_set_backup


if any(C.f_SET.isna()):
    neg_set_backup = load_backup()
    undefined = C.index[C.f_SET.isna()].to_list()
    print_iter(
        undefined, header=f'Adverbs with negated SET_DIFF tokens < {SET_FLOOR:,}', bullet='-')
    C.loc[undefined, neg_set_backup.columns] = neg_set_backup.loc[undefined, :]
    C.loc[undefined, :]

print_md_table(C.filter(regex=r'G2|P1|^f|LRC|unexpected_f'), n_dec=2)

C.to_csv(
    AM_DF_DIR / f'Top{sample_size}NEG-ADV_combined.35f-7c_{timestamp_today()}.csv')

key
NEG~necessarily    necessarily
NEG~exactly            exactly
NEG~that                  that
NEG~immediately    immediately
NEG~yet                    yet
NEG~terribly          terribly
NEG~remotely          remotely
NEG~only                  only
Name: l2, dtype: string
key
NEG~before                before
NEG~ever                    ever
NEG~exactly              exactly
NEG~any                      any
NEG~remotely            remotely
NEG~particularly    particularly
NEG~that                    that
NEG~necessarily      necessarily
Name: l2, dtype: string
                       key_SET   f_SET  exp_f_SET  unexp_f_SET  unexp_abs_sqrt_SET  dP1_SET  ...  MI_SET  odds_r_disc_SET     N_SET   f1_SET  f2_SET   l1_SET
l2                                                                                           ...                                                             
ornamentally  NEG~ornamentally     109      5.381      103.619              10.179    0.720  ...   1.307            

In [None]:
# %%

# results/assoc_df/polar/RBdirect/bigram/polarized-bigram_35f-7c_min1000x.pkl.gz
bigram_floor = 200
bigram_dfs = {d.name:
              pd.read_pickle(
                  tuple(d.joinpath('bigram/extra')
                        .glob(f'*35f-7c*min{bigram_floor//2 if d.name == "NEGmirror" else bigram_floor}x*.pkl.gz')
                        )[0])
              for d in POLAR_DIR.iterdir()}

In [None]:
# %%

def show_adv_bigrams(sample_size, C, bigram_dfs) -> dict:
    print('# Top bigrams corresponding to top adverbs\n')
    print(timestamp_today())
    patterns = list(bigram_dfs.keys())
    top_adverbs = C.LRC_SET.nlargest(sample_size).index
    bigram_samples = dict.fromkeys(top_adverbs)
    bigrams = []
    adj = []
    for adv in top_adverbs:
        print(f'\n## _{adv}_\n')
        adv_top = pd.DataFrame()
        bigram_samples[adv] = dict.fromkeys(patterns + ['both', 'adj'])
        adj_for_adv = []
        for pat, bdf in bigram_dfs.items():
            bdf = bdf[FOCUS+['adv', 'adj', 'adj_total']]
            bdf.columns = adjust_assoc_columns(bdf.columns)
            bdf = bdf.loc[bdf.LRC >= 1, ~bdf.columns.str.endswith('sqrt')]

            adv_pat_bigrams = bdf.loc[bdf.adv ==
                                      adv, :].nlargest(sample_size, 'LRC')
            # print(adv_top_bigrams)
            if adv_pat_bigrams.empty:
                print(f'No bigrams found in loaded `{pat}` AM table.')
            else:
                print_md_table(adv_pat_bigrams, n_dec=2,
                               title=f'### Top `{pat}` "{adv}_*" bigrams (sorted by LRC)')

            adj_for_adv.extend(adv_pat_bigrams.adj.drop_duplicates().to_list())

            bigram_samples[adv][pat] = adv_pat_bigrams

            adv_top = pd.concat([adv_top, adv_pat_bigrams])

        bigram_samples[adv]['adj'] = set(adj_for_adv)
        bigrams.extend(adv_top.l2.drop_duplicates().to_list())
        adj.extend(adj_for_adv)
        bigram_samples[adv]['both'] = adv_top
    bigram_samples['bigrams'] = set(bigrams)
    bigram_samples['adj'] = set(adj)
    return bigram_samples


samples_dict = show_adv_bigrams(sample_size, C, bigram_dfs)

# Top bigrams corresponding to top adverbs

2024-05-11

## _necessarily_


### Top `RBdirect` "necessarily_*" bigrams (sorted by LRC)

| key                            |     f |   exp_f |   unexp_f |   dP1 |   LRC |        G2 |     t |   MI |   odds_r_disc |          N |        f1 |    f2 | l1      | l2                         | adv         | adj            |   adj_total |
|:-------------------------------|------:|--------:|----------:|------:|------:|----------:|------:|-----:|--------------:|-----------:|----------:|------:|:--------|:---------------------------|:------------|:---------------|------------:|
| NEG~necessarily_indicative     | 1,406 |   54.41 |  1,351.59 |  0.93 |  8.37 |  8,811.69 | 36.05 | 1.41 |          2.86 | 86,330,752 | 3,226,213 | 1,456 | NEGATED | necessarily_indicative     | necessarily | indicative     |   12,760.00 |
| NEG~necessarily_representative |   496 |   19.58 |    476.42 |  0.91 |  7.31 |  3,044.27 | 21.39 | 1.40 |          2.65 | 86,330,752 | 3,226

In [None]:
# %%

samples_dict['adj']

{'able',
 'apparent',
 'available',
 'bad',
 'beautiful',
 'big',
 'certain',
 'cheap',
 'clear',
 'common',
 'complete',
 'complicated',
 'concerned',
 'convenient',
 'delicious',
 'different',
 'easy',
 'eligible',
 'exciting',
 'final',
 'good',
 'great',
 'happy',
 'hard',
 'ideal',
 'illegal',
 'important',
 'impressed',
 'indicative',
 'ineffective',
 'interested',
 'interesting',
 'new',
 'obvious',
 'possible',
 'ready',
 'related',
 'representative',
 'simple',
 'stylish',
 'successful',
 'sure',
 'surprised',
 'surprising',
 'true',
 'uncommon',
 'unfair',
 'unnecessary',
 'unusual',
 'useful',
 'visible',
 'wrong'}

In [None]:
# %%

samples_dict['exactly']['adj']

{'cheap', 'clear', 'easy', 'happy', 'ideal', 'new', 'sure', 'surprising'}

In [None]:
# %%

samples_dict['necessarily']['adj']

{'easy',
 'illegal',
 'indicative',
 'interested',
 'related',
 'representative',
 'surprising',
 'true',
 'wrong'}

In [None]:
# %%

# print_iter(samples_dict['exactly']['adj'], header='exactly...')

In [None]:
# %%

for key, info in samples_dict.items():
    if key in ('bigrams', 'adj'):
        key = f'ALL {key.replace("adj", "adjectives")}'
    formatted_iter = [
        f'_{a.replace("_", " ")}_' for a
        in (info['adj'] if isinstance(info, dict)
            else info)]
    print_iter(formatted_iter,
               header=f'+ _{key}_ ({len(formatted_iter)} unique)',
               bullet='+', indent=2)


+ _necessarily_ (9 unique)
  + _true_
  + _wrong_
  + _representative_
  + _illegal_
  + _surprising_
  + _related_
  + _indicative_
  + _interested_
  + _easy_

+ _exactly_ (8 unique)
  + _ideal_
  + _sure_
  + _clear_
  + _cheap_
  + _surprising_
  + _happy_
  + _new_
  + _easy_

+ _that_ (15 unique)
  + _unusual_
  + _simple_
  + _complicated_
  + _important_
  + _surprising_
  + _good_
  + _bad_
  + _big_
  + _uncommon_
  + _great_
  + _hard_
  + _impressed_
  + _common_
  + _exciting_
  + _easy_

+ _immediately_ (8 unique)
  + _obvious_
  + _clear_
  + _successful_
  + _possible_
  + _visible_
  + _able_
  + _apparent_
  + _available_

+ _yet_ (8 unique)
  + _sure_
  + _ready_
  + _clear_
  + _final_
  + _complete_
  + _eligible_
  + _certain_
  + _available_

+ _before_ (1 unique)
  + _available_

+ _terribly_ (9 unique)
  + _concerned_
  + _wrong_
  + _different_
  + _surprising_
  + _useful_
  + _impressed_
  + _interesting_
  + _interested_
  + _surprised_

+ _only_ (9 unique

In [None]:
# %%

all_top_adv_dfs = [ad['both']
                   for ad in samples_dict.values() if isinstance(ad, dict)]
pd.concat(all_top_adv_dfs).sort_values('LRC', ascending=False).to_csv(
    AM_DF_DIR.joinpath(f'top{sample_size}_NEG-ADV_top-bigrams.{timestamp_today()}.csv'))

In [None]:
# %%

# topDPb_neg_adv = setdiff_adv.filter(
#     like='NEG', axis=0).nlargest(10, 'am_p1_given2').l2.to_list()
# topDPb_com_adv = setdiff_adv.filter(
#     like='COM', axis=0).nlargest(10, 'am_p1_given2').l2.to_list()

# # %%
# topLRC_neg_adv = setdiff_adv.filter(like='NEG', axis=0).nlargest(
#     10, 'conservative_log_ratio').l2.to_list()
# topLRC_com_adv = setdiff_adv.filter(like='COM', axis=0).nlargest(
#     10, 'conservative_log_ratio').l2.to_list()

# setdiff_adv.unexpected_f / setdiff_adv.E11
# setdiff_adv.unexpected_f / setdiff_adv.f
# setdiff_adv.unexpected_f / setdiff_adv.f2


# setdiff_adv.loc[setdiff_adv.l2.isin(topDPb_com_adv+)]

# setdiff_adv.filter(like='COM', axis=0).nlargest(10, 'am_p1_given2')