In [10]:
import re
from pathlib import Path

import pandas as pd

from source.utils import POST_PROC_DIR, RESULT_DIR, corners
from source.utils.associate import POLAR_DIR, TOP_AM_DIR, adjust_assoc_columns
from source.utils.sample import sample_pickle

pd.set_option('display.float_format', '{:,.2f}'.format)
pd.set_option("styler.format.thousands", ',')
pd.set_option('display.max_colwidth', 80)
pd.set_option('display.max_columns', 10)
pd.set_option('display.width', 300)
HIT_EX_COLS = ['WITH::^[bt].*lower', 'WITH::text', 'token_str']
FOCUS = ['f',
         'am_p1_given2', 'conservative_log_ratio',
         'am_p1_given2_simple',
         'am_log_likelihood',
        #  'N', 'f1', 
        #  'mutual_information', 'am_odds_ratio_disc', 't_score',
          'f2', 'E11', 'unexpected_f', 
         'l1', 'l2', 'adj', 'adj_total']
abbr_FOCUS = adjust_assoc_columns(FOCUS)

## Define helper functions

In [11]:

def update_index(df, pat_name: str = None):
    neg_env_name = df.filter(like='NEG', axis=0).l1[0]
    # > will be either `NEGATED` or `NEGMIR`
    #   both are shortened to just `NEG` for the keys in their separate dataframes
    # > replace to avoid ambiguity in `key` values when combined
    #! some filtering relies on 'NEG', so have to keep that prefix
    index_update = pat_name or (
        'NEGmir' if neg_env_name.endswith('MIR') else 'NEGany')
    df.index = df.index.str.replace('NEG', index_update)
    return df


def set_col_widths(df):
    cols = df.copy().reset_index().columns
    width_dict = (
        {c: None for c in cols}
        | {c: 22 for c in cols[cols.str.contains('_id')]}
        | {c: 35 for c in cols[cols.str.contains('text')]}
        | {c: 30 for c in cols[cols.str.contains('forms')]}
        | {c: 55 for c in cols[cols.str.contains('_str')]})
    return list(width_dict.values())


def embolden(strings: pd.Series,
             bold_regex: str = None,
             mono: bool = True) -> pd.Series:
    bold_regex = re.compile(bold_regex, flags=re.I) if bold_regex else REGNOT
    if mono:
        return strings.apply(lambda x: bold_regex.sub(r' __`\1`__ ', x))
    else:
        return strings.apply(lambda x: bold_regex.sub(r' __\1__ ', x))


def show_sample(df: pd.DataFrame,
                format: str = 'grid',
                n_dec: int = 0, 
                limit_cols: bool = True, 
                assoc: bool = False):
    _df = df.copy().convert_dtypes()
    if limit_cols and format != 'pipe' and not assoc:
        print(_df.to_markdown(
        floatfmt=f',.{n_dec}f', intfmt=',',
        maxcolwidths=set_col_widths(_df),
        tablefmt=format
    ))
    else:
        if assoc: 
            if not bool(n_dec): 
                n_dec = 2
            _df = adjust_assoc_columns(_df)
            
        
        print(_df.to_markdown(
            floatfmt=f',.{n_dec}f', intfmt=',',
            tablefmt=format
        ))

## Load and Evaluate _Exactly_ Hits by Subset

In [12]:
def update_mir_paths(paths:dict) -> dict:
    for mir in {'POS', 'NEG'}:
        key = f'{mir}mirror'
        mir_path = paths[key]
        updated_mir_path = mir_path.with_name(f'Limited{mir}-'+mir_path.name)
        if updated_mir_path.is_file():
            paths[key] = updated_mir_path
    return paths


In [13]:
pkl_name = 'trigger-bigrams_frq-thrMIN-7.35f.pkl.gz'
path_dict = {p: POST_PROC_DIR / p / pkl_name for  p in ('RBdirect', 'POSmirror','NEGmirror')}
    
path_dict = update_mir_paths(path_dict)
show_sample(pd.Series(path_dict).to_frame('hits path'))
pd.Series({d:p.relative_to(POST_PROC_DIR.parent) for d,p in path_dict.items()}).to_frame('hits path')

+-----------+----------------------------------------------------------------------------------------------------------+
|           | hits path                                                                                                |
| RBdirect  | /share/compling/data/sanpi/4_post-processed/RBdirect/trigger-bigrams_frq-thrMIN-7.35f.pkl.gz             |
+-----------+----------------------------------------------------------------------------------------------------------+
| POSmirror | /share/compling/data/sanpi/4_post-processed/POSmirror/LimitedPOS-trigger-bigrams_frq-thrMIN-7.35f.pkl.gz |
+-----------+----------------------------------------------------------------------------------------------------------+
| NEGmirror | /share/compling/data/sanpi/4_post-processed/NEGmirror/LimitedNEG-trigger-bigrams_frq-thrMIN-7.35f.pkl.gz |
+-----------+----------------------------------------------------------------------------------------------------------+


Unnamed: 0,hits path
RBdirect,4_post-processed/RBdirect/trigger-bigrams_frq-thrMIN-7.35f.pkl.gz
POSmirror,4_post-processed/POSmirror/LimitedPOS-trigger-bigrams_frq-thrMIN-7.35f.pkl.gz
NEGmirror,4_post-processed/NEGmirror/LimitedNEG-trigger-bigrams_frq-thrMIN-7.35f.pkl.gz


In [14]:
def load_exactly_hits(pkl_path):
    _df = pd.read_pickle(pkl_path).convert_dtypes()
    _df = _df.loc[_df.adv_form_lower == 'exactly', :]
    return _df

nmir_exactly = load_exactly_hits(pkl_path=path_dict['NEGmirror'])
pmir_exactly = load_exactly_hits(pkl_path=path_dict['POSmirror'])
full_not_exactly = load_exactly_hits(pkl_path=path_dict['RBdirect'])


### `NEGmirror` *exactly* hits

In [15]:
xnmir = (nmir_exactly.sample(8)
            .sort_values(['bigram_lower','adj_form'])
            .filter(regex=r'trig[ger]*_[dl]e|text|bigram_lower'))
show_sample(xnmir)
xnmir

+------------------------+-------------------------------------+--------------------+
| hit_id                 | text_window                         | bigram_lower       |
| pcc_eng_20_084.0354_x1 | are all different , none of us are  | exactly_alike      |
| 341744_059:06-10-11    | exactly alike and variety is a      |                    |
+------------------------+-------------------------------------+--------------------+
| pcc_eng_07_052.8916_x0 | None of those are exactly           | exactly_convenient |
| 838851_26:1-5-6        | convenient and production and work  |                    |
+------------------------+-------------------------------------+--------------------+
| pcc_eng_02_040.4261_x0 | begin the scenes , none of the      | exactly_horrifying |
| 637920_34:14-19-20     | scenes are exactly horrifying .     |                    |
+------------------------+-------------------------------------+--------------------+
| pcc_eng_21_027.4394_x0 | women and minorities -- non

Unnamed: 0_level_0,text_window,bigram_lower
hit_id,Unnamed: 1_level_1,Unnamed: 2_level_1
pcc_eng_20_084.0354_x1341744_059:06-10-11,"are all different , none of us are exactly alike and variety is a",exactly_alike
pcc_eng_07_052.8916_x0838851_26:1-5-6,None of those are exactly convenient and production and work,exactly_convenient
pcc_eng_02_040.4261_x0637920_34:14-19-20,"begin the scenes , none of the scenes are exactly horrifying .",exactly_horrifying
pcc_eng_21_027.4394_x0427443_31:26-30-31,women and minorities -- none of it is exactly new .,exactly_new
pcc_eng_02_105.3699_x1687756_36:09-11-12,by manga etc -- nothing is exactly square .,exactly_square
pcc_eng_07_001.7565_x0012177_15:20-21-22,"way , you 're never exactly sure how the recording process",exactly_sure
pcc_eng_25_044.9147_x0711011_22:10-12-13,because at this point nobody is exactly sure how to get there,exactly_sure
nyt_eng_20100722_0112_18:30-31-32,"it 's predictable and never exactly sweeping , it 's certainly",exactly_sweeping


In [16]:
xpm = (pmir_exactly.sample(6)
            .sort_values(['bigram_lower','adj_form'])
            .filter(regex=r'trig[ger]*_[dl]e|text|bigram_lower'))
show_sample(xpm)
xpm

+------------------------+------------------------------------+-------------------+
| hit_id                 | text_window                        | bigram_lower      |
| pcc_eng_24_088.2047_x1 | , if we were all exactly alike we  | exactly_alike     |
| 410351_42:16-17-18     | would never exchange               |                   |
+------------------------+------------------------------------+-------------------+
| pcc_eng_24_012.7792_x0 | along if we were all exactly alike | exactly_alike     |
| 189980_12:10-11-12     | ?                                  |                   |
+------------------------+------------------------------------+-------------------+
| pcc_eng_11_103.3304_x1 | either virtually the same or       | exactly_identical |
| 656213_46:39-40-41     | exactly identical to the bullets   |                   |
|                        | used                               |                   |
+------------------------+------------------------------------+-------------

Unnamed: 0_level_0,text_window,bigram_lower
hit_id,Unnamed: 1_level_1,Unnamed: 2_level_1
pcc_eng_24_088.2047_x1410351_42:16-17-18,", if we were all exactly alike we would never exchange",exactly_alike
pcc_eng_24_012.7792_x0189980_12:10-11-12,along if we were all exactly alike ?,exactly_alike
pcc_eng_11_103.3304_x1656213_46:39-40-41,either virtually the same or exactly identical to the bullets used,exactly_identical
pcc_eng_06_042.8439_x0676759_37:16-18-19,seemed genuinely concerned that everything was exactly right .,exactly_right
pcc_eng_04_061.6183_x0979284_12:24-28-29,", and headliner , all of which are exactly right and installed by experts",exactly_right
pcc_eng_21_049.1505_x0778237_23:10-12-13,", inscrutable expression , everything is exactly right and so exactly the",exactly_right


In [17]:
xn_all =(full_not_exactly.sample(6)
            .sort_values(['bigram_lower','adj_form'])
            .filter(regex=r'trig[ger]*_[dl]e|text|bigram_lower'))
show_sample(xn_all)
xn_all

+------------------------+-------------------------------------+----------------------+
| hit_id                 | text_window                         | bigram_lower         |
| pcc_eng_13_087.7563_x1 | and decaying carcasses -- not       | exactly_appetizing   |
| 402076_07:23-24-25     | exactly appetizing to the human     |                      |
|                        | palate                              |                      |
+------------------------+-------------------------------------+----------------------+
| pcc_eng_13_101.9941_x1 | defendant , you were n't exactly    | exactly_certain      |
| 631945_125:22-23-24    | certain when , Friday ,             |                      |
+------------------------+-------------------------------------+----------------------+
| pcc_eng_06_044.1854_x0 | - Saxon world is not exactly chock  | exactly_chock        |
| 698472_052:11-12-13    | -a- block with Tiger                |                      |
+------------------------+------

Unnamed: 0_level_0,text_window,bigram_lower
hit_id,Unnamed: 1_level_1,Unnamed: 2_level_1
pcc_eng_13_087.7563_x1402076_07:23-24-25,and decaying carcasses -- not exactly appetizing to the human palate,exactly_appetizing
pcc_eng_13_101.9941_x1631945_125:22-23-24,"defendant , you were n't exactly certain when , Friday ,",exactly_certain
pcc_eng_06_044.1854_x0698472_052:11-12-13,- Saxon world is not exactly chock -a- block with Tiger,exactly_chock
pcc_eng_11_050.4455_x0799859_20:13-14-15,"which means he was n't exactly fond of pinstripes , but",exactly_fond
pcc_eng_00_103.7144_x1661316_3:41-42-43,", cable management is n't exactly optimal either ) .",exactly_optimal
pcc_eng_28_031.0975_x0486453_7:27-28-29,through glasses that are n't exactly rose-colored .,exactly_rose-colored


In [18]:
def adjust_hit_table(_df): 
    _df.columns = _df.columns.str.replace(
        r'mir|neg', 'trig', regex=True)
    _df = _df.assign(
        all_forms_lower=(_df.trig_form_lower.astype('string')
                         + '_' + _df.bigram_lower.astype('string')
                         ).astype('category'))
    _df = _df.drop_duplicates(['all_forms_lower', 'text_window'])

    if any(_df.category.str.contains('mirror')):
        _df['trigger_head'] = (
            _df.pattern.astype('string')
            .str.split('mirror-').str.get(1)
        ).astype('category')
    else:
        _df['trigger_head'] = (
            _df.pattern.astype('string')
            .str.split('-').str.get(1)
            .str.replace('adj', 'R')
            .str.replace('neg', 'L')
        ).astype('category')
    if 'trigger_lower' in _df.columns: 
        _df = _df.loc[:, _df.columns != 'trig_form_lower']
    else: 
        _df = _df.rename(columns={'trig_form_lower': 'trigger_lower'})
    return _df.rename(columns={'trig_lemma': 'trigger_lemma'})
    

In [19]:
nmir_exactly = adjust_hit_table(nmir_exactly)
pmir_exactly = adjust_hit_table(pmir_exactly)
full_not_exactly = adjust_hit_table(full_not_exactly)

filtering `not_exactly` hits to ignore duplicates with NEGmirror hits...

🤔 but the question is how much of the remainder is due a trigger not included in the NEGmirror patterns 
as opposed to being a duplicate that was not discarded? 

The forms `"not"` and `"n't"` were not included in the mirror hits, but are any other trigger hits cases of duplication already removed from the NEGmirror hits? 

❓ Can proper redundancy elimination only be done with the NEGmirror hits before additional duplicate removal?

In [20]:
not_exactly = full_not_exactly.copy()
not_exactly = not_exactly.loc[~not_exactly.bigram_id.isin(nmir_exactly.bigram_id), :]
xnot_exactly = (pd.Series({'all negated  hits': len(full_not_exactly), 
                       'NEGmirror complement': len(not_exactly), 
                       'NEGmirror hits': len(nmir_exactly), 
                       'POSmirror hits': len(pmir_exactly)}).to_frame('"exactly" hits in subset'))
show_sample(xnot_exactly, format='pipe')
xnot_exactly

|                      |   "exactly" hits in subset |
|:---------------------|---------------------------:|
| all negated  hits    |                     42,058 |
| NEGmirror complement |                     41,260 |
| NEGmirror hits       |                        802 |
| POSmirror hits       |                        219 |


Unnamed: 0,"""exactly"" hits in subset"
all negated hits,42058
NEGmirror complement,41260
NEGmirror hits,802
POSmirror hits,219


Ok, so not so bad, all things considered. 
- _never_, _neither_, _none_, and _nor_ should be included in the `NEGmirror` hits, but these numbers are reasonable
- **but** keep in mind that the _not_ hits shown here have not been filtered for additional duplicates

In [21]:
show_sample(not_exactly.trigger_lemma.value_counts().to_frame(), format='pipe')
not_exactly.trigger_lemma.value_counts().to_frame()

| trigger_lemma   |   count |
|:----------------|--------:|
| not             |  41,149 |
| ain't           |      78 |
| without         |      16 |
| aint            |       8 |
| few             |       3 |
| never           |       2 |
| neither         |       2 |
| none            |       1 |
| nor             |       1 |


Unnamed: 0_level_0,count
trigger_lemma,Unnamed: 1_level_1
not,41149
ain't,78
without,16
aint,8
few,3
never,2
neither,2
none,1
nor,1


Just going to drop the hits that should be represented by `NEGmirror`

In [23]:
neg_overlap = not_exactly.loc[not_exactly.trigger_lemma.isin(['never', 'neither', 'none', 'nor'])].filter(regex=r'all|text|token|deprel|head')
show_sample(neg_overlap)
neg_overlap

+------------------------+---------------------------------------------------------+-------------------------------------+---------------+-------------------------+----------------+
| hit_id                 | token_str                                               | text_window                         | trig_deprel   | all_forms_lower         | trigger_head   |
| pcc_eng_09_006.3776_x0 | It was never exactly clear , however , what he intended | It was never exactly clear ,        | advmod        | never_exactly_clear     | R              |
| 087199_04:3-4-5        | to do about it : Appearing at the American Israel       | however , what                      |               |                         |                |
|                        | Public Affairs Committee 's policy conference in March  |                                     |               |                         |                |
|                        | 2016 , Trump said in the same speech that he planned to |      

Unnamed: 0_level_0,token_str,text_window,trig_deprel,all_forms_lower,trigger_head
hit_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
pcc_eng_09_006.3776_x0087199_04:3-4-5,"It was never exactly clear , however , what he intended to do about it : App...","It was never exactly clear , however , what",advmod,never_exactly_clear,R
pcc_eng_17_025.1287_x0390487_29:3-4-5,Life is never exactly easy for Jamie and Claire .,Life is never exactly easy for Jamie and Claire,advmod,never_exactly_easy,R
pcc_eng_01_013.5821_x0203017_13:08-15-16,The problem with the system is that none of King Abdullah 's brothers are ex...,the system is that none of King Abdullah 's brothers are exactly young and f...,nsubj,none_exactly_young,R
nyt_eng_20060406_0182_2:28-31-32,"it would be a stretch to say that at that moment her character , a vaguely d...","actually happy , but neither is she exactly unhappy .",dep,neither_exactly_unhappy,R
nyt_eng_20101010_0083_15:01-11-12,"neither Dara , Xavier nor , apparently , Leonard is exactly sure what opport...","neither Dara , Xavier nor , apparently , Leonard is exactly sure what opport...",dep,neither_exactly_sure,R
pcc_eng_02_069.5519_x1108750_03:1-5-6,Nor were the procedures exactly similar .,Nor were the procedures exactly similar .,cc,nor_exactly_similar,R


In [24]:
nmir_overlap = nmir_exactly.loc[nmir_exactly.text_window.isin(neg_overlap.text_window)].filter(regex=r'all|text|token|deprel|head')
show_sample(nmir_overlap)
nmir_overlap

+------------------------+---------------------------------------------------------+----------------------------------+---------------+---------------------+----------------+
| hit_id                 | token_str                                               | text_window                      | trig_deprel   | all_forms_lower     | trigger_head   |
| pcc_eng_08_036.2797_x0 | Nor were the procedures exactly similar .               | Nor were the procedures exactly  | cc            | nor_exactly_similar | R              |
| 571321_24:1-5-6        |                                                         | similar .                        |               |                     |                |
+------------------------+---------------------------------------------------------+----------------------------------+---------------+---------------------+----------------+
| pcc_eng_08_065.4444_x1 | Life is never exactly easy for Jamie and Claire .       | Life is never exactly easy for   | advmo

Unnamed: 0_level_0,token_str,text_window,trig_deprel,all_forms_lower,trigger_head
hit_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
pcc_eng_08_036.2797_x0571321_24:1-5-6,Nor were the procedures exactly similar .,Nor were the procedures exactly similar .,cc,nor_exactly_similar,R
pcc_eng_08_065.4444_x1043970_32:3-4-5,Life is never exactly easy for Jamie and Claire .,Life is never exactly easy for Jamie and Claire,advmod,never_exactly_easy,R
pcc_eng_29_009.8148_x0142508_17:3-4-5,"It was never exactly clear , however , what he intended to ...","It was never exactly clear , however , what",advmod,never_exactly_clear,R
pcc_eng_15_026.9576_x0419493_10:08-15-16,The problem with the system is that none of King Abdullah 's brothers are ex...,the system is that none of King Abdullah 's brothers are exactly young and f...,nsubj,none_exactly_young,R


💡 hmm maybe the `dep` dependency relations were excluded outright for mirror patterns? (since they were developed later?)

In [25]:
not_exactly = not_exactly.loc[~not_exactly.index.isin(neg_overlap.index),:]

In [26]:
show_sample(not_exactly.trigger_lemma.value_counts(), format='pipe')

| trigger_lemma   |   count |
|:----------------|--------:|
| not             |  41,149 |
| ain't           |      78 |
| without         |      16 |
| aint            |       8 |
| few             |       3 |


It appears there are no duplicates in the `RBdirect` remainder however 🤷‍♀️, so I guess there just really are **that** many hits with sentential negation 🤯

In [27]:
not_exactly.text_window.duplicated().value_counts().to_frame()

Unnamed: 0_level_0,count
text_window,Unnamed: 1_level_1
False,41254


In [31]:
print(f'{not_exactly.text_window.nunique():,} unique `text_window` strings')
print(f'{len(not_exactly):,} total rows')

41,254 unique `text_window` strings
41,254 total rows


In [30]:
updated_afl = not_exactly.groupby('trigger_lower').value_counts(['adj_form_lower'])
updated_afl.nlargest(20).to_frame()


Unnamed: 0_level_0,Unnamed: 1_level_0,count
trigger_lower,adj_form_lower,Unnamed: 2_level_1
not,sure,6169
n't,sure,1969
not,clear,1097
not,true,764
n't,new,762
not,new,548
n't,easy,521
not,easy,503
n't,clear,486
not,cheap,345


In [32]:
updated_afl.nsmallest(10).to_frame()

Unnamed: 0_level_0,Unnamed: 1_level_0,count
trigger_lower,adj_form_lower,Unnamed: 2_level_1
ain't,accurate,1
ain't,short,1
ain't,unfounded,1
ain't,unprecedented,1
ain't,unbiased,1
ain't,shabby,1
ain't,sensible,1
ain't,catchy,1
ain't,different,1
ain't,eager,1


In [33]:
not_exactly[['trigger_head', 'trigger_lemma',  'adj_form_lower']].astype('string').value_counts(['trigger_lemma', 'adj_form_lower']).nlargest(20)

trigger_lemma  adj_form_lower
not            sure              8138
               clear             1584
               new               1310
               true              1089
               easy              1024
               cheap              672
               right              536
               happy              434
               surprising         426
               ideal              401
               accurate           309
               great              304
               perfect            276
               good               252
               fair               251
               subtle             246
               correct            230
               fun                215
               conducive          206
               hard               201
Name: count, dtype: int64

In [34]:
# full_not_exactly.loc[full_not_exactly.trigger_lemma!='not', ['trigger_lemma', 'adj_form_lower', 'trigger_head']].groupby('trigger_lemma').value_counts(['trigger_head','adj_form_lower', ])
trigger_counts = full_not_exactly[['trigger_lemma' ,'trigger_head', 'trigger_lower']].value_counts().to_frame().reset_index()
# trigger_counts = full_not_exactly[['trigger_lemma', 'trigger_lower' ,'trigger_head', ]].value_counts().to_frame().reset_index()
trigger_counts.index = trigger_counts.index + 1
show_sample(trigger_counts.sort_values(['trigger_head', 'count', 'trigger_lemma'], ascending = False), format='pipe')


|    | trigger_lemma   | trigger_head   | trigger_lower   |   count |
|---:|:----------------|:---------------|:----------------|--------:|
|  1 | not             | R              | not             |  24,362 |
|  2 | not             | R              | n't             |  16,572 |
|  3 | never           | R              | never           |     323 |
|  5 | none            | R              | none            |     185 |
|  6 | nobody          | R              | nobody          |      83 |
|  7 | ain't           | R              | ain't           |      74 |
|  8 | neither         | R              | neither         |      71 |
|  9 | nor             | R              | nor             |      43 |
| 11 | nothing         | R              | nothing         |      32 |
| 12 | no              | R              | no              |      16 |
| 13 | without         | R              | without         |      15 |
| 14 | not             | R              | nit             |      13 |
| 15 | aint         

In [42]:
def flatten_by_head(trigger_counts):
    by_head = {h:hc.filter(['trigger_lemma', 'count']).set_index('trigger_lemma') for h, hc in trigger_counts.groupby('trigger_head')}
    by_head = by_head['R'].join(by_head['L'], lsuffix='_R_headed', rsuffix='_L_headed').fillna(0).convert_dtypes().sort_index(axis=1)
    by_head.columns = by_head.columns.str.replace('count_','')
    show_sample(by_head.sort_values('R_headed', ascending=False), format='pipe')
    return by_head.sort_values('R_headed', ascending=False)
flatten_by_head(trigger_counts)

| trigger_lemma   |   L_headed |   R_headed |
|:----------------|-----------:|-----------:|
| not             |        206 |     24,362 |
| not             |        206 |     16,572 |
| never           |          0 |        323 |
| none            |          7 |        185 |
| nobody          |          1 |         83 |
| ain't           |          4 |         74 |
| neither         |          1 |         71 |
| nor             |          0 |         43 |
| nothing         |         33 |         32 |
| no              |          0 |         16 |
| without         |          1 |         15 |
| not             |        206 |         13 |
| aint            |          0 |          8 |
| rarely          |          0 |          4 |
| few             |          0 |          2 |
| few             |          0 |          1 |
| seldom          |          0 |          1 |


Unnamed: 0_level_0,L_headed,R_headed
trigger_lemma,Unnamed: 1_level_1,Unnamed: 2_level_1
not,206,24362
not,206,16572
never,0,323
none,7,185
nobody,1,83
ain't,4,74
neither,1,71
nor,0,43
nothing,33,32
no,0,16


In [43]:
# ref_trigger_count =  pd.read_pickle(path_dict['RBdirect'])[['neg_lemma', 'pattern']].value_counts().to_frame().reset_index()
# ref_trigger_count.columns = ['trigger_lemma', 'trigger_head', 'total count']
# ref_trigger_count.index = ref_trigger_count.index + 1
# ref_trigger_count.loc[:,'trigger_head'] = ref_trigger_count.trigger_head.apply(lambda n: 'R' if 'adj' in n else 'L')

ref_trigger_count = pd.read_csv('/home/arh234/projects/sanpi/info/md_import_tables/allMIN-7_neg_trigger_counts.csv').set_index('rank')
show_sample(ref_trigger_count.sort_values(['trigger_head', 'count', 'trigger_lemma'], ascending = False), format='pipe')
flatten_by_head(ref_trigger_count)
# show_sample(ref_trigger_count.sort_values(['trigger_lemma', 'trigger_head', 'total count'], ascending = False), format='pipe')

|   rank | trigger_lemma   | trigger_head   |     count |
|-------:|:----------------|:---------------|----------:|
|      1 | not             | R              | 2,894,561 |
|      2 | never           | R              |   111,085 |
|      4 | nothing         | R              |    34,234 |
|      5 | none            | R              |    21,621 |
|      6 | nor             | R              |    15,922 |
|      7 | without         | R              |    13,320 |
|      8 | no              | R              |    12,721 |
|     10 | few             | R              |     8,077 |
|     12 | neither         | R              |     6,617 |
|     13 | nobody          | R              |     5,859 |
|     14 | hardly          | R              |     5,513 |
|     15 | rarely          | R              |     4,461 |
|     17 | ain't           | R              |     1,361 |
|     18 | barely          | R              |     1,228 |
|     19 | seldom          | R              |     1,037 |
|     20 | sca

Unnamed: 0_level_0,L_headed,R_headed
trigger_lemma,Unnamed: 1_level_1,Unnamed: 2_level_1
not,7480,2894561
never,64,111085
nothing,66536,34234
none,9978,21621
nor,0,15922
without,1495,13320
no,101,12721
few,546,8077
neither,91,6617
nobody,539,5859


`NEGmirror` *exactly* examples

In [79]:
show_sample(nmir_exactly.sample(8)
            .sort_values(['trigger_lemma', 'trigger_lower', 'adj_form'])
            .filter(regex=r'trig[ger]*_[dl]e|text|bigram_lower'))


+------------------------+------------------------------------+---------------+-----------------+------------------+
| hit_id                 | text_window                        | trig_deprel   | trigger_lemma   | bigram_lower     |
| pcc_eng_04_010.3961_x0 | is that it is never exactly boring | advmod        | never           | exactly_boring   |
| 151951_64:7-8-9        | , only dulled .                    |               |                 |                  |
+------------------------+------------------------------------+---------------+-----------------+------------------+
| pcc_eng_15_040.9047_x0 | , it 's still never exactly fun to | advmod        | never           | exactly_fun      |
| 645253_57:7-8-9        | have to worry                      |               |                 |                  |
+------------------------+------------------------------------+---------------+-----------------+------------------+
| pcc_eng_08_081.8258_x1 | No not exactly unknown like the    | 

`POSmirror` *exactly* examples

In [80]:
show_sample(pmir_exactly.sample(8)
            .sort_values(['trigger_lemma', 'trigger_lower', 'adj_form'])
            .filter(regex=r'trig[ger]*_[dl]e|text|bigram_lower'))


+------------------------+-------------------------------------+---------------+-----------------+--------------------+
| hit_id                 | text_window                         | trig_deprel   | trigger_lemma   | bigram_lower       |
| pcc_eng_17_097.1162_x1 | ground ) wind is always exactly     | advmod        | always          | exactly_zero       |
| 553549_29:16-17-18     | zero .                              |               |                 |                    |
+------------------------+-------------------------------------+---------------+-----------------+--------------------+
| pcc_eng_00_034.6056_x0 | with this statement that everyone   | nsubj         | everyone        | exactly_right      |
| 542944_056:16-19-20    | new was exactly right !             |               |                 |                    |
+------------------------+-------------------------------------+---------------+-----------------+--------------------+
| pcc_eng_07_074.2182_x1 | came across t

Sententially negated *exactly* examples

In [81]:
show_sample(not_exactly.sample(8)
            .sort_values(['trigger_lemma', 'trigger_lower', 'adj_form'])
            .filter(regex=r'trig[ger]*_[dl]e|text|bigram_lower'))

+------------------------+-------------------------------------+---------------+-----------------+----------------------+
| hit_id                 | text_window                         | trig_deprel   | trigger_lemma   | bigram_lower         |
| pcc_eng_28_041.2299_x0 | , my coworkers were n't exactly     | advmod        | not             | exactly_impressed    |
| 650717_29:6-7-8        | impressed ... most of them          |               |                 |                      |
+------------------------+-------------------------------------+---------------+-----------------+----------------------+
| pcc_eng_02_094.5840_x1 | Moves you is n't exactly right .    | advmod        | not             | exactly_right        |
| 513186_02:4-5-6        |                                     |               |                 |                      |
+------------------------+-------------------------------------+---------------+-----------------+----------------------+
| pcc_eng_09_023.1785_x0

In [86]:
all_pat_counts = pd.concat((mdf.pattern.value_counts() for mdf in [pmir_exactly, nmir_exactly, not_exactly])
                           ).to_frame().sort_values('count', ascending=False)
show_sample(all_pat_counts, format='pipe')
all_pat_counts

| pattern         |   count |
|:----------------|--------:|
| direct-adj-head |  41,043 |
| neg-mirror-R    |     760 |
| pos-mirror-R    |     211 |
| direct-neg-head |     211 |
| neg-mirror-L    |      42 |
| pos-mirror-L    |       8 |


Unnamed: 0_level_0,count
pattern,Unnamed: 1_level_1
direct-adj-head,41043
neg-mirror-R,760
pos-mirror-R,211
direct-neg-head,211
neg-mirror-L,42
pos-mirror-L,8


In [85]:
all_trig_counts = pd.concat((mdf.trigger_head for mdf in [pmir_exactly, nmir_exactly, not_exactly])).value_counts().to_frame()
show_sample(all_trig_counts, format='pipe')
all_trig_counts

| trigger_head   |   count |
|:---------------|--------:|
| R              |  42,014 |
| L              |     261 |


Unnamed: 0_level_0,count
trigger_head,Unnamed: 1_level_1
R,42014
L,261


In [87]:
exactly = pd.concat((d[['adj_form_lower', 'trigger_head', 'trigger_lemma',  'category']].astype('string') 
           for d in [not_exactly, nmir_exactly, pmir_exactly])
          )
exactly

Unnamed: 0_level_0,adj_form_lower,trigger_head,trigger_lemma,category
hit_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
pcc_eng_09_001.0244_x0000400_05:08-09-10,full,R,not,RBdirect
pcc_eng_09_001.2416_x0003918_19:5-6-7,consistent,R,not,RBdirect
pcc_eng_09_001.2755_x0004474_37:4-5-6,celebratory,R,not,RBdirect
pcc_eng_09_001.2765_x0004492_04:10-11-12,friendly,R,not,RBdirect
pcc_eng_09_001.3782_x0006136_05:4-5-6,wrong,R,not,RBdirect
...,...,...,...,...
pcc_eng_24_068.2411_x1087534_2:39-40-41,impossible,R,or,POSmirror
pcc_eng_24_088.2047_x1410351_42:16-17-18,alike,R,all,POSmirror
pcc_eng_26_012.4795_x0185440_4029:18-20-21,alike,R,all,POSmirror
pcc_eng_26_073.3344_x1169114_092:4-6-7,alike,R,all,POSmirror


In [48]:
for trigger, tdf in exactly.groupby('trigger_lemma'):
    print(f'\n>> {trigger} <<')
    show_sample(tdf.groupby('trigger_head').adj_form_lower.value_counts().nlargest(10).reset_index(), format='pipe')


>> ain't <<
|    | trigger_head   | adj_form_lower   |   count |
|---:|:---------------|:-----------------|--------:|
|  0 | R              | clear            |      40 |
|  1 | R              | cheap            |       4 |
|  2 | R              | easy             |       3 |
|  3 | R              | real             |       2 |
|  4 | L              | different        |       1 |
|  5 | L              | fun              |       1 |
|  6 | L              | rich             |       1 |
|  7 | L              | ready            |       1 |
|  8 | R              | normal           |       1 |
|  9 | R              | perfect          |       1 |

>> aint <<
|    | trigger_head   | adj_form_lower   |   count |
|---:|:---------------|:-----------------|--------:|
|  0 | R              | sure             |       3 |
|  1 | R              | clear            |       1 |
|  2 | R              | easy             |       1 |
|  3 | R              | new              |       1 |
|  4 | R             

---
## *Exactly* Associations
### Work in Progress 🚧 

_Prior_ Table Output for Negation Marginal frequencies

|                     |        `N` |      `f1` |   `adv_total` |
|:--------------------|-----------:|----------:|--------------:|
| **NEGATED_exactly** | 86,330,752 | 3,226,213 |        61,599 |
| **NEGMIR_exactly**  |  2,032,082 |   293,963 |         1,114 |

### Top bigrams for _exactly_

In [49]:
most_neg = pd.read_csv('/share/compling/projects/sanpi/results/top_AM/neg_bigram_examples/exactly/exactly_10mostNEG-bigrams_AMscores_2024-05-22.csv').set_index('key')
show_sample(most_neg.sort_values(['LRC', 'dP1'], ascending=False).loc[:, :'f2'], assoc=True, format='pipe')

| key                        |     f |   dP1 |   LRC |        G2 |          N |        f1 |    f2 |
|:---------------------------|------:|------:|------:|----------:|-----------:|----------:|------:|
| NEGany~exactly_sure        | 8,860 |  0.92 |  8.63 | 54,750.58 | 86,330,752 | 3,226,213 | 9,301 |
| NEGany~exactly_new         | 1,378 |  0.93 |  8.54 |  8,697.93 | 86,330,752 | 3,226,213 | 1,418 |
| NEGany~exactly_easy        | 1,069 |  0.93 |  8.37 |  6,747.64 | 86,330,752 | 3,226,213 | 1,100 |
| NEGany~exactly_clear       | 1,759 |  0.92 |  8.30 | 10,937.16 | 86,330,752 | 3,226,213 | 1,835 |
| NEGany~exactly_cheap       |   693 |  0.95 |  8.28 |  4,443.27 | 86,330,752 | 3,226,213 |   704 |
| NEGany~exactly_surprising  |   441 |  0.96 |  7.34 |  2,863.35 | 86,330,752 | 3,226,213 |   444 |
| NEGany~exactly_practical   |   105 |  0.95 |  3.52 |    679.01 | 86,330,752 | 3,226,213 |   106 |
| NEGmir~exactly_clear       |    52 |  0.80 |  2.13 |    178.73 |  2,032,082 |   293,963 |    55 |


In [50]:
top_overall = pd.read_csv('/share/compling/projects/sanpi/results/top_AM/any_bigram_examples/exactly/exactly_top11-bigrams_AMscores_2024-05-18.csv').set_index('key')
show_sample(top_overall.sort_values('LRC', ascending=False))

+-------------------+-------+---------+-----------+-------+-------+--------+------------+--------+---------+---------+-----------+
| key               |     f |   exp_f |   unexp_f |   dP1 |   LRC |     G2 |          N |     f1 |      f2 | l1      | l2        |
| exactly~alike     | 3,040 |       9 |     3,031 |     0 |     9 | 29,939 | 86,330,753 | 61,599 |  13,261 | exactly | alike     |
+-------------------+-------+---------+-----------+-------+-------+--------+------------+--------+---------+---------+-----------+
| exactly~opposite  |   498 |       7 |       491 |     0 |     6 |  3,337 | 86,330,753 | 61,599 |   9,404 | exactly | opposite  |
+-------------------+-------+---------+-----------+-------+-------+--------+------------+--------+---------+---------+-----------+
| exactly~right     | 6,948 |     146 |     6,802 |     0 |     6 | 41,086 | 86,330,753 | 61,599 | 204,572 | exactly | right     |
+-------------------+-------+---------+-----------+-------+-------+--------+-------

In [51]:
top_overall = pd.read_csv('/share/compling/projects/sanpi/results/top_AM/any_bigram_examples/exactly/exactly_top11-bigrams_AMscores_2024-05-18.csv').set_index('key')
top_overall.sort_values('LRC', ascending=False)

Unnamed: 0_level_0,f,exp_f,unexp_f,dP1,LRC,...,N,f1,f2,l1,l2
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
exactly~alike,3040,9.46,3030.54,0.23,8.55,...,86330753,61599,13261,exactly,alike
exactly~opposite,498,6.71,491.29,0.05,5.94,...,86330753,61599,9404,exactly,opposite
exactly~right,6948,145.97,6802.03,0.03,5.53,...,86330753,61599,204572,exactly,right
exactly~zero,344,8.19,335.81,0.03,5.02,...,86330753,61599,11472,exactly,zero
exactly~parallel,224,5.41,218.59,0.03,4.9,...,86330753,61599,7577,exactly,parallel
exactly~sure,9301,602.91,8698.09,0.01,3.89,...,86330753,61599,844981,exactly,sure
exactly~equal,560,33.61,526.39,0.01,3.75,...,86330753,61599,47099,exactly,equal
exactly~conducive,214,11.71,202.29,0.01,3.68,...,86330753,61599,16405,exactly,conducive
exactly~correct,788,55.83,732.17,0.01,3.56,...,86330753,61599,78240,exactly,correct
exactly~ideal,445,30.47,414.53,0.01,3.52,...,86330753,61599,42701,exactly,ideal


### `ENV~ADJ` associations for top bigrams

In [52]:
setdiff_floor = 200
mirror_floor = 120
adj_dfs = {
    d.name:
    update_index(pd.read_pickle(
        tuple(d.joinpath('adj/extra')
              .glob(f'*35f-7c*min{mirror_floor if d.name == "NEGmirror" else setdiff_floor}x*.pkl.gz')
              )[0]
    )
    )
    for d in POLAR_DIR.iterdir()}
for pat_dir, amdf in adj_dfs.items():
    print(f'>> {pat_dir} <<')
    show_sample(amdf.loc[amdf.conservative_log_ratio.abs().round() > 1, :].sample(
        8).filter(FOCUS).filter(regex=r'^[^l]'), assoc=True)
    print('.............')

>> RBdirect <<
+---------------------+---------+-------+-------+--------------+-----------+---------+------------+-----------+
| key                 |       f |   dP1 |   LRC |   dP1_simple |        G2 |      f2 |      exp_f |   unexp_f |
| NEGany~clear-cut    |   1,368 |  0.27 |  3.29 |         0.31 |  3,770.52 |   4,399 |     164.39 |  1,203.61 |
+---------------------+---------+-------+-------+--------------+-----------+---------+------------+-----------+
| COM~basic           | 104,613 |  0.03 |  2.45 |         0.99 |  4,544.02 | 105,208 | 101,273.29 |  3,339.71 |
+---------------------+---------+-------+-------+--------------+-----------+---------+------------+-----------+
| NEGany~prestigious  |     265 | -0.03 | -2.22 |         0.01 | -1,868.25 |  44,499 |   1,662.94 | -1,397.94 |
+---------------------+---------+-------+-------+--------------+-----------+---------+------------+-----------+
| COM~picky           |  10,311 | -0.09 | -1.68 |         0.87 | -1,621.43 |  11,798 |  1

In [53]:
adj_amdf = pd.concat(adj_dfs.values())
adj_amdf = adjust_assoc_columns(adj_amdf)
adj_amdf = adj_amdf.sort_values(
    ['LRC', 'dP1', 'dP1_simple'],
    ascending=False)
adj_amdf.filter(abbr_FOCUS).describe().round(2).T.iloc[:,1:].sort_index()


Unnamed: 0,mean,std,min,25%,50%,75%,max
G2,24.05,4604.0,-188432.85,-0.65,6.44,33.16,188571.6
LRC,0.06,0.55,-5.66,0.0,0.0,0.0,5.66
dP1,0.01,0.05,-0.76,-0.0,0.01,0.03,0.76
dP1_simple,0.82,0.33,0.0,0.9,0.97,0.98,1.0
exp_f,8460.63,55383.96,35.2,333.06,775.83,2676.51,2130224.57
f,8481.82,55421.21,120.0,331.0,758.0,2629.5,2198836.0
f2,15931.39,80623.04,122.0,418.0,1243.5,6206.0,2212989.0
unexp_f,21.18,2833.95,-97222.23,-4.69,10.2,45.89,97246.74


In [54]:
for c in adj_amdf.select_dtypes(include='number').columns:
    print(f'\nWeakest/Minimum `{c}`')
    show_sample(
        adj_amdf.loc[adj_amdf[c] == adj_amdf[c].abs().min()]
                .filter(set(abbr_FOCUS + [c])).head(10),
        assoc=True, format='pipe')


Weakest/Minimum `f`
| key               |   LRC | l1     |   exp_f |   unexp_f |   dP1 | l2            |   f |   f2 |    G2 |   dP1_simple |
|:------------------|------:|:-------|--------:|----------:|------:|:--------------|----:|-----:|------:|-------------:|
| POS~regional      |     0 | POSMIR |  104.44 |     15.56 |  0.12 | regional      | 120 |  125 | 19.20 |         0.96 |
| POS~psychological |     0 | POSMIR |  105.27 |     14.73 |  0.12 | psychological | 120 |  126 | 16.55 |         0.95 |
| POS~sexist        |     0 | POSMIR |  107.78 |     12.22 |  0.09 | sexist        | 120 |  129 | 10.34 |         0.93 |
| POS~unsuitable    |     0 | POSMIR |  108.62 |     11.38 |  0.09 | unsuitable    | 120 |  130 |  8.72 |         0.92 |
| POS~disingenuous  |     0 | POSMIR |  111.96 |      8.04 |  0.06 | disingenuous  | 120 |  134 |  3.94 |         0.90 |
| POS~sticky        |     0 | POSMIR |  114.46 |      5.54 |  0.04 | sticky        | 120 |  137 |  1.75 |         0.88 |
| POS~unnat

In [55]:
for c in adj_amdf.select_dtypes(include='number').columns:
    print(f'\nStrongest/Maximum `{c}`')
    show_sample(
        adj_amdf.loc[adj_amdf[c] == adj_amdf[c].max()]
                .filter(set(FOCUS + [c])).head(10),
        assoc=True, format='pipe')


Strongest/Maximum `f`
| key      | l1         | l2   |         f |        f2 |
|:---------|:-----------|:-----|----------:|----------:|
| COM~many | COMPLEMENT | many | 2,198,836 | 2,212,989 |

Strongest/Maximum `exp_f`
| key      | l1         |        exp_f | l2   |         f |        f2 |
|:---------|:-----------|-------------:|:-----|----------:|----------:|
| COM~many | COMPLEMENT | 2,130,224.57 | many | 2,198,836 | 2,212,989 |

Strongest/Maximum `G2`
| key        | l1      |         G2 | l2   |       f |      f2 |
|:-----------|:--------|-----------:|:-----|--------:|--------:|
| NEGany~bad | NEGATED | 188,571.60 | bad  | 105,275 | 557,528 |

Strongest/Maximum `odds_r_disc`
| key          | l1         | l2       |     f |   odds_r_disc |    f2 |
|:-------------|:-----------|:---------|------:|--------------:|------:|
| COM~evolving | COMPLEMENT | evolving | 3,174 |          2.39 | 3,174 |

Strongest/Maximum `dP1`
| key          | l1     |   dP1 | l2    |     f |    f2 |
|:-------

### significant environment LRCs for context-blind *exactly* associated adjectives

In [56]:
show_sample(adj_amdf
            .sort_values(['f1','LRC'], ascending=False)
            .filter(abbr_FOCUS).filter(regex=r'NEGany|COM', axis=0)
            .loc[(adj_amdf.l2.isin(top_overall.l2)) & (adj_amdf.LRC>1), :],
            format='fancy_outline',
            assoc=True)

╒══════════════════╤═════════╤═══════╤═══════╤══════════════╤════════════╤═════════╤═══════════╤═══════════╤════════════╤═══════════╕
│ key              │       f │   dP1 │   LRC │   dP1_simple │         G2 │      f2 │     exp_f │   unexp_f │ l1         │ l2        │
╞══════════════════╪═════════╪═══════╪═══════╪══════════════╪════════════╪═════════╪═══════════╪═══════════╪════════════╪═══════════╡
│ COM~same         │  55,867 │  0.03 │  2.31 │         0.99 │   2,406.02 │  56,190 │ 54,088.53 │  1,778.47 │ COMPLEMENT │ same      │
│ COM~zero         │  11,377 │  0.03 │  1.42 │         0.99 │     391.67 │  11,472 │ 11,042.95 │    334.05 │ COMPLEMENT │ zero      │
│ NEGany~sure      │ 128,824 │  0.12 │  2.19 │         0.15 │ 182,987.33 │ 844,981 │ 31,577.26 │ 97,246.74 │ NEGATED    │ sure      │
│ NEGany~conducive │   1,618 │  0.06 │  1.29 │         0.10 │   1,196.28 │  16,405 │    613.06 │  1,004.94 │ NEGATED    │ conducive │
│ NEGany~correct   │   6,864 │  0.05 │  1.21 │         0.09 │ 

In [57]:
show_sample(adj_amdf
            .sort_values(['f1','LRC'], ascending=False)
            .filter(abbr_FOCUS).filter(regex=r'NEGmir|POS', axis=0)
            .loc[(adj_amdf.l2.isin(top_overall.l2)) & (adj_amdf.LRC>1), :],
            format='fancy_outline',
            assoc=True)

╒═════════════╤═══════╤═══════╤═══════╤══════════════╤══════════╤═══════╤══════════╤═══════════╤════════╤══════╕
│ key         │     f │   dP1 │   LRC │   dP1_simple │       G2 │    f2 │    exp_f │   unexp_f │ l1     │ l2   │
╞═════════════╪═══════╪═══════╪═══════╪══════════════╪══════════╪═══════╪══════════╪═══════════╪════════╪══════╡
│ NEGmir~sure │ 5,753 │  0.43 │  2.72 │         0.59 │ 9,087.14 │ 9,744 │ 1,602.58 │  4,150.42 │ NEGMIR │ sure │
╘═════════════╧═══════╧═══════╧═══════╧══════════════╧══════════╧═══════╧══════════╧═══════════╧════════╧══════╛


In [58]:
def interpret_polar_lrc(amdf: pd.DataFrame) -> pd.DataFrame:
    if "LRC" not in amdf.columns: 
        amdf = adjust_assoc_columns(amdf)
    amdf = amdf.assign(
        polarity=amdf.l1.apply(
            lambda env: 'Negative' if env.startswith('NEG') else 'Positive'),
        significant=amdf.LRC.abs() > 1,
        attract=amdf.LRC.round() > 0)
    amdf = amdf.assign(promote=amdf.significant & amdf.attract,
                       prohibit=amdf.significant & ~amdf.attract)
    
    return amdf


In [59]:

adj_amdf_i = interpret_polar_lrc(adj_amdf
                               .filter(abbr_FOCUS)
                               .loc[adj_amdf.l2.isin(top_overall.l2), :]
                               ).sort_values('LRC',
                                            ascending=False)


In [60]:
pmir_conducive = sample_pickle(data=pmir_exactly, sample_size=4,
                          columns=['all_forms_lower', 'token_str'], filters=['adj_form_lower==conducive'], 
                          print_sample=False)
show_sample(pmir_conducive.assign(token_str = embolden(pmir_conducive.token_str, 
                                                          bold_regex=r' (exactly.conducive) ')))


- *filtering rows...*
  - regex parsing = False
  - Filter expression `adj_form_lower==conducive` matched zero rows. Filter not applied.

### 4 random rows from `input frame`

+------------------------+---------------------------+---------------------------------------------------------+
| hit_id                 | all_forms_lower           | token_str                                               |
| pcc_eng_01_055.9820_x0 | everything_exactly_right  | When someone is looking for a new job , it 's vital     |
| 889134_08:27-29-30     |                           | that they look over their various documents a number of |
|                        |                           | times to make sure everything is exactly right ,        |
|                        |                           | according to Job Monkey .                               |
+------------------------+---------------------------+---------------------------------------------------------+
| pcc_eng_13_020.1821_x0 | every

In [61]:
nmir_conducive = sample_pickle(data=nmir_exactly, sample_size=4,
                          columns=['all_forms_lower', 'token_str'], filters=['adj_form_lower==conducive'], 
                          print_sample=False)
show_sample(nmir_conducive.assign(token_str = embolden(nmir_conducive.token_str, 
                                                          bold_regex=r' (exactly.conducive) ')), format='pipe')


- *filtering rows...*
  - regex parsing = False
  - ✓ Applied filter: `adj_form_lower==conducive`

### All (2) row(s) matching filter(s) from `input frame`

| hit_id                                | all_forms_lower        | token_str                                                                                                              |
|:--------------------------------------|:-----------------------|:-----------------------------------------------------------------------------------------------------------------------|
| pcc_eng_00_042.0217_x0662848_14:1-8-9 | none_exactly_conducive | None of these places or circumstances were __`exactly conducive`__ for a dog , so she " stayed " with my mom and dad . |
| pcc_eng_11_078.1769_x1249405_45:1-5-6 | none_exactly_conducive | None of which were __`exactly conducive`__ to climbing nice , cold mountains .                                         |


### Context-blind *exactly* associated adjectives Polarity Preferences


In [62]:
pd.crosstab(adj_amdf_i.l1, adj_amdf_i.promote)

promote,False,True
l1,Unnamed: 1_level_1,Unnamed: 2_level_1
COMPLEMENT,9,2
NEGATED,6,4
NEGMIR,2,1
POSMIR,9,0



Of the 11 adjectives most strongly associated with _exactly_ when context is ignored: 
- only 6 show any significant polar sensitivity in the superset data,
    - 2 with a Positive lean
    - 4 with a Negative lean
- only 1 shows significant polarity sensitivity in the `mirror` subset
    - with a tendency toward negative environments

Thus, despite the overall negative propensity of *exactly*, half of the most commonly associated adjectives overall are completely neutral on their own. 
Of those that skew toward negative environments, the effect size is relatively weak, with _sure_ garnering an LRC of 2.1-2.7 and _conducive_, _correct_, and _right_ only barely surpassing 1. 

| key              |       f |   dP1 |   LRC |   dP1_simple |         G2 |      f2 |
|:-----------------|--------:|------:|------:|-------------:|-----------:|--------:|
| NEGmir~sure      |   5,753 |  0.43 |  2.72 |         0.59 |   9,087.14 |   9,744 |
| NEGany~sure      | 128,824 |  0.12 |  2.19 |         0.15 | 182,987.33 | 844,981 |
| NEGany~conducive |   1,618 |  0.06 |  1.29 |         0.10 |   1,196.28 |  16,405 |
| NEGany~correct   |   6,864 |  0.05 |  1.21 |         0.09 |   4,049.83 |  78,240 |
| NEGany~right     |  15,740 |  0.04 |  1.04 |         0.08 |   6,902.12 | 204,572 |


Even more strikingly, 2 of these adjectives even demonstrate a bias for _positive_ environments when considered independently of adverb modifier,
although these effect sizes are also quite small, and the accuracy of the parses for these particular adjectives are questionable.

| key      |      f |   dP1 |   LRC |   dP1_simple |       G2 |     f2 |
|:---------|-------:|------:|------:|-------------:|---------:|-------:|
| COM~same | 55,867 |  0.03 |  2.31 |         0.99 | 2,406.02 | 56,190 |
| COM~zero | 11,377 |  0.03 |  1.42 |         0.99 |   391.67 | 11,472 |



In [63]:
show_sample(adj_amdf_i.filter(like='O', axis=0).loc[adj_amdf_i.promote, :'f2'], assoc=True, format='pipe')

| key      |      f |   dP1 |   LRC |   dP1_simple |       G2 |     f2 |
|:---------|-------:|------:|------:|-------------:|---------:|-------:|
| COM~same | 55,867 |  0.03 |  2.31 |         0.99 | 2,406.02 | 56,190 |
| COM~zero | 11,377 |  0.03 |  1.42 |         0.99 |   391.67 | 11,472 |


#### REMEMBER: _The `env~adj` AM are for_ ALL _adverbs, not just_ "exactly"

In [64]:

adj_amdf_i.loc[adj_amdf_i.promote, :].groupby('polarity').value_counts(['l1'])


polarity  l1        
Negative  NEGATED       4
          NEGMIR        1
Positive  COMPLEMENT    2
Name: count, dtype: int64

In [65]:
# attract_only = adj_amdf_i.loc[adj_amdf_i.attract,:]
pd.crosstab(adj_amdf_i.polarity, adj_amdf_i.LRC.apply(lambda x: 'strong_attract' if x > 2.5 else ('weak_attract' if x > 1 else ('neutral' if x > -1 else 'repel'))))


LRC,neutral,repel,strong_attract,weak_attract
polarity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Negative,7,1,1,4
Positive,13,5,0,2


In [66]:
for adj, adj_only_am in adj_amdf_i.groupby('l2'): 
    print(f'\n#### {adj}\n')
    show_sample(adj_only_am.filter(regex=r'^f$|^[dLGupsa]'), assoc=True, format='pipe')
    if any(adj_only_am['promote']): 
        polar_promoted = adj_only_am.loc[adj_only_am.promote, :]
        for adj_in_env in polar_promoted.index:
            lrc_val = polar_promoted.loc[adj_in_env, 'LRC'].round(3).squeeze()
            caveat = "(but weak) " if abs(lrc_val) < 2 else ''
            print(f'\n👀🧲 significant {caveat} LRC found for _{adj}_ adj & `{adj_in_env.split("~")[0]}` env: {lrc_val:.3f}\n')
        show_sample(polar_promoted[['polarity', 'f', 'LRC', 'dP1','dP1_simple']], assoc=True, format='pipe')


#### alike

| key          |      f |   dP1 |   LRC |   dP1_simple |     G2 |   unexp_f | polarity   | significant   | attract   | promote   | prohibit   |
|:-------------|-------:|------:|------:|-------------:|-------:|----------:|:-----------|:--------------|:----------|:----------|:-----------|
| NEGany~alike |    698 |  0.02 |  0.21 |         0.05 |  76.53 |    202.43 | Negative   | False         | False     | False     | False      |
| POS~alike    |    351 | -0.02 |  0.00 |         0.81 |  -1.35 |     -9.10 | Positive   | False         | False     | False     | False      |
| COM~alike    | 12,563 | -0.02 | -0.21 |         0.95 | -76.20 |   -202.05 | Positive   | False         | False     | False     | False      |

#### conducive

| key              |      f |   dP1 |   LRC |   dP1_simple |        G2 |   unexp_f | polarity   | significant   | attract   | promote   | prohibit   |
|:-----------------|-------:|------:|------:|-------------:|----------:|----------:|:-----------|:-

### Adjectives that create the most negatively skewed bigrams when modified by *exactly*

Of the adjectives that most strongly negative bigrams when modified by _exactly_, 
- 0 are independently prone to positive polarity environments
- 5/10 are independently prone to negative environments when data superset is considered 
- 2/9[^1] are independently predisposed to negative environments when only the negative mirrors are considered

| polarity | l1         | prone | count |
|:---------|:-----------|:--------|------:|
| Negative | NEGATED    | True    |     5 |
| Negative | NEGATED    | False   |     5 |
| Negative | NEGMIR     | True    |     2 |
| Negative | NEGMIR     | False   |     7 |
| Positive | COMPLEMENT | True    |     0 |
| Positive | COMPLEMENT | False   |    10 |
| Positive | POSMIR     | True    |     0 |
| Positive | POSMIR     | False   |    10 |

[^1]: only 9 of 10 adjectives identified in the strongly negative bigram selection have an entry in the mirror subset AMs

In [67]:
show_sample(interpret_polar_lrc(adj_amdf.loc[adj_amdf.l2.isin(most_neg.adj),:].filter(abbr_FOCUS))
            .groupby(['polarity','l1']).value_counts(['promote']).reset_index(), format='pipe')

|    | polarity   | l1         | promote   |   count |
|---:|:-----------|:-----------|:----------|--------:|
|  0 | Negative   | NEGATED    | False     |       5 |
|  1 | Negative   | NEGATED    | True      |       5 |
|  2 | Negative   | NEGMIR     | False     |       7 |
|  3 | Negative   | NEGMIR     | True      |       2 |
|  4 | Positive   | COMPLEMENT | False     |      10 |
|  5 | Positive   | POSMIR     | False     |      10 |


In [68]:
neg_bigram_adj = interpret_polar_lrc(
    adj_amdf.loc[adj_amdf.l2.isin(most_neg.adj), :].filter(abbr_FOCUS))
show_sample(neg_bigram_adj.loc[(neg_bigram_adj.polarity == 'Negative') & (neg_bigram_adj.promote), :].value_counts(['l2'])
            .to_frame('# datasets where negatively associated').reset_index(), format='pipe')


|    | l2          |   # datasets where negatively associated |
|---:|:------------|-----------------------------------------:|
|  0 | clear       |                                        2 |
|  1 | sure        |                                        2 |
|  2 | easy        |                                        1 |
|  3 | forthcoming |                                        1 |
|  4 | surprising  |                                        1 |


In [69]:
show_sample(neg_bigram_adj.loc[(neg_bigram_adj.polarity == 'Negative') & (~neg_bigram_adj.promote), :].value_counts(['l2'])
            .to_frame('# datasets where NOT negatively associated').reset_index(), format='pipe')

|    | l2         |   # datasets where NOT negatively associated |
|---:|:-----------|---------------------------------------------:|
|  0 | cheap      |                                            2 |
|  1 | impressive |                                            2 |
|  2 | new        |                                            2 |
|  3 | practical  |                                            2 |
|  4 | shy        |                                            2 |
|  5 | easy       |                                            1 |
|  6 | surprising |                                            1 |


In [70]:
show_sample(neg_bigram_adj.sort_values(['l2','l1']).loc[(neg_bigram_adj.polarity=='Negative') & ~neg_bigram_adj.promote,:'f2'], format='pipe', assoc=True)

| key               |      f |   dP1 |   LRC |   dP1_simple |       G2 |      f2 |
|:------------------|-------:|------:|------:|-------------:|---------:|--------:|
| NEGany~cheap      |  4,121 |  0.01 |  0.29 |         0.05 |   297.40 |  83,765 |
| NEGmir~cheap      |    229 | -0.04 |  0.00 |         0.13 |   -22.04 |   1,829 |
| NEGmir~easy       |  5,055 |  0.09 |  0.65 |         0.25 | 1,008.11 |  20,050 |
| NEGany~impressive |  8,912 |  0.01 |  0.21 |         0.05 |   340.20 | 195,739 |
| NEGmir~impressive |    937 |  0.02 |  0.00 |         0.19 |    16.73 |   5,033 |
| NEGany~new        | 10,471 | -0.00 | -0.13 |         0.03 |  -213.97 | 321,311 |
| NEGmir~new        |  2,136 | -0.00 |  0.00 |         0.16 |    -0.38 |  13,145 |
| NEGany~practical  |  4,133 |  0.02 |  0.63 |         0.06 |   913.41 |  67,263 |
| NEGmir~practical  |    193 | -0.04 |  0.00 |         0.13 |   -17.35 |   1,528 |
| NEGany~shy        |  1,581 | -0.01 | -0.08 |         0.03 |   -60.43 |  50,956 |
| NE

LRC values indicate that, while _clear_ and _sure_ show independent negative association in both datasets, _easy_ and _surprising_ show significant negative association for the superset, but not for the subset.
The adjectives _cheap_, _impressive_, _new_, _practical_, and _shy_ do not show significant negative lean neither superset nor subset.
Finally, _forthcoming_ only appears in the superset evaluation, where it demonstrates a preference for negative polarity environments.

The only adjective in this set with a $\Delta P(\texttt{env}|\texttt{adj})$ value over 0.16, regardless of LRC value, is _sure_ evaluated on the superset at 0.43. 
That is, the probability of an utterance showing evidence of negative polarity when it contains _sure_ is 0.59---59% of the _sure_ tokens occur with negative triggers---and 
    0.43 when the probability when the adjective is anything else is factored in (16% of tokens for all other adjectives are found with negation.)

_Adjectives in negatively associated *exactly* bigrams which show significant independent negative association_

| key                |       f |   dP1 |   LRC |   dP1_simple |         G2 |      f2 |
|:-------------------|--------:|------:|------:|-------------:|-----------:|--------:|
| NEGany~clear       |  72,905 |  0.11 |  2.14 |         0.15 |  99,542.75 | 491,108 |
| NEGmir~clear       |   2,438 |  0.15 |  1.01 |         0.31 |   1,031.26 |   7,833 |
| NEGany~easy        |  87,578 |  0.08 |  1.69 |         0.11 |  83,051.73 | 771,307 |
| NEGany~forthcoming |   2,019 |  0.14 |  2.30 |         0.18 |   3,381.87 |  11,270 |
| NEGany~sure        | 128,824 |  0.12 |  2.19 |         0.15 | 182,987.33 | 844,981 |
| NEGmir~sure        |   5,753 |  0.43 |  2.72 |         0.59 |   9,087.14 |   9,744 |
| NEGany~surprising  |  16,440 |  0.07 |  1.60 |         0.11 |  14,570.16 | 150,067 |

_Adjectives in negatively associated *exactly* bigrams which **do NOT** show significant independent negative association_

| key               |      f |   dP1 |   LRC |   dP1_simple |       G2 |      f2 |
|:------------------|-------:|------:|------:|-------------:|---------:|--------:|
| NEGany~cheap      |  4,121 |  0.01 |  0.29 |         0.05 |   297.40 |  83,765 |
| NEGmir~cheap      |    229 | -0.04 |  0.00 |         0.13 |   -22.04 |   1,829 |
| NEGmir~easy       |  5,055 |  0.09 |  0.65 |         0.25 | 1,008.11 |  20,050 |
| NEGany~impressive |  8,912 |  0.01 |  0.21 |         0.05 |   340.20 | 195,739 |
| NEGmir~impressive |    937 |  0.02 |  0.00 |         0.19 |    16.73 |   5,033 |
| NEGany~new        | 10,471 | -0.00 | -0.13 |         0.03 |  -213.97 | 321,311 |
| NEGmir~new        |  2,136 | -0.00 |  0.00 |         0.16 |    -0.38 |  13,145 |
| NEGany~practical  |  4,133 |  0.02 |  0.63 |         0.06 |   913.41 |  67,263 |
| NEGmir~practical  |    193 | -0.04 |  0.00 |         0.13 |   -17.35 |   1,528 |
| NEGany~shy        |  1,581 | -0.01 | -0.08 |         0.03 |   -60.43 |  50,956 |
| NEGmir~shy        |    152 | -0.04 |  0.00 |         0.13 |   -14.44 |   1,212 |
| NEGmir~surprising |    907 |  0.16 |  0.96 |         0.32 |   416.44 |   2,829 |


In [71]:
show_sample(pd.concat([x.filter(like='new',axis=0).loc[:, ['f','dP1','LRC','G2']] for x in (most_neg, neg_bigram_adj)]), assoc=True, format='pipe')

| key                |       f |   dP1 |   LRC |       G2 |
|:-------------------|--------:|------:|------:|---------:|
| NEGany~exactly_new |   1,378 |  0.93 |  8.54 | 8,697.93 |
| COM~new            | 310,840 |  0.00 |  0.13 |   216.46 |
| POS~new            |  11,009 |  0.00 |  0.00 |     0.39 |
| NEGmir~new         |   2,136 | -0.00 |  0.00 |    -0.38 |
| NEGany~new         |  10,471 | -0.00 | -0.13 |  -213.97 |


In [72]:
show_sample(pd.concat([x.filter(like='shy',axis=0).loc[:, ['f','dP1','LRC','G2']] for x in (most_neg, neg_bigram_adj)]), assoc=True, format='pipe')

| key                |      f |   dP1 |   LRC |     G2 |
|:-------------------|-------:|------:|------:|-------:|
| NEGany~exactly_shy |    124 |  0.96 |  1.53 | 815.15 |
| COM~shy            | 49,375 |  0.01 |  0.08 |  60.95 |
| POS~shy            |  1,060 |  0.04 |  0.00 |  14.46 |
| NEGmir~shy         |    152 | -0.04 |  0.00 | -14.44 |
| NEGany~shy         |  1,581 | -0.01 | -0.08 | -60.43 |


The absence of independent negative bias for the adjectives found in the most negatively associated *exactly* bigrams makes it unlikely that *exactly*'s pattern of negative association overall is unlikely to be inherited from specific adjectives it modifies.
For example, despite _exactly new_ showing a very strong preference for negative polarity environments, the adjective _new_ shows no significant polarity sensitivity when evaluated for all possible adverbs modifiers.

| key                |       f |   dP1 |   LRC |       G2 |
|:-------------------|--------:|------:|------:|---------:|
| NEGany~exactly_new |   1,378 |  0.93 |  8.54 | 8,697.93 |
| COM~new            | 310,840 |  0.00 |  0.13 |   216.46 |
| POS~new            |  11,009 |  0.00 |  0.00 |     0.39 |
| NEGmir~new         |   2,136 | -0.00 |  0.00 |    -0.38 |
| NEGany~new         |  10,471 | -0.00 | -0.13 |  -213.97 |

The same can be said for _(exactly) shy_. 

| key                |      f |   dP1 |   LRC |     G2 |
|:-------------------|-------:|------:|------:|-------:|
| NEGany~exactly_shy |    124 |  0.96 |  1.53 | 815.15 |
| COM~shy            | 49,375 |  0.01 |  0.08 |  60.95 |
| POS~shy            |  1,060 |  0.04 |  0.00 |  14.46 |
| NEGmir~shy         |    152 | -0.04 |  0.00 | -14.44 |
| NEGany~shy         |  1,581 | -0.01 | -0.08 | -60.43 |

If anything, these adjectives appear slightly **dis**inclined toward negative polarity environments on the whole.

### `ENV~bigram` associations for top bigrams

In [73]:
# for loading `polar/*/bigram/*` tables
bigram_floor = 50
mirror_floor = 10

bigram_dfs = {d.name:
              update_index(pd.read_pickle(
                  tuple(d.joinpath('bigram/extra')
                        .glob(f'*35f-7c*min{mirror_floor if d.name == "NEGmirror" else bigram_floor}x*.pkl.gz')
                        )[0]
                  )
              ).filter(like='~exactly_', axis=0)
              for d in POLAR_DIR.iterdir()}
for pat_dir, amdf in bigram_dfs.items():
    print(f'>> {pat_dir} <<')
    show_sample(amdf.sample(min(len(amdf), 8)).filter(FOCUS).filter(regex=r'^([^la]|am)'), assoc=True)
    print('.............')

>> RBdirect <<
+---------------------------+-------+-------+-------+--------------+----------+-------+---------+-----------+
| key                       |     f |   dP1 |   LRC |   dP1_simple |       G2 |    f2 |   exp_f |   unexp_f |
| NEGany~exactly_related    |    63 |  0.76 |  4.47 |         0.80 |   335.75 |    79 |    2.95 |     60.05 |
+---------------------------+-------+-------+-------+--------------+----------+-------+---------+-----------+
| NEGany~exactly_comforting |   100 |  0.93 |  5.15 |         0.97 |   630.48 |   103 |    3.85 |     96.15 |
+---------------------------+-------+-------+-------+--------------+----------+-------+---------+-----------+
| NEGany~exactly_subtle     |   264 |  0.94 |  6.92 |         0.97 | 1,671.02 |   271 |   10.13 |    253.87 |
+---------------------------+-------+-------+-------+--------------+----------+-------+---------+-----------+
| NEGany~exactly_quiet      |    64 |  0.93 |  4.06 |         0.97 |   402.95 |    66 |    2.47 |     61.

#### Most Polarity Sensitive Bigrams (i.e. Negative Leaning because it's _exactly_)

In [74]:
exactly_bigram_amdf = adjust_assoc_columns(pd.concat(bigram_dfs.values()))
exactly_bigram_amdf['abs_LRC'] = exactly_bigram_amdf.LRC.abs()
exactly_bigram_amdf.nlargest(10, 'abs_LRC').filter(adjust_assoc_columns(FOCUS) + ['abs_LRC'])

Unnamed: 0_level_0,f,dP1,LRC,dP1_simple,G2,...,l1,l2,adj,adj_total,abs_LRC
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
NEGany~exactly_sure,8860,1,9,1,54751,...,NEGATED,exactly_sure,sure,844981,9
COM~exactly_sure,441,-1,-9,0,-54737,...,COMPLEMENT,exactly_sure,sure,844981,9
NEGany~exactly_new,1378,1,9,1,8698,...,NEGATED,exactly_new,new,321311,9
NEGany~exactly_easy,1069,1,8,1,6748,...,NEGATED,exactly_easy,easy,771307,8
NEGany~exactly_clear,1759,1,8,1,10937,...,NEGATED,exactly_clear,clear,491108,8
COM~exactly_clear,76,-1,-8,0,-10934,...,COMPLEMENT,exactly_clear,clear,491108,8
NEGany~exactly_cheap,693,1,8,1,4443,...,NEGATED,exactly_cheap,cheap,83765,8
NEGany~exactly_surprising,441,1,7,1,2863,...,NEGATED,exactly_surprising,surprising,150067,7
NEGany~exactly_happy,441,1,7,1,2695,...,NEGATED,exactly_happy,happy,528511,7
NEGany~exactly_ideal,418,1,7,1,2546,...,NEGATED,exactly_ideal,ideal,42701,7


### _Least_ "Negative Leaning" _exactly_ bigrams

In [75]:
exactly_bigram_amdf.loc[exactly_bigram_amdf.dP1<0.6, :].filter(like='NEG', axis=0).nsmallest(10, 'abs_LRC').filter(adjust_assoc_columns(FOCUS) + ['abs_LRC'])

Unnamed: 0_level_0,f,dP1,LRC,dP1_simple,G2,...,l1,l2,adj,adj_total,abs_LRC
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
NEGany~exactly_alike,134,0,0,0,4,...,NEGATED,exactly_alike,alike,13261,0
NEGmir~exactly_true,12,1,0,1,23,...,NEGMIR,exactly_true,true,7402,0
NEGmir~exactly_right,52,0,0,0,38,...,NEGMIR,exactly_right,right,13473,0
NEGmir~exactly_alike,40,0,0,0,42,...,NEGMIR,exactly_alike,alike,431,0
NEGmir~exactly_wrong,21,1,1,1,47,...,NEGMIR,exactly_wrong,wrong,20866,1
NEGany~exactly_equal,72,0,1,0,81,...,NEGATED,exactly_equal,equal,47099,1
NEGany~exactly_right,638,0,1,0,412,...,NEGATED,exactly_right,right,204572,1
NEGany~exactly_identical,81,0,2,0,172,...,NEGATED,exactly_identical,identical,52155,2
NEGany~exactly_wrong,178,0,3,0,457,...,NEGATED,exactly_wrong,wrong,187720,3
NEGany~exactly_correct,259,0,3,0,745,...,NEGATED,exactly_correct,correct,78240,3


In [76]:
am_for_blind = exactly_bigram_amdf.filter(regex='_'+r'|'.join(top_overall.l2), axis=0).filter(items=adjust_assoc_columns(FOCUS)).sort_values('LRC', ascending=False)
am_for_blind = interpret_polar_lrc(am_for_blind)
show_sample(am_for_blind, assoc=True, format='pipe')
print(am_for_blind[['polarity', 'significant', 'attract', 'promote', 'prohibit']].to_markdown(floatfmt=',.2f', intfmt=','))

for pol, pol_df in am_for_blind.groupby('polarity'): 
    print(f'\n#### {pol} Association')
    print(pol_df.filter(['f','dP1', 'LRC','promote', 'prohibit']).to_markdown(floatfmt=',.2f', intfmt=','))


| key                      |     f |   dP1 |   LRC |   dP1_simple |         G2 |    f2 |    exp_f |   unexp_f | l1         | l2                | adj       |   adj_total | polarity   | significant   | attract   | promote   | prohibit   |
|:-------------------------|------:|------:|------:|-------------:|-----------:|------:|---------:|----------:|:-----------|:------------------|:----------|------------:|:-----------|:--------------|:----------|:----------|:-----------|
| NEGany~exactly_sure      | 8,860 |  0.92 |  8.63 |         0.95 |  54,750.58 | 9,301 |   347.58 |  8,512.42 | NEGATED    | exactly_sure      | sure      |     844,981 | Negative   | True          | True      | True      | False      |
| NEGany~exactly_ideal     |   418 |  0.90 |  7.08 |         0.94 |   2,546.29 |   445 |    16.63 |    401.37 | NEGATED    | exactly_ideal     | ideal     |      42,701 | Negative   | True          | True      | True      | False      |
| NEGany~exactly_conducive |   208 |  0.93 |  6.56 |

In [77]:
pd.set_option('display.float_format', '{:,.1f}'.format)
print(exactly_bigram_amdf.filter(like='O', axis=0)[['l1','adj','f', 'dP1','dP1_simple','LRC','G2']].round(1).sort_values([
    'LRC', 
    'dP1_simple',
    'f'
    ], ascending=False).to_markdown(floatfmt=',.1f'))

| key                    | l1         | adj        |    f |   dP1 |   dP1_simple |   LRC |        G2 |
|:-----------------------|:-----------|:-----------|-----:|------:|-------------:|------:|----------:|
| COM~exactly_alike      | COMPLEMENT | alike      | 2906 |  -0.0 |          1.0 |   0.0 |      -3.6 |
| COM~exactly_opposite   | COMPLEMENT | opposite   |  485 |   0.0 |          1.0 |   0.0 |       2.0 |
| COM~exactly_zero       | COMPLEMENT | zero       |  330 |  -0.0 |          1.0 |   0.0 |      -0.1 |
| COM~exactly_many       | COMPLEMENT | many       |   76 |  -0.0 |          1.0 |   0.0 |      -0.0 |
| COM~exactly_contrary   | COMPLEMENT | contrary   |   67 |   0.0 |          1.0 |   0.0 |       5.1 |
| COM~exactly_same       | COMPLEMENT | same       |  457 |  -0.0 |          0.9 |   0.0 |     -13.7 |
| COM~exactly_average    | COMPLEMENT | average    |   73 |  -0.0 |          0.9 |   0.0 |      -2.5 |
| COM~exactly_double     | COMPLEMENT | double     |   53 |  -0.0 |      

In [88]:
exactly_bigram_amdf.filter(like='NEG', axis=0)[['l1','adj','f', 'dP1','dP1_simple','LRC','G2']].round(1).sort_values([
    'LRC', 
    'dP1_simple',
    'f'
    ], ascending=False)

Unnamed: 0_level_0,l1,adj,f,dP1,dP1_simple,LRC,G2
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
NEGany~exactly_sure,NEGATED,sure,8860,0.9,1.0,8.6,54750.6
NEGany~exactly_new,NEGATED,new,1378,0.9,1.0,8.5,8697.9
NEGany~exactly_easy,NEGATED,easy,1069,0.9,1.0,8.4,6747.6
NEGany~exactly_clear,NEGATED,clear,1759,0.9,1.0,8.3,10937.2
NEGany~exactly_cheap,NEGATED,cheap,693,0.9,1.0,8.3,4443.3
...,...,...,...,...,...,...,...
NEGany~exactly_obvious,NEGATED,obvious,50,1.0,1.0,0.2,328.7
NEGmir~exactly_new,NEGMIR,new,29,0.8,1.0,0.0,96.3
NEGmir~exactly_easy,NEGMIR,easy,20,0.8,1.0,0.0,72.2
NEGmir~exactly_true,NEGMIR,true,12,0.5,0.7,0.0,22.6


## Text examples of top compositions (context-blind)

In [89]:

def print_exactly_sample(example_df):    
    if example_df.index.name == 'hit_id': 
        example_df = example_df.reset_index() 
    example_df['hit_id'] = '`' + example_df.hit_id + '`'
    example_df = example_df.set_index('hit_id').filter(regex=r'token|bigram|trigger_lower|all_forms_lower|head')
    example_df = example_df.assign(
        token_str='*' + embolden(example_df.token_str, r'\b([Ee]xactly \w+)\b', mono=False).str.replace('``', '"') + '*',
        )
    example_df = example_df.sort_values(['trigger_lower', 'bigram_lower'] 
                      if 'trigger_lower' in example_df.columns else 'bigram_lower'
                      )[
                          [c for c in ['trigger_lower', 'bigram_lower', 'token_str', 'trigger_head'] if c in example_df.columns]]

    show_sample(example_df, format='pipe')

In [90]:
neg_bigram_ids = full_not_exactly.bigram_id

In [91]:
baseline_sample = pd.read_pickle(TOP_AM_DIR / 'top9adv_sample-9-hit-tables_2024-05-15.pkl.gz').filter(regex=r'bigram|token|text|lower|lemma')
baseline_sample = baseline_sample.loc[baseline_sample.adv_form_lower =='exactly', :]
positive_sample = baseline_sample.loc[~baseline_sample.index.isin(neg_bigram_ids),:]

show_sample(sample_pickle(data=positive_sample, regex=True, sample_size=8, quiet=True, print_sample=False))

+------------------------+-------------------------------------+---------------------------------------------------------+------------------+------------------+-------------------+
| hit_id                 | text_window                         | token_str                                               | adv_form_lower   | adj_form_lower   | bigram_lower      |
| nyt_eng_19950519_0410_ | that are not exactly Biosafety      | the researchers in Zaire `` are seeing it face to face  | exactly          | biosafety        | exactly_biosafety |
| 10:27-28               | Level 4 ,                           | , and they 're seeing a whole lot of it in conditions   |                  |                  |                   |
|                        |                                     | that are not exactly Biosafety Level 4 , '' he said .   |                  |                  |                   |
+------------------------+-------------------------------------+-------------------------------

⚠️ There are a _lot_ of "no (two/NOUN)... exactly alike" cases that were not caught by `RBdirect` 😬
I think this may have influenced the association measures 🤔

In [92]:
positive_sample = positive_sample.loc[
    ~positive_sample.token_str.str.lower()
    .str.contains(r"\bn[o'e]([tr]?|ver|thing|body|where|ne|ither)\b|\bain.?t\b|\b(without|seldom|(scarce|hard|bare|rare)ly)\b", 
                  regex=True),:].drop_duplicates('text_window')
show_sample(sample_pickle(data=positive_sample, regex=True, sample_size=8, quiet=True, print_sample=False))

+------------------------+-------------------------------------+---------------------------------------------------------+------------------+------------------+-----------------------+
| hit_id                 | text_window                         | token_str                                               | adv_form_lower   | adj_form_lower   | bigram_lower          |
| pcc_eng_06_036.8752_x0 | That 's exactly right , and what    | That 's exactly right , and what 's interesting is when | exactly          | right            | exactly_right         |
| 580132_199:3-4         |                                     | you pronounce this , let 's assume that this is         |                  |                  |                       |
|                        |                                     | verbalized in an environment in which people are        |                  |                  |                       |
|                        |                                     | sensitive 

  ~positive_sample.token_str.str.lower()


In [93]:
positive_sample['token_str'] = positive_sample.token_str.str.strip()

In [94]:
len(positive_sample)

2141

In [95]:
samples=[]
top_overall_bigram_lowers = top_overall.index.str.replace('~', '_').to_list()
for bigram in top_overall_bigram_lowers:
    bigram_examples = sample_pickle(data=positive_sample, filters=[f'bigram_lower=={bigram}'], sample_size=10, quiet=True, print_sample=False).reset_index()
    
    # bigram_examples = pd.read_csv(csv)
    # print(bigram)
    bigram_examples['hit_id'] = '`' + bigram_examples.hit_id + '`'

    bigram_sample = bigram_examples.set_index('hit_id').filter(regex=r'token|bigram')
    bigram_sample = bigram_sample.assign(
        token_str=embolden(bigram_sample.token_str, r'\b([Ee]xactly \w+)\b', mono=False),
        # text_window=embolden(bigram_examples.text_window, bigram_regex)
        )
    # show_sample(bigram_sample.head(1))
    samples.append(bigram_sample)
show_sample(pd.concat(samples), format='pipe')

| hit_id                                    | token_str                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            | bigram_lower        |
|:------------------------------------------|:-------------------------------------------------------------------------------

In [96]:
other_positives = positive_sample.loc[~positive_sample.bigram_lower.isin(top_overall_bigram_lowers),:]


In [97]:
other_examples = sample_pickle(data=other_positives, sample_size=60, quiet=True, print_sample=False, regex=True, 
                               filters=['token_str!=.*(re|[ai]s)nt ']).reset_index()
print_exactly_sample(other_examples)

| hit_id                                   | bigram_lower           | token_str                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
|:-----------------------------------------|:-----------------------|:--------------------------------------------------------------------------------------------------------------------------------------

In [98]:
adj_limited_pmir = pmir_exactly.loc[~pmir_exactly.adj_form_lower.isin(['right','same','enough','identical','alike','comfortable','correct']),:].astype('string')
adj_limited_pmir.groupby('trigger_lower').adj_form_lower.value_counts().nlargest(10)

trigger_lower  adj_form_lower
all            opposite          3
always         equal             3
everything     true              3
               perfect           3
or             wrong             3
all            equal             2
everyone       equal             2
everything     different         2
or             equal             2
all            parallel          1
Name: count, dtype: int64

In [99]:
for op in ('every', 'all', 'everyone', 'everybody', 'always'): 
    print(f'\n## *{op}*\n')
    if op in adj_limited_pmir.trigger_lower:
        univ_ex = sample_pickle(data=adj_limited_pmir, sample_size=30, 
                            quiet=True, print_sample=False, regex=True, 
                            filters=[f'trigger_lower==^{op}$', '']).reset_index()
        print_exactly_sample(univ_ex)


## *every*


## *all*


## *everyone*


## *everybody*


## *always*



In [100]:
perpendicular = sample_pickle(data=other_positives, sample_size=60, quiet=True, print_sample=False, regex=True, 
                               filters=['token_str!=.*(re|[ai]s)nt ', 'bigram_lower==exactly_perpendicular']).reset_index()
    
perpendicular['hit_id'] = '`' + perpendicular.hit_id + '`'
perpendicular = perpendicular.set_index('hit_id').filter(regex=r'token|bigram')
perpendicular = perpendicular.assign(
    token_str='*' + embolden(perpendicular.token_str, r'\b([Ee]xactly \w+)\b', mono=False).str.replace('``', '"') + '*',
    ).sort_values('bigram_lower').reset_index()[['bigram_lower', 'token_str', 'hit_id']]

show_sample(perpendicular, format='pipe')

|    | bigram_lower          | token_str                                                                                                                                                                                                                                    | hit_id                                   |
|---:|:----------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:-----------------------------------------|
|  0 | exactly_perpendicular | *This line is  __exactly perpendicular__  to the cycloid , and if you drew a circle around this point*                                                                                                                                       | `pcc_eng_06_058.9136_x0937134_099:4-5`   |
|  1 | exactly_perpendicular | *In this context , the rotatio

In [101]:
pmir_exactly.head(3)

Unnamed: 0_level_0,bigram_id,token_str,pattern,category,adv_form,adj_form,text_window,trig_deprel,...,adv_index,adj_index,adv_form_lower,adj_form_lower,bigram_lower,all_forms_lower,trigger_lower,trigger_head
hit_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
pcc_eng_13_045.7674_x0723903_18:08-0...,pcc_eng_13_045.7674_x0723903_18:09-10,"And you know what , they 're often e...",pos-mirror-R,POSmirror,exactly,right,"what , they 're often exactly right .",advmod,...,8,9,exactly,right,exactly_right,often_exactly_right,often,R
pcc_eng_13_079.2956_x1265427_12:10-1...,pcc_eng_13_079.2956_x1265427_12:12-13,This is MLS it is suposed to be fun ...,pos-mirror-R,POSmirror,exactly,funny,suposed to be fun or more exactly fu...,cc,...,11,12,exactly,funny,exactly_funny,or_exactly_funny,or,R
pcc_eng_13_102.5595_x1641011_50:3-4-5,pcc_eng_13_102.5595_x1641011_50:4-5,I was both exactly right and exactly...,pos-mirror-R,POSmirror,exactly,right,I was both exactly right and exactly...,advmod,...,3,4,exactly,right,exactly_right,both_exactly_right,both,R


In [102]:
pmir_ex = sample_pickle(data=pmir_exactly, sample_size=40, quiet=True, print_sample=False, regex=True, 
                               filters=['token_str!=.*(re|[ai]s)nt ']).reset_index()


print_exactly_sample(pmir_ex)

| hit_id                                      | trigger_lower   | bigram_lower          | token_str                                                                                                                                                                                                                                                                                                                                                                                                | trigger_head   |
|:--------------------------------------------|:----------------|:----------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:-------------

In [103]:
some_ex = pmir_exactly.loc[pmir_exactly.trigger_lower.str.startswith('some'), :].sample(10).reset_index()
print_exactly_sample(some_ex)

| hit_id                                      | trigger_lower   | bigram_lower       | token_str                                                                                                                                                                                                                                                                                                                                      | trigger_head   |
|:--------------------------------------------|:----------------|:-------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:---------------|
| `pcc_eng_17_076.5027_x1220260_178:02-12-13` | someone         | exactly_equivalent | *So someone traveling to Alpha 

In [116]:
FOCUS = ['f',
         'am_p1_given2', 'am_p2_given1', 
         'deltaP_max', 'deltaP_mean',
         'am_p1_given2_simple', 'am_p2_given1_simple',
         'conservative_log_ratio', 'am_log_likelihood',
        #  'mutual_information', 'am_odds_ratio_disc', 't_score',
         'N', 'f1', 'f2', 'E11', 'unexpected_f', 
         'l1', 'l2']
adv_adj_AM = pd.read_pickle(RESULT_DIR.joinpath('assoc_df/adv_adj/RBXadj/extra/AdvAdj_frq-thrMIN-7.35f_min50x_extra.pkl.gz')).filter(FOCUS)
adv_adj_AM.info()

<class 'pandas.core.frame.DataFrame'>
Index: 91093 entries, hip~flexor to very~more
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   f                       91093 non-null  int32   
 1   am_p1_given2            91093 non-null  float32 
 2   am_p2_given1            91093 non-null  float32 
 3   deltaP_max              91093 non-null  float32 
 4   am_p1_given2_simple     91093 non-null  float32 
 5   am_p2_given1_simple     91093 non-null  float32 
 6   conservative_log_ratio  91093 non-null  float32 
 7   am_log_likelihood       91093 non-null  float64 
 8   N                       91093 non-null  int32   
 9   f1                      91093 non-null  int32   
 10  f2                      91093 non-null  int32   
 11  E11                     91093 non-null  float64 
 12  unexpected_f            91093 non-null  float64 
 13  l1                      91093 non-null  category
 14  l2            

In [120]:
pd.set_option('display.float_format', '{:,.2f}'.format)
exactly_adj_AM = adv_adj_AM.loc[adv_adj_AM.l1=='exactly', :].set_index('l2').loc[:, :'unexpected_f']
exactly_adj_AM.index.name = 'adj'


In [121]:
exactly_adj_AM.sort_values('conservative_log_ratio', ascending=False)

Unnamed: 0_level_0,f,am_p1_given2,am_p2_given1,deltaP_max,am_p1_given2_simple,am_p2_given1_simple,conservative_log_ratio,am_log_likelihood,N,f1,f2,E11,unexpected_f
adj,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
alike,3040,0.23,0.05,0.23,0.23,0.05,8.55,29939.31,86330753,61599,13261,9.46,3030.54
opposite,498,0.05,0.01,0.05,0.05,0.01,5.94,3337.27,86330753,61599,9404,6.71,491.29
right,6948,0.03,0.11,0.11,0.03,0.11,5.53,41085.55,86330753,61599,204572,145.97,6802.03
zero,344,0.03,0.01,0.03,0.03,0.01,5.02,1912.07,86330753,61599,11472,8.19,335.81
parallel,224,0.03,0.00,0.03,0.03,0.00,4.90,1238.35,86330753,61599,7577,5.41,218.59
...,...,...,...,...,...,...,...,...,...,...,...,...,...
possible,51,-0.00,-0.00,-0.00,0.00,0.00,-1.28,-252.55,86330753,61599,364265,259.91,-208.91
difficult,129,-0.00,-0.01,-0.00,0.00,0.00,-1.53,-542.68,86330753,61599,835024,595.81,-466.81
good,309,-0.00,-0.02,-0.00,0.00,0.01,-1.80,-1354.64,86330753,61599,2037285,1453.65,-1144.65
much,72,-0.00,-0.02,-0.00,0.00,0.00,-3.24,-2005.57,86330753,61599,1778739,1269.17,-1197.17


In [123]:
exactly_adj_AM.nlargest(20,'deltaP_max')

Unnamed: 0_level_0,f,am_p1_given2,am_p2_given1,deltaP_max,am_p1_given2_simple,am_p2_given1_simple,conservative_log_ratio,am_log_likelihood,N,f1,f2,E11,unexpected_f
adj,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
alike,3040,0.23,0.05,0.23,0.23,0.05,8.55,29939.31,86330753,61599,13261,9.46,3030.54
sure,9301,0.01,0.14,0.14,0.01,0.15,3.89,34895.53,86330753,61599,844981,602.91,8698.09
right,6948,0.03,0.11,0.11,0.03,0.11,5.53,41085.55,86330753,61599,204572,145.97,6802.03
opposite,498,0.05,0.01,0.05,0.05,0.01,5.94,3337.27,86330753,61599,9404,6.71,491.29
perpendicular,52,0.04,0.0,0.04,0.04,0.0,4.63,307.75,86330753,61599,1444,1.03,50.97
analogous,118,0.03,0.0,0.03,0.03,0.0,4.81,669.53,86330753,61599,3706,2.64,115.36
zero,344,0.03,0.01,0.03,0.03,0.01,5.02,1912.07,86330753,61599,11472,8.19,335.81
parallel,224,0.03,0.0,0.03,0.03,0.0,4.9,1238.35,86330753,61599,7577,5.41,218.59
stellar,177,0.02,0.0,0.02,0.03,0.0,4.61,925.08,86330753,61599,6973,4.98,172.02
true,1740,0.0,0.02,0.02,0.0,0.03,2.63,3826.53,86330753,61599,348994,249.02,1490.98


In [118]:
mir_adv_adj_AM = pd.read_pickle(RESULT_DIR.joinpath('assoc_df/adv_adj/ANYmirror/extra/AdvAdj_frq-thrMIN-7.35f_min5x_extra.pkl.gz')
                                    ).filter(FOCUS)
mir_exactly_adj_AM = mir_adv_adj_AM.loc[mir_adv_adj_AM.l1=='exactly', :'unexpected_f']
mir_exactly_adj_AM.sort_values('deltaP_max')

Unnamed: 0_level_0,f,am_p1_given2,am_p2_given1,deltaP_max,deltaP_mean,am_p1_given2_simple,am_p2_given1_simple,conservative_log_ratio,am_log_likelihood,N,f1,f2,E11,unexpected_f
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
exactly~right,135,0.01,0.12,0.12,0.07,0.01,0.13,3.51,529.59,1761853,1034,13473,7.91,127.09
exactly~sure,148,0.01,0.14,0.14,0.08,0.02,0.14,4.14,701.24,1761853,1034,9744,5.72,142.28
