In [110]:
import re
from pathlib import Path

import pandas as pd

from source.utils import POST_PROC_DIR, print_iter
from source.utils.sample import sample_pickle

HIT_EX_COLS = ['WITH::^.[il].*lower', 'WITH::text', 'token_str']

pkl_name = 'trigger-bigrams_thr0-001p.35f.pkl.gz'
path_dict = {p: POST_PROC_DIR / p / pkl_name for  p in ('POSmirror','NEGmirror')}
path_dict

{'POSmirror': PosixPath('/share/compling/data/sanpi/4_post-processed/POSmirror/trigger-bigrams_thr0-001p.35f.pkl.gz'),
 'NEGmirror': PosixPath('/share/compling/data/sanpi/4_post-processed/NEGmirror/trigger-bigrams_thr0-001p.35f.pkl.gz')}

In [111]:
nmir = pd.read_pickle(path_dict['NEGmirror'])

In [2]:
pmir = pd.read_pickle(path_dict['POSmirror'])


In [3]:
def str_to_cat(df):
    cat_cols = df.filter(regex=r'form|bigram|lemma|deprel|head').columns
    df[cat_cols] = df[cat_cols].astype('category')
    # df.info()
    return df

In [4]:
pmir = str_to_cat(pmir)

In [79]:
nmir = str_to_cat(nmir)

In [58]:
def set_col_widths(df):
    cols = df.copy().reset_index().columns
    width_dict = (
        {c: None for c in cols}
        | {c: 22 for c in cols[cols.str.contains('_id')]}
        | {c: 45 for c in cols[cols.str.contains('text')]}
        | {c: 30 for c in cols[cols.str.contains('forms')]}
        | {c: 60 for c in cols[cols.str.contains('_str')]})
    return list(width_dict.values())

In [7]:
print_iter(header = 'POSmirror columns:', iter_obj= pmir.columns.to_list())
print_iter(header = 'NEGmirror columns:', iter_obj= nmir.columns.to_list())


POSmirror columns:
▸ adv_form
▸ adj_form
▸ text_window
▸ bigram_id
▸ token_str
▸ mir_deprel
▸ mir_head
▸ mir_lemma
▸ adv_lemma
▸ adj_lemma
▸ mir_form
▸ mir_form_lower
▸ adv_form_lower
▸ adj_form_lower
▸ bigram_lower
▸ all_forms_lower
▸ pattern
▸ category
▸ prev_form_lower

NEGmirror columns:
▸ neg_form
▸ adv_form
▸ adj_form
▸ text_window
▸ bigram_id
▸ token_str
▸ neg_deprel
▸ neg_head
▸ neg_lemma
▸ adv_lemma
▸ adj_lemma
▸ neg_form_lower
▸ adv_form_lower
▸ adj_form_lower
▸ bigram_lower
▸ all_forms_lower
▸ pattern
▸ category
▸ prev_form_lower


In [59]:
def show_sample(df: pd.DataFrame,
                format: str = 'grid',
                limit_cols: bool = True):
    if limit_cols and format != 'pipe':
        col_widths_list = set_col_widths(df)
    else:
        col_widths_list = [None] * len(df.columns)
    print(df.to_markdown(
        floatfmt=',.0f', intfmt=',',
        maxcolwidths=col_widths_list, 
        tablefmt=format
        ))

In [9]:
show_sample(pmir.pattern.value_counts().to_frame(), limit_cols=False, format='pipe')

| pattern      |     count |
|:-------------|----------:|
| pos-mirror-R | 1,313,154 |
| pos-mirror-L |   362,347 |


In [10]:
show_sample(nmir.pattern.value_counts().to_frame(), limit_cols=False, format='pipe')

| pattern      |   count |
|:-------------|--------:|
| neg-mirror-R | 210,404 |
| neg-mirror-L |  75,031 |


In [38]:
REGNOT=r" (n[o']t) "
def embolden(series,
            bold_regex=None):
    bold_regex = re.compile(bold_regex) if bold_regex else REGNOT
    return series.apply(
        lambda x: bold_regex.sub(r' __`\1`__ ', x))
    


## Problem Sentences

The following examples are all from the `POSmirror` data set which should not include any negative triggers. 
I believe the issue may be due to unexpected parses or cases where the negative trigger dependency is indirect or scopes over the identified positive trigger. 

In [12]:
for adv in ['exactly', 'ever', 'necessarily', 'yet']:
    for pat_suff in ['L', 'R']:
        problems = sample_pickle(
            data=pmir, sample_size=6, regex=True, print_sample=False,
            filters=[f'token_str== {REGNOT} .* {adv} ',
                    f'adv_form_lower==^{adv}$', 
                    f'pattern==.*{pat_suff}$'],
            columns=['mir_form_lower', 'bigram_lower', 'text_window', 'token_str'],
            sort_by='all_forms_lower')

        show_sample(
            problems.loc[problems.token_str.str.contains(f'{REGNOT}.*{adv}')].assign(
                token_str=embolden(problems.token_str, f' ({REGNOT}|{adv}) '),
                text_window=embolden(problems.text_window, f' ({REGNOT}|{adv}) ')
            ),
            format='pipe', limit_cols=False)


- *filtering rows...*
  - regex parsing = True
  - ✓ Applied filter: `token_str== n[o']t .* exactly `
  - ✓ Applied filter: `adv_form_lower==^exactly$`
  - ✓ Applied filter: `pattern==.*L$`

### All (5) row(s) matching filter(s) from `input frame`

| hit_id                                    | mir_form_lower   | bigram_lower     | text_window                                                  | token_str                                                                                                                                                                                                                                                                                                                                                                                            |
|:------------------------------------------|:-----------------|:-----------------|:-------------------------------------------------------------|:-----------------------------------------------------------------

- This could be dealt with by modifying the patterns (i.e. the `WITHOUT` clauses specifically) and rerunning everything, but
  1. There's no telling how long that would take 
  2. verifying its accuracy is difficult
  3. even with 100% accurate patterns for *correct* parses, there is no way to prevent or really even predict all possible *mis*parses
- So there is a better way: 
  
  The preponderance of positive data provides a large margin for additional data exclusions without unbalancing the samples---in fact, 
  it actually brings `[POSMIR,f1]` _closer_ to the negative sample size, `[NEGMIR, f1]`.

  Therefore, it is possible to simply drop anything with a likely negation preceding the bigram, 
  regardless of the polarity environment the particular syntactic configuration creates, and call it a day.


In [13]:
pmir['adv_index'] = pd.to_numeric(pmir.index.to_series().str.split(':').str.get(-1).apply(lambda i: re.search(r'-(\d+)-', i).group().strip('-')), downcast='unsigned')
pmir['preceding_text'] = pmir.apply(lambda x: ' '.join(x.token_str.split()[:x.adv_index - 1]), axis='columns').astype('string')

In [14]:
show_sample(pmir[['preceding_text', 'bigram_lower', 'token_str']].sample(5))

+------------------------+------------------------------------------+----------------------+---------------------------------------------------------+
| hit_id                 | preceding_text                           | bigram_lower         | token_str                                               |
| pcc_eng_06_009.0943_x0 | There 's just something                  | so_solid             | There 's just something so solid , so comforting about  |
| 130776_7:4-5-6         |                                          |                      | a piping hot flaky disc which conceals a tender and     |
|                        |                                          |                      | buttery crumb .                                         |
+------------------------+------------------------------------------+----------------------+---------------------------------------------------------+
| nyt_eng_20100804_0062_ | in the Harlem store , popular brands     | conspicuously_absent | i

In [15]:
show_sample(pmir[['preceding_text', 'bigram_lower', 'token_str']].sample(5), format='pipe')

| hit_id                                    | preceding_text                                                                                                                             | bigram_lower     | token_str                                                                                                                                                                     |
|:------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------|:-----------------|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| pcc_eng_15_088.6009_x1415991_30:4-5-6     | Has she done something                                                                                                                     | so_egregious     | Has she done something so egre

In [28]:
pmir['after_neg'] = pmir.preceding_text.str.lower().str.contains(r"\b(no|n[o']t|no(body| one|thing|where)|(rare|scarce|bare|hard)ly|without|never)\b", regex=True)
show_sample(pmir.loc[pmir.after_neg, ['preceding_text', 'bigram_lower', 'token_str']].sample(10))

  pmir['after_neg'] = pmir.preceding_text.str.lower().str.contains(r"\b(no|n[o']t|no(body| one|thing|where)|(rare|scarce|bare|hard)ly|without|never)\b", regex=True)


+------------------------+------------------------------------------+------------------------+---------------------------------------------------------+
| hit_id                 | preceding_text                           | bigram_lower           | token_str                                               |
| pcc_eng_02_104.3362_x1 | I found that the mood on this album was  | too_mournful           | I found that the mood on this album was consistent      |
| 671023_332:17-18-19    | consistent throughout , never becoming   |                        | throughout , never becoming too uplifting or too        |
|                        | too uplifting or                         |                        | mournful , but steady and very revealing , almost       |
|                        |                                          |                        | meditative .                                            |
+------------------------+------------------------------------------+-------------

In [29]:
some_neg_ex = pmir.loc[pmir.after_neg, ['preceding_text', 'bigram_lower', 'token_str']].sample(6)
show_sample(some_neg_ex.assign(
    preceding_text=embolden(some_neg_ex.preceding_text, 
                            f' ({REGNOT}|nobody|nothing|never|none|no) ')
    ), format='pipe')

| hit_id                                    | preceding_text                                                                                                                                                                                                                                                                                  | bigram_lower        | token_str                                                                                                                                                                                                                                                                                                                                                                   |
|:------------------------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [31]:
print(f'* ${pmir.after_neg.value_counts()[False]:,}$ tokens in `POSmirror` hits not preceded by negation')
print('  > - I.e. what would remain if _all_ potential contaminants were excluded')
print(f'  > - _{pmir.after_neg.value_counts()[True]:,}_ potential exclusions')
print(f'* ${len(nmir):,}$ tokens in `NEGmirror` hits')
print(f'* Remaining Sample Size Discrepancy: ${pmir.after_neg.value_counts()[False] - len(nmir):,}$')

* $1,434,420$ tokens in `POSmirror` hits not preceded by negation
  > - I.e. what would remain if _all_ potential contaminants were excluded
  > - _241,081_ potential exclusions
* $285,435$ tokens in `NEGmirror` hits
* Remaining Sample Size Discrepancy: $1,148,985$


_Without considering any upper case_
* ~~__1,457,913__ tokens in `POSmirror` hits not preceded by negation~~
    * ~~I.e. what would remain if _all_ potential contaminants were excluded~~
    * ~~_217,588_ potential exclusions~~
---
_Without considering fully upper case triggers_
* ~~__1,460,126__ tokens in `POSmirror` hits not preceded by negation~~
  * ~~I.e. what would remain if _all_ potential contaminants were excluded~~
  * ~~_215,375_ potential exclusions~~
---
_Normalized for case first, but not catching negation at very end of preceding text (no whitespace following)_
* ~~**1,459,568** tokens in `POSmirror` hits not preceded by negation~~
  > - ~~I.e. what would remain if _all_ potential contaminants were excluded~~
  > - ~~_215,933_ potential exclusions~~
* ~~Updated difference in hit subtotals: **1,174,133**~~
* $285,435$ tokens in `NEGmirror` hits
---
**_Fixed to catch even `preceding_text` final negative triggers_**
* ~~**1,455,547** tokens in `POSmirror` hits not preceded by negation~~
  > - ~~I.e. what would remain if _all_ potential contaminants were excluded~~
  > - ~~_219,954_ potential exclusions~~
* $285,435$ tokens in `NEGmirror` hits
* ~~Remaining Sample Size Discrepancy: **1,170,112**~~
---
**Strengthened even furthre to catch negative adverbs and "without" and triggers at the _beginning_ of the `preceding_text`**
* $1,434,420$ tokens in `POSmirror` hits not preceded by negation
  > - I.e. what would remain if _all_ potential contaminants were excluded
  > - _241,081_ potential exclusions
* $285,435$ tokens in `NEGmirror` hits
* Remaining Sample Size Discrepancy: $1,148,985$


In [85]:
enforced_pos= pmir.loc[~pmir.after_neg, :'preceding_text']
enforced_pos.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1434420 entries, apw_eng_19941111_0004_1:14-15-16 to pcc_eng_val_3.11253_x52703_07:08-10-11
Data columns (total 21 columns):
 #   Column           Non-Null Count    Dtype   
---  ------           --------------    -----   
 0   adv_form         1434420 non-null  category
 1   adj_form         1434420 non-null  category
 2   text_window      1434420 non-null  string  
 3   bigram_id        1434420 non-null  category
 4   token_str        1434420 non-null  string  
 5   mir_deprel       1434420 non-null  category
 6   mir_head         1434420 non-null  category
 7   mir_lemma        1434420 non-null  category
 8   adv_lemma        1434420 non-null  category
 9   adj_lemma        1434420 non-null  category
 10  mir_form         1434420 non-null  category
 11  mir_form_lower   1434420 non-null  category
 12  adv_form_lower   1434420 non-null  category
 13  adj_form_lower   1434420 non-null  category
 14  bigram_lower     1434420 non-null  catego

In [89]:
adv = 'exactly'
new_exactly_ex = sample_pickle(
    data=enforced_pos,
    print_sample=False, sample_size=10,
    columns=['all_forms_lower', 'token_str'],
    filters=[f'adv_form_lower=={adv}'],
)

show_sample(new_exactly_ex.assign(token_str=embolden(new_exactly_ex.token_str, r' (exactly) ')))


- *filtering rows...*
  - regex parsing = False
  - ✓ Applied filter: `adv_form_lower==exactly`

### 10 random rows matching filter(s) from `input frame`

+------------------------+--------------------------+--------------------------------------------------------------+
| hit_id                 | all_forms_lower          | token_str                                                    |
| pcc_eng_09_044.5386_x0 | all_exactly_right        | All that is __`exactly`__ right , because deciding on a      |
| 704486_15:1-4-5        |                          | university is often the first really major decision of a     |
|                        |                          | young adult 's life .                                        |
+------------------------+--------------------------+--------------------------------------------------------------+
| pcc_eng_10_096.0405_x1 | all_exactly_identical    | The doors were evenly spaced from one another , and they     |
| 536500_662:13-14-15    

In [90]:
for pat_suff in ['R', 'L']:
    new_exactly_ex = sample_pickle(
        data=enforced_pos, sample_size=8,
        print_sample=False, sort_by='adj_form_lower',
        columns=['all_forms_lower', 'text_window', 'token_str'],
        filters=[f'adv_form_lower=={adv}', 
                f'pattern==pos-mirror-{pat_suff}'],
    )

    show_sample(new_exactly_ex.assign(
        text_window=embolden(new_exactly_ex.text_window, f' ({adv}) '),
        token_str=embolden(new_exactly_ex.token_str, f' ({adv}) ')
    ), format='pipe')


- *filtering rows...*
  - regex parsing = False
  - ✓ Applied filter: `adv_form_lower==exactly`
  - ✓ Applied filter: `pattern==pos-mirror-R`

### 8 random rows matching filter(s) from `input frame`

| hit_id                                    | all_forms_lower             | text_window                                                                                                                 | token_str                                                                                                                                                                                                                                                                                                                     |
|:------------------------------------------|:----------------------------|:----------------------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------

# FIXME 
  👇 🪲

In [91]:
enforced_pos['utt_len']= pd.to_numeric(enforced_pos.token_str.apply(lambda x: int(len(x.split()))), downcast='integer')

In [63]:
dups = enforced_pos.loc[enforced_pos.duplicated(subset=['token_str', 'all_forms_lower'], keep=False), ['all_forms_lower', 'token_str', 'utt_len']]
show_sample(dups.loc[dups.utt_len>=80, :].sort_values(['utt_len', 'token_str']).head(6))

+------------------------+-----------------------+--------------------------------------------------------------+-----------+
| hit_id                 | all_forms_lower       | token_str                                                    |   utt_len |
| pcc_eng_14_107.06555_x | or_too_little         | All of the types of depression may be accompanied by sighs , |        80 |
| 1724020_021:25-26-27   |                       | tears , outbursts , impulsivity , disturbances of sleep (    |           |
|                        |                       | too much or too little ) , eating ( too much or too little ) |           |
|                        |                       | , stomach problems , random aches and pains , changes in     |           |
|                        |                       | libido ( too much or too little ) , and often a reluctance   |           |
|                        |                       | to do anything from taking care of major responsibilities to |     

In [92]:
new_pmir = pd.concat((enforced_pos.loc[enforced_pos.utt_len < 80, :],
                      enforced_pos.loc[enforced_pos.utt_len >= 80, :].drop_duplicates(
                          subset=['token_str', 'all_forms_lower'], keep='first'))
                     )

In [97]:
print(f'{len(new_pmir):,} hits remaining in `POSmirror` set after additional filtering')

1,434,380 hits remaining in `POSmirror` set after additional filtering


In [96]:
show_sample(new_pmir.sample(6)[['all_forms_lower', 'text_window']])

+------------------------+-------------------------+-----------------------------------------------+
| hit_id                 | all_forms_lower         | text_window                                   |
| pcc_eng_09_069.4170_x1 | all_more_human          | this by making him all the more human         |
| 106792_109:17-19-20    |                         | emotionally , and filling                     |
+------------------------+-------------------------+-----------------------------------------------+
| pcc_eng_19_049.7594_x0 | something_very_wrong    | , I could tell something was very wrong today |
| 787089_49:6-8-9        |                         | at Mc Donalds                                 |
+------------------------+-------------------------+-----------------------------------------------+
| pcc_eng_03_003.4198_x0 | or_not_safe             | lock is unsecure ' or ' your lock is not safe |
| 038990_3:35-40-41      |                         | '                                     

In [103]:
new_pmir = new_pmir.loc[:, :'category']
new_pmir['trigger_lower'] = new_pmir['mir_form_lower'].astype('category')
new_pmir.columns

Index(['adv_form', 'adj_form', 'text_window', 'bigram_id', 'token_str',
       'mir_deprel', 'mir_head', 'mir_lemma', 'adv_lemma', 'adj_lemma',
       'mir_form', 'mir_form_lower', 'adv_form_lower', 'adj_form_lower',
       'bigram_lower', 'all_forms_lower', 'pattern', 'category',
       'trigger_lower'],
      dtype='object')

In [112]:
new_path = path_dict['POSmirror'].with_name('LimitedPOS-'+path_dict['POSmirror'].name)

if not new_path.is_file():
    
    new_pmir.loc[:, ].to_pickle(new_path)
    print(f'Updated `POSmirror` hits dataframe saved as:\ \n  `{new_path}`')
else: 
    print(f'Updated `POSmirror` hits dataframe already exists:\ \n  `{new_path}`')
    print('\n```shell')
    !ls -ho {new_path}
    print('```')
    

Updated `POSmirror` hits dataframe saved as:\ 
  `/share/compling/data/sanpi/4_post-processed/POSmirror/LimitedPOS-trigger-bigrams_thr0-001p.35f.pkl.gz`


In [117]:
nmir['utt_len']= nmir.token_str.apply(lambda x: len(x.split()))
new_nmir = pd.concat((nmir.loc[nmir.utt_len < 80, :], nmir.loc[nmir.utt_len >= 80, :].drop_duplicates(subset=['token_str', 'all_forms_lower'], keep='first')))
new_nmir = new_nmir.loc[:, :'category']
new_nmir['trigger_lower'] = new_nmir.neg_form_lower.astype('category')
show_sample(new_nmir.sample(6)[['trigger_lower', 'all_forms_lower', 'text_window']])


+------------------------+-----------------+------------------------+----------------------------------------------+
| hit_id                 | trigger_lower   | all_forms_lower        | text_window                                  |
| pcc_eng_08_055.1052_x0 | nothing         | nothing_more_exciting  | " There 's nothing more exciting than that . |
| 876088_33:4-5-6        |                 |                        | "                                            |
+------------------------+-----------------+------------------------+----------------------------------------------+
| pcc_eng_24_070.1460_x1 | nor             | nor_more_different     | Nor as writers could they have been more     |
| 118408_21:1-8-9        |                 |                        | different in their talents .                 |
+------------------------+-----------------+------------------------+----------------------------------------------+
| pcc_eng_06_057.2967_x0 | never           | never_too_late     

In [118]:

print(f'\n* {len(nmir):,} original hits in `NEGmirror` (`{path_dict["NEGmirror"].relative_to(POST_PROC_DIR)}`)')
print(f'\n* {len(new_nmir):,} hits remaining in `NEGmirror` set after additional filtering of duplicate hits')



* 285,435 original hits in `NEGmirror` (`NEGmirror/trigger-bigrams_thr0-001p.35f.pkl.gz`)

* 285,430 hits remaining in `NEGmirror` set after additional filtering of duplicate hits


In [119]:

new_path = path_dict['NEGmirror'].with_name('LimitedNEG-'+path_dict['NEGmirror'].name)
if not new_path.is_file():
    
    new_nmir.to_pickle(new_path)
    print(f'Updated `NEGmirror` hits dataframe saved as:\ \n  `{new_path}`')
else: 
    print(f'Updated `NEGmirror` hits dataframe already exists:\ \n  `{new_path}`')
    print('\n```shell')
    !ls -ho {new_path}
    print('```')

Updated `NEGmirror` hits dataframe saved as:\ 
  `/share/compling/data/sanpi/4_post-processed/NEGmirror/LimitedNEG-trigger-bigrams_thr0-001p.35f.pkl.gz`
