In [1]:
import re
from pathlib import Path

import pandas as pd

from source.utils import POST_PROC_DIR, print_iter
from source.utils.sample import sample_pickle

HIT_EX_COLS = ['WITH::^.[il].*lower', 'WITH::text', 'token_str']

pkl_name = 'trigger-bigrams_thr0-001p.35f.pkl.gz'
path_dict = {p: POST_PROC_DIR / p / pkl_name for  p in ('POSmirror','NEGmirror')}
path_dict



{'POSmirror': PosixPath('/share/compling/data/sanpi/4_post-processed/POSmirror/trigger-bigrams_thr0-001p.35f.pkl.gz'),
 'NEGmirror': PosixPath('/share/compling/data/sanpi/4_post-processed/NEGmirror/trigger-bigrams_thr0-001p.35f.pkl.gz')}

In [2]:
pmir = pd.read_pickle(path_dict['POSmirror'])
nmir = pd.read_pickle(path_dict['NEGmirror'])

In [3]:
def str_to_cat(df):
    cat_cols = df.filter(regex=r'form|bigram|lemma|deprel|head').columns
    df[cat_cols] = df[cat_cols].astype('category')
    # df.info()
    return df

In [4]:
pmir = str_to_cat(pmir)

In [5]:
nmir = str_to_cat(nmir)

In [6]:
def set_col_widths(df):
    cols = df.copy().reset_index().columns
    width_dict = (
        {c: None for c in cols}
        | {c: 22 for c in cols[cols.str.contains('_id')]}
        | {c: 40 for c in cols[cols.str.contains('text')]}
        | {c: 30 for c in cols[cols.str.contains('forms')]}
        | {c: 55 for c in cols[cols.str.contains('_str')]})
    return list(width_dict.values())

In [8]:
print_iter(header = 'POSmirror columns:', iter_obj= pmir.columns.to_list())
print_iter(header = 'NEGmirror columns:', iter_obj= nmir.columns.to_list())


POSmirror columns:
▸ adv_form
▸ adj_form
▸ text_window
▸ bigram_id
▸ token_str
▸ mir_deprel
▸ mir_head
▸ mir_lemma
▸ adv_lemma
▸ adj_lemma
▸ mir_form
▸ mir_form_lower
▸ adv_form_lower
▸ adj_form_lower
▸ bigram_lower
▸ all_forms_lower
▸ pattern
▸ category
▸ prev_form_lower

NEGmirror columns:
▸ neg_form
▸ adv_form
▸ adj_form
▸ text_window
▸ bigram_id
▸ token_str
▸ neg_deprel
▸ neg_head
▸ neg_lemma
▸ adv_lemma
▸ adj_lemma
▸ neg_form_lower
▸ adv_form_lower
▸ adj_form_lower
▸ bigram_lower
▸ all_forms_lower
▸ pattern
▸ category
▸ prev_form_lower


In [24]:
def show_sample(df: pd.DataFrame,
                format: str = 'grid',
                limit_cols: bool = True):
    if limit_cols and format != 'pipe':
        col_widths_list = set_col_widths(df)
    else:
        col_widths_list = [None] * len(df.columns)
    print(df.to_markdown(
        floatfmt=',.0f', intfmt=',',
        maxcolwidths=col_widths_list, 
        tablefmt=format
        ))

In [29]:
show_sample(pmir.pattern.value_counts().to_frame(), limit_cols=False, format='pipe')

| pattern      |     count |
|:-------------|----------:|
| pos-mirror-R | 1,313,154 |
| pos-mirror-L |   362,347 |


In [28]:
show_sample(nmir.pattern.value_counts().to_frame(), limit_cols=False, format='pipe')

| pattern      |   count |
|:-------------|--------:|
| neg-mirror-R | 210,404 |
| neg-mirror-L |  75,031 |


In [32]:
REGNOT=r"n[o']t"
def embolden(series,
            bold_regex=None):
    bold_regex = bold_regex or r" (n[o']t) "
    return series.apply(
        lambda x: re.sub(bold_regex,
                        r' __`\1`__ ', x, flags=re.I))
    


## Problem Sentences

The following examples are all from the `POSmirror` data set which should not include any negative triggers. 
I believe the issue may be due to unexpected parses or cases where the negative trigger dependency is indirect or scopes over the identified positive trigger. 

In [35]:
for adv in ['exactly', 'ever', 'necessarily', 'yet']:
    for pat_suff in ['L', 'R']:
        problems = sample_pickle(
            data=pmir, sample_size=6, regex=True, print_sample=False,
            filters=[f'token_str== {REGNOT} .* {adv} ',
                    f'adv_form_lower==^{adv}$', 
                    f'pattern==.*{pat_suff}$'],
            columns=['mir_form_lower', 'bigram_lower', 'text_window', 'token_str'],
            sort_by='all_forms_lower')

        show_sample(
            problems.loc[problems.token_str.str.contains(f'{REGNOT}.*{adv}')].assign(
                token_str=embolden(problems.token_str, f' ({REGNOT}|{adv}) '),
                text_window=embolden(problems.text_window, f' ({REGNOT}|{adv}) ')
            ),
            format='pipe', limit_cols=False)


- *filtering rows...*
  - regex parsing = True
  - ✓ Applied filter: `token_str== n[o']t .* exactly `
  - ✓ Applied filter: `adv_form_lower==^exactly$`
  - ✓ Applied filter: `pattern==.*L$`

### All (5) row(s) matching filter(s) from `input frame`

| hit_id                                    | mir_form_lower   | bigram_lower     | text_window                                                  | token_str                                                                                                                                                                                                                                                                                                                                                                                            |
|:------------------------------------------|:-----------------|:-----------------|:-------------------------------------------------------------|:-----------------------------------------------------------------

- This could be dealt with by modifying the patterns (i.e. the `WITHOUT` clauses specifically) and rerunning everything, but
  1. There's no telling how long that would take 
  2. verifying its accuracy is difficult
  3. even with 100% accurate patterns for *correct* parses, there is no way to prevent or really even predict all possible *mis*parses
- So there is a better way: 
  
  The preponderance of positive data provides a large margin for additional data exclusions without unbalancing the samples---in fact, 
  it actually brings `[POSMIR,f1]` _closer_ to the negative sample size, `[NEGMIR, f1]`.

  Therefore, it is possible to simply drop anything with a likely negation preceding the bigram, 
  regardless of the polarity environment the particular syntactic configuration creates, and call it a day.


In [36]:
pmir['adv_index'] = pd.to_numeric(pmir.index.to_series().str.split(':').str.get(-1).apply(lambda i: re.search(r'-(\d+)-', i).group().strip('-')), downcast='unsigned')
pmir['preceding_text'] = pmir.apply(lambda x: ' '.join(x.token_str.split()[:x.adv_index - 1]), axis='columns').astype('string')

In [37]:
show_sample(pmir[['preceding_text', 'bigram_lower', 'token_str']].sample(5))

+------------------------+-----------------------------------------+------------------+---------------------------------------------------------+
| hit_id                 | preceding_text                          | bigram_lower     | token_str                                               |
| pcc_eng_20_107.03184_x | There 's something                      | very_comfortable | There 's something very comfortable to that in me . "   |
| 1718038_28:3-4-5       |                                         |                  |                                                         |
+------------------------+-----------------------------------------+------------------+---------------------------------------------------------+
| pcc_eng_19_014.4502_x0 | " Close to Perpignan the wind is always | quite_strong     | " Close to Perpignan the wind is always quite strong    |
| 217288_7:08-09-10      |                                         |                  | and it will be a really stressful da

In [44]:
show_sample(pmir[['preceding_text', 'bigram_lower', 'token_str']].sample(5), format='pipe')

| hit_id                                    | preceding_text                                                                                                                                       | bigram_lower   | token_str                                                                                                                                                                           |
|:------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------|:---------------|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| pcc_eng_17_051.8568_x0821672_01:12-13-14  | We 've all said it when a road or building seems all                                                                                                 | too_familia

In [45]:
pmir['after_neg'] = pmir.preceding_text.str.lower().str.contains(r" n[o'e](t?|body|thing|where|ver|ne| one) ", regex=True)
show_sample(pmir.loc[pmir.after_neg, ['preceding_text', 'bigram_lower', 'token_str']].sample(10))

  pmir['after_neg'] = pmir.preceding_text.str.lower().str.contains(r" n[o'e](t?|body|thing|where|ver|ne| one) ", regex=True)


+------------------------+------------------------------------------+-------------------------+---------------------------------------------------------+
| hit_id                 | preceding_text                           | bigram_lower            | token_str                                               |
| pcc_eng_23_080.3263_x1 | The so-called dismal science has never   | more_controversial      | The so-called dismal science has never been more        |
| 281816_3:11-25-26      | been more popular - or , given its       |                         | popular - or , given its failure to predict or prevent  |
|                        | failure to predict or prevent the recent |                         | the recent financial crisis , more controversial .      |
|                        | financial crisis ,                       |                         |                                                         |
+------------------------+------------------------------------------+-------

In [56]:
some_neg_ex = pmir.loc[pmir.after_neg, ['preceding_text', 'bigram_lower', 'token_str']].sample(6)
show_sample(some_neg_ex.assign(
    preceding_text=embolden(some_neg_ex.preceding_text, 
                            f' ({REGNOT}|nobody|nothing|never|none|no) ')
    ), format='pipe')

| hit_id                                    | preceding_text                                                                                                  | bigram_lower        | token_str                                                                                                                                               |
|:------------------------------------------|:----------------------------------------------------------------------------------------------------------------|:--------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------|
| pcc_eng_06_050.0638_x0793919_173:24-25-26 | There are many " girl on top " variations that are easy to try and you do __`n't`__ have to be the strongest or | most_flexible       | There are many " girl on top " variations that are easy to try and you do n't have to be the strongest or most flexible couple for them to feel gr

In [59]:
print(f'* ${pmir.after_neg.value_counts()[False]:,}$ tokens in `POSmirror` hits not preceded by negation')
print('  > - I.e. what would remain if _all_ potential contaminants were excluded')
print(f'  > - _{pmir.after_neg.value_counts()[True]:,}_ potential exclusions')
print(f'* ${len(nmir):,}$ tokens in `NEGmirror` hits')
print(f'* Updated difference in hit subtotals: ${pmir.after_neg.value_counts()[False] - len(nmir):,}$')

* $1,459,568$ tokens in `POSmirror` hits not preceded by negation
  > - I.e. what would remain if _all_ potential contaminants were excluded
  > - _215,933_ potential exclusions
* $285,435$ tokens in `NEGmirror` hits
* Updated difference in hit subtotals: $1,174,133$


_Without considering any upper case_
* ~~__1,457,913__ tokens in `POSmirror` hits not preceded by negation~~
    * ~~I.e. what would remain if _all_ potential contaminants were excluded~~
    * ~~_217,588_ potential exclusions~~
---
_Without considering fully upper case triggers_
* ~~__1,460,126__ tokens in `POSmirror` hits not preceded by negation~~
  * ~~I.e. what would remain if _all_ potential contaminants were excluded~~
  * ~~_215,375_ potential exclusions~~
---
_Normalized for case first_
* $1,459,568$ tokens in `POSmirror` hits not preceded by negation
  > - I.e. what would remain if _all_ potential contaminants were excluded
  > - _215,933_ potential exclusions
* $285,435$ tokens in `NEGmirror` hits
* Updated difference in hit subtotals: $1,174,133$

In [60]:
enforced_pos= pmir.loc[~pmir.after_neg, :'preceding_text']
enforced_pos.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1459568 entries, apw_eng_19941111_0004_1:14-15-16 to pcc_eng_val_3.11253_x52703_07:08-10-11
Data columns (total 21 columns):
 #   Column           Non-Null Count    Dtype   
---  ------           --------------    -----   
 0   adv_form         1459568 non-null  category
 1   adj_form         1459568 non-null  category
 2   text_window      1459568 non-null  string  
 3   bigram_id        1459568 non-null  category
 4   token_str        1459568 non-null  string  
 5   mir_deprel       1459568 non-null  category
 6   mir_head         1459568 non-null  category
 7   mir_lemma        1459568 non-null  category
 8   adv_lemma        1459568 non-null  category
 9   adj_lemma        1459568 non-null  category
 10  mir_form         1459568 non-null  category
 11  mir_form_lower   1459568 non-null  category
 12  adv_form_lower   1459568 non-null  category
 13  adj_form_lower   1459568 non-null  category
 14  bigram_lower     1459568 non-null  catego

In [61]:
adv = 'exactly'
new_exactly_ex = sample_pickle(
    data=enforced_pos,
    print_sample=False, sample_size=10,
    columns=['all_forms_lower', 'text_window', 'preceding_text', 'token_str'],
    filters=[f'adv_form_lower=={adv}'],
)

show_sample(new_exactly_ex)


- *filtering rows...*
  - regex parsing = False
  - ✓ Applied filter: `adv_form_lower==exactly`

### 10 random rows matching filter(s) from `input frame`

+------------------------+----------------------------+------------------------------------------+------------------------------------------+---------------------------------------------------------+
| hit_id                 | all_forms_lower            | text_window                              | preceding_text                           | token_str                                               |
| nyt_eng_19990225_0004_ | everybody_exactly_right    | `` Everybody '' is exactly right with    | `` Everybody '' is                       | `` Everybody '' is exactly right with this team .       |
| 56:2-5-6               |                            | this team .                              |                                          |                                                         |
+------------------------+------------------

In [64]:
for pat_suff in ['R', 'L']:
    new_exactly_ex = sample_pickle(
        data=enforced_pos, sample_size=8,
        print_sample=False, sort_by='adj_form_lower',
        columns=['all_forms_lower', 'text_window', 'token_str'],
        filters=[f'adv_form_lower=={adv}', 
                f'pattern==pos-mirror-{pat_suff}'],
    )

    show_sample(new_exactly_ex.assign(
        text_window=embolden(new_exactly_ex.text_window, f' ({adv}) '),
        token_str=embolden(new_exactly_ex.token_str, f' ({adv}) ')
    ), format='pipe')


- *filtering rows...*
  - regex parsing = False
  - ✓ Applied filter: `adv_form_lower==exactly`
  - ✓ Applied filter: `pattern==pos-mirror-R`

### 8 random rows matching filter(s) from `input frame`

| hit_id                                     | all_forms_lower         | text_window                                                        | token_str                                                                                                                                                                                                                                                       |
|:-------------------------------------------|:------------------------|:-------------------------------------------------------------------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|

In [70]:
enforced_path = path_dict['POSmirror'].with_name('enforced-POS-'+path_dict['POSmirror'].name)
if not enforced_path.is_file():
    
    enforced_pos.to_pickle(enforced_path)
    print(f'Updated `POSmirror` hits dataframe saved as:\ \n  `{enforced_path}')
else: 
    print(f'Updated `POSmirror` hits dataframe already exists:\ \n  `{enforced_path}')
    print('\n```shell')
    !ls -ho {enforced_path}
    print('```')
    

Updated `POSmirror` hits dataframe already exists:\ 
  `/share/compling/data/sanpi/4_post-processed/POSmirror/enforced-POS-trigger-bigrams_thr0-001p.35f.pkl.gz

```shell
-rw-r--r-- 1 arh234 207M May 16 23:30 /share/compling/data/sanpi/4_post-processed/POSmirror/enforced-POS-trigger-bigrams_thr0-001p.35f.pkl.gz
```
