In [1]:
import re
from pathlib import Path

import pandas as pd

from source.utils import POST_PROC_DIR, print_iter
from source.utils.sample import sample_pickle

HIT_EX_COLS = ['WITH::^.[il].*lower', 'WITH::text', 'token_str']
# sanpi/4_post-processed/POSmirror/trigger-bigrams_frq-thrMIN-7.35f.pkl.gz
pkl_name = 'trigger-bigrams_frq-thrMIN-7.35f.pkl.gz'
path_dict = {p: POST_PROC_DIR / p / pkl_name for  p in ('POSmirror','NEGmirror')}
path_dict



{'POSmirror': PosixPath('/share/compling/data/sanpi/4_post-processed/POSmirror/trigger-bigrams_frq-thrMIN-7.35f.pkl.gz'),
 'NEGmirror': PosixPath('/share/compling/data/sanpi/4_post-processed/NEGmirror/trigger-bigrams_frq-thrMIN-7.35f.pkl.gz')}

In [2]:
nmir = pd.read_pickle(path_dict['NEGmirror'])

In [3]:
pmir = pd.read_pickle(path_dict['POSmirror'])

In [4]:
pmir[pmir.bigram_lower.str.contains(r"[^\w'-]", regex=True)].bigram_lower.value_counts().nlargest(15)

bigram_lower
v._good           2
not_o.k           2
even_1\/4-inch    1
v._expensive      1
fun._amoral       1
v._religious      1
too*_ironic       1
is*_remarkable    1
v._profound       1
def._wrong        1
just_o.k.         1
so*_expensive     1
probably_no.      1
c._ecstatic       1
b)_impossible     1
Name: count, dtype: Int64

In [5]:
def update_form_combos(df): 
    
    return df.assign(
        all_forms_lower = df.filter(regex=r'^[nma][^l]\w+lower$').apply(lambda x: '_'.join(x), axis=1),
        bigram_lower = df.filter(['adv_form_lower', 'adj_form_lower']).apply(lambda x: '_'.join(x), axis=1)
                     )


def remove_odd_orth_forms(df):

    df.loc[:, ['adv_form_lower', 'adj_form_lower', 'adj_lemma', 'adv_lemma']
           ] = df.loc[:, ['adv_form_lower', 'adj_form_lower', 'adj_lemma', 'adv_lemma']
                      ].astype('string')

    def adv_is_very(df):
        return df.adv_form_lower.str.contains(r'^v\.?$|^ve+r+y+$', regex=True)

    def adv_is_def(df):
        return df.adv_form_lower.str.contains(r'^def\.?$', regex=True)

    def adj_is_ok(df):
        return df.adj_form_lower.str.contains(r'^o*\.?k+\.?a*y*$', regex=True)

    print('Dropping most bizarre...')
    print(df.loc[df.bigram_lower.str.contains(r'[\[\\\/)]', regex=True),
          ['adv_form_lower', 'adj_form_lower']].astype('string').value_counts()
          .nlargest(10).to_frame().reset_index()
          .to_markdown(floatfmt=',.0f', intfmt=','))
    df = df.loc[~df.bigram_lower.str.contains(r'[\[\\\/)]', regex=True), :]

    print('Dropping plain numerals as adjectives')
    print(df.loc[df.adj_form_lower.astype('string').str.contains(r'^\d+$'), ['adv_form_lower', 'adj_form_lower', 'text_window']]
          .astype('string').value_counts().nlargest(10).to_frame().reset_index().to_markdown(floatfmt=',.0f')
          )
    df = df.loc[~df.adj_form_lower.astype('string').str.contains(r'^\d+$'), :]

    print('Translating some known orthographic quirks...')
    # > variations on "very"
    print('\n==== very ====')
    print(df.loc[adv_is_very(df), 'adv_form_lower']
          .astype('string').value_counts().nlargest(10).to_frame()
          .to_markdown(floatfmt=',.0f', intfmt=','))
    df.loc[adv_is_very(df), :] = df.loc[adv_is_very(df), :].assign(
        adv_lemma='very',
        adv_form_lower='very')

    # > variations on "ok"
    print('\n==== ok ====')
    print(df.loc[adj_is_ok(df), 'adj_form_lower']
          .astype('string').value_counts().nlargest(10).to_frame().reset_index()
          .to_markdown(floatfmt=',.0f', intfmt=','))
    df.loc[adj_is_ok(df), :] = df.loc[adj_is_ok(df), :].assign(
        adj_form_lower='ok',
        adj_lemma='ok')

    # > variations on "definitely"
    print('\n==== definitely ====')
    print(df.loc[adv_is_def(df), 'adv_form_lower']
          .astype('string').value_counts().nlargest(10).to_frame().reset_index()
          .to_markdown(floatfmt=',.0f', intfmt=','))
    df.loc[adv_is_def(df), :] = df.loc[adv_is_def(df), :].assign(adv_form_lower='definitely',
                                                                 adv_lemma='definitely')

    # > drop any single character "words"
    print(df.loc[df.adv_form_lower.str.contains(
        r'^\w\W*$'), ['adv_form_lower', 'adj_form_lower']]
        .astype('string').value_counts().nlargest(10).to_frame().reset_index()
        .to_markdown(floatfmt=',.0f', intfmt=','))
    print(df.loc[df.adj_form_lower.str.contains(
        r'^\w\W*$'), ['adv_form_lower', 'adj_form_lower']]
        .astype('string').value_counts().nlargest(10).to_frame().reset_index()
        .to_markdown())
    df = df.loc[~((df.adv_form_lower.str.contains(r'^\w\W*$'))
                  | (df.adj_form_lower.str.contains(r'^\w\W*$'))), :]

    # > delete remaining non-word characters (esp. `.` & `*`)
    df = df.assign(
        adv_form_lower=df.adv_form_lower.str.strip(
            '-').str.replace(r'[^a-z0-9&-]', '', regex=True),
        adv_lemma=df.adv_lemma.str.strip(
            '-').str.replace(r'[^a-zA-Z0-9&-]', '', regex=True),
        adj_form_lower=df.adj_form_lower.str.strip(
            '-').str.replace(r'[^a-z0-9&-]', '', regex=True),
        adj_lemma=df.adj_lemma.str.strip(
            '-').str.replace(r'[^a-zA-Z0-9&-]', '', regex=True)
    )
    df = df.loc[~df.adv_form_lower.isin({'is', 'ie'}), :]
    print('**** **** ****')

    print(df.loc[(df.adv_form_lower.str.contains(r"[^\w'-]", regex=True))
                 | (df.adj_form_lower.str.contains(r"[^\w'-]", regex=True)),
                 ['adv_form_lower', 'adj_form_lower']]
          .astype('string').value_counts()
          .nlargest(10).to_frame().reset_index()
          .to_markdown(floatfmt=',.0f', intfmt=',')
          )
    # print(df[df.adv_form_lower.str.contains(r"[^\w'-]", regex=True)].value_counts(['adv_lemma', 'adv_form_lower','adj_form_lower']))
    # print()
    # print(df[df.adj_form_lower.str.contains(r"[^\w'-]", regex=True)].value_counts(['adj_lemma', 'adj_form_lower','adv_form_lower']))
    df.loc[:, ['adv_form_lower', 'adj_form_lower', 'adj_lemma', 'adv_lemma']
           ] = df.loc[:, ['adv_form_lower', 'adj_form_lower', 'adj_lemma', 'adv_lemma']
                      ].astype('category')
    return df.convert_dtypes()


In [6]:
pmir = remove_odd_orth_forms(pmir)

Dropping most bizarre...
|    | adv_form_lower   | adj_form_lower   |   count |
|---:|:-----------------|:-----------------|--------:|
|  0 | 1/3rd            | smaller          |       1 |
|  1 | a)               | sick             |       1 |
|  2 | b)               | better           |       1 |
|  3 | b)               | impossible       |       1 |
|  4 | even             | 1\/4-inch        |       1 |
Dropping plain numerals as adjectives
|    | adv_form_lower   |   adj_form_lower | text_window                                              |   count |
|---:|:-----------------|-----------------:|:---------------------------------------------------------|--------:|
|  0 | more             |              401 | be popping up in many more 401 -LRB- k -RRB- plans       |       1 |
|  1 | mostly           |              401 | workers retire on all or mostly 401 -LRB- k -RRB- assets |       1 |
Translating some known orthographic quirks...

==== very ====
| adv_form_lower   |   count |
|:-

In [7]:
nmir = remove_odd_orth_forms(nmir)

Dropping most bizarre...
|    | adv_form_lower   | adj_form_lower   |   count |
|---:|:-----------------|:-----------------|--------:|
|  0 | more             | [...]            |       1 |
Dropping plain numerals as adjectives
|    | adv_form_lower   |   adj_form_lower | text_window                       |   count |
|---:|:-----------------|-----------------:|:----------------------------------|--------:|
|  0 | out              |                0 | Shanthakumaran Sreesanth no out 0 |       1 |
Translating some known orthographic quirks...

==== very ====
| adv_form_lower   |   count |
|:-----------------|--------:|
| very             |   8,956 |

==== ok ====
|    | adj_form_lower   |   count |
|---:|:-----------------|--------:|
|  0 | ok               |      58 |
|  1 | okay             |      51 |

==== definitely ====
| adv_form_lower   | count   |
|------------------|---------|
|    | adv_form_lower   | adj_form_lower   |   count |
|---:|:-----------------|:-----------------|---

In [8]:
pmir.head()

Unnamed: 0_level_0,bigram_id,token_str,pattern,category,adv_form,adj_form,text_window,mir_deprel,mir_lemma,adv_lemma,...,mir_form,mir_index,adv_index,adj_index,mir_form_lower,adv_form_lower,adj_form_lower,bigram_lower,all_forms_lower,prev_form_lower
hit_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
apw_eng_19941111_0004_1:14-15-16,apw_eng_19941111_0004_1:15-16,"after being locked out for 41 days , National ...",pos-mirror-R,POSmirror,too,happy,Hockey League players were all too happy Thurs...,advmod,all,too,...,all,13,14,15,all,too,happy,too_happy,all_too_happy,all
apw_eng_19941111_0011_5:27-29-30,apw_eng_19941111_0011_5:29-30,Volpe and Poland were the only two of the 13 w...,pos-mirror-R,POSmirror,too,dangerous,government 's claim that both were too dangero...,nsubj,both,too,...,both,26,28,29,both,too,dangerous,too_dangerous,both_too_dangerous,were
apw_eng_19941111_0090_14:09-12-13,apw_eng_19941111_0090_14:12-13,`` It seems like the next meeting could always...,pos-mirror-R,POSmirror,most,important,the next meeting could always be the most impo...,advmod,always,most,...,always,8,11,12,always,most,important,most_important,always_most_important,the
apw_eng_19941112_0323_28:08-16-17,apw_eng_19941112_0323_28:16-17,"that was a big concern , for all of us -- that...",pos-mirror-L,POSmirror,so,appalling,"big concern , for all of us -- that he would b...",dep,all,so,...,all,7,15,16,all,so,appalling,so_appalling,all_so_appalling,be
apw_eng_19941113_0019_21:2-8-9,apw_eng_19941113_0019_21:8-9,Handling all of the egos involved was surprisi...,pos-mirror-R,POSmirror,surprisingly,easy,Handling all of the egos involved was surprisi...,advmod,all,surprisingly,...,all,1,7,8,all,surprisingly,easy,surprisingly_easy,all_surprisingly_easy,was


In [9]:
nmir.info()

<class 'pandas.core.frame.DataFrame'>
Index: 293947 entries, pcc_eng_11_001.0326_x0000513_088:14-15-16 to pcc_eng_10_108.10108_x1747378_18:7-8-9
Data columns (total 21 columns):
 #   Column           Non-Null Count   Dtype   
---  ------           --------------   -----   
 0   bigram_id        293947 non-null  string  
 1   token_str        293947 non-null  string  
 2   pattern          293947 non-null  category
 3   category         293947 non-null  category
 4   neg_form         293947 non-null  string  
 5   adv_form         293947 non-null  string  
 6   adj_form         293947 non-null  string  
 7   text_window      293947 non-null  string  
 8   neg_deprel       293947 non-null  string  
 9   neg_lemma        293947 non-null  string  
 10  adv_lemma        293947 non-null  category
 11  adj_lemma        293947 non-null  category
 12  neg_index        293947 non-null  UInt16  
 13  adv_index        293947 non-null  UInt16  
 14  adj_index        293947 non-null  UInt16  
 15  n

In [11]:
pmir.loc[:, ['bigram_lower','all_forms_lower']] = update_form_combos(pmir.filter(like='lower')).loc[:, ['bigram_lower','all_forms_lower']]


In [12]:
nmir.loc[:, ['bigram_lower','all_forms_lower']] = update_form_combos(nmir.filter(like='lower')).loc[:, ['bigram_lower','all_forms_lower']]

In [13]:
def str_to_cat(df):
    cat_cols = df.filter(regex=r'form|bigram|lemma|deprel|head').columns
    df[cat_cols] = df[cat_cols].astype('category')
    # df.info()
    return df

In [14]:
pmir = str_to_cat(pmir)

In [15]:
nmir = str_to_cat(nmir)

In [16]:
def set_col_widths(df):
    cols = df.copy().reset_index().columns
    width_dict = (
        {c: None for c in cols}
        | {c: 22 for c in cols[cols.str.contains('_id')]}
        | {c: 45 for c in cols[cols.str.contains('text')]}
        | {c: 30 for c in cols[cols.str.contains('forms')]}
        | {c: 60 for c in cols[cols.str.contains('_str')]})
    return list(width_dict.values())

In [17]:
print_iter(header = 'POSmirror columns:', iter_obj= pmir.columns.to_list())
print_iter(header = 'NEGmirror columns:', iter_obj= nmir.columns.to_list())


POSmirror columns:
▸ bigram_id
▸ token_str
▸ pattern
▸ category
▸ adv_form
▸ adj_form
▸ text_window
▸ mir_deprel
▸ mir_lemma
▸ adv_lemma
▸ adj_lemma
▸ mir_form
▸ mir_index
▸ adv_index
▸ adj_index
▸ mir_form_lower
▸ adv_form_lower
▸ adj_form_lower
▸ bigram_lower
▸ all_forms_lower
▸ prev_form_lower

NEGmirror columns:
▸ bigram_id
▸ token_str
▸ pattern
▸ category
▸ neg_form
▸ adv_form
▸ adj_form
▸ text_window
▸ neg_deprel
▸ neg_lemma
▸ adv_lemma
▸ adj_lemma
▸ neg_index
▸ adv_index
▸ adj_index
▸ neg_form_lower
▸ adv_form_lower
▸ adj_form_lower
▸ bigram_lower
▸ all_forms_lower
▸ prev_form_lower


In [18]:
def show_sample(df: pd.DataFrame,
                format: str = 'grid',
                limit_cols: bool = True):
    if limit_cols and format != 'pipe':
        col_widths_list = set_col_widths(df)
    else:
        col_widths_list = [None] * len(df.columns)
    print(df.to_markdown(
        floatfmt=',.0f', intfmt=',',
        maxcolwidths=col_widths_list, 
        tablefmt=format
        ))

In [19]:
show_sample(pmir.pattern.value_counts().to_frame(), limit_cols=False, format='pipe')

| pattern      |     count |
|:-------------|----------:|
| pos-mirror-R | 1,364,547 |
| pos-mirror-L |   373,490 |


In [20]:
show_sample(nmir.pattern.value_counts().to_frame(), limit_cols=False, format='pipe')

| pattern      |   count |
|:-------------|--------:|
| neg-mirror-R | 216,886 |
| neg-mirror-L |  77,061 |


In [21]:
REGNOT=r" (n[o']t) "
def embolden(series,
            bold_regex=None):
    bold_regex = re.compile(bold_regex) if bold_regex else REGNOT
    return series.apply(
        lambda x: bold_regex.sub(r' __`\1`__ ', x))

## Problem Sentences

The following examples are all from the `POSmirror` data set which should not include any negative triggers. 
I believe the issue may be due to unexpected parses or cases where the negative trigger dependency is indirect or scopes over the identified positive trigger. 

In [22]:
for adv in ['exactly', 'ever', 'necessarily', 'yet']:
    for pat_suff in ['L', 'R']:
        problems = sample_pickle(
            data=pmir, sample_size=6, regex=True, print_sample=False,
            filters=[f'token_str== {REGNOT} .* {adv} ',
                    f'adv_form_lower==^{adv}$', 
                    f'pattern==.*{pat_suff}$'],
            columns=['mir_form_lower', 'bigram_lower', 'text_window', 'token_str'],
            sort_by='all_forms_lower')

        show_sample(
            problems.loc[problems.token_str.str.contains(f'{REGNOT}.*{adv}')].assign(
                token_str=embolden(problems.token_str, f' ({REGNOT}|{adv}) '),
                text_window=embolden(problems.text_window, f' ({REGNOT}|{adv}) ')
            ),
            format='pipe', limit_cols=False)


- *filtering rows...*
  - regex parsing = True
  - Filter expression `token_str==  (n[o']t)  .* exactly ` matched zero rows. Filter not applied.
  - ✓ Applied filter: `adv_form_lower==^exactly$`
  - ✓ Applied filter: `pattern==.*L$`

### 6 random rows matching filter(s) from `input frame`

| hit_id                                    | mir_form_lower   | bigram_lower      | text_window                                                             | token_str                                                                                                                                                                                                                                                                                                                                                                                      |
|:------------------------------------------|:-----------------|:------------------|:------------------------------------------------------------------------|:-----

  problems.loc[problems.token_str.str.contains(f'{REGNOT}.*{adv}')].assign(



- *filtering rows...*
  - regex parsing = True
  - Filter expression `token_str==  (n[o']t)  .* exactly ` matched zero rows. Filter not applied.
  - ✓ Applied filter: `adv_form_lower==^exactly$`
  - ✓ Applied filter: `pattern==.*R$`

### 6 random rows matching filter(s) from `input frame`

| hit_id                                   | mir_form_lower   | bigram_lower   | text_window                                             | token_str                                               |
|:-----------------------------------------|:-----------------|:---------------|:--------------------------------------------------------|:--------------------------------------------------------|
| pcc_eng_22_098.7323_x1579436_44:04-09-10 | all              | exactly_true   | Well , not all of these things are __`exactly`__ true . | Well , not all of these things are __`exactly`__ true . |


  problems.loc[problems.token_str.str.contains(f'{REGNOT}.*{adv}')].assign(



- *filtering rows...*
  - regex parsing = True
  - Filter expression `token_str==  (n[o']t)  .* ever ` matched zero rows. Filter not applied.
  - ✓ Applied filter: `adv_form_lower==^ever$`
  - ✓ Applied filter: `pattern==.*L$`

### 6 random rows matching filter(s) from `input frame`

| hit_id                                  | mir_form_lower   | bigram_lower   | text_window                                                             | token_str                                                                                                                                                                                                                                                  |
|:----------------------------------------|:-----------------|:---------------|:------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------

  problems.loc[problems.token_str.str.contains(f'{REGNOT}.*{adv}')].assign(



- *filtering rows...*
  - regex parsing = True
  - Filter expression `token_str==  (n[o']t)  .* ever ` matched zero rows. Filter not applied.
  - ✓ Applied filter: `adv_form_lower==^ever$`
  - ✓ Applied filter: `pattern==.*R$`

### 6 random rows matching filter(s) from `input frame`



  problems.loc[problems.token_str.str.contains(f'{REGNOT}.*{adv}')].assign(


TypeError: boolean value of NA is ambiguous

- This could be dealt with by modifying the patterns (i.e. the `WITHOUT` clauses specifically) and rerunning everything, but
  1. There's no telling how long that would take 
  2. verifying its accuracy is difficult
  3. even with 100% accurate patterns for *correct* parses, there is no way to prevent or really even predict all possible *mis*parses
- So there is a better way: 
  
  The preponderance of positive data provides a large margin for additional data exclusions without unbalancing the samples---in fact, 
  it actually brings `[POSMIR,f1]` _closer_ to the negative sample size, `[NEGMIR, f1]`.

  Therefore, it is possible to simply drop anything with a likely negation preceding the bigram, 
  regardless of the polarity environment the particular syntactic configuration creates, and call it a day.


In [23]:
# pmir['adv_index'] = pd.to_numeric(pmir.index.to_series().str.split(':').str.get(-1).apply(lambda i: re.search(r'-(\d+)-', i).group().strip('-')), downcast='unsigned')
pmir['preceding_text'] = pmir.apply(lambda x: x.token_str.split()[:x.adv_index - 1], axis='columns').astype('string').str.join(' ')

In [24]:
show_sample(pmir[['preceding_text', 'bigram_lower', 'token_str']].sample(5))

+------------------------+-----------------------------------------------+----------------+--------------------------------------------------------------+
| hit_id                 | preceding_text                                | bigram_lower   | token_str                                                    |
| pcc_eng_23_038.1062_x0 | As can be seen , Lian and Brian 's episode    | too_familiar   | As can be seen , Lian and Brian 's episode will focus on the |
| 599468_38:24-25-26     | will focus on the topic of autism , a subject |                | topic of autism , a subject Ler himself is all too familiar  |
|                        | Ler himself is                                |                | with as he has family members afflicted with autism and he   |
|                        |                                               |                | explains that it is partly because of this that he feels a   |
|                        |                                            

In [25]:
show_sample(pmir[['preceding_text', 'bigram_lower', 'token_str']].sample(5), format='pipe')

| hit_id                                   | preceding_text                                                                                                                                                                                                              | bigram_lower             | token_str                                                                                                                                                                                                                                                 |
|:-----------------------------------------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:-------------------------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------

In [26]:
pmir['after_neg'] = pmir.preceding_text.str.lower().str.contains(r"\b(no|n[o']t|no(body| one|thing|where)|(rare|scarce|bare|hard)ly|seldom|without|never)\b", regex=True)
show_sample(pmir.loc[pmir.after_neg, ['preceding_text', 'bigram_lower', 'token_str']].sample(10))

  pmir['after_neg'] = pmir.preceding_text.str.lower().str.contains(r"\b(no|n[o']t|no(body| one|thing|where)|(rare|scarce|bare|hard)ly|seldom|without|never)\b", regex=True)


+------------------------+-----------------------------------------------+----------------------+--------------------------------------------------------------+
| hit_id                 | preceding_text                                | bigram_lower         | token_str                                                    |
| pcc_eng_01_026.0198_x0 | Sitting around in 95 degree weather just      | necessarily_fun      | Sitting around in 95 degree weather just waiting on some     |
| 404751_13:22-23-24     | waiting on some sweaty dudes to show up all   |                      | sweaty dudes to show up all at once is not easy or           |
|                        | at once is not easy                           |                      | necessarily fun .                                            |
+------------------------+-----------------------------------------------+----------------------+--------------------------------------------------------------+
| pcc_eng_25_047.4015_x0 | I may n

In [27]:
some_neg_ex = pmir.loc[pmir.after_neg, ['preceding_text', 'bigram_lower', 'token_str']].sample(6)
show_sample(some_neg_ex.assign(
    preceding_text=embolden(some_neg_ex.preceding_text, 
                            f' ({REGNOT}|nobody|nothing|never|none|no) ')
    ), format='pipe')

| hit_id                                    | preceding_text                                                                                                                                   | bigram_lower       | token_str                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |
|:------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------|:----------

In [28]:
print(f'* ${pmir.after_neg.value_counts()[False]:,}$ tokens in `POSmirror` hits not preceded by negation')
print('  > - I.e. what would remain if _all_ potential contaminants were excluded')
print(f'  > - _{pmir.after_neg.value_counts()[True]:,}_ potential exclusions')
print(f'* ${len(nmir):,}$ tokens in `NEGmirror` hits')
print(f'* Remaining Sample Size Discrepancy: ${pmir.after_neg.value_counts()[False] - len(nmir):,}$')

* $1,490,579$ tokens in `POSmirror` hits not preceded by negation
  > - I.e. what would remain if _all_ potential contaminants were excluded


  > - _247,458_ potential exclusions
* $293,947$ tokens in `NEGmirror` hits
* Remaining Sample Size Discrepancy: $1,196,632$


## Effect of Negation Removals



### For 868+ frequency filtered ad* forms 
  
  _Without considering any upper case_
  * ~~__1,457,913__ tokens in `POSmirror` hits not preceded by negation~~
      * ~~I.e. what would remain if _all_ potential contaminants were excluded~~
      * ~~_217,588_ potential exclusions~~
  ---
  _Without considering fully upper case triggers_
  * ~~__1,460,126__ tokens in `POSmirror` hits not preceded by negation~~
  * ~~I.e. what would remain if _all_ potential contaminants were excluded~~
  * ~~_215,375_ potential exclusions~~
  ---
  _Normalized for case first, but not catching negation at very end of preceding text (no whitespace following)_
  * ~~**1,459,568** tokens in `POSmirror` hits not preceded by negation~~
  > - ~~I.e. what would remain if _all_ potential contaminants were excluded~~
  > - ~~_215,933_ potential exclusions~~
  * ~~Updated difference in hit subtotals: **1,174,133**~~
  * $285,435$ tokens in `NEGmirror` hits
  ---
  **_Fixed to catch even `preceding_text` final negative triggers_**
  * ~~**1,455,547** tokens in `POSmirror` hits not preceded by negation~~
  > - ~~I.e. what would remain if _all_ potential contaminants were excluded~~
  > - ~~_219,954_ potential exclusions~~
  * $285,435$ tokens in `NEGmirror` hits
  * ~~Remaining Sample Size Discrepancy: **1,170,112**~~
  ---
  **Strengthened even furthre to catch negative adverbs and "without" and triggers at the _beginning_ of the `preceding_text`**
  * $1,434,420$ tokens in `POSmirror` hits not preceded by negation
  > - I.e. what would remain if _all_ potential contaminants were excluded
  > - _241,081_ potential exclusions
  * $285,435$ tokens in `NEGmirror` hits
  * Remaining Sample Size Discrepancy: $1,148,985$


### For 7+ frequency filtered ad\* forms

_Without orthography adjustments_
* ~~**1,487,458** tokens in `POSmirror` hits not preceded by negation~~
    > - ~~I.e. what would remain if _all_ potential contaminants were excluded~~
    > - ~~_250,661_ potential exclusions~~
* ~~**293,964** tokens in `NEGmirror` hits~~
* ~~Remaining Sample Size Discrepancy: **1,193,494**~~

_**With** orthography adjustments/filtering_
* $1,490,579$ tokens in `POSmirror` hits not preceded by negation
  > - I.e. what would remain if _all_ potential contaminants were excluded
  > - _247,458_ potential exclusions
* $293,947$ tokens in `NEGmirror` hits
* Remaining Sample Size Discrepancy: $1,196,632$




In [29]:
enforced_pos= pmir.loc[~pmir.after_neg, :'preceding_text']
enforced_pos.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1490579 entries, apw_eng_19941111_0004_1:14-15-16 to pcc_eng_26_108.10246_x1747457_06:6-8-9
Data columns (total 22 columns):
 #   Column           Non-Null Count    Dtype   
---  ------           --------------    -----   
 0   bigram_id        1490579 non-null  category
 1   token_str        1490579 non-null  string  
 2   pattern          1490579 non-null  category
 3   category         1490579 non-null  category
 4   adv_form         1490579 non-null  category
 5   adj_form         1490579 non-null  category
 6   text_window      1490579 non-null  string  
 7   mir_deprel       1490579 non-null  category
 8   mir_lemma        1490579 non-null  category
 9   adv_lemma        1490579 non-null  category
 10  adj_lemma        1490579 non-null  category
 11  mir_form         1490579 non-null  category
 12  mir_index        1490579 non-null  UInt16  
 13  adv_index        1490579 non-null  UInt16  
 14  adj_index        1490579 non-null  UInt16

In [30]:
adv = 'exactly'
new_exactly_ex = sample_pickle(
    data=enforced_pos,
    print_sample=False, sample_size=10,
    columns=['all_forms_lower', 'token_str'],
    filters=[f'adv_form_lower=={adv}'],
)

show_sample(new_exactly_ex.assign(token_str=embolden(new_exactly_ex.token_str, r' (exactly) ')))


- *filtering rows...*
  - regex parsing = False
  - ✓ Applied filter: `adv_form_lower==exactly`

### 10 random rows matching filter(s) from `input frame`

+------------------------+----------------------------+--------------------------------------------------------------+
| hit_id                 | all_forms_lower            | token_str                                                    |
| pcc_eng_04_104.1453_x1 | something_exactly_opposite | misrepresentation is the kind of thing you and Goerzen do    |
| 666254_08:20-21-22     |                            | where you outright lie and say that somebody said something  |
|                        |                            | __`exactly`__ opposite to what they actually said . >        |
+------------------------+----------------------------+--------------------------------------------------------------+
| pcc_eng_07_101.4447_x1 | all_exactly_alike          | The Man Whose Teeth Were All Exactly Alike                   |
| 623411_11

In [31]:
for pat_suff in ['R', 'L']:
    new_exactly_ex = sample_pickle(
        data=enforced_pos, sample_size=8,
        print_sample=False, sort_by='adj_form_lower',
        columns=['all_forms_lower', 'text_window', 'token_str'],
        filters=[f'adv_form_lower=={adv}', 
                f'pattern==pos-mirror-{pat_suff}'],
    )

    show_sample(new_exactly_ex.assign(
        text_window=embolden(new_exactly_ex.text_window, f' ({adv}) '),
        token_str=embolden(new_exactly_ex.token_str, f' ({adv}) ')
    ), format='pipe')


- *filtering rows...*
  - regex parsing = False
  - ✓ Applied filter: `adv_form_lower==exactly`
  - ✓ Applied filter: `pattern==pos-mirror-R`

### 8 random rows matching filter(s) from `input frame`

| hit_id                                    | all_forms_lower          | text_window                                                                 | token_str                                                                                                                                                                         |
|:------------------------------------------|:-------------------------|:----------------------------------------------------------------------------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| pcc_eng_03_074.5040_x1190619_26:1-6-7     | or_exactly_correct       | Or maybe the result wasnt __`exactly`__ correct because he was n

## Remove Duplicated `text_window`+`all_forms_lower`

In [32]:
enforced_pos['utt_len']= pd.to_numeric(enforced_pos.token_str.apply(lambda x: int(len(x.split()))), downcast='integer')

In [33]:
dups_token_str = enforced_pos.loc[enforced_pos.duplicated(subset=['token_str', 'all_forms_lower']), ['all_forms_lower', 'token_str','text_window', 'utt_len']]
dups_text_window = enforced_pos.loc[enforced_pos.duplicated(subset=['text_window', 'all_forms_lower']), ['all_forms_lower', 'token_str', 'text_window','utt_len']]
dups_both = enforced_pos.loc[enforced_pos.duplicated(subset=['text_window', 'token_str', 'all_forms_lower']), ['all_forms_lower', 'token_str','text_window', 'utt_len']]
# show_sample(dups.loc[dups.utt_len>=80, :].sort_values(['utt_len', 'token_str']).head(6))
print_iter([f'token_str:   {len(dups_token_str):,}', 
            f'text_window: {len(dups_text_window):,}',
            f'both:        {len(dups_both):,}'], header='Potental Removals (no `utt_len` filter applied)')


Potental Removals (no `utt_len` filter applied)
▸ token_str:   15,394
▸ text_window: 36,885
▸ both:        14,861


In [34]:
print(f'`text_window` duplicates when restricted to 20+ tokens in `token_str`: {len(dups_text_window.loc[dups_text_window.utt_len>=20, :]):,}')

`text_window` duplicates when restricted to 20+ tokens in `token_str`: 16,570


These duplicates are retained since it's too messy to separate the clearly carbon copy utterances from plausible genuine production. 

Highly Suspicious

|     | text_window                                               | \#tokens in sentence | \#duplications |
|----:|:----------------------------------------------------------|---------------------:|---------------:|
|   7 | _Everything you see here is absolutely FREE to watch ._   |                   10 |             60 |
|  17 | _All of Swedenborg 's works are well worth reading ._     |                   10 |             43 |
|  60 | _2 fig Something quintessentially Canadian ._             |                    6 |             20 |
|  65 | _Everybody is Super Heady in our sandbox ._               |                    8 |             19 |
| 119 | _Because sometimes , 140 characters just is n't enough ._ |                   10 |             13 |
| 143 | _" Or maybe stupid , " Ebenezar countered_                |                    9 |             11 |
| 188 | _Because Sometimes 140 Characters Just Is n't Enough_     |                    8 |              9 |
| 283 | _There 's something very wrong with our pterosaurs ._     |                    9 |              7 |
| 290 | _The plaid decoration is all very good ._                 |                    8 |              7 |
| 482 | _wrap up something as special as she is [...]_            |                    9 |              5 |

Plausible Production

|    | text_window                                    | \#tokens in sentence | \#duplications |
|---:|:-----------------------------------------------|---------------------:|---------------:|
|  2 | _Something was n't right ._                    |                    5 |            118 |
|  3 | _But I 'm sure everything will be just fine ._ |                   10 |            102 |
|  8 | _It 's all very confusing ._                   |                    6 |             56 |
|  6 | _We should all be so lucky ._                  |                    7 |             69 |
| 10 | _It 's all very exciting ._                    |                    6 |             54 |
| 19 | _Something is definitely wrong ._              |                    5 |             41 |
| 20 | _Looking for something more specific ?_        |                    6 |             40 |
| 25 | _Both are equally important ._                 |                    5 |             33 |
| 31 | _My cup is always half full ._                 |                    7 |             30 |
| 39 | _If only it were all so simple !_              |                    9 |             24 |
| 48 | _Everything is not awesome ._                  |                    5 |             23 |



In [35]:
dups_text_window.loc[(dups_text_window.utt_len < 20), :].value_counts(['text_window', 'utt_len']).to_frame().reset_index().sort_values(['count','text_window', ], ascending=False)

Unnamed: 0,text_window,utt_len,count
0,Some of your changes are now live .,8,384
1,And now for something completely different .,7,122
2,Something was n't right .,5,118
3,But I 'm sure everything will be just fine .,10,102
4,And now for something completely different ...,7,75
...,...,...,...
7968,""" And now for something completely different . """,17,1
7965,""" All of us at Whataburger are so happy to get...",16,1
7964,""" All along I have been completely consistent ...",18,1
8972,""" Abusers are often very adept at identifying ...",10,1


In [36]:
def weed_windows(df):
    return pd.concat(
        [df.loc[df.utt_len < 20, :],
         df.loc[df.utt_len >= 20, :].drop_duplicates(
            subset=['text_window', 'all_forms_lower'])]
    )

In [37]:
enforced_pos.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1490579 entries, apw_eng_19941111_0004_1:14-15-16 to pcc_eng_26_108.10246_x1747457_06:6-8-9
Data columns (total 23 columns):
 #   Column           Non-Null Count    Dtype   
---  ------           --------------    -----   
 0   bigram_id        1490579 non-null  category
 1   token_str        1490579 non-null  string  
 2   pattern          1490579 non-null  category
 3   category         1490579 non-null  category
 4   adv_form         1490579 non-null  category
 5   adj_form         1490579 non-null  category
 6   text_window      1490579 non-null  string  
 7   mir_deprel       1490579 non-null  category
 8   mir_lemma        1490579 non-null  category
 9   adv_lemma        1490579 non-null  category
 10  adj_lemma        1490579 non-null  category
 11  mir_form         1490579 non-null  category
 12  mir_index        1490579 non-null  UInt16  
 13  adv_index        1490579 non-null  UInt16  
 14  adj_index        1490579 non-null  UInt16

In [38]:
new_pmir = weed_windows(enforced_pos)
print(f'{len(new_pmir):,} hits remaining in `POSmirror` set after additional filtering', 
      f'({len(enforced_pos) - len(new_pmir):,} hits removed as duplicates.)', 
      sep='\n')
# show_sample(new_pmir.sample(6)[['all_forms_lower', 'text_window']])

1,475,109 hits remaining in `POSmirror` set after additional filtering
(15,470 hits removed as duplicates.)


In [39]:
new_pmir.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1475109 entries, apw_eng_19941111_0090_14:09-12-13 to pcc_eng_26_108.10246_x1747457_06:6-8-9
Data columns (total 23 columns):
 #   Column           Non-Null Count    Dtype   
---  ------           --------------    -----   
 0   bigram_id        1475109 non-null  category
 1   token_str        1475109 non-null  string  
 2   pattern          1475109 non-null  category
 3   category         1475109 non-null  category
 4   adv_form         1475109 non-null  category
 5   adj_form         1475109 non-null  category
 6   text_window      1475109 non-null  string  
 7   mir_deprel       1475109 non-null  category
 8   mir_lemma        1475109 non-null  category
 9   adv_lemma        1475109 non-null  category
 10  adj_lemma        1475109 non-null  category
 11  mir_form         1475109 non-null  category
 12  mir_index        1475109 non-null  UInt16  
 13  adv_index        1475109 non-null  UInt16  
 14  adj_index        1475109 non-null  UInt1

### Add `trigger_lower` column to `POSmirror` table

In [40]:
new_pmir = new_pmir.loc[:, :'all_forms_lower']
new_pmir['trigger_lower'] = new_pmir['mir_form_lower'].astype('category')
new_pmir.columns

Index(['bigram_id', 'token_str', 'pattern', 'category', 'adv_form', 'adj_form',
       'text_window', 'mir_deprel', 'mir_lemma', 'adv_lemma', 'adj_lemma',
       'mir_form', 'mir_index', 'adv_index', 'adj_index', 'mir_form_lower',
       'adv_form_lower', 'adj_form_lower', 'bigram_lower', 'all_forms_lower',
       'trigger_lower'],
      dtype='object')

In [41]:
new_path = path_dict['POSmirror'].with_name('LimitedPOS-'+path_dict['POSmirror'].name)

if not new_path.is_file():
    
    new_pmir.loc[:, ].to_pickle(new_path)
    print(f'Updated `POSmirror` hits dataframe saved as:\ \n  `{new_path}`')
else: 
    print(f'Updated `POSmirror` hits dataframe already exists:\ \n  `{new_path}`')
    print('\n```shell')
    !ls -ho {new_path}
    print('```')
    

Updated `POSmirror` hits dataframe saved as:\ 
  `/share/compling/data/sanpi/4_post-processed/POSmirror/LimitedPOS-trigger-bigrams_frq-thrMIN-7.35f.pkl.gz`


## Remove Duplication from `NEGmirror` as well

In [42]:
nmir.info()

<class 'pandas.core.frame.DataFrame'>
Index: 293947 entries, pcc_eng_11_001.0326_x0000513_088:14-15-16 to pcc_eng_10_108.10108_x1747378_18:7-8-9
Data columns (total 21 columns):
 #   Column           Non-Null Count   Dtype   
---  ------           --------------   -----   
 0   bigram_id        293947 non-null  category
 1   token_str        293947 non-null  string  
 2   pattern          293947 non-null  category
 3   category         293947 non-null  category
 4   neg_form         293947 non-null  category
 5   adv_form         293947 non-null  category
 6   adj_form         293947 non-null  category
 7   text_window      293947 non-null  string  
 8   neg_deprel       293947 non-null  category
 9   neg_lemma        293947 non-null  category
 10  adv_lemma        293947 non-null  category
 11  adj_lemma        293947 non-null  category
 12  neg_index        293947 non-null  UInt16  
 13  adv_index        293947 non-null  UInt16  
 14  adj_index        293947 non-null  UInt16  
 15  n

In [43]:
nmir['utt_len']= nmir.token_str.apply(lambda x: len(x.split()))
new_nmir = weed_windows(nmir)
new_nmir = new_nmir.loc[:, :'all_forms_lower']
new_nmir['trigger_lower'] = new_nmir.neg_form_lower.astype('category')
show_sample(new_nmir.sample(4)[['trigger_lower', 'all_forms_lower', 'text_window']])

+------------------------+-----------------+-------------------------------+-----------------------------------------------+
| hit_id                 | trigger_lower   | all_forms_lower               | text_window                                   |
| pcc_eng_15_002.5425_x0 | never           | never_entirely_separate       | the two worlds can never be entirely separate |
| 024650_37:6-8-9        |                 |                               | , the overlap in                              |
+------------------------+-----------------+-------------------------------+-----------------------------------------------+
| pcc_eng_10_038.8129_x0 | nor             | nor_epistemically_transparent | constrained in its options nor epistemically  |
| 611668_40:13-14-15     |                 |                               | transparent , as the examples                 |
+------------------------+-----------------+-------------------------------+-----------------------------------------------+


In [44]:
print(f'\n* {len(nmir):,} original hits in `NEGmirror` (`{path_dict["NEGmirror"].relative_to(POST_PROC_DIR)}`)')
print(f'* {len(new_nmir):,} hits remaining in `NEGmirror` set after additional filtering of duplicate hits')
print(f'  ({len(nmir) - len(new_nmir):,} hits removed as duplicates.)', 
      sep='\n')


* 293,947 original hits in `NEGmirror` (`NEGmirror/trigger-bigrams_frq-thrMIN-7.35f.pkl.gz`)
* 289,759 hits remaining in `NEGmirror` set after additional filtering of duplicate hits
  (4,188 hits removed as duplicates.)


In [45]:
new_path = path_dict['NEGmirror'].with_name('LimitedNEG-'+path_dict['NEGmirror'].name)
if not new_path.is_file():
    
    new_nmir.to_pickle(new_path)
    print(f'Updated `NEGmirror` hits dataframe saved as:\ \n  `{new_path}`')
else: 
    print(f'Updated `NEGmirror` hits dataframe already exists:\ \n  `{new_path}`')
    print('\n```shell')
    !ls -ho {new_path}
    print('```')

Updated `NEGmirror` hits dataframe saved as:\ 
  `/share/compling/data/sanpi/4_post-processed/NEGmirror/LimitedNEG-trigger-bigrams_frq-thrMIN-7.35f.pkl.gz`


## After Removing Additional Duplication

* $1,472,077$ hits remaining in `POSmirror` set after additional filtering\
  (15,381 hits removed as duplicates.)

* $289,776$ hits remaining in `NEGmirror` set after additional filtering of duplicate hits\
  (4,188 hits removed as duplicates.)
