In [1]:
import re
from pathlib import Path

import pandas as pd

from source.utils import POST_PROC_DIR, print_iter
from source.utils.sample import sample_pickle

HIT_EX_COLS = ['WITH::^.[il].*lower', 'WITH::text', 'token_str']
# sanpi/4_post-processed/POSmirror/trigger-bigrams_frq-thrMIN-7.35f.pkl.gz
pkl_name = 'trigger-bigrams_frq-thrMIN-7.35f.pkl.gz'
path_dict = {p: POST_PROC_DIR / p / pkl_name for  p in ('POSmirror','NEGmirror')}
path_dict



{'POSmirror': PosixPath('/share/compling/data/sanpi/4_post-processed/POSmirror/trigger-bigrams_frq-thrMIN-7.35f.pkl.gz'),
 'NEGmirror': PosixPath('/share/compling/data/sanpi/4_post-processed/NEGmirror/trigger-bigrams_frq-thrMIN-7.35f.pkl.gz')}

In [2]:
nmir = pd.read_pickle(path_dict['NEGmirror'])

In [3]:
pmir = pd.read_pickle(path_dict['POSmirror'])


In [4]:
def str_to_cat(df):
    cat_cols = df.filter(regex=r'form|bigram|lemma|deprel|head').columns
    df[cat_cols] = df[cat_cols].astype('category')
    # df.info()
    return df

In [5]:
pmir = str_to_cat(pmir)

In [6]:
nmir = str_to_cat(nmir)

In [7]:
def set_col_widths(df):
    cols = df.copy().reset_index().columns
    width_dict = (
        {c: None for c in cols}
        | {c: 22 for c in cols[cols.str.contains('_id')]}
        | {c: 45 for c in cols[cols.str.contains('text')]}
        | {c: 30 for c in cols[cols.str.contains('forms')]}
        | {c: 60 for c in cols[cols.str.contains('_str')]})
    return list(width_dict.values())

In [8]:
print_iter(header = 'POSmirror columns:', iter_obj= pmir.columns.to_list())
print_iter(header = 'NEGmirror columns:', iter_obj= nmir.columns.to_list())


POSmirror columns:
▸ bigram_id
▸ token_str
▸ pattern
▸ category
▸ adv_form
▸ adj_form
▸ text_window
▸ mir_deprel
▸ mir_lemma
▸ adv_lemma
▸ adj_lemma
▸ mir_form
▸ mir_index
▸ adv_index
▸ adj_index
▸ mir_form_lower
▸ adv_form_lower
▸ adj_form_lower
▸ bigram_lower
▸ all_forms_lower
▸ prev_form_lower

NEGmirror columns:
▸ bigram_id
▸ token_str
▸ pattern
▸ category
▸ neg_form
▸ adv_form
▸ adj_form
▸ text_window
▸ neg_deprel
▸ neg_lemma
▸ adv_lemma
▸ adj_lemma
▸ neg_index
▸ adv_index
▸ adj_index
▸ neg_form_lower
▸ adv_form_lower
▸ adj_form_lower
▸ bigram_lower
▸ all_forms_lower
▸ prev_form_lower


In [9]:
def show_sample(df: pd.DataFrame,
                format: str = 'grid',
                limit_cols: bool = True):
    if limit_cols and format != 'pipe':
        col_widths_list = set_col_widths(df)
    else:
        col_widths_list = [None] * len(df.columns)
    print(df.to_markdown(
        floatfmt=',.0f', intfmt=',',
        maxcolwidths=col_widths_list, 
        tablefmt=format
        ))

In [10]:
show_sample(pmir.pattern.value_counts().to_frame(), limit_cols=False, format='pipe')

| pattern      |     count |
|:-------------|----------:|
| pos-mirror-R | 1,364,620 |
| pos-mirror-L |   373,499 |


In [11]:
show_sample(nmir.pattern.value_counts().to_frame(), limit_cols=False, format='pipe')

| pattern      |   count |
|:-------------|--------:|
| neg-mirror-R | 216,900 |
| neg-mirror-L |  77,064 |


In [12]:
REGNOT=r" (n[o']t) "
def embolden(series,
            bold_regex=None):
    bold_regex = re.compile(bold_regex) if bold_regex else REGNOT
    return series.apply(
        lambda x: bold_regex.sub(r' __`\1`__ ', x))
    


## Problem Sentences

The following examples are all from the `POSmirror` data set which should not include any negative triggers. 
I believe the issue may be due to unexpected parses or cases where the negative trigger dependency is indirect or scopes over the identified positive trigger. 

In [13]:
for adv in ['exactly', 'ever', 'necessarily', 'yet']:
    for pat_suff in ['L', 'R']:
        problems = sample_pickle(
            data=pmir, sample_size=6, regex=True, print_sample=False,
            filters=[f'token_str== {REGNOT} .* {adv} ',
                    f'adv_form_lower==^{adv}$', 
                    f'pattern==.*{pat_suff}$'],
            columns=['mir_form_lower', 'bigram_lower', 'text_window', 'token_str'],
            sort_by='all_forms_lower')

        show_sample(
            problems.loc[problems.token_str.str.contains(f'{REGNOT}.*{adv}')].assign(
                token_str=embolden(problems.token_str, f' ({REGNOT}|{adv}) '),
                text_window=embolden(problems.text_window, f' ({REGNOT}|{adv}) ')
            ),
            format='pipe', limit_cols=False)


- *filtering rows...*
  - regex parsing = True
  - Filter expression `token_str==  (n[o']t)  .* exactly ` matched zero rows. Filter not applied.
  - ✓ Applied filter: `adv_form_lower==^exactly$`
  - ✓ Applied filter: `pattern==.*L$`

### 6 random rows matching filter(s) from `input frame`

| hit_id                                   | mir_form_lower   | bigram_lower      | text_window                                                             | token_str                                                                                                                                     |
|:-----------------------------------------|:-----------------|:------------------|:------------------------------------------------------------------------|:----------------------------------------------------------------------------------------------------------------------------------------------|
| nyt_eng_19990108_0419_27:5-8-9           | someone          | exactly_simpatico | and he is not someon

  problems.loc[problems.token_str.str.contains(f'{REGNOT}.*{adv}')].assign(



- *filtering rows...*
  - regex parsing = True
  - Filter expression `token_str==  (n[o']t)  .* exactly ` matched zero rows. Filter not applied.
  - ✓ Applied filter: `adv_form_lower==^exactly$`
  - ✓ Applied filter: `pattern==.*R$`

### 6 random rows matching filter(s) from `input frame`

| hit_id                                   | mir_form_lower   | bigram_lower   | text_window                                               | token_str                                                                  |
|:-----------------------------------------|:-----------------|:---------------|:----------------------------------------------------------|:---------------------------------------------------------------------------|
| pcc_eng_23_043.3902_x0684965_07:10-14-15 | or               | exactly_right  | this is contradictory , or if it 's __`exactly`__ right . | I do n't know if this is contradictory , or if it 's __`exactly`__ right . |


  problems.loc[problems.token_str.str.contains(f'{REGNOT}.*{adv}')].assign(



- *filtering rows...*
  - regex parsing = True
  - Filter expression `token_str==  (n[o']t)  .* ever ` matched zero rows. Filter not applied.
  - ✓ Applied filter: `adv_form_lower==^ever$`
  - ✓ Applied filter: `pattern==.*L$`

### 6 random rows matching filter(s) from `input frame`

| hit_id                                | mir_form_lower   | bigram_lower   | text_window                                                     | token_str                                                                                                                                                                                                                                         |
|:--------------------------------------|:-----------------|:---------------|:----------------------------------------------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

  problems.loc[problems.token_str.str.contains(f'{REGNOT}.*{adv}')].assign(



- *filtering rows...*
  - regex parsing = True
  - Filter expression `token_str==  (n[o']t)  .* ever ` matched zero rows. Filter not applied.
  - ✓ Applied filter: `adv_form_lower==^ever$`
  - ✓ Applied filter: `pattern==.*R$`

### 6 random rows matching filter(s) from `input frame`

| hit_id                                   | mir_form_lower   | bigram_lower   | text_window                                                          | token_str                                                                                                                                                                                             |
|:-----------------------------------------|:-----------------|:---------------|:---------------------------------------------------------------------|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| pcc_eng_

  problems.loc[problems.token_str.str.contains(f'{REGNOT}.*{adv}')].assign(



- *filtering rows...*
  - regex parsing = True
  - Filter expression `token_str==  (n[o']t)  .* necessarily ` matched zero rows. Filter not applied.
  - ✓ Applied filter: `adv_form_lower==^necessarily$`
  - ✓ Applied filter: `pattern==.*L$`

### 6 random rows matching filter(s) from `input frame`

| hit_id                                   | mir_form_lower   | bigram_lower          | text_window                                                             | token_str                                                                                                                                                                                                                                                                                                                                                         |
|:-----------------------------------------|:-----------------|:----------------------|:------------------------------------------------------------------------|:--------------------

  problems.loc[problems.token_str.str.contains(f'{REGNOT}.*{adv}')].assign(



- *filtering rows...*
  - regex parsing = True
  - Filter expression `token_str==  (n[o']t)  .* necessarily ` matched zero rows. Filter not applied.
  - ✓ Applied filter: `adv_form_lower==^necessarily$`
  - ✓ Applied filter: `pattern==.*R$`

### 6 random rows matching filter(s) from `input frame`

| hit_id                                   | mir_form_lower   | bigram_lower         | text_window                                                         | token_str                                                                                                                                                                                                                                                                      |
|:-----------------------------------------|:-----------------|:---------------------|:--------------------------------------------------------------------|:-----------------------------------------------------------------------------------------------------------------

  problems.loc[problems.token_str.str.contains(f'{REGNOT}.*{adv}')].assign(



- *filtering rows...*
  - regex parsing = True
  - Filter expression `token_str==  (n[o']t)  .* yet ` matched zero rows. Filter not applied.
  - ✓ Applied filter: `adv_form_lower==^yet$`
  - ✓ Applied filter: `pattern==.*L$`

### 6 random rows matching filter(s) from `input frame`

| hit_id                                   | mir_form_lower   | bigram_lower   | text_window                                                              | token_str                                                                                      |
|:-----------------------------------------|:-----------------|:---------------|:-------------------------------------------------------------------------|:-----------------------------------------------------------------------------------------------|
| pcc_eng_03_097.9630_x1569987_08:07-10-11 | something        | yet_unknown    | not even begin until something ( still __`yet`__ unknown to us ) happens | Evolution can not even begin until something ( still _

  problems.loc[problems.token_str.str.contains(f'{REGNOT}.*{adv}')].assign(



- *filtering rows...*
  - regex parsing = True
  - Filter expression `token_str==  (n[o']t)  .* yet ` matched zero rows. Filter not applied.
  - ✓ Applied filter: `adv_form_lower==^yet$`
  - ✓ Applied filter: `pattern==.*R$`

### 6 random rows matching filter(s) from `input frame`

| hit_id                                   | mir_form_lower   | bigram_lower     | text_window                                                           | token_str                                                                                                                                                                                      |
|:-----------------------------------------|:-----------------|:-----------------|:----------------------------------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| pcc_eng_14_089.645

  problems.loc[problems.token_str.str.contains(f'{REGNOT}.*{adv}')].assign(


- This could be dealt with by modifying the patterns (i.e. the `WITHOUT` clauses specifically) and rerunning everything, but
  1. There's no telling how long that would take 
  2. verifying its accuracy is difficult
  3. even with 100% accurate patterns for *correct* parses, there is no way to prevent or really even predict all possible *mis*parses
- So there is a better way: 
  
  The preponderance of positive data provides a large margin for additional data exclusions without unbalancing the samples---in fact, 
  it actually brings `[POSMIR,f1]` _closer_ to the negative sample size, `[NEGMIR, f1]`.

  Therefore, it is possible to simply drop anything with a likely negation preceding the bigram, 
  regardless of the polarity environment the particular syntactic configuration creates, and call it a day.


In [14]:
pmir['adv_index'] = pd.to_numeric(pmir.index.to_series().str.split(':').str.get(-1).apply(lambda i: re.search(r'-(\d+)-', i).group().strip('-')), downcast='unsigned')
pmir['preceding_text'] = pmir.apply(lambda x: ' '.join(x.token_str.split()[:x.adv_index - 1]), axis='columns').astype('string')

In [15]:
show_sample(pmir[['preceding_text', 'bigram_lower', 'token_str']].sample(5))

+------------------------+-----------------------------------------------+----------------+-------------------------------------------------------------+
| hit_id                 | preceding_text                                | bigram_lower   | token_str                                                   |
| pcc_eng_24_020.1972_x0 | NG : Borrisokane wanted to release a new EP , | more_special   | NG : Borrisokane wanted to release a new EP , but rather    |
| 310014_16:27-28-29     | but rather than just record something and put |                | than just record something and put it out , they wanted to  |
|                        | it out , they wanted to make it something     |                | make it something more special and more involved with the   |
|                        |                                               |                | Austin music community .                                    |
+------------------------+-----------------------------------------------+--

In [16]:
show_sample(pmir[['preceding_text', 'bigram_lower', 'token_str']].sample(5), format='pipe')

| hit_id                                   | preceding_text                                                                                                                                                           | bigram_lower    | token_str                                                                                                                                                                                                                                                                                                                                                   |
|:-----------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:----------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [22]:
pmir['after_neg'] = pmir.preceding_text.str.lower().str.contains(r"\b(no|n[o']t|no(body| one|thing|where)|(rare|scarce|bare|hard)ly|seldom|without|never)\b", regex=True)
show_sample(pmir.loc[pmir.after_neg, ['preceding_text', 'bigram_lower', 'token_str']].sample(10))

  pmir['after_neg'] = pmir.preceding_text.str.lower().str.contains(r"\b(no|n[o']t|no(body| one|thing|where)|(rare|scarce|bare|hard)ly|seldom|without|never)\b", regex=True)


+------------------------+-----------------------------------------------+-----------------+--------------------------------------------------------------+
| hit_id                 | preceding_text                                | bigram_lower    | token_str                                                    |
| pcc_eng_19_018.1666_x0 | Naturalness , to physicists , means no ( or   | very_little     | Naturalness , to physicists , means no ( or very little )    |
| 276969_22:09-10-11     |                                               |                 | fine tuning .                                                |
+------------------------+-----------------------------------------------+-----------------+--------------------------------------------------------------+
| pcc_eng_14_075.7211_x1 | I still love the genre , but I ca n't read or | very_domestic   | I still love the genre , but I ca n't read or watch anything |
| 208178_056:20-21-22    | watch anything that involves kids or 

In [23]:
some_neg_ex = pmir.loc[pmir.after_neg, ['preceding_text', 'bigram_lower', 'token_str']].sample(6)
show_sample(some_neg_ex.assign(
    preceding_text=embolden(some_neg_ex.preceding_text, 
                            f' ({REGNOT}|nobody|nothing|never|none|no) ')
    ), format='pipe')

| hit_id                                    | preceding_text                                                                                                                                                                                          | bigram_lower    | token_str                                                                                                                                                                                                                                                     |
|:------------------------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:----------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [25]:
print(f'* ${pmir.after_neg.value_counts()[False]:,}$ tokens in `POSmirror` hits not preceded by negation')
print('  > - I.e. what would remain if _all_ potential contaminants were excluded')
print(f'  > - _{pmir.after_neg.value_counts()[True]:,}_ potential exclusions')
print(f'* ${len(nmir):,}$ tokens in `NEGmirror` hits')
print(f'* Remaining Sample Size Discrepancy: ${pmir.after_neg.value_counts()[False] - len(nmir):,}$')

* $1,487,458$ tokens in `POSmirror` hits not preceded by negation
  > - I.e. what would remain if _all_ potential contaminants were excluded
  > - _250,661_ potential exclusions
* $293,964$ tokens in `NEGmirror` hits
* Remaining Sample Size Discrepancy: $1,193,494$


## Effect of Negation Removals



### For 868+ frequency filtered ad* forms 
  
  _Without considering any upper case_
  * ~~__1,457,913__ tokens in `POSmirror` hits not preceded by negation~~
      * ~~I.e. what would remain if _all_ potential contaminants were excluded~~
      * ~~_217,588_ potential exclusions~~
  ---
  _Without considering fully upper case triggers_
  * ~~__1,460,126__ tokens in `POSmirror` hits not preceded by negation~~
  * ~~I.e. what would remain if _all_ potential contaminants were excluded~~
  * ~~_215,375_ potential exclusions~~
  ---
  _Normalized for case first, but not catching negation at very end of preceding text (no whitespace following)_
  * ~~**1,459,568** tokens in `POSmirror` hits not preceded by negation~~
  > - ~~I.e. what would remain if _all_ potential contaminants were excluded~~
  > - ~~_215,933_ potential exclusions~~
  * ~~Updated difference in hit subtotals: **1,174,133**~~
  * $285,435$ tokens in `NEGmirror` hits
  ---
  **_Fixed to catch even `preceding_text` final negative triggers_**
  * ~~**1,455,547** tokens in `POSmirror` hits not preceded by negation~~
  > - ~~I.e. what would remain if _all_ potential contaminants were excluded~~
  > - ~~_219,954_ potential exclusions~~
  * $285,435$ tokens in `NEGmirror` hits
  * ~~Remaining Sample Size Discrepancy: **1,170,112**~~
  ---
  **Strengthened even furthre to catch negative adverbs and "without" and triggers at the _beginning_ of the `preceding_text`**
  * $1,434,420$ tokens in `POSmirror` hits not preceded by negation
  > - I.e. what would remain if _all_ potential contaminants were excluded
  > - _241,081_ potential exclusions
  * $285,435$ tokens in `NEGmirror` hits
  * Remaining Sample Size Discrepancy: $1,148,985$



### For 7+ frequency filtered ad\* forms

  * $1,487,458$ tokens in `POSmirror` hits not preceded by negation
    > - I.e. what would remain if _all_ potential contaminants were excluded
    > - _250,661_ potential exclusions
  * $293,964$ tokens in `NEGmirror` hits
  * Remaining Sample Size Discrepancy: $1,193,494$


In [33]:
enforced_pos= pmir.loc[~pmir.after_neg, :'preceding_text']
enforced_pos.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1487458 entries, apw_eng_19941111_0004_1:14-15-16 to pcc_eng_26_108.10246_x1747457_06:6-8-9
Data columns (total 22 columns):
 #   Column           Non-Null Count    Dtype   
---  ------           --------------    -----   
 0   bigram_id        1487458 non-null  category
 1   token_str        1487458 non-null  string  
 2   pattern          1487458 non-null  category
 3   category         1487458 non-null  category
 4   adv_form         1487458 non-null  category
 5   adj_form         1487458 non-null  category
 6   text_window      1487458 non-null  string  
 7   mir_deprel       1487458 non-null  category
 8   mir_lemma        1487458 non-null  category
 9   adv_lemma        1487458 non-null  category
 10  adj_lemma        1487458 non-null  category
 11  mir_form         1487458 non-null  category
 12  mir_index        1487458 non-null  UInt16  
 13  adv_index        1487458 non-null  uint8   
 14  adj_index        1487458 non-null  UInt16

In [27]:
adv = 'exactly'
new_exactly_ex = sample_pickle(
    data=enforced_pos,
    print_sample=False, sample_size=10,
    columns=['all_forms_lower', 'token_str'],
    filters=[f'adv_form_lower=={adv}'],
)

show_sample(new_exactly_ex.assign(token_str=embolden(new_exactly_ex.token_str, r' (exactly) ')))


- *filtering rows...*
  - regex parsing = False
  - ✓ Applied filter: `adv_form_lower==exactly`

### 10 random rows matching filter(s) from `input frame`

+------------------------+------------------------------+--------------------------------------------------------------+
| hit_id                 | all_forms_lower              | token_str                                                    |
| pcc_eng_01_007.5031_x0 | everything_exactly_right     | Everything from the frieze boards on the exterior to the     |
| 105039_16:01-17-18     |                              | window trim on the interior is __`exactly`__ right for the   |
|                        |                              | space .                                                      |
+------------------------+------------------------------+--------------------------------------------------------------+
| pcc_eng_06_077.3579_x1 | everyone_exactly_alike       | I grew up in an environment where there were very few people

In [28]:
for pat_suff in ['R', 'L']:
    new_exactly_ex = sample_pickle(
        data=enforced_pos, sample_size=8,
        print_sample=False, sort_by='adj_form_lower',
        columns=['all_forms_lower', 'text_window', 'token_str'],
        filters=[f'adv_form_lower=={adv}', 
                f'pattern==pos-mirror-{pat_suff}'],
    )

    show_sample(new_exactly_ex.assign(
        text_window=embolden(new_exactly_ex.text_window, f' ({adv}) '),
        token_str=embolden(new_exactly_ex.token_str, f' ({adv}) ')
    ), format='pipe')


- *filtering rows...*
  - regex parsing = False
  - ✓ Applied filter: `adv_form_lower==exactly`
  - ✓ Applied filter: `pattern==pos-mirror-R`

### 8 random rows matching filter(s) from `input frame`

| hit_id                                    | all_forms_lower             | text_window                                                                                                                 | token_str                                                                                                                                                                                                                                                                                                  |
|:------------------------------------------|:----------------------------|:----------------------------------------------------------------------------------------------------------------------------|:----------------------------------------------------------------------------------------------

## Remove Duplicated `text_window`+`all_forms_lower`

In [34]:
enforced_pos['utt_len']= pd.to_numeric(enforced_pos.token_str.apply(lambda x: int(len(x.split()))), downcast='integer')

In [71]:
dups_token_str = enforced_pos.loc[enforced_pos.duplicated(subset=['token_str', 'all_forms_lower']), ['all_forms_lower', 'token_str','text_window', 'utt_len']]
dups_text_window = enforced_pos.loc[enforced_pos.duplicated(subset=['text_window', 'all_forms_lower']), ['all_forms_lower', 'token_str', 'text_window','utt_len']]
dups_both = enforced_pos.loc[enforced_pos.duplicated(subset=['text_window', 'token_str', 'all_forms_lower']), ['all_forms_lower', 'token_str','text_window', 'utt_len']]
# show_sample(dups.loc[dups.utt_len>=80, :].sort_values(['utt_len', 'token_str']).head(6))
print_iter([f'token_str:   {len(dups_token_str):,}', 
            f'text_window: {len(dups_text_window):,}',
            f'both:        {len(dups_both):,}'], header='Potental Removals (no `utt_len` filter applied)')


Potental Removals (no `utt_len` filter applied)
▸ token_str:   15,359
▸ text_window: 36,745
▸ both:        14,827


In [81]:
print(f'`text_window` duplicates when restricted to 20+ tokens in `token_str`: {len(dups_text_window.loc[dups_text_window.utt_len>=20, :]):,}')

`text_window` duplicates when restricted to 20+ tokens in `token_str`: 16,479


These duplicates are retained since it's too messy to separate the clearly carbon copy utterances from plausible genuine production. 

Highly Suspicious

|     | text_window                                               | \#tokens in sentence | \#duplications |
|----:|:----------------------------------------------------------|---------------------:|---------------:|
|   7 | _Everything you see here is absolutely FREE to watch ._   |                   10 |             60 |
|  17 | _All of Swedenborg 's works are well worth reading ._     |                   10 |             43 |
|  60 | _2 fig Something quintessentially Canadian ._             |                    6 |             20 |
|  65 | _Everybody is Super Heady in our sandbox ._               |                    8 |             19 |
| 119 | _Because sometimes , 140 characters just is n't enough ._ |                   10 |             13 |
| 143 | _" Or maybe stupid , " Ebenezar countered_                |                    9 |             11 |
| 188 | _Because Sometimes 140 Characters Just Is n't Enough_     |                    8 |              9 |
| 283 | _There 's something very wrong with our pterosaurs ._     |                    9 |              7 |
| 290 | _The plaid decoration is all very good ._                 |                    8 |              7 |
| 482 | _wrap up something as special as she is [...]_            |                    9 |              5 |

Plausible Production

|    | text_window                                    | \#tokens in sentence | \#duplications |
|---:|:-----------------------------------------------|---------------------:|---------------:|
|  2 | _Something was n't right ._                    |                    5 |            118 |
|  3 | _But I 'm sure everything will be just fine ._ |                   10 |            102 |
|  8 | _It 's all very confusing ._                   |                    6 |             56 |
|  6 | _We should all be so lucky ._                  |                    7 |             69 |
| 10 | _It 's all very exciting ._                    |                    6 |             54 |
| 19 | _Something is definitely wrong ._              |                    5 |             41 |
| 20 | _Looking for something more specific ?_        |                    6 |             40 |
| 25 | _Both are equally important ._                 |                    5 |             33 |
| 31 | _My cup is always half full ._                 |                    7 |             30 |
| 39 | _If only it were all so simple !_              |                    9 |             24 |
| 48 | _Everything is not awesome ._                  |                    5 |             23 |



In [91]:
dups_text_window.loc[(dups_text_window.utt_len < 20), :].value_counts(['text_window', 'utt_len']).to_frame().reset_index().sort_values(['count','text_window', ], ascending=False)

Unnamed: 0,text_window,utt_len,count
0,Some of your changes are now live .,8,384
1,And now for something completely different .,7,122
2,Something was n't right .,5,118
3,But I 'm sure everything will be just fine .,10,102
4,And now for something completely different ...,7,75
...,...,...,...
7953,""" And now for something completely different . """,17,1
7950,""" All of us at Whataburger are so happy to get...",16,1
7949,""" All along I have been completely consistent ...",18,1
8954,""" Abusers are often very adept at identifying ...",10,1


In [93]:
def weed_windows(df):
    return pd.concat(
        [df.loc[df.utt_len < 20, :],
         df.loc[df.utt_len >= 20, :].drop_duplicates(
            subset=['text_window', 'all_forms_lower'])]
    )

In [101]:
enforced_pos.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1487458 entries, apw_eng_19941111_0004_1:14-15-16 to pcc_eng_26_108.10246_x1747457_06:6-8-9
Data columns (total 23 columns):
 #   Column           Non-Null Count    Dtype   
---  ------           --------------    -----   
 0   bigram_id        1487458 non-null  category
 1   token_str        1487458 non-null  string  
 2   pattern          1487458 non-null  category
 3   category         1487458 non-null  category
 4   adv_form         1487458 non-null  category
 5   adj_form         1487458 non-null  category
 6   text_window      1487458 non-null  string  
 7   mir_deprel       1487458 non-null  category
 8   mir_lemma        1487458 non-null  category
 9   adv_lemma        1487458 non-null  category
 10  adj_lemma        1487458 non-null  category
 11  mir_form         1487458 non-null  category
 12  mir_index        1487458 non-null  UInt16  
 13  adv_index        1487458 non-null  uint8   
 14  adj_index        1487458 non-null  UInt16

In [102]:
new_pmir = weed_windows(enforced_pos)
print(f'{len(new_pmir):,} hits remaining in `POSmirror` set after additional filtering', 
      f'({len(enforced_pos) - len(new_pmir):,} hits removed as duplicates.)', 
      sep='\n')
# show_sample(new_pmir.sample(6)[['all_forms_lower', 'text_window']])

1,472,077 hits remaining in `POSmirror` set after additional filtering
(15,381 hits removed as duplicates.)


In [103]:
new_pmir.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1472077 entries, apw_eng_19941111_0090_14:09-12-13 to pcc_eng_26_108.10246_x1747457_06:6-8-9
Data columns (total 23 columns):
 #   Column           Non-Null Count    Dtype   
---  ------           --------------    -----   
 0   bigram_id        1472077 non-null  category
 1   token_str        1472077 non-null  string  
 2   pattern          1472077 non-null  category
 3   category         1472077 non-null  category
 4   adv_form         1472077 non-null  category
 5   adj_form         1472077 non-null  category
 6   text_window      1472077 non-null  string  
 7   mir_deprel       1472077 non-null  category
 8   mir_lemma        1472077 non-null  category
 9   adv_lemma        1472077 non-null  category
 10  adj_lemma        1472077 non-null  category
 11  mir_form         1472077 non-null  category
 12  mir_index        1472077 non-null  UInt16  
 13  adv_index        1472077 non-null  uint8   
 14  adj_index        1472077 non-null  UInt1

### Add `trigger_lower` column to `POSmirror` table

In [104]:
new_pmir = new_pmir.loc[:, :'all_forms_lower']
new_pmir['trigger_lower'] = new_pmir['mir_form_lower'].astype('category')
new_pmir.columns

Index(['bigram_id', 'token_str', 'pattern', 'category', 'adv_form', 'adj_form',
       'text_window', 'mir_deprel', 'mir_lemma', 'adv_lemma', 'adj_lemma',
       'mir_form', 'mir_index', 'adv_index', 'adj_index', 'mir_form_lower',
       'adv_form_lower', 'adj_form_lower', 'bigram_lower', 'all_forms_lower',
       'trigger_lower'],
      dtype='object')

In [105]:
new_path = path_dict['POSmirror'].with_name('LimitedPOS-'+path_dict['POSmirror'].name)

if not new_path.is_file():
    
    new_pmir.loc[:, ].to_pickle(new_path)
    print(f'Updated `POSmirror` hits dataframe saved as:\ \n  `{new_path}`')
else: 
    print(f'Updated `POSmirror` hits dataframe already exists:\ \n  `{new_path}`')
    print('\n```shell')
    !ls -ho {new_path}
    print('```')
    

Updated `POSmirror` hits dataframe saved as:\ 
  `/share/compling/data/sanpi/4_post-processed/POSmirror/LimitedPOS-trigger-bigrams_frq-thrMIN-7.35f.pkl.gz`


## Remove Duplication from `NEGmirror` as well

In [106]:
nmir.info()

<class 'pandas.core.frame.DataFrame'>
Index: 293964 entries, pcc_eng_11_001.0326_x0000513_088:14-15-16 to pcc_eng_10_108.10108_x1747378_18:7-8-9
Data columns (total 21 columns):
 #   Column           Non-Null Count   Dtype   
---  ------           --------------   -----   
 0   bigram_id        293964 non-null  category
 1   token_str        293964 non-null  string  
 2   pattern          293964 non-null  category
 3   category         293964 non-null  category
 4   neg_form         293964 non-null  category
 5   adv_form         293964 non-null  category
 6   adj_form         293964 non-null  category
 7   text_window      293964 non-null  string  
 8   neg_deprel       293964 non-null  category
 9   neg_lemma        293964 non-null  category
 10  adv_lemma        293964 non-null  category
 11  adj_lemma        293964 non-null  category
 12  neg_index        293964 non-null  UInt16  
 13  adv_index        293964 non-null  UInt16  
 14  adj_index        293964 non-null  UInt16  
 15  n

In [108]:
nmir['utt_len']= nmir.token_str.apply(lambda x: len(x.split()))
new_nmir = weed_windows(nmir)
new_nmir = new_nmir.loc[:, :'all_forms_lower']
new_nmir['trigger_lower'] = new_nmir.neg_form_lower.astype('category')
show_sample(new_nmir.sample(4)[['trigger_lower', 'all_forms_lower', 'text_window']])

+------------------------+-----------------+-----------------------------+---------------------------------------------+
| hit_id                 | trigger_lower   | all_forms_lower             | text_window                                 |
| pcc_eng_24_084.7054_x1 | never           | never_more_robust           | all American cities have never been more    |
| 353971_6:15-17-18      |                 |                             | robust , more vibrant and                   |
+------------------------+-----------------+-----------------------------+---------------------------------------------+
| pcc_eng_28_004.8344_x0 | none            | none_particularly_patriotic | None of us have been particularly patriotic |
| 061988_30:1-6-7        |                 |                             | up until now ,                              |
+------------------------+-----------------+-----------------------------+---------------------------------------------+
| pcc_eng_03_033.4509_x0 | never

In [111]:
print(f'\n* {len(nmir):,} original hits in `NEGmirror` (`{path_dict["NEGmirror"].relative_to(POST_PROC_DIR)}`)')
print(f'* {len(new_nmir):,} hits remaining in `NEGmirror` set after additional filtering of duplicate hits')
print(f'  ({len(nmir) - len(new_nmir):,} hits removed as duplicates.)', 
      sep='\n')


* 293,964 original hits in `NEGmirror` (`NEGmirror/trigger-bigrams_frq-thrMIN-7.35f.pkl.gz`)
* 289,776 hits remaining in `NEGmirror` set after additional filtering of duplicate hits
  (4,188 hits removed as duplicates.)


In [112]:
new_path = path_dict['NEGmirror'].with_name('LimitedNEG-'+path_dict['NEGmirror'].name)
if not new_path.is_file():
    
    new_nmir.to_pickle(new_path)
    print(f'Updated `NEGmirror` hits dataframe saved as:\ \n  `{new_path}`')
else: 
    print(f'Updated `NEGmirror` hits dataframe already exists:\ \n  `{new_path}`')
    print('\n```shell')
    !ls -ho {new_path}
    print('```')

Updated `NEGmirror` hits dataframe saved as:\ 
  `/share/compling/data/sanpi/4_post-processed/NEGmirror/LimitedNEG-trigger-bigrams_frq-thrMIN-7.35f.pkl.gz`


## After Removing Additional Duplication

* $1,472,077$ hits remaining in `POSmirror` set after additional filtering\
  (15,381 hits removed as duplicates.)

* $289,776$ hits remaining in `NEGmirror` set after additional filtering of duplicate hits\
  (4,188 hits removed as duplicates.)
