In [110]:
from source.utils.general import HIT_TABLES_DIR
from source.utils.dataframes import NEG_REGEX
from source.utils.am_notebooks import nb_show_table, embolden
import pandas as pd


In [129]:
neq_dirs = tuple(HIT_TABLES_DIR.glob('*/NEQ*sample*.parq'))


def show_hit_sample(parq_path, n=1, part='PccVa', transpose=True, show_following_neg=False, head=None):
    _sample = pd.read_parquet(parq_path,
                              engine='pyarrow', filters=[('part', '==', part)])
    if show_following_neg:
        _sample = _sample.loc[_sample.token_str.str.contains(NEG_REGEX), :]
    if head: 
        _sample = _sample.loc[(_sample.filter(regex=r'[rg]_head').iloc[:, 0]) == head, :]
    _sample = _sample.sample(n)
    for text_col in _sample.filter(['text_window', 'hit_text', 'token_str', 'sent_text']).columns:
        _formatted = '*' + embolden(_sample[text_col], mono=False,
                              bold_regex=f"({_sample.filter(like='adv_form').iat[0,0]} {_sample.filter(like='adj_form').iat[0,0]})") + '*'
        if 'trigger_lower' in _sample.columns:
            trigger_lower = _sample.trigger_lower.squeeze()
            trigger_regex = r'|'.join(
                [trigger_lower, trigger_lower.capitalize(), trigger_lower.upper()])
            _formatted = _formatted.replace(
                r'\b('+trigger_regex+r')\b', r'`\1`', regex=True)
        _sample[text_col] = _formatted
    if n == 1 and transpose:
        # print(_sample.filter(['all_forms_lower', 'bigram_lower','bigram']).iat[0,0])
        hit_label = f": *{_sample.filter(['all_forms_lower', 'bigram_lower','bigram']).iat[0,0]}*".replace(
            '_', ' ')
    else:
        hit_label = ''
    nb_show_table(
        _sample, transpose=transpose, italics=False,
        title=f'\nSample Hit from `{parq_path.relative_to(HIT_TABLES_DIR)}`{hit_label}\n')
    return _sample

In [112]:
samples = [show_hit_sample(neq_dirs[0])]


Sample Hit from `POSmirror/NEQ-POSmirror_final_sample.24072400.parq`: *something less expensive*

|                           | `pcc_eng_val_3.05286_x43129_13:4-7-8`                                                                                                                                                                                                                      |
|:--------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| **`bigram`**              | less_expensive                                                                                                                                                                                                                                             |
| **`sent_text`**           | *If you want `somethin

Sample Hit from `POSmirror/NEQ-POSmirror_final_sample.24072400.parq`: *something so crucial*

|                       | `pcc_eng_val_3.11224_x52656_18:25-26-27`                                                                                                                                   |
|:----------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| **`bigram`**          | so_crucial                                                                                                                                                                 |
| **`sent_text`**       | Even though I had found it in the "household kitchen faucet" section, it was incredibly ignorant to proceed blindly into purchasing `something` __so crucial__.            |
| **`adv_form`**        | so                                                                                                                                                                         |
| **`adj_form`**        | crucial                                                                                                                                                                    |
| **`hit_text`**        | `something` __so crucial__ .                                                                                                                                               |
| **`text_window`**     | ignorant to proceed blindly into purchasing `something` __so crucial__ .                                                                                                   |
| **`sent_id`**         | pcc_eng_val_3.11224_x52656_18                                                                                                                                              |
| **`match_id`**        | 25-26-27                                                                                                                                                                   |
| **`bigram_id`**       | pcc_eng_val_3.11224_x52656_18:26-27                                                                                                                                        |
| **`token_str`**       | Even though I had found it in the " household kitchen faucet " section , it was incredibly ignorant to proceed blindly into purchasing `something` __so crucial__ .        |
| **`lemma_str`**       | *even though I have find it in the " household kitchen faucet " section , it be incredibly ignorant to proceed blindly into purchase something so crucial .*               |
| **`mod_deprel`**      | advmod                                                                                                                                                                     |
| **`mod_head`**        | ADJ                                                                                                                                                                        |
| **`mir_deprel`**      | amod                                                                                                                                                                       |
| **`mir_head`**        | MIR                                                                                                                                                                        |
| **`mir_lemma`**       | something                                                                                                                                                                  |
| **`adv_lemma`**       | so                                                                                                                                                                         |
| **`adj_lemma`**       | crucial                                                                                                                                                                    |
| **`mir_form`**        | something                                                                                                                                                                  |
| **`mir_index`**       | 24                                                                                                                                                                         |
| **`adv_index`**       | 25                                                                                                                                                                         |
| **`adj_index`**       | 26                                                                                                                                                                         |
| **`dep_mod`**         | *{'node': 'mod', 'contiguous': True, 'relation': 'advmod', 'head': {'lemma': 'crucial', 'ix': 26, 'xpos': 'JJ'}, 'target': {'lemma': 'so', 'ix': 25, 'xpos': 'RB'}}*       |
| **`dep_mir`**         | *{'node': 'mir', 'contiguous': False, 'relation': 'amod', 'head': {'lemma': 'something', 'ix': 24, 'xpos': 'NN'}, 'target': {'lemma': 'crucial', 'ix': 26, 'xpos': 'JJ'}}* |
| **`mir_form_lower`**  | something                                                                                                                                                                  |
| **`adv_form_lower`**  | so                                                                                                                                                                         |
| **`adj_form_lower`**  | crucial                                                                                                                                                                    |
| **`utt_len`**         | 28                                                                                                                                                                         |
| **`json_source`**     | /share/compling/data/sanpi/1_json_grew-matches/POSmirror/bigram-PccVa.pos-mirror-L/BIGRAM.pcc_eng_val-03.json                                                              |
| **`bigram_lower`**    | so_crucial                                                                                                                                                                 |
| **`all_forms_lower`** | something_so_crucial                                                                                                                                                       |
| **`pattern`**         | pos-mirror-L                                                                                                                                                               |
| **`category`**        | POSmirror                                                                                                                                                                  |
| **`trigger_lower`**   | something                                                                                                                                                                  |
| **`trigger_lemma`**   | something                                                                                                                                                                  |
| **`dep_distance`**    | 1                                                                                                                                                                          |
| **`window_len`**      | 10                                                                                                                                                                         |
| **`quarantine`**      | False                                                                                                                                                                      |
| **`id_prefix`**       | pcc_eng_val_3                                                                                                                                                              |
| **`part`**            | PccVa                                                                                                                                                                      |




Sample Hit from `POSmirror/NEQ-POSmirror_final_sample.24072400.parq`: *sometimes really enough*

|                       | `pcc_eng_val_3.03145_x39665_10:1-5-6`                                                                                                                                     |
|:----------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| **`bigram`**          | really_enough                                                                                                                                                             |
| **`sent_text`**       | *`Sometimes`, it is __really enough__ to not say much about your reasons and let your decision speak for itself.*                                                         |
| **`adv_form`**        | really                                                                                                                                                                    |
| **`adj_form`**        | enough                                                                                                                                                                    |
| **`hit_text`**        | *`Sometimes` , it is __really enough__ to not say much*                                                                                                                   |
| **`text_window`**     | *`sometimes` , it is __really enough__ to not say much about your*                                                                                                        |
| **`sent_id`**         | pcc_eng_val_3.03145_x39665_10                                                                                                                                             |
| **`match_id`**        | 1-5-6                                                                                                                                                                     |
| **`bigram_id`**       | pcc_eng_val_3.03145_x39665_10:5-6                                                                                                                                         |
| **`token_str`**       | *`Sometimes` , it is __really enough__ to not say much about your reasons and let your decision speak for itself .*                                                       |
| **`lemma_str`**       | *sometimes , it be really enough to not say much about you reason and let you decision speak for itself .*                                                                |
| **`mod_deprel`**      | advmod                                                                                                                                                                    |
| **`mod_head`**        | ADJ                                                                                                                                                                       |
| **`mir_deprel`**      | advmod                                                                                                                                                                    |
| **`mir_head`**        | ADJ                                                                                                                                                                       |
| **`mir_lemma`**       | sometimes                                                                                                                                                                 |
| **`adv_lemma`**       | really                                                                                                                                                                    |
| **`adj_lemma`**       | enough                                                                                                                                                                    |
| **`mir_form`**        | Sometimes                                                                                                                                                                 |
| **`mir_index`**       | 0                                                                                                                                                                         |
| **`adv_index`**       | 4                                                                                                                                                                         |
| **`adj_index`**       | 5                                                                                                                                                                         |
| **`dep_mod`**         | *{'node': 'mod', 'contiguous': True, 'relation': 'advmod', 'head': {'lemma': 'enough', 'ix': 5, 'xpos': 'JJ'}, 'target': {'lemma': 'really', 'ix': 4, 'xpos': 'RB'}}*     |
| **`dep_mir`**         | *{'node': 'mir', 'contiguous': False, 'relation': 'advmod', 'head': {'lemma': 'enough', 'ix': 5, 'xpos': 'JJ'}, 'target': {'lemma': 'sometimes', 'ix': 0, 'xpos': 'RB'}}* |
| **`mir_form_lower`**  | sometimes                                                                                                                                                                 |
| **`adv_form_lower`**  | really                                                                                                                                                                    |
| **`adj_form_lower`**  | enough                                                                                                                                                                    |
| **`utt_len`**         | 21                                                                                                                                                                        |
| **`json_source`**     | /share/compling/data/sanpi/1_json_grew-matches/POSmirror/bigram-PccVa.pos-mirror-R/BIGRAM.pcc_eng_val-03.json                                                             |
| **`bigram_lower`**    | really_enough                                                                                                                                                             |
| **`all_forms_lower`** | sometimes_really_enough                                                                                                                                                   |
| **`pattern`**         | pos-mirror-R                                                                                                                                                              |
| **`category`**        | POSmirror                                                                                                                                                                 |
| **`trigger_lower`**   | sometimes                                                                                                                                                                 |
| **`trigger_lemma`**   | sometimes                                                                                                                                                                 |
| **`dep_distance`**    | 4                                                                                                                                                                         |
| **`window_len`**      | 12                                                                                                                                                                        |
| **`quarantine`**      | False                                                                                                                                                                     |
| **`id_prefix`**       | pcc_eng_val_3                                                                                                                                                             |
| **`part`**            | PccVa                                                                                                                                                                     |


In [113]:
samples.append(show_hit_sample(neq_dirs[-1], show_following_neg=True))


Sample Hit from `not-RBdirect/NEQ_not-RBdirect_sample.24080200.parq`: *increasingly hip*

|                      | `pcc_eng_val_3.01634_x37205_09:4-5`                                                                                                                                                    |
|:---------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| **`adv_form`**       | increasingly                                                                                                                                                                           |
| **`adj_form`**       | hip                                                                                                                                                                                    |
| **`text_window`**    | *people are becoming __increasingly hip__ ( 

  _sample = _sample.loc[_sample.token_str.str.contains(NEG_REGEX), :]



Sample Hit from `not-RBdirect/NEQ_not-RBdirect_sample.24080200.parq`

|                      | `pcc_eng_val_2.01745_x19038_13:24-25`                                                                                                                         |
|:---------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------|
| **`adv_form`**       | pretty                                                                                                                                                        |
| **`adj_form`**       | rare                                                                                                                                                          |
| **`text_window`**    | , female- focused surf films are still __pretty rare__ .                                                                                                      |
| **`token_str`**      | Despite the huge population of women surfers that grace virtually every break on the planet nowadays , female- focused surf films are still __pretty rare__ . |
| **`adv_lemma`**      | pretty                                                                                                                                                        |
| **`adj_lemma`**      | rare                                                                                                                                                          |
| **`adv_index`**      | 23                                                                                                                                                            |
| **`utt_len`**        | 26                                                                                                                                                            |
| **`adv_form_lower`** | pretty                                                                                                                                                        |
| **`adj_form_lower`** | rare                                                                                                                                                          |
| **`bigram_lower`**   | pretty_rare                                                                                                                                                   |
| **`window_len`**     | 10                                                                                                                                                            |
| **`chunk`**          | 1                                                                                                                                                             |
| **`id_prefix`**      | pcc_eng_val_2                                                                                                                                                 |
| **`part`**           | PccVa                                                                                                                                                         |




Sample Hit from `not-RBdirect/NEQ_not-RBdirect_sample.24080200.parq`: *so many*

|                      | `pcc_eng_val_1.4949_x08088_60:09-10`                                                                                                                                      |
|:---------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| **`adv_form`**       | so                                                                                                                                                                        |
| **`adj_form`**       | many                                                                                                                                                                      |
| **`text_window`**    | *teammates do tease brissett a bit about __so many__ expectations being placed at his feet*                                                                               |
| **`token_str`**      | *While teammates do tease Brissett a bit about __so many__ expectations being placed at his feet , Hines said that Brissett does n't seem to feel any sort of pressure .* |
| **`adv_lemma`**      | so                                                                                                                                                                        |
| **`adj_lemma`**      | many                                                                                                                                                                      |
| **`adv_index`**      | 8                                                                                                                                                                         |
| **`utt_len`**        | 31                                                                                                                                                                        |
| **`adv_form_lower`** | so                                                                                                                                                                        |
| **`adj_form_lower`** | many                                                                                                                                                                      |
| **`bigram_lower`**   | so_many                                                                                                                                                                   |
| **`window_len`**     | 15                                                                                                                                                                        |
| **`chunk`**          | 1                                                                                                                                                                         |
| **`id_prefix`**      | pcc_eng_val_1                                                                                                                                                             |
| **`part`**           | PccVa                                                                                                                                                                     |

In [124]:
show_hit_sample(HIT_TABLES_DIR.joinpath('RBdirect/ALL-RBdirect_final.parq'))


Sample Hit from `RBdirect/ALL-RBdirect_final.parq`: *n't especially glamorous*

|                          | `pcc_eng_val_3.02703_x38921_190:7-8-9`                                                                                                                                                                                                                                                                                                   |
|:-------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| **`bigram`**             | especially_glamorous                                                                                                                                                      

Unnamed: 0_level_0,bigram,sent_text,neg_form,adv_form,adj_form,hit_text,text_window,sent_id,match_id,bigram_id,...,bigram_lower,neg_form_lower,trigger_lower,trigger_lemma,all_forms_lower,dep_distance,window_len,quarantine,id_prefix,part
hit_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
pcc_eng_val_3.02703_x38921_190:7-8-9,especially_glamorous,*This is another job that isn't __especially g...,n't,especially,glamorous,"*`n't` __especially glamorous__ , but it 's*",*this is another job that is `n't` __especiall...,pcc_eng_val_3.02703_x38921_190,7-8-9,pcc_eng_val_3.02703_x38921_190:8-9,...,especially_glamorous,n't,n't,not,n't_especially_glamorous,1,15,False,pcc_eng_val_3,PccVa


In [126]:
show_hit_sample(HIT_TABLES_DIR.joinpath('NEGmirror/ALL-NEGmirror_final.parq'))


Sample Hit from `NEGmirror/ALL-NEGmirror_final.parq`: *never quite comfortable*

|                           | `pcc_eng_val_3.04925_x42562_13:08-09-10`                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
|:--------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

Unnamed: 0_level_0,bigram,sent_text,neg_form,adv_form,adj_form,hit_text,text_window,sent_id,match_id,bigram_id,...,all_forms_lower,pattern,category,trigger_lower,trigger_lemma,dep_distance,window_len,quarantine,id_prefix,part
hit_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
pcc_eng_val_3.04925_x42562_13:08-09-10,quite_comfortable,"*For whatever reason, little Ted was `never` _...",never,quite,comfortable,*`never` __quite comfortable__ about being bor...,"*whatever reason , little ted was `never` __qu...",pcc_eng_val_3.04925_x42562_13,08-09-10,pcc_eng_val_3.04925_x42562_13:09-10,...,never_quite_comfortable,neg-mirror-R,NEGmirror,never,never,1,13,False,pcc_eng_val_3,PccVa


In [135]:
show_hit_sample(HIT_TABLES_DIR.joinpath('POSmirror/ALL-POSmirror_final.parq'), head='MIR').T


Sample Hit from `POSmirror/ALL-POSmirror_final.parq`: *something fairly bad*

|                           | `pcc_eng_val_1.9883_x16019_07:10-11-12`                                                                                                                               |
|:--------------------------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| **`bigram`**              | fairly_bad                                                                                                                                                            |
| **`sent_text`**           | *It seemed clear to me that Santi had done `something` __fairly bad__ to his knee.*                                                                                   |
| **`adv_form`**            | fairly                                                                                             

hit_id,pcc_eng_val_1.9883_x16019_07:10-11-12
bigram,fairly_bad
sent_text,*It seemed clear to me that Santi had done `so...
adv_form,fairly
adj_form,bad
hit_text,*`something` __fairly bad__ to his knee .*
text_window,*to me that santi had done `something` __fairl...
sent_id,pcc_eng_val_1.9883_x16019_07
match_id,10-11-12
bigram_id,pcc_eng_val_1.9883_x16019_07:11-12
token_str,*It seemed clear to me that Santi had done `so...
