In [26]:
# Install `transformers` from master
# ! 👇 apparently this isn't really python code. It only works on jupyter kernal
%pip install transformers
%pip list | grep -E 'transformers|tokenizers'
# transformers version at notebook update --- 4.32.1 #//2.11.0
# tokenizers version at notebook update ---  0.13.3 #//0.8.0rc1

import csv
from pathlib import Path
from pprint import pprint

import pandas as pd
import torch
from transformers import (AutoModel, AutoModelForMaskedLM, AutoTokenizer,
                          pipeline)
from utils.LexicalCategories import ADJ_BY_SCALE as ADJ_SETS
from utils.LexicalCategories import ADV_OF_INTEREST as ADV_SETS

pd.set_option('display.max_colwidth', 70)
pd.set_option("display.precision", 5)
MASK_POS = 'adv'
MODEL = 'distilbert-base-uncased'
UNMASK = pipeline('fill-mask', model=MODEL, tokenizer=MODEL, framework='pt')
HITS_PATH = Path(
    '/share/compling/projects/sanpi/demo/data/2_hit_tables/RBXadj/bigram_X2puddin_all-RB-JJs_hits.csv')
FRQ_FILTER = Path(
    '/share/compling/projects/sanpi/demo/data/4_post-processed/RBxpos/hit-index_thr0-001p.1f.txt')

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Note: you may need to restart the kernel to use updated packages.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
tokenizers               0.13.3
transformers             4.32.1
Note: you may need to restart the kernel to use updated packages.


 ## Load sample hit table as data

In [27]:
def _load_hits(dpath=HITS_PATH):

    if '.csv' in dpath.suffixes:
        data = pd.read_csv(dpath)
        data = data.set_index('hit_id').convert_dtypes()

    elif dpath.endswith('pkl.gz'):
        data = pd.read_pickle(dpath)

    print('data loaded')
    pprint(data.columns)
    print(data.sample(3))

    return data

data = _load_hits()

data loaded
Index(['colloc', 'sent_text', 'adv_form', 'adj_form', 'hit_text',
       'text_window', 'sent_id', 'match_id', 'colloc_id', 'token_str',
       'lemma_str', 'context_prev_id', 'context_prev_sent', 'context_next_id',
       'context_next_sent', 'adv_lemma', 'adj_lemma', 'adv_index', 'adj_index',
       'dep_mod', 'json_source', 'utt_len', 'category'],
      dtype='object')
                                               colloc  \
hit_id                                                  
pcc_eng_29_108.2028_x1732482_04:28-29   very_eventful   
pcc_eng_29_109.3185_x1745367_07:09-10  most_important   
pcc_eng_29_108.4211_x1736061_05:4-5       very_public   

                                                                                                   sent_text  \
hit_id                                                                                                         
pcc_eng_29_108.2028_x1732482_04:28-29  Set in a sleepy cul-de-sac somewhere in beachside suburbia, Swin

 Apply filters to loaded data, based on frequency as well as manual criteria

In [28]:
def _filter_hits(hits_data):
    # trim data
    filter_hi = pd.read_csv(FRQ_FILTER).squeeze()
    df = hits_data.loc[filter_hi,
                       ['adv_form', 'adj_form', 'adv_lemma', 'adj_lemma',
                        'adv_index', 'adj_index', 'token_str', 'text_window']]
    df = df.assign(
        prev_prev_word=df.index.to_series().apply(
            lambda ix: df.token_str[ix].split()[max(-1,df.adv_index[ix] - 2)]),
        prev_word=df.index.to_series().apply(
            lambda ix: df.token_str[ix].split()[max(-1,df.adv_index[ix] - 1)]),
        next_word=df.index.to_series().apply(
            lambda ix: df.token_str[ix].split()[min(len(df.token_str[ix].split())-1, df.adj_index[ix] + 1)]),
        next_next_word=df.index.to_series().apply(
            lambda ix: df.token_str[ix].split()[min(len(df.token_str[ix].split())-1, df.adj_index[ix] + 2)])
        )
    df.loc[df.prev_word == df.prev_prev_word, 'prev_prev_word'] = ''
    df.loc[df.prev_word == df.adv_form, 'prev_word'] = ''
    df.loc[df.next_word == df.next_next_word, 'next_next_word'] = ''
    df.loc[df.next_word == df.adj_form, 'next_word'] = ''
    manual_criteria = (
        # limit to `token_str` values with no more than 14 spaces ≈ max 15 words
        (df.token_str.str.count(' ') <= 14)
        # NOTE: 'many', 'much' & 'enough' have abnormal syntactic patterns
        & (~df.adv_lemma.str.lower().isin(('enough', 'how', 'much', 'most')))
        & (~df.adj_lemma.str.lower().isin(('many', 'more', 'much')))
        # have issues filling position following "much" and "more"--they don't wind up being adverbs but comparative adjectives
        & (~df.prev_word.isin(['more', 'much', 'as']))
        & (~df.next_word.str.lower().isin(['as']))
    )
    df = df.loc[manual_criteria, :]

    return df

data = _filter_hits(data)

In [29]:
data.sample(5).reset_index(drop=True).loc[:, data.columns.str.endswith(('word', 'form', 'str', 'window'))].set_index('text_window')

Unnamed: 0_level_0,adv_form,adj_form,token_str,prev_prev_word,prev_word,next_word,next_next_word
text_window,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Very amazed with the standard,Very,amazed,Very amazed with the standard of his provider and his professional...,,.,with,the
I 'm also curious to see how,also,curious,I 'm also curious to see how players take to the leaderboards .,I,'m,to,see
thing that is definitely better about Portland is,definitely,better,Austin -- the one thing that is definitely better about Portland i...,that,is,about,Portland
pain is a surprisingly mysterious thing .,surprisingly,mysterious,Chronic pain is a surprisingly mysterious thing .,is,a,thing,.
Being so Normal was released on,so,Normal,Being so Normal was released on September 18th via Kingfisher Bluez .,.,Being,was,released


 ## Mask tokens
 In this version, the data is not pre-masked.
 It consists of sentence strings and target information, including the index of token nodes in the `token_str` values.
 Should be able to use the `adv_index` and/or `adj_index` to create the corresponding masked version of `token_str`

 _Note: for some reason, the initial filter series, `filter_hi`, gets reshaped as the index of `data`, so it cannot be used in place of `data.index.to_series()`_

In [30]:
def pre_mask(df, mask_pos='adv'):
    # df.info()
    mask_ix = df.loc[:, f'{mask_pos}_index']
    # print(mask_ix)
    # #! end index is not included in the returned value! [:mask_ix[h]-1] cuts out the word preceding the adv as well

    return df.index.to_series().apply(
        lambda h: ' '.join(df.token_str[h].split()[:mask_ix[h]]))


def post_mask(df, mask_pos='adv'):
    mask_ix = df.loc[:, f'{mask_pos}_index']

    return mask_ix.index.to_series().apply(
        lambda h: ' '.join(df.token_str[h].split()[mask_ix[h]+1:]))


def mask(df: pd.DataFrame, pos='adv'):
    # sourcery skip: use-fstring-for-concatenation
    pre_col = pre_mask(df, pos)
    post_col = post_mask(df, pos)
    return df.assign(masked=pre_col + ' [MASK] ' + post_col)


data = mask(data, MASK_POS)

In [31]:
data.sample(9).rename(columns={'adv_form':'original token'}).set_index('original token')[['masked']]

Unnamed: 0_level_0,masked
original token,Unnamed: 1_level_1
very,It 's [MASK] clear about their mission .
that,But I can I admit I have n't always been [MASK] welcoming to him .
more,White - winged Crossbills were the [MASK] abundant of the two .
economically,Digital thermometers are [MASK] cheaper than the other forms of
not,"You might feel like you have plenty of choice , but simply [MASK] ..."
rather,"Slicing and dicing into small pieces - yes , that did sound [MASK]..."
super,They are [MASK] soft and extremely absorbent .
not,"For publishers , the news may be dispiriting , but it 's [MASK] su..."
too,Life is [MASK] short !


 ## *Unmask* Tokens
 Data is loaded and now _masked_ sentences are a column in the dataframe.
 Can use `UNMASK` pipeline to get probabilities of tokens at masked position.

 ## For just 1 input

 Select sample sentence and run pipeline

In [32]:
selected_data = data.loc[:, ['adv_form', 'adj_form', 'text_window', 'masked', 'prev_word', 'next_word']]
sent_row = selected_data.sample(1)
sent = sent_row.masked.squeeze()
sent_row.transpose()

hit_id,pcc_eng_29_109.3186_x1745368_049:5-6
adv_form,very
adj_form,ambiguous
text_window,it 's a very ambiguous paragraph .
masked,And it 's a [MASK] ambiguous paragraph .
prev_word,a
next_word,paragraph


 get top 10 predictions for masked position

In [33]:
results = UNMASK(sent, top_k=10)
rdf = pd.DataFrame(results).sort_values('score', ascending=False)
rdf.set_index('token_str')

Unnamed: 0_level_0,score,token,sequence
token_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
very,0.12981,2200,and it's a very ambiguous paragraph.
rather,0.10789,2738,and it's a rather ambiguous paragraph.
highly,0.06455,3811,and it's a highly ambiguous paragraph.
pretty,0.04952,3492,and it's a pretty ambiguous paragraph.
somewhat,0.04842,5399,and it's a somewhat ambiguous paragraph.
fairly,0.04229,7199,and it's a fairly ambiguous paragraph.
perfectly,0.03731,6669,and it's a perfectly ambiguous paragraph.
politically,0.03101,10317,and it's a politically ambiguous paragraph.
slightly,0.02751,3621,and it's a slightly ambiguous paragraph.
morally,0.02369,28980,and it's a morally ambiguous paragraph.


 load lexical categories and compare

In [34]:
def add_lexcats(results:pd.DataFrame, 
                lex_dict:dict,
                original_forms:pd.Series) -> pd.DataFrame:

    cat_start_index = len(results.columns)
    for lex_type, lex_set in lex_dict.items():
        results.loc[:, lex_type] = results.token_str.isin(lex_set)
    return (results
            .assign(lexcat_defined=rdf.iloc[:, cat_start_index:].apply(any, axis=1),
                    match_masked_POS=results.token_str.isin(original_forms))
            )


if MASK_POS == 'adj':
    lexcat_dict = ADJ_SETS
elif MASK_POS == 'adv':
    # for adv_type, adv_set in ADV_SETS.items():
    #     rdf.loc[:, adv_type] = rdf.token_str.isin(adv_set)
    # rdf = rdf.assign(match_masked_POS=rdf.token_str.isin(data.adv_form))
    lexcat_dict = ADV_SETS

rdf = add_lexcats(rdf, lexcat_dict, data[f'{MASK_POS}_form'])
rdf = rdf.set_index('token_str')
print(f'## input:\n   > {sent}')

## input:
   > And it 's a [MASK] ambiguous paragraph .


In [35]:
rdf.loc[:, ['score']].join(rdf.iloc[:, 3:])

Unnamed: 0_level_0,score,sufficient,min_threshold,max_threshold,moderate,precise,compare,weak_intense,meas_intense,mod_intense,...,max_intense,EDM_Morzycki,Mmod_SoltWilson,negative,positive,NPS,PS_rescuer,PPS,lexcat_defined,match_masked_POS
token_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
very,0.12981,False,False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,True,True
rather,0.10789,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,True,True,True
highly,0.06455,False,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,True,True
pretty,0.04952,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,True,True,True
somewhat,0.04842,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,True,True,True
fairly,0.04229,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,True,True,True
perfectly,0.03731,False,False,True,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,True,True
politically,0.03101,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
slightly,0.02751,False,True,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,True
morally,0.02369,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True


 ## Across multiple inputs

 Apply across all masked sentences and collect results

In [36]:
pprint([{'hit_id': i,
         f'original_{MASK_POS}': selected_data.loc[i, f'{MASK_POS}_form'],
         'text_window': selected_data.text_window[i],
         'masked_str': selected_data.masked[i],
         'scores': UNMASK(selected_data.masked[i], top_k=3)}
        for i in selected_data.sample(3).index])


[{'hit_id': 'pcc_eng_29_108.5107_x1737524_64:12-13',
  'masked_str': 'Men are very safe with us , let them be ever [MASK] rich .',
  'original_adv': 'so',
  'scores': [{'score': 0.478022038936615,
              'sequence': 'men are very safe with us, let them be ever so '
                          'rich.',
              'token': 2061,
              'token_str': 'so'},
             {'score': 0.14283417165279388,
              'sequence': 'men are very safe with us, let them be ever too '
                          'rich.',
              'token': 2205,
              'token_str': 'too'},
             {'score': 0.08018776029348373,
              'sequence': 'men are very safe with us, let them be ever more '
                          'rich.',
              'token': 2062,
              'token_str': 'more'}],
  'text_window': 'them be ever so rich .'},
 {'hit_id': 'pcc_eng_29_109.0735_x1741513_02:3-4',
  'masked_str': "Pope ' [MASK] afraid of schism ' within Catholic Church under "
          

In [37]:
selected_data = selected_data.sample(300) # HACK: #! temporary. REMOVE


def get_scores(masked, top_k=10):
    return {word_fill['token_str']: word_fill['score'] for word_fill in UNMASK(masked, top_k=top_k)}


def sum_lexcats(scores: dict, lex_dict: dict):
    results = pd.Series(scores).to_frame('score')
    # results.index.name = 'filler_str'
    # results = results.reset_index()
    lexcat_sums = pd.Series(dtype='float')
    for lex_type, lex_set in lex_dict.items():
        # return results.loc[results.index.isin(lex_set), 'score'].sum()
        lexcat_sums.loc[f'{lex_type}_total'] = results.loc[results.index.isin(
            lex_set), 'score'].sum().round(5)
    return lexcat_sums

In [38]:
score_df = selected_data.assign(scores=selected_data.masked.apply(get_scores, top_k=30))
score_df = score_df.join(score_df.scores.apply(sum_lexcats, lex_dict=ADV_SETS))
score_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 300 entries, pcc_eng_29_108.5498_x1738154_20:11-12 to pcc_eng_29_109.1591_x1742842_37:09-10
Data columns (total 35 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   adv_form                 300 non-null    string 
 1   adj_form                 300 non-null    string 
 2   text_window              300 non-null    string 
 3   masked                   300 non-null    object 
 4   prev_word                300 non-null    object 
 5   next_word                300 non-null    object 
 6   scores                   300 non-null    object 
 7   sufficient_total         300 non-null    float64
 8   min_threshold_total      300 non-null    float64
 9   max_threshold_total      300 non-null    float64
 10  moderate_total           300 non-null    float64
 11  precise_total            300 non-null    float64
 12  compare_total            300 non-null    float64
 13  weak_intense_to

In [39]:
stats = score_df.describe().round(4).T.sort_values('mean', ascending=False)
stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
weak_intense_total,300.0,0.1072,0.1435,0.0,0.0051,0.0531,0.1542,0.9829
sufficient_total,300.0,0.0723,0.178,0.0,0.0,0.0031,0.0473,0.9852
compare_total,300.0,0.0528,0.1588,0.0,0.0,0.0037,0.0239,0.9922
PPS_total,300.0,0.0522,0.105,0.0,0.0,0.0142,0.0516,0.9005
Mmod_SoltWilson_total,300.0,0.05,0.1051,0.0,0.0,0.0113,0.0437,0.9005
PS_rescuer_total,300.0,0.049,0.1387,0.0,0.0,0.0067,0.0249,0.9786
mod_intense_total,300.0,0.0424,0.0884,0.0,0.0,0.0088,0.0421,0.7837
negative_total,300.0,0.0254,0.0858,0.0,0.0,0.0,0.0133,0.8354
max_intense_total,300.0,0.0196,0.0577,0.0,0.0,0.0,0.0153,0.5966
meas_intense_total,300.0,0.0185,0.0594,0.0,0.0,0.0,0.0138,0.6993


In [40]:
adv_sets = pd.Series(dtype='string')
for adv_cat in stats.index:
    adv_cat = adv_cat.replace('_total', '')
    adv_sets[adv_cat] = ', '.join(ADV_SETS[adv_cat])
adv_sets = adv_sets.to_frame('members')
adv_sets.index.name = 'adv category'
pd.set_option('display.max_colwidth', 100)
adv_sets

Unnamed: 0_level_0,members
adv category,Unnamed: 1_level_1
weak_intense,"quite, very, really"
sufficient,"enough, too, so"
compare,"comparatively, less, as, most, more, least"
PPS,"kinda, pretty, sorta, utterly, somewhat, rather, fairly"
Mmod_SoltWilson,"kinda, pretty, sorta, somewhat, rather, fairly"
PS_rescuer,"too, only, enough, even"
mod_intense,"entirely, notably, particularly, extremely, especially"
negative,"never, rarely, scarcely, not, barely, hardly"
max_intense,"perfectly, totally, maximally, completely, utterly"
meas_intense,"strongly, super, deeply, hugely, widely, greatly, highly, largely"


In [41]:
score_df.sample(15)

Unnamed: 0_level_0,adv_form,adj_form,text_window,masked,prev_word,next_word,scores,sufficient_total,min_threshold_total,max_threshold_total,...,unease_intense_total,excess_intense_total,max_intense_total,EDM_Morzycki_total,Mmod_SoltWilson_total,negative_total,positive_total,NPS_total,PS_rescuer_total,PPS_total
hit_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
pcc_eng_29_108.2595_x1733448_13:3-4,not,necessary,This is not necessary when using the,This is [MASK] necessary when using the dynamic framework .,is,when,"{'usually': 0.11686263233423233, 'not': 0.09931281954050064, 'therefore': 0.07445567101240158, '...",0.0,0.0,0.0,...,0.0,0.0,0.0,0.01801,0.0,0.11472,0.27614,0.0,0.04485,0.0
pcc_eng_29_108.3249_x1734511_51:11-12,not,green,battery is orange not green .,The sunshine on the plug in to battery is orange [MASK] green .,orange,.,"{'or': 0.3573743999004364, 'and': 0.1321895718574524, '-': 0.1184810921549797, '/': 0.1012193858...",0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
pcc_eng_29_109.5621_x1749274_36:10-11,more,pertinent,debate and elicits more pertinent information .,Some observers believe this format stimulates debate and elicits [MASK] pertinent information .,elicits,information,"{'more': 0.26272404193878174, 'much': 0.04430467635393143, 'increasingly': 0.026463249698281288,...",0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.01554,0.0,0.0,0.0
pcc_eng_29_109.3318_x1745597_08:7-8,too,simple,be putting it too simple .,But that would be putting it [MASK] simple .,it,.,"{'too': 0.2638441026210785, 'so': 0.13080938160419464, 'as': 0.09529045224189758, 'pretty': 0.06...",0.39465,0.0,0.00614,...,0.0,0.0,0.00614,0.0,0.12183,0.0,0.0,0.0,0.26384,0.12183
pcc_eng_29_109.2591_x1744427_01:3-4,not,Fair,Its Just not Fair !,Its Just [MASK] Fair !,Just,!,"{'not': 0.07358408719301224, 'too': 0.0585193894803524, 'a': 0.04666743054986, 'plain': 0.044469...",0.0847,0.0,0.01083,...,0.0,0.0,0.01083,0.0,0.05283,0.10379,0.0,0.0,0.05852,0.05283
pcc_eng_29_108.1997_x1732436_04:3-4,Just,delightful,Edit : Just delightful .,Edit : [MASK] delightful .,:,.,"{'roger': 0.00421407213434577, 'jonathan': 0.0041914028115570545, '#': 0.0035584899596869946, 't...",0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
pcc_eng_29_109.6613_x1750903_10:1-2,Also,good,Also good for flanking sunny,[MASK] good for flanking sunny doorways or arbors .,.,for,"{'especially': 0.11291126906871796, 'particularly': 0.06442534178495407, 'not': 0.05678001046180...",0.05756,0.0,0.00614,...,0.0,0.0,0.00614,0.0,0.04522,0.05678,0.01918,0.0,0.0382,0.04522
pcc_eng_29_109.3677_x1746172_17:1-2,Very,amiss,Very amiss .,[MASK] amiss .,.,.,"{'bitter': 0.04257819801568985, 'never': 0.03674129769206047, 'no': 0.029797472059726715, 'not':...",0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.07157,0.00804,0.0,0.0,0.0
pcc_eng_29_109.0705_x1741471_024:3-4,perfectly,valid,"This is perfectly valid JSON , per","This is [MASK] valid JSON , per the JSON spec :",is,JSON,"{'a': 0.6661784052848816, 'the': 0.1116970106959343, 'another': 0.0297965370118618, 'one': 0.010...",0.0,0.0,0.01289,...,0.0,0.0,0.00422,0.0,0.0,0.0042,0.0,0.0,0.0073,0.0
pcc_eng_29_109.4172_x1746961_03:2-3,ridiculously,over-priced,From ridiculously over-priced watches to high,From [MASK] over-priced watches to high class four wheeled ...,From,watches,"{'inexpensive': 0.1144171804189682, 'cheap': 0.10992474108934402, 'luxury': 0.08209829032421112,...",0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
