 # Calculating Trigger ~ `UNIT` frequencies and AMs directly from final parquets
 initiated: _January 16, 2025_
 > The previously authored notebook `calc_trigger_AM.*` relies on `UCS` command line tools,
 > which this attempt will not, thereby simplifying the pipeline and file outputs.

 Loading imports

In [None]:
from am_notebooks import *
from source.utils.LexicalCategories import SPECIAL_ADV
from association_measures import frequencies as amfq, measures as amms

L1 = 'trigger_lemma'
L2 = 'adv_form_lower'
pd.set_option('display.max_colwidth', 15)
pd.set_option('display.max_columns', 6)
pd.set_option('display.width', 120)
HIT_DATA_DIR = Path('/share/compling/data/sanpi/2_hit_tables')
NEG_SUPER_PARQ = HIT_DATA_DIR.joinpath('RBdirect/ALL-RBdirect_final.parq')
POS_SUPER_PARQ = HIT_DATA_DIR.joinpath(
    'not-RBdirect/ALL_not-RBdirect_final.parq')
NEG_MIRROR_PARQ = HIT_DATA_DIR.joinpath('NEGmirror/ALL-NEGmirror_final.parq')
POS_MIRROR_PARQ = HIT_DATA_DIR.joinpath('POSmirror/ALL-POSmirror_final.parq')
print(timestamp_today())


def rename_trigger_dep_info(df):
    if df.filter(['neg_head', 'mir_head', 'neg_deprel', 'mir_deprel']).empty:
        return df
    return df.assign(
        trigger_head=df.filter(
            ['neg_head', 'mir_head']).iloc[:, 0],
        trigger_deprel=df.filter(
            ['neg_deprel', 'mir_deprel']).iloc[:, 0],
    )


def fix_word_null(df):
    """Fix null values in adjective and adverb columns.

    This function replaces null values in 'adj_form_lower' and 'adv_form_lower'
    columns of a DataFrame with the string 'null'. 
    That is, these are cases where the string "null" 
    was improperly interpreted as '<NA>' rather than the literal word, "null".

    Args:
        df: Pandas DataFrame containing 'adj_form_lower' and 'adv_form_lower' columns.

    Returns:
        Pandas DataFrame with null values in specified columns replaced with 'null'.
    """

    return df.assign(adj_form_lower=df.adj_form_lower.fillna('null'),
                     adv_form_lower=df.adv_form_lower.fillna('null'),
                     trigger_lower=df.trigger_lower.fillna('null'))


def load_trigger_info(parq_paths):
    sources = []
    trig_node = 'neg'
    polarity = 'neg'
    for path in parq_paths:
        if 'POS' in path.stem:
            trig_node = 'mir'
            polarity = 'pos'
        _df = pd.read_parquet(
            path, engine='pyarrow',
            columns=(pd.Series(
                [L1, L2] +
                ['trigger_lemma', 'trigger_lower', 'bigram_lower',
                 f'{trig_node}_head', f'{trig_node}_deprel',
                 'adv_form_lower', 'adj_form_lower', 'bigram_id']
            ).drop_duplicates().to_list()))
        _df = rename_trigger_dep_info(_df)
        _df = _df.assign(polarity=polarity).convert_dtypes()
        sources.append(_df)
    trigger_df = pd.concat(sources) if len(sources) > 1 else sources[0]
    trigger_df = trigger_df.loc[:, ~
                                trigger_df.columns.str.startswith(('neg_', 'mir_'))]
    trigger_df = fix_word_null(trigger_df)
    if any(trigger_df.trigger_lemma.str.startswith('ain')):
        aint_triggered = trigger_df.trigger_lemma.isin(["ain't", "aint"])
        trigger_df.loc[aint_triggered, :] = trigger_df.loc[aint_triggered, :].assign(
            trigger_lemma='not')
    trigger_df.info()
    return trigger_df

2025-03-10


In [None]:
# %%

df_super_neg = load_trigger_info([NEG_SUPER_PARQ])
print(df_super_neg.describe().T.iloc[:, 1:].convert_dtypes())

<class 'pandas.core.frame.DataFrame'>
Index: 3188297 entries, apw_eng_19971117_0849_61:4-6-7 to pcc_eng_val_3.10643_x51716_28:5-7-8
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   trigger_lemma   string
 1   adv_form_lower  string
 2   trigger_lower   string
 3   bigram_lower    string
 4   adj_form_lower  string
 5   bigram_id       string
 6   trigger_head    string
 7   trigger_deprel  string
 8   polarity        string
dtypes: string(9)
memory usage: 243.2+ MB
                 unique             top     freq
trigger_lemma        15             not  2871807
adv_form_lower     5579              as   533878
trigger_lower        54             not  2005270
bigram_lower     187832         as_good    44635
adj_form_lower    28060            good   131430
bigram_id       3188297  apw_eng_199...        1
trigger_head          2             ADJ  3101239
trigger_deprel       33          advmod  2701472
polarity              1             neg  3188

In [None]:
# %%

def display_trigger_totals(_df, dataset='superset',
                           polar='negative',
                           cross='lower',
                           cmap='YlGnBu'):
    x = f'trigger_{cross}'
    cross_count_label = f"{x} total".replace('_', ' ')
    total_name = 'lemma total'
    _df[total_name] = _df.trigger_lemma.map(_df.trigger_lemma.value_counts())
    indexers = ['polarity', 'rigger_lemma', total_name] if _df.polarity.nunique() > 1 else [
        'trigger_lemma', total_name]
    for_sty = _df.groupby(indexers).value_counts(
        [x,]).to_frame(cross_count_label)
    if cross == 'head':
        if _df[x].nunique() > 2:
            _df[x] = _df[x].map(
                {'ADJ': 'BIGRAM', 'MIR': 'TRIGGER', 'NEG': 'TRIGGER'})
        cross_name = 'dependency head'
        for_sty = for_sty.unstack().fillna(0).droplevel(
            0, axis=1).reset_index(level=total_name).sort_index(axis=1)
        # head_totals = _df.value_counts(x).to_frame(cross_count_label).T
        print(_df.value_counts(x)
              .to_frame(cross_count_label).T
              .style.background_gradient(cmap, axis=1)
              .to_latex(siunitx=True, convert_css=True).replace('_', ' '))
        for_sty = (
            for_sty  # .assign(Total=q.sum(axis=1))
            .sort_values(total_name, ascending=False)
            .convert_dtypes())
    else:
        cross_name = 'lowercase form'
        for_sty = (
            for_sty.reset_index(level=total_name)
            .sort_values([total_name, cross_count_label], ascending=False)
            # .reset_index(level=x)
            # .filter([total_name, x, cross_count_label])
            .filter([total_name, cross_count_label])
        )
    for_sty['lemma % N'] = (for_sty[total_name] / for_sty[total_name].sum()) * 100
    _sty = (for_sty.sort_index(axis=1).style
            .background_gradient(
                axis=0, cmap=cmap))
    # nb_display(_sty)
    tex_table = save_latex_table(
        sty=_sty, 
        longtable=cross=='lower',
        caption=(
            f'{dataset} {polar} Trigger Lemma Frequencies by {cross_name}'.title()),
        label=f'trig-lemma-{cross}-{dataset[:3]}-{polar[:3]}',
        verbose=True,
        latex_subdir='ch2/triggers',
        latex_stem=f'trigger_lemma-{cross}_counts_{dataset[:3]}-{polar[:3]}')


display_trigger_totals(df_super_neg)

Caption: Superset Negative Trigger Lemma Frequencies By Lowercase Form


Unnamed: 0_level_0,Unnamed: 1_level_0,lemma \% N,lemma total,trigger lower total
Trigger Lemma,Trigger Lower,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
not,not,3.31,2871807,2005270
not,n't,3.31,2871807,863289
not,ain't,3.31,2871807,2052
not,nit,3.31,2871807,608
not,aint,3.31,2871807,270
not,nt,3.31,2871807,155
not,doe,3.31,2871807,32
not,n''t,3.31,2871807,30
not,na,3.31,2871807,16
not,oud,3.31,2871807,15


Stylized latex table saved as:
  OverleafDissertex/assets/tables/ch2/triggers/trigger_lemma-lower_counts_sup-neg.2025-03-10.tex



In [None]:
# %%

display_trigger_totals(df_super_neg, cross='head', cmap='YlOrRd')

\begin{tabular}{lSS}
{trigger head} & {ADJ} & {NEG} \\
trigger head total & {\cellcolor[HTML]{800026}} \color[HTML]{F1F1F1} 3101239 & {\cellcolor[HTML]{FFFFCC}} \color[HTML]{000000} 87058 \\
\end{tabular}

Caption: Superset Negative Trigger Lemma Frequencies By Dependency Head


trigger_head,ADJ,NEG,lemma \% N,lemma total
Trigger Lemma,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
not,2863597,8210,90.07,2871807
never,111026,63,3.48,111089
nothing,34067,66446,3.15,100513
none,21502,9951,0.99,31453
nor,15925,0,0.5,15925
without,13339,1486,0.46,14825
no,12343,95,0.39,12438
neither,6436,92,0.2,6528
nobody,5827,538,0.2,6365
hardly,5517,8,0.17,5525


Stylized latex table saved as:
  OverleafDissertex/assets/tables/ch2/triggers/trigger_lemma-head_counts_sup-neg.2025-03-10.tex



In [None]:
#%%

N_dict = {'super':  71961373,
          'mirror':  1680633}
def describe_triggers(_df):
    lemma_counts = _df.value_counts('trigger_lemma')
    form_counts = _df.value_counts('trigger_lower')
    trig_stats = lemma_counts.describe().to_frame('lemma').join(
        form_counts.describe().to_frame('lowercase form')
    ).convert_dtypes()
    trig_stats.columns.name = 'Trigger'
    trig_stats = trig_stats.rename(index={'count': 'unique'})
    trig_stats.loc['CV%'] = [(v.std()/v.mean()) * 100
                             for v in (lemma_counts, form_counts)]
    return trig_stats  # .iloc[1:, :]


# nb_display(describe_triggers(df_super_neg))
save_latex_table(
    describe_triggers(df_super_neg), default_SI=7.0,
    caption='Descriptive Statistics for Negative Superset Trigger Frequencies', 
    latex_subdir= 'ch2/freq/neg-super/', position='ht', verbose=True,
    latex_stem='neg-super-trigger-freq-descrip')

Caption: Descriptive Statistics for Negative Superset Trigger Frequencies


Trigger,lemma,lowercase form
,,
unique,15.0,54.0
mean,212553.13,59042.54
std,736488.25,294680.56
min,796.0,1.0
25\%,4381.5,1.0
50\%,6528.0,13.0
75\%,23689.0,1846.75
max,2871807.0,2005270.0
CV\%,346.5,499.1


Stylized latex table saved as:
  OverleafDissertex/assets/tables/ch2/freq/neg-super/neg-super-trigger-freq-descrip.2025-03-10.tex



PosixPath('/share/compling/projects/arh234/OverleafDissertex/assets/tables/ch2/freq/neg-super/neg-super-trigger-freq-descrip.2025-03-10.tex')

In [None]:
# %%

df_mirror_neg = load_trigger_info([NEG_MIRROR_PARQ])
df_mirror_neg.describe().T.iloc[:, 1:].convert_dtypes()

# save_latex_table(
#     sty=(df_mirror_neg.groupby(['polarity', 'trigger_lemma']).value_counts(
#         ['trigger_lower']).to_frame()
#         .style
#         .background_gradient(axis=0, cmap='purple_rain')
#         .format(precision=0, thousands=',', escape='latex')),
#     caption=(r'Negative Mirror Subset Trigger Lemma Composition: attested forms'),
#     label='negmir-trig-lemma-vs-form',
#     longtable=True,
#     latex_subdir='triggers',
#     latex_stem='mirror-neg-trigger_lemma-form_counts')

<class 'pandas.core.frame.DataFrame'>
Index: 293456 entries, apw_eng_19941117_0297_31:3-4-5 to pcc_eng_val_3.09952_x50620_12:13-15-16
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   trigger_lemma   293456 non-null  string
 1   adv_form_lower  293456 non-null  string
 2   trigger_lower   293456 non-null  string
 3   bigram_lower    293456 non-null  string
 4   adj_form_lower  293456 non-null  string
 5   bigram_id       293456 non-null  string
 6   trigger_head    293456 non-null  string
 7   trigger_deprel  293456 non-null  string
 8   polarity        293456 non-null  string
dtypes: string(9)
memory usage: 22.4+ MB


Unnamed: 0,unique,top,freq
trigger_lemma,12,never,111102
adv_form_lower,2013,more,79045
trigger_lower,20,never,111102
bigram_lower,41019,more_important,12347
adj_form_lower,8070,important,14913
bigram_id,293456,apw_eng_199...,1
trigger_head,2,ADJ,216433
trigger_deprel,18,advmod,120867
polarity,1,neg,293456


In [None]:
# %%

save_latex_table(
    describe_triggers(df_mirror_neg), default_SI=6.0,
    caption='Descriptive Statistics for Negative Mirror Subset Trigger Frequencies', 
    latex_subdir= 'ch2/freq/neg-mirror/', position='ht', verbose=True,
    latex_stem='neg-mirror-trigger-freq-descrip')

display_trigger_totals(df_mirror_neg, dataset='mirror')
display_trigger_totals(df_mirror_neg, dataset='mirror', cross='head', cmap='YlOrRd')

Caption: Descriptive Statistics for Negative Mirror Subset Trigger Frequencies


Trigger,lemma,lowercase form
,,
unique,12.0,20.0
mean,24454.67,14672.8
std,38879.07,32027.43
min,797.0,1.0
25\%,3671.25,4.75
50\%,6426.0,1143.5
75\%,19770.0,7161.75
max,111102.0,111102.0
CV\%,158.98,218.28


Stylized latex table saved as:
  OverleafDissertex/assets/tables/ch2/freq/neg-mirror/neg-mirror-trigger-freq-descrip.2025-03-10.tex

Caption: Mirror Negative Trigger Lemma Frequencies By Lowercase Form


Unnamed: 0_level_0,Unnamed: 1_level_0,lemma \% N,lemma total,trigger lower total
Trigger Lemma,Trigger Lower,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
never,never,19.01,111102,111102
nothing,nothing,17.12,100044,100012
nothing,nothings,17.12,100044,31
nothing,nothingis,17.12,100044,1
none,none,5.35,31248,31215
none,nones,5.35,31248,32
none,non-exalts,5.35,31248,1
nor,nor,2.73,15944,15944
no,no,1.57,9175,9174
no,nys,1.57,9175,1


Stylized latex table saved as:
  OverleafDissertex/assets/tables/ch2/triggers/trigger_lemma-lower_counts_mir-neg.2025-03-10.tex

\begin{tabular}{lSS}
{trigger head} & {ADJ} & {NEG} \\
trigger head total & {\cellcolor[HTML]{800026}} \color[HTML]{F1F1F1} 216433 & {\cellcolor[HTML]{FFFFCC}} \color[HTML]{000000} 77023 \\
\end{tabular}

Caption: Mirror Negative Trigger Lemma Frequencies By Dependency Head


trigger_head,ADJ,NEG,lemma \% N,lemma total
Trigger Lemma,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
never,111039,63,37.86,111102
nothing,33760,66284,34.09,100044
none,21319,9929,10.65,31248
nor,15944,0,5.43,15944
no,9081,94,3.13,9175
neither,6400,91,2.21,6491
nobody,5823,538,2.17,6361
hardly,5519,8,1.88,5527
rarely,4470,10,1.53,4480
barely,1245,0,0.42,1245


Stylized latex table saved as:
  OverleafDissertex/assets/tables/ch2/triggers/trigger_lemma-head_counts_mir-neg.2025-03-10.tex



In [None]:
# %%

df_mirror_pos = load_trigger_info([POS_MIRROR_PARQ])
df_mirror_pos.describe().T.iloc[:, 1:].convert_dtypes()

<class 'pandas.core.frame.DataFrame'>
Index: 1396324 entries, apw_eng_19941121_0258_30:1-4-5 to pcc_eng_val_3.11049_x52389_100:30-31-32
Data columns (total 9 columns):
 #   Column          Non-Null Count    Dtype 
---  ------          --------------    ----- 
 0   trigger_lemma   1396324 non-null  string
 1   adv_form_lower  1396324 non-null  string
 2   trigger_lower   1396324 non-null  string
 3   bigram_lower    1396324 non-null  string
 4   adj_form_lower  1396324 non-null  string
 5   bigram_id       1396324 non-null  string
 6   trigger_head    1396324 non-null  string
 7   trigger_deprel  1396324 non-null  string
 8   polarity        1396324 non-null  string
dtypes: string(9)
memory usage: 106.5+ MB


Unnamed: 0,unique,top,freq
trigger_lemma,16,something,309632
adv_form_lower,5322,more,208524
trigger_lower,35,something,309483
bigram_lower,178916,as_simple,14992
adj_form_lower,21484,different,34454
bigram_id,1396324,apw_eng_199...,1
trigger_head,2,ADJ,1076623
trigger_deprel,20,advmod,451671
polarity,1,pos,1396324


In [None]:
#%%

save_latex_table(
    describe_triggers(df_mirror_pos), default_SI=6.0,
    caption='Descriptive Statistics for Positive Mirror Subset Trigger Frequencies', 
    latex_subdir= 'ch2/freq/pos-mirror/', position='ht', verbose=True,
    latex_stem='pos-mirror-trigger-freq-descrip')

display_trigger_totals(df_mirror_pos, dataset='mirror', polar='positive', cmap='PuBu')
display_trigger_totals(df_mirror_pos, dataset='mirror', polar='positive',  cross='head', cmap='PuBuGn')

# save_latex_table(
#     sty=(df_mirror_pos.groupby(['polarity', 'trigger_lemma']).value_counts(
#         ['trigger_lower']).to_frame()
#         .style
#         .background_gradient(axis=0, cmap='purple_rain')
#         .format(precision=0, thousands=',', escape='latex')),
#     caption=(r'Negative Mirror Subset Trigger Lemma Composition: attested forms'),
#     label='posmir-trig-lemma-vs-form',
#     longtable=True,
#     latex_subdir='triggers',
#     latex_stem='mirror-pos-trigger_lemma-form_counts')

Caption: Descriptive Statistics for Positive Mirror Subset Trigger Frequencies


Trigger,lemma,lowercase form
,,
unique,16.0,35.0
mean,87270.25,39894.97
std,104293.06,82105.48
min,334.0,1.0
25\%,22694.5,1.5
50\%,44761.0,18.0
75\%,88812.75,39017.0
max,309632.0,309483.0
CV\%,119.51,205.8


Stylized latex table saved as:
  OverleafDissertex/assets/tables/ch2/freq/pos-mirror/pos-mirror-trigger-freq-descrip.2025-03-10.tex

Caption: Mirror Positive Trigger Lemma Frequencies By Lowercase Form


Unnamed: 0_level_0,Unnamed: 1_level_0,lemma \% N,lemma total,trigger lower total
Trigger Lemma,Trigger Lower,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
something,something,9.07,309632,309483
something,somethings,9.07,309632,131
something,somethin,9.07,309632,18
or,or,8.93,304971,304970
or,ors,8.93,304971,1
all,all,7.15,243962,243951
all,alls,7.15,243962,10
all,alleg,7.15,243962,1
some,some,3.15,107634,107631
some,somes,3.15,107634,2


Stylized latex table saved as:
  OverleafDissertex/assets/tables/ch2/triggers/trigger_lemma-lower_counts_mir-pos.2025-03-10.tex

\begin{tabular}{lSS}
{trigger head} & {ADJ} & {MIR} \\
trigger head total & {\cellcolor[HTML]{014636}} \color[HTML]{F1F1F1} 1076623 & {\cellcolor[HTML]{FFF7FB}} \color[HTML]{000000} 319701 \\
\end{tabular}

Caption: Mirror Positive Trigger Lemma Frequencies By Dependency Head


trigger_head,ADJ,MIR,lemma \% N,lemma total
Trigger Lemma,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
something,19715,289917,22.17,309632
or,304920,51,21.84,304971
all,242683,1279,17.47,243962
some,105476,2158,7.71,107634
often,82474,65,5.91,82539
always,82213,25,5.89,82238
sometimes,64092,58,4.59,64150
both,49170,148,3.53,49318
many,39151,1053,2.88,40204
everything,35500,2348,2.71,37848


Stylized latex table saved as:
  OverleafDissertex/assets/tables/ch2/triggers/trigger_lemma-head_counts_mir-pos.2025-03-10.tex

