# Inspecting pickled `exactly` hits

In [1]:
import pandas as pd
from pathlib import Path

## Define helper functions
These are copied from `./source/analyze/utils/{dataframes, general}.py`, but jupyter won't import them.

In [2]:
def find_files(data_dir: Path, fname_glob: str, verbose: bool = False):
    path_iter = data_dir.rglob(fname_glob)
    if verbose:
        path_iter = tuple(path_iter)
        print_iter(
            [f'../{p.relative_to(data_dir)}' for p in path_iter], bullet='-',
            header=f'### {len(path_iter)} paths matching {fname_glob} found in {data_dir}')
    return path_iter


def print_iter(iter_obj,
               bullet: str = '▸',
            #//    logger: logging.Logger = None,
            #//    level: int = 20,
               header: str = ''):

    bullet_str = f'\n{bullet} '

    iter_str = bullet_str.join(f'{i}' for i in iter_obj)

    msg_str = f'\n{header}{bullet_str}{iter_str}'
    msg_str = msg_str.replace('\n\n', '\n').strip(f'{bullet} ')

    print(msg_str)
    

def balance_sample(full_df: pd.DataFrame,
                   column_name: str = 'category',
                   sample_per_value: int = 5,
                   verbose: bool = False) -> tuple:
    '''
    create sample with no more than n rows satisfying each unique value
    of the given column. A value of -1 for `sample_per_value` will limit
    all values' results to the minimum count per value.
    '''
    info_message = ''
    subsamples = []
    for __, col_val_df in full_df.groupby(column_name):
        # take sample if 1+ and less than length of full dataframe
        if len(col_val_df) > sample_per_value > 0:
            subsample_df = col_val_df.sample(sample_per_value)
            subsamples.append(subsample_df)
        else: 
            subsamples.append(col_val_df)

    # > trim all "by column" sub dfs to length of shortest if -1 given
    if sample_per_value == -1:
        trim_len = int(min(len(sdf) for sdf in subsamples))
        subsamples = [sdf.sample(trim_len)
                       for sdf in subsamples]

    b_sample = pd.concat(subsamples)

    if verbose:
        subset_info_table = (
            b_sample
            .value_counts(subset=column_name)
            .to_frame(name='count')
            .assign(percentage=b_sample
                    .value_counts(column_name, normalize=True)
                    .round(2) * 100)
            .to_markdown())
        label = (full_df.hits_df_pkl[0].stem + ' '
                 if 'hits_df_pkl' in full_df.columns
                 else '')
        info_message = (f'\n## {column_name} representation in {label}sample\n'
                        + subset_info_table)

    return b_sample, info_message


def concat_pkls(data_dir: Path = Path('/share/compling/data/sanpi/2_hit_tables'),
                fname_glob: str = '*.pkl.gz',
                pickles=None,
                convert_dtypes=False,
                verbose: bool = True) -> pd.DataFrame:
    if not pickles:
        pickles = find_files(Path(data_dir), fname_glob, verbose)

    # tested and found that it is faster to assign `corpus` intermittently
    df = pd.concat((pd.read_pickle(p).assign(corpus=p.stem.rsplit('_', 2)[0])
                    for p in pickles))

    dup_check_cols = cols_by_str(df, end_str=('text', 'id', 'sent'))
    df = (df.loc[~df.duplicated(subset = dup_check_cols), :])
    df = df.convert_dtypes()
    df = make_cats(df, (['corpus'] + cols_by_str(df, start_str=('nr', 'neg', 'adv'),
                                                 end_str=('lemma', 'form'))))
    
    return df


def cols_by_str(df: pd.DataFrame, start_str=None, end_str=None) -> list:
    if end_str:
        cols = df.columns[df.columns.str.endswith(end_str)]
        if start_str:
            cols = cols[cols.str.startswith(start_str)]
    elif start_str:
        cols = df.columns[df.columns.str.startswith(start_str)]
    else:
        cols = df.columns

    return cols.to_list()


def make_cats(orig_df:pd.DataFrame, columns: list = None) -> pd.DataFrame:
    df = orig_df.copy()
    if columns is None:
        cat_suff = ("code", "name", "path", "stem")
        columns = df.columns.str.endswith(cat_suff)

    df.loc[:, columns] = df.loc[:, columns].astype(
        'string').fillna('_').astype('category')

    return df


This one is copied from `./source/analyze_deps.py`:

In [3]:
def _optimize_df(df:pd.DataFrame) -> pd.DataFrame: 
    
    print('Original Dataframe:')
    df.info(memory_usage='deep')
    
    # * clean up dataframe a bit
    # drop unneeded string columns
    # was:
    #   for c in udf.cols_by_str(df, start_str=('context', 'text', 'sent_text', 'token')):
    for c in cols_by_str(df, start_str=('context', 'sent_text', 'token')):
        df.pop(c)
    # select only non-`object` dtype columns
    relevant_cols = df.columns[~df.dtypes.astype(
        'string').str.endswith(('object'))]
    # limit df to `relevant_cols`
    df = df[relevant_cols]
    
    # create empty dataframe with `relevant_cols` as index/rows
    df_info = pd.DataFrame(index=relevant_cols)

    df_info = df_info.assign(
        mem0=df.memory_usage(deep=True),
        dtype0=df.dtypes.astype('string'),
        defined_values=df.count(),
        unique_values=df.apply(pd.unique, axis=0).apply(len))
    df_info = df_info.assign(
        ratio_unique = (df_info.unique_values/df_info.defined_values).round(2))

    cat_candidates = df_info.loc[df_info.ratio_unique < 0.8, :].loc[df_info.dtype0!='category'].index.to_list()
    # catted_df = udf.make_cats(df.copy(), cat_candidates)
    catted_df = make_cats(df.copy(), cat_candidates)
    
    df_info = df_info.assign(dtype1=catted_df.dtypes, mem1=catted_df.memory_usage(deep=True))
    df_info = df_info.assign(mem_change= df_info.mem1-df_info.mem0)
    print(df_info.sort_values(['mem_change', 'ratio_unique', 'dtype0']).to_markdown())
    mem_improved = df_info.loc[df_info.mem_change < 0, :].index.to_list()
    for c in df.columns[~df.columns.isin(mem_improved)]: 
        print(c, '\t', df.loc[:, c].dtype)
    df.loc[:, mem_improved] = catted_df.loc[:, mem_improved]
    print('Category Converted dataframe:')
    df.info(memory_usage='deep')
    
    return df

In [4]:
ddf = concat_pkls(data_dir=Path('/share/compling/data/sanpi/3_dep_info'), 
                  fname_glob='exactly*hits+deps.pkl.gz', 
                  convert_dtypes=True)


### 12 paths matching exactly*hits+deps.pkl.gz found in /share/compling/data/sanpi/3_dep_info
- ../raised/exactly_apw_neg-raised_hits+deps.pkl.gz
- ../raised/exactly_nyt_neg-raised_hits+deps.pkl.gz
- ../raised/exactly_puddin_neg-raised_hits+deps.pkl.gz
- ../scoped/exactly_apw_with-relay_hits+deps.pkl.gz
- ../scoped/exactly_nyt_with-relay_hits+deps.pkl.gz
- ../scoped/exactly_puddin_with-relay_hits+deps.pkl.gz
- ../contig/exactly_apw_sans-relay_hits+deps.pkl.gz
- ../contig/exactly_nyt_sans-relay_hits+deps.pkl.gz
- ../contig/exactly_puddin_sans-relay_hits+deps.pkl.gz
- ../advadj/exactly_apw_all-RB-JJs_hits+deps.pkl.gz
- ../advadj/exactly_nyt_all-RB-JJs_hits+deps.pkl.gz
- ../advadj/exactly_puddin_all-RB-JJs_hits+deps.pkl.gz


In [5]:
odf = _optimize_df(ddf)

Original Dataframe:
<class 'pandas.core.frame.DataFrame'>
Index: 150437 entries, apw_eng_20030918_0697_20:4-5-8-9 to pcc_eng_09_047.0803_x0745587_19:3-4
Data columns (total 43 columns):
 #   Column             Non-Null Count   Dtype   
---  ------             --------------   -----   
 0   colloc             150437 non-null  string  
 1   sent_text          150437 non-null  string  
 2   nr_form            150437 non-null  category
 3   neg_form           150437 non-null  category
 4   adv_form           150437 non-null  category
 5   adj_form           150437 non-null  string  
 6   hit_text           150437 non-null  string  
 7   text_window        150437 non-null  string  
 8   sent_id            150437 non-null  string  
 9   match_id           150437 non-null  string  
 10  colloc_id          150437 non-null  string  
 11  token_str          150437 non-null  string  
 12  lemma_str          150437 non-null  string  
 13  context_prev_id    150437 non-null  string  
 14  context_p

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, mem_improved] = catted_df.loc[:, mem_improved]


Category Converted dataframe:
<class 'pandas.core.frame.DataFrame'>
Index: 150437 entries, apw_eng_20030918_0697_20:4-5-8-9 to pcc_eng_09_047.0803_x0745587_19:3-4
Data columns (total 31 columns):
 #   Column            Non-Null Count   Dtype   
---  ------            --------------   -----   
 0   colloc            150437 non-null  category
 1   nr_form           150437 non-null  category
 2   neg_form          150437 non-null  category
 3   adv_form          150437 non-null  category
 4   adj_form          150437 non-null  category
 5   hit_text          150437 non-null  category
 6   text_window       150437 non-null  string  
 7   sent_id           150437 non-null  category
 8   match_id          150437 non-null  category
 9   colloc_id         150437 non-null  category
 10  lemma_str         150437 non-null  category
 11  nr_lemma          150437 non-null  category
 12  neg_lemma         150437 non-null  category
 13  adv_lemma         150437 non-null  category
 14  adj_lemma      

In [6]:
columns = odf.columns[~odf.columns.isin(cols_by_str(odf, start_str=('dep_m', 'dep_n', 'dep_r', 'context')))].to_list()
columns.sort()
odf = odf.loc[:, columns]

In [7]:
len(odf) == len(ddf)

True

In [8]:
# vdf = odf.loc[odf.category != 'advadj', ['category', 'neg_lemma', 'colloc', 'hit_text', 'text_window']]
# vdf, info = balance_sample(vdf, sample_per_value=50, verbose=True)
# print(info)
# vdf

In [9]:
def show_counts(df, columns): 
    return df.value_counts(columns).to_frame().rename(columns={0:'count'})

In [10]:
odf = odf.loc[odf.adv_lemma=='exactly', :]
odf.loc[odf.corpus.str.endswith('puddin'), 'corpus_group'] = 'puddin'
odf.loc[odf.corpus.str.endswith(('nyt', 'apw')), 'corpus_group'] = 'news'
general_counts = show_counts(odf, ['category', 'corpus_group']).unstack().sort_values(('count', 'puddin'), ascending=False) # type: ignore
general_counts

Unnamed: 0_level_0,count,count
corpus_group,news,puddin
category,Unnamed: 1_level_2,Unnamed: 2_level_2
advadj,4917,60971
contig,3500,42908
scoped,78,2750
raised,13,327


In [11]:
len(odf) < len(ddf)

True

*NOTE: ☝️ `odf` is shorter (fewer rows) than original loaded hits because adverbs other than `'exactly'` have been dropped.*

In [12]:
def summarize_text_cols(tdf:pd.DataFrame): 
    
    summary = tdf.describe().transpose()
    summary = summary.assign(top_percent = (((pd.to_numeric(summary.freq) / len(tdf)))*100).round(2))
    summary = summary.rename(columns={'top': 'top_value', 'freq':'top_freq'})
    
    return summary.convert_dtypes().sort_values('unique')

In [13]:
odf = odf.assign(conllu_id=odf.sent_id.str.rsplit('_', 2).str.get(0).str.split('.').str.get(0).astype('string').astype('category')) # type: ignore
tdf = odf[cols_by_str(odf, end_str=('lemma','id', 'text', 'window', 'category', 'Pol')) + ['lemma_str']]
print(f'Total "exactly" hits for all patterns: {len(tdf)}')
summary_tdf = summarize_text_cols(tdf)
summary_tdf

Total "exactly" hits for all patterns: 115464


Unnamed: 0,count,unique,top_value,top_freq,top_percent
adv_lemma,115464,1,exactly,115464,100.0
category,115464,4,advadj,65888,57.06
neg_lemma,115464,14,_,65888,57.06
nr_lemma,115464,17,_,115124,99.71
relay_lemma,115464,622,_,112636,97.55
match_id,115464,978,4-5,9361,8.11
adj_lemma,115464,4240,sure,19181,16.61
conllu_id,115464,6611,pcc_eng_22_037,65,0.06
hit_text,115464,12602,exactly sure,9620,8.33
lemma_str,115464,59565,because we understand that not every purchase ...,1942,1.68


## $PosPol$ context dataset

### Option A
bare collocation tokens (`advadj.all-RB-JJs` pattern match) which do not appear as matches for any other pattern match (i.e. $NegPol$ contexts).

*That is, the `colloc_id` (unique `ADV` & `ADJ` nodes in unique sentence tokens) is not duplicated.*

In [14]:
tdfp_a = tdf.loc[(tdf.category=='advadj') & (~tdf.duplicated(subset='colloc_id', keep=False)), :]

### Option B
categorize $NegPol$ set first (`tdfn`), then compute complement of that (i.e. $ALL - NegPol$)

In [15]:

tdfn = tdf.loc[tdf.neg_lemma!='_', :]
tdfp_b = tdf.loc[~tdf.colloc_id.isin(tdfn.colloc_id), :]
summarize_text_cols(tdfn)


Unnamed: 0,count,unique,top_value,top_freq,top_percent
adv_lemma,49576,1,exactly,49576,100.0
category,49576,3,contig,46408,93.61
neg_lemma,49576,13,not,45771,92.32
nr_lemma,49576,17,_,49236,99.31
relay_lemma,49576,622,_,46748,94.3
match_id,49576,833,3-4-5,7889,15.91
adj_lemma,49576,3743,sure,9548,19.26
conllu_id,49576,5908,pcc_eng_01_001,29,0.06
hit_text,49576,8174,not exactly sure,6420,12.95
text_window,49576,43673,we understand that not every purchase is exact...,971,1.96


In [16]:
all(tdfp_a.index == tdfp_b.index)

True

### Options A and B are identical
so since $NegPol$ is more directly defined, and has to be separated out anyway, it's simpler to just get the "complement", (`tdfp_b` method)

In [17]:
tdfp = tdf.loc[~tdf.colloc_id.isin(tdfn.colloc_id)]
summarize_text_cols(tdfp)

Unnamed: 0,count,unique,top_value,top_freq,top_percent
adv_lemma,16610,1,exactly,16610,100.0
category,16610,1,advadj,16610,100.0
neg_lemma,16610,1,_,16610,100.0
nr_lemma,16610,1,_,16610,100.0
relay_lemma,16610,1,_,16610,100.0
match_id,16610,122,3-4,2074,12.49
adj_lemma,16610,1346,right,6517,39.24
hit_text,16610,1446,exactly right,6450,38.83
conllu_id,16610,4373,pcc_eng_24_085,18,0.11
text_window,16610,13345,That 's exactly right .,454,2.73


In [18]:
print(f'Total `exactly` collocations identified: {odf.category.value_counts()["advadj"]}')
print(f'PosPol: {round(100*len(tdfp)/len(tdf))}% : {len(tdfp)} hits')
print(f'NegPol: {str(round(100*len(tdfn)/len(tdf))).zfill(2)}% : {len(tdfn)} hits')

Total `exactly` collocations identified: 65888
PosPol: 14% : 16610 hits
NegPol: 43% : 49576 hits


In [19]:
tdfp = tdfp.assign(polarity='positive')
select_cols = ['adj_lemma', 'text_window', 'lemma_str'] + cols_by_str(tdfp, end_str=('_id', 'corpus'))
pos_text_info = tdfp.loc[:, select_cols]

In [20]:
tdfn = tdfn.assign(polarity='negative')
tdf_with_overlap = tdf
tdf = pd.concat([tdfp, tdfn]).sort_values('colloc_id')
tdf.to_pickle('/share/compling/projects/sanpi/notebooks/exactly_out/all-exactly-hits_text+polarity.pkl.gz')

In [21]:
freq_dist = pd.crosstab(tdf.adj_lemma, tdf.polarity,
                        margins=True, margins_name='SUM')
freq_dist = freq_dist.assign(neg_ratio=(freq_dist.negative/freq_dist.SUM).round(3),
                             pos_ratio=(freq_dist.positive/freq_dist.SUM).round(3))
freq_dist = freq_dist.assign(neg_bin=freq_dist.neg_ratio.round(1),
                             pos_bin=freq_dist.pos_ratio.round(1))

freq_dist.sort_values('SUM', ascending=False)


polarity,negative,positive,SUM,neg_ratio,pos_ratio,neg_bin,pos_bin
adj_lemma,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
SUM,49576,16610,66186,0.749,0.251,0.7,0.3
sure,9548,118,9666,0.988,0.012,1.0,0.0
right,1769,6517,8286,0.213,0.787,0.2,0.8
alike,2410,1107,3517,0.685,0.315,0.7,0.3
clear,1755,146,1901,0.923,0.077,0.9,0.1
...,...,...,...,...,...,...,...
doughy,1,0,1,1.000,0.000,1.0,0.0
picture-perfect,1,0,1,1.000,0.000,1.0,0.0
dour,0,1,1,0.000,1.000,0.0,1.0
dovish,1,0,1,1.000,0.000,1.0,0.0


In [25]:
tdfp.adj_lemma[~tdfp.adj_lemma.isin(freq_dist.index)].value_counts()
#TODO?? why does it say there is a mismatch between the crosstab and pospol adj set?

-and               0
post-dubstep       0
poster-size        0
poster-children    0
postcard           0
                  ..
first-choice       0
first              0
firmer             0
firm               0
~healthy           0
Name: adj_lemma, Length: 6134, dtype: int64

In [28]:
pos_text_info = pos_text_info.assign(
    neg_ratio = pos_text_info.adj_lemma.apply(lambda a: freq_dist.loc[a, 'neg_ratio'] if a in freq_dist.index else None)) # type: ignore
pos_text_info.sort_values(['neg_ratio', 'conllu_id'], ascending=False)
pos_text_info.to_csv('/share/compling/projects/sanpi/notebooks/exactly_out/pos_sentences.csv')

In [29]:
freq_thresh5 = freq_dist.loc[freq_dist.SUM >= 5, :]
freq_thresh5.sort_values(['neg_bin', 'SUM', 'neg_ratio'], ascending=False).to_csv('/share/compling/projects/sanpi/notebooks/exactly_out/freq_thresh5.csv')

In [30]:
freq_thresh100 = freq_dist.loc[freq_dist.SUM >= 100, :]
freq_thresh100.sort_values(['neg_bin', 'SUM', 'neg_ratio' ], ascending=False).to_csv('/share/compling/projects/sanpi/notebooks/exactly_out/freq_thresh100.csv')

In [31]:
freq_thresh200 = freq_dist.loc[freq_dist.SUM >= 200, :]
freq_thresh200.sort_values(['neg_bin', 'SUM', 'neg_ratio' ], ascending=False).to_csv('/share/compling/projects/sanpi/notebooks/exactly_out/freq_thresh200.csv')

In [32]:
# bsamp, __ = balance_sample(tdf, column_name='polarity', sample_per_value=3)
# bsamp

In [33]:
show_counts(tdf, ['polarity', 'adj_lemma'])

Unnamed: 0_level_0,Unnamed: 1_level_0,count
polarity,adj_lemma,Unnamed: 2_level_1
negative,sure,9548
positive,right,6517
negative,alike,2410
negative,right,1769
negative,clear,1755
negative,...,...
negative,outdoorsy,1
negative,outrageous,1
negative,outre,1
negative,outsize,1


In [34]:
show_counts(tdf, ['neg_lemma', 'nr_lemma', 'relay_lemma'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count
neg_lemma,nr_lemma,relay_lemma,Unnamed: 3_level_1
not,_,_,45459
_,_,_,16610
no,_,two,700
no,_,one,355
never,_,_,349
...,...,...,...
no,_,development,1
no,_,designer,1
no,_,designation,1
no,_,description,1


In [35]:
print(show_counts(tdf, ['relay_lemma', 'nr_lemma']).reset_index().to_markdown())

|     | relay_lemma    | nr_lemma   |   count |
|----:|:---------------|:-----------|--------:|
|   0 | _              | _          |   63018 |
|   1 | two            | _          |     701 |
|   2 | one            | _          |     358 |
|   3 | piece          | _          |     190 |
|   4 | _              | think      |     126 |
|   5 | arrangement    | _          |     100 |
|   6 | people         | _          |     100 |
|   7 | _              | look       |      81 |
|   8 | _              | seem       |      58 |
|   9 | case           | _          |      38 |
|  10 | item           | _          |      38 |
|  11 | situation      | _          |      36 |
|  12 | _              | want       |      31 |
|  13 | business       | _          |      30 |
|  14 | patient        | _          |      22 |
|  15 | pair           | _          |      17 |
|  16 | project        | _          |      16 |
|  17 | _              | believe    |      16 |
|  18 | individual     | _          |   

🚩 **zero overlap of defined relay and defined negraiser** so the patterns probably do not allow for this 🤔:


In [36]:
any((tdf.relay_lemma!='_') & (tdf.nr_lemma!='_'))

False

In [37]:
# odf.loc[odf.hit_text=='not every purchase is exactly right', :].sample(1).squeeze()

In [38]:
tdf.loc[tdf.neg_lemma == 'no', ['category', 'hit_text', 'text_window']]

Unnamed: 0_level_0,category,hit_text,text_window
hit_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
apw_eng_19950325_0192_28:3-4-6-7,scoped,no one is exactly sure,"actually , no one is exactly sure who will take"
apw_eng_19950809_0936_18:08-13-14,contig,"no , that is n't exactly true","finding out , no , that is n't exactly true . ''"
apw_eng_19971119_0418_15:11-16-17,contig,"No , I 'm not exactly sure","said , `` No , I 'm not exactly sure I 've made"
apw_eng_19990223_1352_7:07-08-10-11,scoped,no one was exactly sure,so far that no one was exactly sure where it l...
apw_eng_20010328_0581_8:02-15-16,contig,"no , the debut film from Mexican director Alej...","so no , the debut film from Mexican director A..."
...,...,...,...
pcc_eng_29_102.3735_x1638023_32:09-10-12-13,scoped,no two are exactly alike,from photos as no two are exactly alike .
pcc_eng_29_105.7814_x1693270_8:1-3-5-6,scoped,No two collars are exactly alike,No two collars are exactly alike !
pcc_eng_test_2.10082_x32416_01:19-21-23-24,scoped,no two lockouts are exactly alike,to remember that no two lockouts are exactly a...
pcc_eng_val_2.05937_x25809_09:09-11-13-14,scoped,no 2 pieces are exactly alike,"characteristics , and no 2 pieces are exactly ..."
