# Inspecting pickled `exactly` hits

## Imports

In [1]:
import pandas as pd
from pathlib import Path

## Define helper functions

### Copied from `./source/analyze/utils/`
jupyter won't import them 🤷‍♀️

In [2]:
def find_files(data_dir: Path, fname_glob: str, verbose: bool = False):
    path_iter = data_dir.rglob(fname_glob)
    if verbose:
        path_iter = tuple(path_iter)
        print_iter(
            [f'../{p.relative_to(data_dir)}' for p in path_iter], bullet='-',
            header=f'### {len(path_iter)} paths matching {fname_glob} found in {data_dir}')
    return path_iter

In [3]:
def print_iter(iter_obj,
               bullet: str = '▸',
            #//    logger: logging.Logger = None,
            #//    level: int = 20,
               header: str = ''):

    bullet_str = f'\n{bullet} '

    iter_str = bullet_str.join(f'{i}' for i in iter_obj)

    msg_str = f'\n{header}{bullet_str}{iter_str}'
    msg_str = msg_str.replace('\n\n', '\n').strip(f'{bullet} ')

    print(msg_str)

In [4]:
def cols_by_str(df: pd.DataFrame, start_str=None, end_str=None) -> list:
    if end_str:
        cols = df.columns[df.columns.str.endswith(end_str)]
        if start_str:
            cols = cols[cols.str.startswith(start_str)]
    elif start_str:
        cols = df.columns[df.columns.str.startswith(start_str)]
    else:
        cols = df.columns

    return cols.to_list()

In [5]:
def make_cats(orig_df:pd.DataFrame, columns: list = None) -> pd.DataFrame:
    df = orig_df.copy()
    if columns is None:
        cat_suff = ("code", "name", "path", "stem")
        columns = df.columns.str.endswith(cat_suff) # type: ignore
    df.loc[:, columns] = df.loc[:, columns].astype(
        'string').fillna('_').astype('category')

    return df

In [6]:
def balance_sample(full_df: pd.DataFrame,
                   column_name: str = 'category',
                   sample_per_value: int = 5,
                   verbose: bool = False) -> tuple:
    '''
    create sample with no more than n rows satisfying each unique value
    of the given column. A value of -1 for `sample_per_value` will limit
    all values' results to the minimum count per value.
    '''
    info_message = ''
    subsamples = []
    for __, col_val_df in full_df.groupby(column_name):
        # take sample if 1+ and less than length of full dataframe
        if len(col_val_df) > sample_per_value > 0:
            subsample_df = col_val_df.sample(sample_per_value)
            subsamples.append(subsample_df)
        else: 
            subsamples.append(col_val_df)

    # > trim all "by column" sub dfs to length of shortest if -1 given
    if sample_per_value == -1:
        trim_len = int(min(len(sdf) for sdf in subsamples))
        subsamples = [sdf.sample(trim_len)
                       for sdf in subsamples]

    b_sample = pd.concat(subsamples)

    if verbose:
        subset_info_table = (
            b_sample
            .value_counts(subset=column_name)
            .to_frame(name='count')
            .assign(percentage=b_sample
                    .value_counts(column_name, normalize=True)
                    .round(2) * 100)
            .to_markdown())
        label = (full_df.hits_df_pkl[0].stem + ' '
                 if 'hits_df_pkl' in full_df.columns
                 else '')
        info_message = (f'\n## {column_name} representation in {label}sample\n'
                        + subset_info_table)

    return b_sample, info_message

In [7]:
def concat_pkls(data_dir: Path = Path('/share/compling/data/sanpi/2_hit_tables'),
                fname_glob: str = '*.pkl.gz',
                pickles=None,
                convert_dtypes=False,
                verbose: bool = True) -> pd.DataFrame:
    if not pickles:
        pickles = find_files(Path(data_dir), fname_glob, verbose)

    # tested and found that it is faster to assign `corpus` intermittently
    df = pd.concat((pd.read_pickle(p).assign(corpus=p.stem.rsplit('_', 2)[0])
                    for p in pickles))

    dup_check_cols = cols_by_str(df, end_str=('text', 'id', 'sent'))
    df = (df.loc[~df.duplicated(subset = dup_check_cols), :])
    df = df.convert_dtypes()
    df = make_cats(df, (['corpus'] + cols_by_str(df, start_str=('nr', 'neg', 'adv'),
                                                 end_str=('lemma', 'form'))))

    return df

### copied from `./source/analyze_deps.py`:

In [8]:
def _optimize_df(df:pd.DataFrame) -> pd.DataFrame: 
    
    # print('Original Dataframe:')
    # df.info(memory_usage='deep')
    
    #> drop unneeded string columns
    # was:
    #   for c in udf.cols_by_str(df, start_str=('context', 'text', 'sent_text', 'token')):
    for c in cols_by_str(df, start_str=('context', 'token', 'utt')):
        df.pop(c)
        
    #> select only non-`object` dtype columns
    relevant_cols = df.columns[~df.dtypes.astype(
        'string').str.endswith(('object'))]
    # limit df to `relevant_cols`
    df = df[relevant_cols]
    
    #> create empty dataframe with `relevant_cols` as index/rows
    df_info = pd.DataFrame(index=relevant_cols)

    df_info = df_info.assign(
        mem0=df.memory_usage(deep=True),
        dtype0=df.dtypes.astype('string'),
        defined_values=df.count(),
        unique_values=df.apply(pd.unique, axis=0).apply(len))
    df_info = df_info.assign(
        ratio_unique = (df_info.unique_values/df_info.defined_values).round(2))

    cat_candidates = df_info.loc[df_info.ratio_unique < 0.8, :].loc[df_info.dtype0!='category'].index.to_list()
    #was: catted_df = udf.make_cats(df.copy(), cat_candidates)
    catted_df = make_cats(df.copy(), cat_candidates)
    
    df_info = df_info.assign(dtype1=catted_df.dtypes, mem1=catted_df.memory_usage(deep=True))
    df_info = df_info.assign(mem_change= df_info.mem1-df_info.mem0)
    print(df_info.sort_values(['mem_change', 'ratio_unique', 'dtype0']).to_markdown())
    mem_improved = df_info.loc[df_info.mem_change < 0, :].index.to_list()
    for c in df.columns[~df.columns.isin(mem_improved)]: 
        print(c, '\t', df.loc[:, c].dtype)
    df.loc[:, mem_improved] = catted_df.loc[:, mem_improved]
    print('Category Converted dataframe:')
    df.info(memory_usage='deep')
    
    return df

### Newly created

In [9]:
def show_counts(df, columns): 
    return df.value_counts(columns).to_frame().rename(columns={0:'count'})

In [10]:
def summarize_text_cols(tdf:pd.DataFrame): 
    
    summary = tdf.describe().transpose()
    summary = summary.assign(top_percent = (((pd.to_numeric(summary.freq) / len(tdf)))*100).round(2))
    summary = summary.rename(columns={'top': 'top_value', 'freq':'top_freq'})
    
    return summary.convert_dtypes().sort_values('unique')

## Load Data

In [11]:
ddf = concat_pkls(
    data_dir=Path('/share/compling/data/sanpi/3_dep_info'),
    fname_glob='exactly*hits+deps.pkl.gz',
    convert_dtypes=True)


### 12 paths matching exactly*hits+deps.pkl.gz found in /share/compling/data/sanpi/3_dep_info
- ../raised/exactly_apw_neg-raised_hits+deps.pkl.gz
- ../raised/exactly_nyt_neg-raised_hits+deps.pkl.gz
- ../raised/exactly_puddin_neg-raised_hits+deps.pkl.gz
- ../scoped/exactly_apw_with-relay_hits+deps.pkl.gz
- ../scoped/exactly_nyt_with-relay_hits+deps.pkl.gz
- ../scoped/exactly_puddin_with-relay_hits+deps.pkl.gz
- ../contig/exactly_apw_sans-relay_hits+deps.pkl.gz
- ../contig/exactly_nyt_sans-relay_hits+deps.pkl.gz
- ../contig/exactly_puddin_sans-relay_hits+deps.pkl.gz
- ../advadj/exactly_apw_all-RB-JJs_hits+deps.pkl.gz
- ../advadj/exactly_nyt_all-RB-JJs_hits+deps.pkl.gz
- ../advadj/exactly_puddin_all-RB-JJs_hits+deps.pkl.gz


### Optimize DataFrame

In [12]:
odf = _optimize_df(ddf)

|                  |     mem0 | dtype0   |   defined_values |   unique_values |   ratio_unique | dtype1   |     mem1 |   mem_change |
|:-----------------|---------:|:---------|-----------------:|----------------:|---------------:|:---------|---------:|-------------:|
| sent_text        | 29901545 | string   |           150437 |           78376 |           0.52 | category | 17628415 |    -12273130 |
| lemma_str        | 29759457 | string   |           150437 |           78190 |           0.52 | category | 17540971 |    -12218486 |
| dep_str_mask_rel | 11599471 | string   |           150437 |             291 |           0    | category |   337911 |    -11261560 |
| dep_str_rel      | 14388422 | string   |           150437 |           26315 |           0.17 | category |  3891252 |    -10497170 |
| dep_str_mask     |  9774606 | string   |           150437 |             190 |           0    | category |   319579 |     -9455027 |
| category         |  9477531 | string   |           150437 | 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, mem_improved] = catted_df.loc[:, mem_improved]


Category Converted dataframe:
<class 'pandas.core.frame.DataFrame'>
Index: 150437 entries, apw_eng_20030918_0697_20:4-5-8-9 to pcc_eng_09_047.0803_x0745587_19:3-4
Data columns (total 31 columns):
 #   Column            Non-Null Count   Dtype   
---  ------            --------------   -----   
 0   colloc            150437 non-null  category
 1   sent_text         150437 non-null  category
 2   nr_form           150437 non-null  category
 3   neg_form          150437 non-null  category
 4   adv_form          150437 non-null  category
 5   adj_form          150437 non-null  category
 6   hit_text          150437 non-null  category
 7   text_window       150437 non-null  string  
 8   sent_id           150437 non-null  category
 9   match_id          150437 non-null  category
 10  colloc_id         150437 non-null  category
 11  lemma_str         150437 non-null  category
 12  nr_lemma          150437 non-null  category
 13  neg_lemma         150437 non-null  category
 14  adv_lemma      

In [13]:
columns = odf.columns[~odf.columns.isin(cols_by_str(odf, start_str=('dep_m', 'dep_n', 'dep_r', 'context')))].to_list()
columns.sort()
odf = odf.loc[:, columns]
len(odf) == len(ddf)

True

### Limit to `exactly` adverbs *only*

In [14]:
odf = odf.loc[odf.adv_lemma=='exactly', :]
odf.loc[odf.corpus.str.endswith('puddin'), 'corpus_group'] = 'puddin'
odf.loc[odf.corpus.str.endswith(('nyt', 'apw')), 'corpus_group'] = 'news'
general_counts = show_counts(odf, ['category', 'corpus_group']).unstack().sort_values(('count', 'puddin'), ascending=False) # type: ignore
general_counts

Unnamed: 0_level_0,count,count
corpus_group,news,puddin
category,Unnamed: 1_level_2,Unnamed: 2_level_2
advadj,4917,60971
contig,3500,42908
scoped,78,2750
raised,13,327


*NOTE:* 👇 *`odf` is shorter (fewer rows) than original loaded hits because adverbs other than `'exactly'` have been dropped.*

In [15]:
len(odf) < len(ddf)

True

### Add `conllu_id` and drop unused columns

In [16]:
odf = odf.assign(conllu_id=odf.sent_id.str.rsplit('_', 2).str.get(0).str.split('.').str.get(0).astype('string').astype('category')) # type: ignore
tdf = odf[cols_by_str(odf, end_str=('lemma','id', 'text', 'window', 'category', 'Pol')) + cols_by_str(odf, start_str=('corpus', 'lemma'))]
print(f'Total "exactly" hits for all patterns: {len(tdf)}')
summary_tdf = summarize_text_cols(tdf)
summary_tdf

Total "exactly" hits for all patterns: 115464


Unnamed: 0,count,unique,top_value,top_freq,top_percent
adv_lemma,115464,1,exactly,115464,100.0
corpus_group,115464,2,puddin,106956,92.63
corpus,115464,3,exactly_puddin,106956,92.63
category,115464,4,advadj,65888,57.06
neg_lemma,115464,14,_,65888,57.06
nr_lemma,115464,17,_,115124,99.71
relay_lemma,115464,622,_,112636,97.55
match_id,115464,978,4-5,9361,8.11
adj_lemma,115464,4240,sure,19181,16.61
conllu_id,115464,6611,pcc_eng_22_037,65,0.06


## Identify $PosPol$ and $NegPol$ contexts

### Option A
bare collocation tokens (`advadj.all-RB-JJs` pattern match) which do not appear as matches for any other pattern match (i.e. $NegPol$ contexts).

*That is, the `colloc_id` (unique `ADV` & `ADJ` nodes in unique sentence tokens) is not duplicated.*

```{python}
tdfp_a = tdf.loc[(tdf.category=='advadj') & (~tdf.duplicated(subset='colloc_id', keep=False)), :]
```

### Option B
categorize $NegPol$ set first (`tdfn`), then compute complement of that (i.e. $ALL - NegPol$)

```{python}
tdfn = tdf.loc[tdf.neg_lemma!='_', :]
tdfp = tdf.loc[~tdf.colloc_id.isin(tdfn.colloc_id), :]
```

### Options A and B are identical

`all(tdfp_a.index == tdfp_b.index)` evaluates as true

So, since $NegPol$ is more directly defined, and has to be separated out anyway, it's simpler to just get the "complement", (`tdfp_b` method)

In [17]:
tdfn = tdf.loc[tdf.neg_lemma!='_', :]
summarize_text_cols(tdfn)

Unnamed: 0,count,unique,top_value,top_freq,top_percent
adv_lemma,49576,1,exactly,49576,100.0
corpus_group,49576,2,puddin,45985,92.76
category,49576,3,contig,46408,93.61
corpus,49576,3,exactly_puddin,45985,92.76
neg_lemma,49576,13,not,45771,92.32
nr_lemma,49576,17,_,49236,99.31
relay_lemma,49576,622,_,46748,94.3
match_id,49576,833,3-4-5,7889,15.91
adj_lemma,49576,3743,sure,9548,19.26
conllu_id,49576,5908,pcc_eng_01_001,29,0.06


In [18]:
tdfp = tdf.loc[~tdf.colloc_id.isin(tdfn.colloc_id), :]
summarize_text_cols(tdfp)

Unnamed: 0,count,unique,top_value,top_freq,top_percent
adv_lemma,16610,1,exactly,16610,100.0
category,16610,1,advadj,16610,100.0
neg_lemma,16610,1,_,16610,100.0
nr_lemma,16610,1,_,16610,100.0
relay_lemma,16610,1,_,16610,100.0
corpus_group,16610,2,puddin,15249,91.81
corpus,16610,3,exactly_puddin,15249,91.81
match_id,16610,122,3-4,2074,12.49
adj_lemma,16610,1346,right,6517,39.24
hit_text,16610,1446,exactly right,6450,38.83


## Assign `polarity` and recombine

In [19]:
tdfp = tdfp.assign(polarity='positive')
tdfn = tdfn.assign(polarity='negative')
tdf_with_overlap = tdf
pol_union_df = pd.concat([tdfp, tdfn]).sort_values('colloc_id')

print(f'Total bare `exactly ADJ` collocations: {odf.category.value_counts()["advadj"]}')
print(f'Total `exactly` pattern hits (+ NegPol pattern overlap): {len(pol_union_df)}')
print(f'PosPol: {round(100*len(tdfp)/len(pol_union_df))}% : {len(tdfp)} hits')
print(f'NegPol: {str(round(100*len(tdfn)/len(pol_union_df))).zfill(2)}% : {len(tdfn)} hits')

Total bare `exactly ADJ` collocations: 65888
Total `exactly` pattern hits (+ NegPol pattern overlap): 66186
PosPol: 25% : 16610 hits
NegPol: 75% : 49576 hits


### Pickle Pos+Neg Dataframe

In [20]:
pol_union_df.to_pickle('/share/compling/projects/sanpi/notebooks/exactly_out/all-exactly-hits_text+polarity.pkl.gz')

## Frequency by Polarity

### Simple Frequency Comparison

In [21]:
top_200 = show_counts(pol_union_df, ['polarity', 'adj_lemma']).head(200)
top_200

Unnamed: 0_level_0,Unnamed: 1_level_0,count
polarity,adj_lemma,Unnamed: 2_level_1
negative,sure,9548
positive,right,6517
negative,alike,2410
negative,right,1769
negative,clear,1755
negative,...,...
negative,fast,37
negative,prolific,37
negative,fashionable,37
negative,analogous,37


In [22]:
spread_top_200 = top_200.unstack(level='polarity', fill_value=0)
spread_top_200.sort_values(('count', 'negative'), ascending=False).head(30) # type: ignore

Unnamed: 0_level_0,count,count
polarity,negative,positive
adj_lemma,Unnamed: 1_level_2,Unnamed: 2_level_2
sure,9548,118
alike,2410,1107
right,1769,6517
clear,1755,146
true,1538,279
new,1439,53
easy,1142,38
cheap,717,0
happy,468,0
surprising,454,0


In [23]:
spread_top_200.sort_values(('count', 'positive'), ascending=False).head(30) # type: ignore

Unnamed: 0_level_0,count,count
polarity,negative,positive
adj_lemma,Unnamed: 1_level_2,Unnamed: 2_level_2
right,1769,6517
alike,2410,1107
correct,280,539
equal,77,501
opposite,0,498
same,50,487
wrong,180,472
zero,0,336
similar,55,335
true,1538,279


### Crosstabulate adjective by context polarity

In [24]:
freq_dist = pd.crosstab(pol_union_df.adj_lemma,
                        pol_union_df.polarity,
                        margins=True, margins_name='TOTAL')

freq_dist = freq_dist.assign(
    ratio_neg=(freq_dist.negative/freq_dist.TOTAL).round(3),
    ratio_pos=(freq_dist.positive/freq_dist.TOTAL).round(3))

freq_dist = freq_dist.assign(
    bin_neg=freq_dist.ratio_neg.round(1),
    bin_pos=freq_dist.ratio_pos.round(1))

cols = freq_dist.columns.to_list()
cols.pop(cols.index('TOTAL'))
freq_dist = freq_dist[['TOTAL'] + cols]

freq_dist.sort_values('TOTAL', ascending=False)

polarity,TOTAL,negative,positive,ratio_neg,ratio_pos,bin_neg,bin_pos
adj_lemma,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
TOTAL,66186,49576,16610,0.749,0.251,0.7,0.3
sure,9666,9548,118,0.988,0.012,1.0,0.0
right,8286,1769,6517,0.213,0.787,0.2,0.8
alike,3517,2410,1107,0.685,0.315,0.7,0.3
clear,1901,1755,146,0.923,0.077,0.9,0.1
...,...,...,...,...,...,...,...
doughy,1,1,0,1.000,0.000,1.0,0.0
picture-perfect,1,1,0,1.000,0.000,1.0,0.0
dour,1,0,1,0.000,1.000,0.0,1.0
dovish,1,1,0,1.000,0.000,1.0,0.0


In [25]:
freq_thresh5 = freq_dist.loc[freq_dist.TOTAL >= 5, :]
freq_thresh5 = freq_thresh5.sort_values(['bin_neg', 'TOTAL', 'ratio_neg'], ascending=False)
freq_thresh5.to_csv('/share/compling/projects/sanpi/notebooks/exactly_out/freq_thresh5.csv')
freq_thresh5

polarity,TOTAL,negative,positive,ratio_neg,ratio_pos,bin_neg,bin_pos
adj_lemma,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
sure,9666,9548,118,0.988,0.012,1.0,0.0
new,1492,1439,53,0.964,0.036,1.0,0.0
easy,1180,1142,38,0.968,0.032,1.0,0.0
cheap,733,717,16,0.978,0.022,1.0,0.0
happy,490,468,22,0.955,0.045,1.0,0.0
...,...,...,...,...,...,...,...
unchanged,6,0,6,0.000,1.000,0.0,1.0
contradictory,5,0,5,0.000,1.000,0.0,1.0
one-,5,0,5,0.000,1.000,0.0,1.0
select,5,0,5,0.000,1.000,0.0,1.0


In [26]:
freq_thresh100 = freq_dist.loc[freq_dist.TOTAL >= 100, :]
freq_thresh100 = freq_thresh100.sort_values(['bin_neg', 'TOTAL', 'ratio_neg'], ascending=False)
freq_thresh100.to_csv('/share/compling/projects/sanpi/notebooks/exactly_out/freq_thresh100.csv')
freq_thresh100

polarity,TOTAL,negative,positive,ratio_neg,ratio_pos,bin_neg,bin_pos
adj_lemma,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
sure,9666,9548,118,0.988,0.012,1.0,0.0
new,1492,1439,53,0.964,0.036,1.0,0.0
easy,1180,1142,38,0.968,0.032,1.0,0.0
cheap,733,717,16,0.978,0.022,1.0,0.0
happy,490,468,22,0.955,0.045,1.0,0.0
...,...,...,...,...,...,...,...
same,537,50,487,0.093,0.907,0.1,0.9
similar,390,55,335,0.141,0.859,0.1,0.9
zero,354,18,336,0.051,0.949,0.1,0.9
enough,307,43,264,0.140,0.860,0.1,0.9


In [27]:
freq_thresh200 = freq_dist.loc[freq_dist.TOTAL >= 200, :]
freq_thresh200 = freq_thresh200.sort_values(['bin_neg', 'TOTAL', 'ratio_neg'], ascending=False)
freq_thresh200.to_csv('/share/compling/projects/sanpi/notebooks/exactly_out/freq_thresh200.csv')
freq_thresh200

polarity,TOTAL,negative,positive,ratio_neg,ratio_pos,bin_neg,bin_pos
adj_lemma,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
sure,9666,9548,118,0.988,0.012,1.0,0.0
new,1492,1439,53,0.964,0.036,1.0,0.0
easy,1180,1142,38,0.968,0.032,1.0,0.0
cheap,733,717,16,0.978,0.022,1.0,0.0
happy,490,468,22,0.955,0.045,1.0,0.0
surprising,457,454,3,0.993,0.007,1.0,0.0
subtle,284,274,10,0.965,0.035,1.0,0.0
fun,243,238,5,0.979,0.021,1.0,0.0
thrilled,233,223,10,0.957,0.043,1.0,0.0
conducive,219,213,6,0.973,0.027,1.0,0.0


## Save $PosPol$ text data

Since $PosPol$ is defined as the complement of $NegPol$, accuracy relies on $NegPol$ catching all relevant cases.

To ensure pattern specifications are sufficiently inclusive, all sentences with supposedly positive polarity
should be manually inspected for any errant (uncaught) negative lemmas, as identified in the $NegPol$ pattern specifications.

```{ocaml}
NEG [lemma="hardly"|"scarcely"|"never"|"rarely"|"barely"|"seldom"|"no"|"nothing"|"none"|"nobody"|"neither"|"without"|"few"|"nor"];  
```

- [x] create simplified output of $PosPol$/`tdfp` sentence text data, with necessary identifiers
- [x] save as csv
- [ ] grep `pos_sentences.csv` for each neg lemma:\
  *There should not be any negative lemmas an `exactly JJ` collocation in its scope.*


In [None]:
select_cols = ['adj_lemma', 'text_window', 'sent_text',
               'sent_id', 'conllu_id', 'corpus']
pos_text_info = tdfp.loc[:, select_cols]
pos_text_info = pos_text_info.assign(
    adj_neg_ratio = pos_text_info.adj_lemma.apply(lambda a: freq_dist.loc[a, 'ratio_neg'] if a in freq_dist.index else None)) # type: ignore
pos_text_info.sort_values(['adj_neg_ratio', 'conllu_id'], ascending=False)
pos_text_info.to_csv('/share/compling/projects/sanpi/notebooks/exactly_out/pos_sentences.csv')

# tdfp.adj_lemma[~tdfp.adj_lemma.isin(freq_dist.index)].value_counts()
#TODO?? why does it say there is a mismatch between the crosstab and pospol adj set?
#^ 🤔 probably something to do with categorical dtype