In [1]:
import re
from pathlib import Path

import pandas as pd

from source.utils.dataframes import corners, print_md_table, square_sample, enhance_descrip, transform_counts
from source.utils.general import (FREQ_DIR, PKL_SUFF, SANPI_HOME, RESULT_DIR, confirm_dir,
                                  file_size_round)



In [2]:

FRQ_FLAG = 'thr0-001p.35f'
ALL_path = FREQ_DIR.joinpath(
    f'RBXadj/all_adj-x-adv_frq-{FRQ_FLAG}=868+.pkl.gz')
data_paths = {'ALL': ALL_path}


def seek_env_data(dirname='RBdirect',
                  component_1: str = 'adj',
                  component_2: str = 'adv',
                  frq_thresh=FRQ_FLAG,
                  complement: bool = False,
                  ucs_format: bool = False):
    top_dir = _dir = FREQ_DIR.joinpath(dirname)
    if complement:
        _dir = _dir / 'complement'
    if ucs_format:
        _dir = _dir / 'ucs_format'
    path = None
    for cross_flag in [f'{x}-x-{y}' for x, y
                       in [(component_1, component_2),
                           (component_2, component_1)]]:
        if _dir.is_dir():
            prefix = 'diff' if complement else 'ALL'
            try:
                path = list(
                    _dir.glob(f'{prefix}*{cross_flag}*{frq_thresh}*{PKL_SUFF}')
                )[0]
            except IndexError:
                continue
        else:
            print(f'Search directory does not exist: {_dir}')
            break
    if not path:
        print(
            f'No bigram context frequency data fround in {_dir.relative_to(RESULT_DIR)}!')
    else:
        print(f"+ Path to {str(_dir.relative_to(FREQ_DIR)).replace('/', '-')} {component_1.capitalize()} & {component_2.capitalize()} joint frequencies: \\\n  `{path.relative_to(RESULT_DIR)}`")
    return path

In [3]:
pat_cats = ('RBdirect', 'NEGmirror', 'POSmirror', 'ANYmirror')
pat_rslt_paths = {pc: seek_env_data(pc)
                  for pc in pat_cats}
complement_paths = {f'{pc}_complement': seek_env_data(
    pc, complement=True) for pc in pat_cats}

+ Path to RBdirect Adj & Adv joint frequencies: \
  `freq_out/RBdirect/ALL-WORDS_adj-x-adv_thr0-001p.35f.pkl.gz`
+ Path to NEGmirror Adj & Adv joint frequencies: \
  `freq_out/NEGmirror/ALL-WORDS_adj-x-adv_thr0-001p.35f.pkl.gz`
+ Path to POSmirror Adj & Adv joint frequencies: \
  `freq_out/POSmirror/ALL-WORDS_adj-x-adv_thr0-001p.35f.pkl.gz`
+ Path to ANYmirror Adj & Adv joint frequencies: \
  `freq_out/ANYmirror/ALL-WORDS_adj-x-adv_thr0-001p.35f.pkl.gz`
+ Path to RBdirect-complement Adj & Adv joint frequencies: \
  `freq_out/RBdirect/complement/diff_all-RBdirect_adj-x-adv_frq-thr0-001p.35f=868+.pkl.gz`
+ Path to NEGmirror-complement Adj & Adv joint frequencies: \
  `freq_out/NEGmirror/complement/diff_all-NEGmirror_adj-x-adv_frq-thr0-001p.35f=868+.pkl.gz`
+ Path to POSmirror-complement Adj & Adv joint frequencies: \
  `freq_out/POSmirror/complement/diff_all-POSmirror_adj-x-adv_frq-thr0-001p.35f=868+.pkl.gz`
+ Path to ANYmirror-complement Adj & Adv joint frequencies: \
  `freq_out/ANYmir

+ Path to RBdirect Adj & Adv joint frequencies: \
  `freq_out/RBdirect/ALL-WORDS_adj-x-adv_thr0-001p.35f.pkl.gz`
+ Path to NEGmirror Adj & Adv joint frequencies: \
  `freq_out/NEGmirror/ALL-WORDS_adj-x-adv_thr0-001p.35f.pkl.gz`
+ Path to POSmirror Adj & Adv joint frequencies: \
  `freq_out/POSmirror/ALL-WORDS_adj-x-adv_thr0-001p.35f.pkl.gz`
+ Path to ANYmirror Adj & Adv joint frequencies: \
  `freq_out/ANYmirror/ALL-WORDS_adj-x-adv_thr0-001p.35f.pkl.gz`
+ Path to RBdirect-complement Adj & Adv joint frequencies: \
  `freq_out/RBdirect/complement/diff_all-RBdirect_adj-x-adv_frq-thr0-001p.35f=868+.pkl.gz`
+ Path to NEGmirror-complement Adj & Adv joint frequencies: \
  `freq_out/NEGmirror/complement/diff_all-NEGmirror_adj-x-adv_frq-thr0-001p.35f=868+.pkl.gz`
+ Path to POSmirror-complement Adj & Adv joint frequencies: \
  `freq_out/POSmirror/complement/diff_all-POSmirror_adj-x-adv_frq-thr0-001p.35f=868+.pkl.gz`
+ Path to ANYmirror-complement Adj & Adv joint frequencies: \
  `freq_out/ANYmirror/complement/diff_all-ANYmirror_adj-x-adv_frq-thr0-001p.35f=868+.pkl.gz`


In [4]:
data_paths.update(pat_rslt_paths)
data_paths.update(complement_paths)
data_paths = pd.Series(data_paths)
table_str = print_md_table(
    (data_paths
     .apply(lambda p: f'`{p.relative_to(RESULT_DIR)}`')
     .to_frame(f'path (_relative to `{RESULT_DIR}/`_)')
     .assign(
         file_size=data_paths.apply(
             lambda x: file_size_round(x.stat().st_size)))
     ),
    suppress=True)
print(re.sub(r'\|\:(\-+)\|\n',
             r'|\1:|\n',
             table_str
             ))

|                      | path (_relative to `/share/compling/projects/sanpi/results/`_)                             | file_size   |
|:---------------------|:-------------------------------------------------------------------------------------------|------------:|
| ALL                  | `freq_out/RBXadj/all_adj-x-adv_frq-thr0-001p.35f=868+.pkl.gz`                              | 1.3 MB      |
| RBdirect             | `freq_out/RBdirect/ALL-WORDS_adj-x-adv_thr0-001p.35f.pkl.gz`                               | 224.2 KB    |
| NEGmirror            | `freq_out/NEGmirror/ALL-WORDS_adj-x-adv_thr0-001p.35f.pkl.gz`                              | 88.6 KB     |
| POSmirror            | `freq_out/POSmirror/ALL-WORDS_adj-x-adv_thr0-001p.35f.pkl.gz`                              | 245.1 KB    |
| ANYmirror            | `freq_out/ANYmirror/ALL-WORDS_adj-x-adv_thr0-001p.35f.pkl.gz`                              | 256.7 KB    |
| RBdirect_complement  | `freq_out/RBdirect/complement/diff_all-RBdirect_adj

|                      | path (_relative to `/share/compling/projects/sanpi/results/`_)                             | file_size |
|:---------------------|:-------------------------------------------------------------------------------------------|----------:|
| ALL                  | `freq_out/RBXadj/all_adj-x-adv_frq-thr0-001p.35f=868+.pkl.gz`                              |    1.3 MB |
| RBdirect             | `freq_out/RBdirect/ALL-WORDS_adj-x-adv_thr0-001p.35f.pkl.gz`                               |  224.2 KB |
| NEGmirror            | `freq_out/NEGmirror/ALL-WORDS_adj-x-adv_thr0-001p.35f.pkl.gz`                              |   88.6 KB |
| POSmirror            | `freq_out/POSmirror/ALL-WORDS_adj-x-adv_thr0-001p.35f.pkl.gz`                              |  245.1 KB |
| ANYmirror            | `freq_out/ANYmirror/ALL-WORDS_adj-x-adv_thr0-001p.35f.pkl.gz`                              |  256.7 KB |
| RBdirect_complement  | `freq_out/RBdirect/complement/diff_all-RBdirect_adj-x-adv_frq-thr0-001p.35f=868+.pkl.gz`   |    1.2 MB |
| NEGmirror_complement | `freq_out/NEGmirror/complement/diff_all-NEGmirror_adj-x-adv_frq-thr0-001p.35f=868+.pkl.gz` |    1.2 MB |
| POSmirror_complement | `freq_out/POSmirror/complement/diff_all-POSmirror_adj-x-adv_frq-thr0-001p.35f=868+.pkl.gz` |    1.2 MB |
| ANYmirror_complement | `freq_out/ANYmirror/complement/diff_all-ANYmirror_adj-x-adv_frq-thr0-001p.35f=868+.pkl.gz` |    1.2 MB |



In [5]:
data = data_paths.to_frame('path')
# for df_name in data_paths.index:
#   df = pd.read_pickle(data_paths[df_name])
#   print(f'Loaded `{df_name}` frequency table: `{df.index.name}` × `{df.columns.name}`')
data = data.assign(frame=data.path.apply(pd.read_pickle),
                   complement=data.index.str.endswith(
                       '_complement').astype('bool'),
                   mirror=data.index.str.contains('mirror').astype('bool'),
                   category=data.index.str.replace('_complement', ''))

In [6]:
data

Unnamed: 0,path,frame,complement,mirror,category
ALL,/share/compling/projects/sanpi/results/freq_ou...,adv_form_lower SUM very mo...,False,False,ALL
RBdirect,/share/compling/projects/sanpi/results/freq_ou...,adv_form_lower SUM very mo...,False,False,RBdirect
NEGmirror,/share/compling/projects/sanpi/results/freq_ou...,adv_form_lower SUM very mo...,False,True,NEGmirror
POSmirror,/share/compling/projects/sanpi/results/freq_ou...,adv_form_lower SUM very mo...,False,True,POSmirror
ANYmirror,/share/compling/projects/sanpi/results/freq_ou...,adv_form_lower SUM very mo...,False,True,ANYmirror
RBdirect_complement,/share/compling/projects/sanpi/results/freq_ou...,adv_form_lower SUM very mo...,True,False,RBdirect
NEGmirror_complement,/share/compling/projects/sanpi/results/freq_ou...,adv_form_lower SUM very mo...,True,True,NEGmirror
POSmirror_complement,/share/compling/projects/sanpi/results/freq_ou...,adv_form_lower SUM very mo...,True,True,POSmirror
ANYmirror_complement,/share/compling/projects/sanpi/results/freq_ou...,adv_form_lower SUM very mo...,True,True,ANYmirror


In [7]:
def print_sq_sample(name: str,
                    df: pd.DataFrame,
                    n: int = 4,
                    upper_limit: int = 10):
    upper_range = [int(df.shape[ax]/upper_limit) for ax in (0, 1)]
    _df = square_sample(df.copy().iloc[:upper_range[0], :upper_range[1]], n)
    _df.index.name = ''
    title = (f'\n## `{name}` {n} × {n} Sample\n\n_semi-random selection '
             f'(from top ${(1/upper_limit*100)}\%$) + `SUM`_\n')
    sample_str = print_md_table(_df, suppress=True)
    print(title,
          re.sub(r'([a-z]) ', r'\1_ ',
                 re.sub(r' ([a-z])', r' _\1', sample_str)
                 ).replace(' SUM ', ' `SUM` '),
          sep='\n')


data.index.to_series().sort_values().apply(
    lambda x: print_sq_sample(name=x, df=data.frame[x]))


## `ALL` 4 × 4 Sample

_semi-random selection (from top $10.0\%$) + `SUM`_

|           |        `SUM` |      _less_ |   _definitely_ |   _remarkably_ |   _socially_ |
|:----------|-----------:|----------:|-------------:|-------------:|-----------:|
| `SUM`       | 83,284,343 | 1,256,870 |      105,862 |       71,432 |     68,090 |
| _creative_  |     88,015 |       866 |           46 |          132 |         14 |
| _rewarding_ |     66,861 |       500 |          120 |           37 |         44 |
| _direct_    |     46,025 |     1,843 |            2 |           37 |          0 |
| _illegal_   |     43,799 |       150 |          101 |            2 |          1 |


## `ANYmirror` 4 × 4 Sample

_semi-random selection (from top $10.0\%$) + `SUM`_

|           |       `SUM` |   _completely_ |   _otherwise_ |   _clearly_ |   _surprisingly_ |
|:----------|----------:|-------------:|------------:|----------:|---------------:|
| `SUM`       | 1,960,936 |       29,469 |       8,614 |     2,233 

ALL                     None
ANYmirror               None
ANYmirror_complement    None
NEGmirror               None
NEGmirror_complement    None
POSmirror               None
POSmirror_complement    None
RBdirect                None
RBdirect_complement     None
dtype: object


## `ALL` 4 × 4 Sample

_semi-random selection (from top $10.0\%$) + `SUM`_

|              |        `SUM` |        _so_ |     _quite_ |   _rather_ |   _entirely_ |
|:-------------|-----------:|----------:|----------:|---------:|-----------:|
| `SUM`          | 83,284,343 | 5,735,964 | 1,065,022 |  382,971 |    291,839 |
| _cold_         |    103,167 |    10,404 |     1,815 |      915 |         32 |
| _professional_ |     74,579 |     1,754 |       286 |       51 |        120 |
| _same_         |     56,073 |        56 |        73 |        4 |          9 |
| _thick_        |     49,433 |     7,505 |     1,468 |      550 |          0 |


## `ANYmirror` 4 × 4 Sample

_semi-random selection (from top $10.0\%$) + `SUM`_

|           |       `SUM` |   _less_ |   _that_ |   _actually_ |   _about_ |
|:----------|----------:|-------:|-------:|-----------:|--------:|
| `SUM`       | 1,960,936 | 29,030 |  7,441 |      5,069 |   1,569 |
| _cool_      |     5,634 |     28 |     11 |          5 |       9 |
| _wonderful_ |     2,833 |     18 |      7 |          3 |       2 |
| _quiet_     |     1,934 |      8 |      5 |          6 |       1 |
| _narrow_    |       927 |      3 |      0 |          0 |       0 |


## `ANYmirror_complement` 4 × 4 Sample

_semi-random selection (from top $10.0\%$) + `SUM`_

|         |        `SUM` |      _much_ |   _relatively_ |   _significantly_ |   _virtually_ |
|:--------|-----------:|----------:|-------------:|----------------:|------------:|
| `SUM`     | 81,323,407 | 1,959,836 |      606,431 |         136,021 |      85,046 |
| _big_     |    347,331 |       286 |          837 |              19 |           1 |
| _busy_    |    187,745 |       206 |          223 |               8 |           0 |
| _worried_ |    115,459 |       695 |            4 |              10 |           0 |
| _cheap_   |     80,902 |       257 |        7,905 |              12 |           0 |


## `NEGmirror` 4 × 4 Sample

_semi-random selection (from top $10.0\%$) + `SUM`_

|             |     `SUM` |   _quite_ |   _probably_ |   _somewhat_ |   _surprisingly_ |
|:------------|--------:|--------:|-----------:|-----------:|---------------:|
| `SUM`         | 285,435 |   6,203 |         53 |         16 |             11 |
| _complex_     |     611 |       1 |          0 |          0 |              0 |
| _terrible_    |     362 |       1 |          0 |          0 |              0 |
| _competitive_ |     245 |       1 |          0 |          0 |              0 |
| _extensive_   |      86 |       0 |          0 |          0 |              0 |


## `NEGmirror_complement` 4 × 4 Sample

_semi-random selection (from top $10.0\%$) + `SUM`_

|       |        `SUM` |      _more_ |   _extremely_ |    _yet_ |   _mentally_ |
|:------|-----------:|----------:|------------:|-------:|-----------:|
| `SUM`   | 82,998,908 | 9,243,662 |     975,860 | 96,680 |     80,905 |
| _aware_ |    318,445 |    34,259 |         433 |    928 |         78 |
| _fine_  |    158,763 |     1,174 |       1,636 |     14 |         13 |
| _cheap_ |     82,984 |       888 |       1,486 |     23 |          0 |
| _huge_  |     39,430 |       581 |         325 |     12 |          0 |


## `POSmirror` 4 × 4 Sample

_semi-random selection (from top $10.0\%$) + `SUM`_

|             |       `SUM` |    _very_ |   _now_ |   _always_ |   _longer_ |
|:------------|----------:|--------:|------:|---------:|---------:|
| `SUM`         | 1,675,501 | 189,186 | 5,482 |    4,347 |    3,087 |
| _complicated_ |     5,449 |     510 |     0 |        0 |        0 |
| _bigger_      |     4,319 |       0 |     6 |        8 |        0 |
| _negative_    |     1,933 |     224 |     0 |       14 |        0 |
| _upset_       |     1,545 |     341 |     3 |        7 |        0 |


## `POSmirror_complement` 4 × 4 Sample

_semi-random selection (from top $10.0\%$) + `SUM`_

|            |        `SUM` |    _that_ |   _easily_ |   _otherwise_ |   _terribly_ |
|:-----------|-----------:|--------:|---------:|------------:|-----------:|
| `SUM`        | 81,608,842 | 246,398 |  117,353 |      89,673 |     64,578 |
| _small_      |    389,446 |     524 |       17 |          85 |         89 |
| _amazing_    |    170,520 |     582 |        2 |          89 |         13 |
| _rare_       |    129,085 |     495 |        0 |          80 |         51 |
| _affordable_ |    124,276 |      39 |      828 |          16 |         10 |


## `RBdirect` 4 × 4 Sample

_semi-random selection (from top $10.0\%$) + `SUM`_

|                 |       `SUM` |   _generally_ |   _about_ |   _both_ |   _partly_ |
|:----------------|----------:|------------:|--------:|-------:|---------:|
| `SUM`             | 3,148,010 |       2,748 |   1,314 |    349 |       71 |
| _straightforward_ |     4,743 |           6 |       0 |      0 |        0 |
| _honest_          |     2,563 |           1 |       0 |      1 |        0 |
| _predictable_     |     1,988 |           3 |       1 |      0 |        0 |
| _extensive_       |     1,494 |           0 |       0 |      0 |        0 |


## `RBdirect_complement` 4 × 4 Sample

_semi-random selection (from top $10.0\%$) + `SUM`_

|              |        `SUM` |      _very_ |        _so_ |      _also_ |   _perfectly_ |
|:-------------|-----------:|----------:|----------:|----------:|------------:|
| `SUM`          | 80,136,333 | 9,722,389 | 5,394,914 | 1,106,341 |     186,318 |
| _recent_       |    401,264 |     5,556 |       605 |        37 |           1 |
| _conservative_ |    104,229 |     9,986 |       650 |       122 |          12 |
| _normal_       |     87,321 |     4,159 |     1,838 |       596 |       8,578 |
| _subject_      |     51,278 |       204 |       219 |     4,283 |           2 |



In [8]:
def collect_margin_stats(df: pd.DataFrame, 
                         transpose: bool=False) -> pd.DataFrame:
    _df = df.copy()

    if transpose:
        _df = _df.transpose()
    sv = _df[['SUM']]
    return enhance_descrip(sv.rename(columns={'SUM': sv.index.name.replace('form_lower', 'stats')})).T

# for ms in [collect_margin_stats(data.sample(1).frame.squeeze(), transpose=b) for b in (True, False)]:
#     print_md_table(ms)


In [9]:

for margin in (data.frame.apply(collect_margin_stats, transpose=b) for b in (True, False)):
    data[margin.iat[0].columns[0]] = margin


In [10]:
data['sqrt_frame'] = data.frame.apply(transform_counts)
# data.sqrt_frame.iat[-1].round(2)

In [11]:
for margin in (data.sqrt_frame.apply(collect_margin_stats, transpose=b) for b in (True, False)):
    margin = margin.apply(lambda m: m.rename(columns={'adv_stats': 'adv_stats_sqrt', 
                                                      'adj_stats': 'adj_stats_sqrt'}))
    
    data[margin.iat[0].columns[0]] = margin


In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9 entries, ALL to ANYmirror_complement
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   path            9 non-null      object
 1   frame           9 non-null      object
 2   complement      9 non-null      bool  
 3   mirror          9 non-null      bool  
 4   category        9 non-null      object
 5   adv_stats       9 non-null      object
 6   adj_stats       9 non-null      object
 7   sqrt_frame      9 non-null      object
 8   adv_stats_sqrt  9 non-null      object
 9   adj_stats_sqrt  9 non-null      object
dtypes: bool(2), object(8)
memory usage: 966.0+ bytes


In [13]:
all_stats = data.copy().filter(like='stats').loc['ALL', :]

In [14]:
for category, cd in data.copy().groupby('category'):
    if category == 'ALL': 
        continue
    print(f'\n## `{category}` statistical comparison of evaluation')
    for cat_stats in [cd.filter(regex=r'stats$'), cd.filter(regex=r'sqrt$') ]:
        for col in cat_stats:
            table = all_stats[col].rename(columns={col:'ALL'})
            for ix in cat_stats.index: 
                table = table.join(cat_stats.loc[ix, col].rename(columns={col:ix}))
            print_md_table(table, n_dec=2, title=f"\n### `{category}`: `{col.replace('_stats', '')}`\n")



## `ANYmirror` statistical comparison of evaluation

### `ANYmirror`: `adv`

|              |            ALL |    ANYmirror |   ANYmirror_complement |
|:-------------|---------------:|-------------:|-----------------------:|
| unique_forms |       1,006.00 |     1,006.00 |               1,006.00 |
| mean         |     165,575.20 |     3,898.50 |             161,676.80 |
| std          |   2,686,912.10 |    63,582.50 |           2,623,756.10 |
| min          |         868.00 |         1.00 |                 783.00 |
| 25%          |       1,705.00 |        38.00 |               1,665.20 |
| median       |       4,053.00 |        85.50 |               3,950.00 |
| 75%          |      15,773.80 |       322.50 |              15,542.50 |
| max          |  83,284,343.00 | 1,960,936.00 |          81,323,407.00 |
| total        | 166,568,686.00 | 3,921,872.00 |         162,646,814.00 |
| var_coeff    |          16.20 |        16.30 |                  16.20 |
| range        |  83,283,475.00 | 

In [18]:
# all_stats.index.to_series().apply(
#     lambda ix: print_md_table(
#         all_stats.iloc[:, -2][ix].join(_stats.iloc[:, -1][ix]), 
#         title=f'\n`{ix}` margin stats\n', 
#         n_dec=2))
