In [16]:
import pandas as pd

from source.utils import FREQ_DIR
from source.utils.associate import (BINARY_ASSOC_ARGS, add_extra_am,
                                    associate_ucs, confirm_basic_ucs)
from source.utils.associate import convert_ucs_to_csv as ucs2csv
from source.utils.associate import get_associations_csv as init_am
from source.utils.associate import manipulate_ucs, seek_readable_ucs

Info for `init_am()`:

The `BINARY_ASSOC_ARGS` tuple follows the same requirments as the arguments for `./script/polar_assoc.py`: 

options:
  | -short             | --long                          | description                                                                                                                                                                                                                                                                                                                        | default value for script                                                                                                     |
  |--------------------|---------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------|
  | -h,                | --help                          | show this help message and exit                                                                                                                                                                                                                                                                                                    |                                                                                                                              |
  | -m MIN_FREQ,       | --min_freq MIN_FREQ             | Minimum frequency of co-occurrences included as rows (everything is still included in the marginal frequencies) in association tables                                                                                                                                                                                              | `500`                                                                                                                          |
  | -a ALL_COUNTS,     | --all_counts ALL_COUNTS         | path to ucs formatted .tsv of all bigram combinations, regardless of polarity                                                                                                                                                                                                                                                      | `/share/compling/projects/sanpi/results/freq_out/RBXadj/ucs_format/AdvAdj_frq-thrMIN-7.35f.tsv`                                |
  | -c COMPARE_COUNTS, | --compare_counts COMPARE_COUNTS | Path to ucs-formatted .tsv of COMPARISON bigram frequencies; e.g. counts for bigram tokens with **no** *identified* negation dependencies. (An approximation of bigrams occurring in 'positive polarity' environments.) The transformed frequency data will be saved as `polarity_prepped_tsv/[COMP_LABEL]_bigram_counts[DATA_SUFFIX]` | `/share/compling/projects/sanpi/results/freq_out/RBdirect/complement/ucs_format/diff_all-RBdirect_AdvAdj_frq-thrMIN-7.35f.tsv` |
  | -C COMP_LABEL,     | --comp_label COMP_LABEL         | Option to set the label for comparison (set difference, not negated, 'positive', etc.) counts. Used for output path generation.                                                                                                                                                                                                    | `complement`                                                                                                                   |
  | -n TARGET_COUNTS,  | --target_counts TARGET_COUNTS   | Path to ucs-formatted .tsv of NEGATED bigram frequencies; i.e. counts for bigram tokens with *identified* negation dependencies. (An approximation of bigrams occurring in 'negative polarity' environments.) The transformed frequency data will be saved as `polarity_prepped_tsv/[NEG_LABEL]_bigram_counts[DATA_SUFFIX]`        | `/share/compling/projects/sanpi/results/freq_out/RBdirect/ucs_format/AdvAdj_frq-thrMIN-7.35f.tsv`                              |
  | -N TARG_LABEL,     | --targ_label TARG_LABEL         | Option to set the label for target counts; Used to generate output path(s) and set the `l1` values for contained `l2` values.                                                                                                                                                                                                      | `negated`                                                                                                                      |
  | -s DATA_SUFFIX,    | --data_suffix DATA_SUFFIX       | Option to indicate specific starting data set as restricted by number of corpus parts/files and frequency threshold                                                                                                                                                                                                                | `.35f-7c.tsv`                                                                                                                  |
  | -S,                | --skew                          | Option to collect skewed lexemes for selected association metrics.                                                                                                                                                                                                                                                                 | `False`                                                                                                                        |
  | -v,                | --verbose                       | Option to print more processing info to stdout                                                                                                                                                                                                                                                                                     | `False`                                                                                                                        |



1. Run `seek_readable_ucs()` to generate consistent output path

    ```python
    def seek_readable_ucs(min_freq: int,
                        target_counts_dir: Path = None,
                        data_suffix: str = '.35f-7c.tsv',
                        unit: str = '',
                        is_polar: bool = True, 
                        contained_counts_path: Path = None,
                        ucs_subdir:str=None) -> Path:
    ```  
    Provide values for:
    - `ucs_subdir`: e.g. `'trigger_eval'`\
      ↪️ this will become a subdir of `./results/ucs/` and will be repeated in `./results/assoc_df/`
    - `contained_counts_path`
    - `min_freq`                


In [10]:
MIR_TRIG_ADV_TSV = FREQ_DIR.joinpath('ANYmirror/ucs_format/TrigAdv_frq-thrMIN-7.35f.tsv')
FRQ_FLOOR = 50

In [21]:
readable = seek_readable_ucs(min_freq=FRQ_FLOOR, ucs_subdir='trigger_eval', 
                                   contained_counts_path=MIR_TRIG_ADV_TSV)
print(readable)
! ls -ho {readable}

    > seeking `trigger_eval/ANYmirror/readable/TrigAdv_frq-thrMIN-7.35f_min50x*` frequency data and initial associations...
/share/compling/projects/sanpi/results/ucs/trigger_eval/ANYmirror/readable/TrigAdv_frq-thrMIN-7.35f_min50x.rsort-view_am-only.csv
-rw-r--r-- 1 arh234 356K May 23 20:54 /share/compling/projects/sanpi/results/ucs/trigger_eval/ANYmirror/readable/TrigAdv_frq-thrMIN-7.35f_min50x.rsort-view_am-only.csv





2. Run `confirm_basic_ucs()`

    ```python
    def confirm_basic_ucs(basic_ucs_path: Path,
                          freq_floor:int=100,
                          contained_counts_path:Path = None,
                          args: BINARY_ASSOC_ARGS = None,
                          unit: str = None):
        if args: 
            contained_counts_path = args.all_counts
        if basic_ucs_path.is_file():
            print('+ existing UCS table found ✓')
        elif unit:
            basic_ucs_path = confirm_polarized_ucs(basic_ucs_path, args, unit)
        elif contained_counts_path and contained_counts_path.is_file():
            build_ucs_table(min_count=freq_floor,
                            ucs_save_path=basic_ucs_path,
                            cat_tsv_str=f'cat {contained_counts_path}')
        else:
            raise FileNotFoundError
        return basic_ucs_path
    ```


In [12]:
basic_ucs_path = confirm_basic_ucs(basic_ucs_path, freq_floor=FRQ_FLOOR, contained_counts_path=MIR_TRIG_ADV_TSV)

+ existing UCS table found ✓


3. Run `associate_ucs()`


In [13]:
associate_ucs(basic_ucs_path)


Calculating UCS associations...

```
bash /share/compling/projects/sanpi/script/transform_ucs.sh /share/compling/projects/sanpi/results/ucs/trigger_eval/ANYmirror/readable/TrigAdv_frq-thrMIN-7.35f_min50x.rsort-view_am-only.txt
> log will be saved to: /share/compling/projects/sanpi/logs/associate/ucs//ucs-readable_TrigAdv_frq-thrMIN-7-35f_min50x.2024-05-23_2051.log
...
```

+ time elapsed → 00:00:13.212


4. Run `ucs_to_csv`

In [14]:
csv_path = ucs2csv(basic_ucs_path)
print(csv_path)

UCS table text converted & saved as /share/compling/projects/sanpi/results/ucs/trigger_eval/ANYmirror/readable/TrigAdv_frq-thrMIN-7.35f_min50x.rsort-view_am-only.csv
/share/compling/projects/sanpi/results/ucs/trigger_eval/ANYmirror/readable/TrigAdv_frq-thrMIN-7.35f_min50x.rsort-view_am-only.csv


5. Load ucs csv as dataframe


In [20]:
! head {csv_path} | column -t -s ','

#     Frequency  signatures  computed   by      the      ucs-make-tables  tool               for  relational  cooccurrences.
#     Sample     size:       N          =       1761853  tokens           V                  =    23125       pair            types.
#     A          frequency   threshold  of      f        >=               50                 was  applied     leaving         V        =    1797     pair    types.
##::  size       =           1797
##::  threshold  =           50
id    l1         l2          f          f1      f2       N                E11                O11  O12         O21             O22      C1   C2       R1      R2       am_log_likelihood  am_odds_ratio_disc  am_p1_given2       am_p2_given1         r_log_likelihood  r_odds_ratio_disc  r_p1_given2  r_p2_given1
829   never      before      284        109723  288      1761853          17.9357891946717   284  109439      4               1652126  288  1761565  109723  1652130  1535.90576785475   2.97973895406291    

In [17]:
trig_adv_amdf = pd.read_csv(csv_path)
trig_adv_amdf

ParserError: Error tokenizing data. C error: Expected 13 fields in line 3, saw 17


6. Save to `./results/assoc_df/`
7. Add additional AM via `add_extra_am()`