In [1]:
# coding=utf-8
from pathlib import Path

import pandas as pd

from source.utils import (PKL_SUFF, SAMPLE_ADV, print_iter, print_md_table,
                          timestamp_today)
from source.utils.associate import AM_DF_DIR, adjust_assoc_columns
from source.utils.general import print_iter

pd.set_option('display.max_columns', 12)
pd.set_option('display.width', 250)
pd.set_option("display.float_format", '{:,.2f}'.format)

FOCUS = ['f', 'E11', 'unexpected_f',
         'am_p1_given2', 'conservative_log_ratio',
         'am_log_likelihood',
        #  'mutual_information', 'am_odds_ratio_disc', 't_score',
         'N', 'f1', 'f2', 'l1', 'l2']



In [18]:
def nb_show_table(df, n_dec:int=2, 
                   adjust_columns:bool=True) -> None: 
    _df = df.copy()
    if adjust_columns: 
        _df = adjust_assoc_columns(_df)
    _df.columns = [f'`{c}`' for c in _df.columns]
    _df.index = [f'**{r}**' for r in _df.index ]
    print('\n'+_df.to_markdown(floatfmt=f',.{n_dec}f', intfmt=',')+'\n')
    return _df

Set global parameters

In [2]:
ADV = 'exactly'
SET_FLOOR = 5000
MIR_FLOOR = 200

In [3]:
POLAR_DIR = AM_DF_DIR.joinpath('polar')
polar_adv_dirs = []
#> example of intended: `AM_DF_DIR`/polar/RBdirect/adv/extra/polarized-adv_35f-7c_min2000x_extra.pkl.gz
#> example of intended: `AM_DF_DIR`/polar/NEGmirror/adv/extra/polarized-adv_MIRROR_polarized.35f-7c_min1000x_extra.pkl.gz
adv_am_paths = {
    p.name: tuple(
        p.joinpath('adv/extra').glob(
            f'*35f-7c_min{SET_FLOOR if p.name == "RBdirect" else MIR_FLOOR}x*{PKL_SUFF}')
    )[0]
    for p in POLAR_DIR.iterdir()}

# diff = pd.read_pickle(adv_am_paths['RBdirect'])
# mirr = pd.read_pickle(adv_am_paths['NEGmirror'])
def update_index(df):
    neg = df.filter(like='NEG', axis=0).l1[0]
    index_update = 'NEGmirror' if neg.endswith('MIR') else 'NEGdiff'
    df.index = df.index.str.replace('NEG', index_update)
    return df

adv_amdf_dict = {n: update_index(pd.read_pickle(adv_am_paths[n])) for n in  ['RBdirect', 'NEGmirror'] }
# pd.concat((d.sample(5) for d in adv_amdf_dict.values()))

In [4]:
adv_amdf_dict['NEGmirror'].filter(like=ADV, axis=0)

Unnamed: 0_level_0,l1,l2,f,E11,am_log_likelihood,am_odds_ratio_disc,...,conservative_log_ratio_05,conservative_log_ratio_nc,conservative_log_ratio_dv,f_sqrt,f1_sqrt,f2_sqrt
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
NEGmirror~exactly,NEGMIR,exactly,813,161.15,1939.47,1.2,...,3.56,3.72,3.57,28.51,542.18,33.38
POS~exactly,POSMIR,exactly,301,952.84,-1939.4,-1.2,...,-3.56,-3.72,-3.57,17.35,1318.37,33.38


In [22]:
exactly_am = pd.concat([df.filter(like=ADV, axis=0) for df in adv_amdf_dict.values()])
nb_show_table(exactly_am[FOCUS[:-1]])


|                       |    `f` |   `exp_f` |   `unexp_f` |   `dP1` |   `LRC` |        `G2` |        `N` |       `f1` |   `f2` | `l1`       |
|:----------------------|-------:|----------:|------------:|--------:|--------:|------------:|-----------:|-----------:|-------:|:-----------|
| **RBdirect~exactly**  | 43,635 |  2,301.98 |   41,333.02 |    0.67 |    5.90 |  214,404.20 | 86,330,752 |  3,226,213 | 61,599 | NEGATED    |
| **COM~exactly**       | 17,964 | 59,295.24 |  -41,331.24 |   -0.67 |   -5.90 | -214,337.11 | 86,330,752 | 83,102,035 | 61,599 | COMPLEMENT |
| **NEGmirror~exactly** |    813 |    161.15 |      651.85 |    0.59 |    3.51 |    1,939.47 |  2,032,082 |    293,963 |  1,114 | NEGMIR     |
| **POS~exactly**       |    301 |    952.84 |     -651.84 |   -0.59 |   -3.51 |   -1,939.40 |  2,032,082 |  1,738,105 |  1,114 | POSMIR     |



Unnamed: 0,`f`,`exp_f`,`unexp_f`,`dP1`,`LRC`,`G2`,`N`,`f1`,`f2`,`l1`
**RBdirect~exactly**,43635,2301.98,41333.02,0.67,5.9,214404.2,86330752,3226213,61599,NEGATED
**COM~exactly**,17964,59295.24,-41331.24,-0.67,-5.9,-214337.11,86330752,83102035,61599,COMPLEMENT
**NEGmirror~exactly**,813,161.15,651.85,0.59,3.51,1939.47,2032082,293963,1114,NEGMIR
**POS~exactly**,301,952.84,-651.84,-0.59,-3.51,-1939.4,2032082,1738105,1114,POSMIR


In [6]:
exactly_am.filter(like='given')

Unnamed: 0_level_0,am_p1_given2,am_p2_given1,am_p1_given2_simple,am_p2_given1_simple
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
RBdirect~exactly,0.67,0.01,0.71,0.01
COM~exactly,-0.67,-0.01,0.29,0.0
NEGmirror~exactly,0.59,0.0,0.73,0.0
POS~exactly,-0.59,-0.0,0.27,0.0


$\Delta P(l_2|l_1) = a/(a+b) - c/(c+d)$

- `deltaP2_given1 = (f / f1) - ((f2 - f) / (N - f1))`

$\Delta P(l_1|l_2) = a/(a+c) - b/(b+d)$

- `deltaP1_given2 = (f / f2) - ((f1 - f) / (N - f2))`

- a=w1 & w2: joint frequency 
  - `a = f`
- b=w1 & NOT w2 
  - `b = f1 - f`
- c=NOT w1 & w2 
  - `c = f2 - f`
- d=NEITHER w{1,2} 
  - `d = O22`
  - `d = N- a - c - b`
  - `d = N - f1 - (f2 - f)` 
  - `d = N - f1 - f2 + f`
- a+b=all with w1: w1 marginal frequency 
  - `a + b = f1`    
- c+d=all WITHOUT w1: N - w1 marginal frequency 
  - `c + d = N - f1`
- a+c=all with w2 : w2 marginal freq 
  - `a + c = f2`
- b+d=all WITHOUT w2 
  - `b + d = N - f2`
- a+b+c+d=N (all tokens considered)

In [7]:
def deltaP(row: pd.Series, given_margin: str = 'f2', ratio:bool=False #, minus_others:bool=True, 
           ):
    focus_margin = 'f1' if given_margin == 'f2' else 'f2'
    # deltaP1_given2 = (f / f2) - ((f2 - f) / (N - f2))
    # deltaP2_given1 = (f / f1) - ((f2 - f) / (N - f1))
    focus = row.name.split('~')[0 if given_margin.endswith('2') else 1]
    given = row.name.replace(focus, '').strip('~')
    print(row.filter(items=['f', 'f1', 'f2', 'N']).to_frame().T.to_markdown(intfmt=','))
    f = row.f #> a
    fp = row[focus_margin]
    # print(f'given: {given_margin}')
    fg = row[given_margin]
    # print(f'f given => {fx:,}')
    cond_p = f / fg
    print(f'\nP({focus}|{given}) => {f:,} / {fg:,} = {round(cond_p, 3):.3f}')
    adjust_num = fp - f #> c or b
    adjust_denom = row.N - fg
    adjust = adjust_num / adjust_denom
    print(f'  {"/" if ratio else "-"} P({focus}|~{given}) => {adjust_num:,} / {adjust_denom:,} = {round(adjust, 3):.3f}')# if minus_others else f'adjustment: P()')
    deltaP = cond_p / adjust if ratio else cond_p - adjust
    print(f'= {round(deltaP, 3):,.3f}\n')
    return deltaP


## $\Delta P$ as defined ($with - without$ difference)

### $\Delta P(1|2) = P(\texttt{env}|\texttt{adv})$

In [8]:
env_given_exactly = exactly_am.apply(deltaP, axis=1)
env_given_exactly

|                  |      f |        f1 |     f2 |          N |
|:-----------------|-------:|----------:|-------:|-----------:|
| RBdirect~exactly | 43,635 | 3,226,213 | 61,599 | 86,330,752 |

P(RBdirect|exactly) => 43,635 / 61,599 = 0.708
  - P(RBdirect|~exactly) => 3,182,578 / 86,269,153 = 0.037
= 0.671

|             |      f |         f1 |     f2 |          N |
|:------------|-------:|-----------:|-------:|-----------:|
| COM~exactly | 17,964 | 83,102,035 | 61,599 | 86,330,752 |

P(COM|exactly) => 17,964 / 61,599 = 0.292
  - P(COM|~exactly) => 83,084,071 / 86,269,153 = 0.963
= -0.671

|                   |   f |      f1 |    f2 |         N |
|:------------------|----:|--------:|------:|----------:|
| NEGmirror~exactly | 813 | 293,963 | 1,114 | 2,032,082 |

P(NEGmirror|exactly) => 813 / 1,114 = 0.730
  - P(NEGmirror|~exactly) => 293,150 / 2,030,968 = 0.144
= 0.585

|             |   f |        f1 |    f2 |         N |
|:------------|----:|----------:|------:|----------:|
| POS~exact

key
RBdirect~exactly     0.67
COM~exactly         -0.67
NEGmirror~exactly    0.59
POS~exactly         -0.59
dtype: float64

### $\Delta P(2|1) = P(\texttt{adv}|\texttt{env})$

In [9]:
exactly_given_env = exactly_am.apply(deltaP, given_margin='f1', axis=1)
exactly_given_env

|                  |      f |        f1 |     f2 |          N |
|:-----------------|-------:|----------:|-------:|-----------:|
| RBdirect~exactly | 43,635 | 3,226,213 | 61,599 | 86,330,752 |

P(exactly|RBdirect) => 43,635 / 3,226,213 = 0.014
  - P(exactly|~RBdirect) => 17,964 / 83,104,539 = 0.000
= 0.013

|             |      f |         f1 |     f2 |          N |
|:------------|-------:|-----------:|-------:|-----------:|
| COM~exactly | 17,964 | 83,102,035 | 61,599 | 86,330,752 |

P(exactly|COM) => 17,964 / 83,102,035 = 0.000
  - P(exactly|~COM) => 43,635 / 3,228,717 = 0.014
= -0.013

|                   |   f |      f1 |    f2 |         N |
|:------------------|----:|--------:|------:|----------:|
| NEGmirror~exactly | 813 | 293,963 | 1,114 | 2,032,082 |

P(exactly|NEGmirror) => 813 / 293,963 = 0.003
  - P(exactly|~NEGmirror) => 301 / 1,738,119 = 0.000
= 0.003

|             |   f |        f1 |    f2 |         N |
|:------------|----:|----------:|------:|----------:|
| POS~exactly 

key
RBdirect~exactly     0.01
COM~exactly         -0.01
NEGmirror~exactly    0.00
POS~exactly         -0.00
dtype: float64

## ✨ Alternate $\Delta P$ ($with / without$ ratio)

### $\Delta P(1|2) = P(\texttt{env}|\texttt{adv})$

In [10]:
env_exactly_ratio = exactly_am.apply(deltaP, ratio=True, axis=1)
env_exactly_ratio.to_frame('deltaP1_ratio')

|                  |      f |        f1 |     f2 |          N |
|:-----------------|-------:|----------:|-------:|-----------:|
| RBdirect~exactly | 43,635 | 3,226,213 | 61,599 | 86,330,752 |

P(RBdirect|exactly) => 43,635 / 61,599 = 0.708
  / P(RBdirect|~exactly) => 3,182,578 / 86,269,153 = 0.037
= 19.202

|             |      f |         f1 |     f2 |          N |
|:------------|-------:|-----------:|-------:|-----------:|
| COM~exactly | 17,964 | 83,102,035 | 61,599 | 86,330,752 |

P(COM|exactly) => 17,964 / 61,599 = 0.292
  / P(COM|~exactly) => 83,084,071 / 86,269,153 = 0.963
= 0.303

|                   |   f |      f1 |    f2 |         N |
|:------------------|----:|--------:|------:|----------:|
| NEGmirror~exactly | 813 | 293,963 | 1,114 | 2,032,082 |

P(NEGmirror|exactly) => 813 / 1,114 = 0.730
  / P(NEGmirror|~exactly) => 293,150 / 2,030,968 = 0.144
= 5.056

|             |   f |        f1 |    f2 |         N |
|:------------|----:|----------:|------:|----------:|
| POS~exact

Unnamed: 0_level_0,deltaP1_ratio
key,Unnamed: 1_level_1
RBdirect~exactly,19.2
COM~exactly,0.3
NEGmirror~exactly,5.06
POS~exactly,0.32


### $\Delta P(2|1) = P(\texttt{adv}|\texttt{env})$

In [11]:
exactly_env_ratio = exactly_am.apply(deltaP, ratio=True, given_margin='f1', axis=1)
exactly_env_ratio.to_frame('exactly_given_env_ratio')

|                  |      f |        f1 |     f2 |          N |
|:-----------------|-------:|----------:|-------:|-----------:|
| RBdirect~exactly | 43,635 | 3,226,213 | 61,599 | 86,330,752 |

P(exactly|RBdirect) => 43,635 / 3,226,213 = 0.014
  / P(exactly|~RBdirect) => 17,964 / 83,104,539 = 0.000
= 62.570

|             |      f |         f1 |     f2 |          N |
|:------------|-------:|-----------:|-------:|-----------:|
| COM~exactly | 17,964 | 83,102,035 | 61,599 | 86,330,752 |

P(exactly|COM) => 17,964 / 83,102,035 = 0.000
  / P(exactly|~COM) => 43,635 / 3,228,717 = 0.014
= 0.016

|                   |   f |      f1 |    f2 |         N |
|:------------------|----:|--------:|------:|----------:|
| NEGmirror~exactly | 813 | 293,963 | 1,114 | 2,032,082 |

P(exactly|NEGmirror) => 813 / 293,963 = 0.003
  / P(exactly|~NEGmirror) => 301 / 1,738,119 = 0.000
= 15.970

|             |   f |        f1 |    f2 |         N |
|:------------|----:|----------:|------:|----------:|
| POS~exactly

Unnamed: 0_level_0,exactly_given_env_ratio
key,Unnamed: 1_level_1
RBdirect~exactly,62.57
COM~exactly,0.02
NEGmirror~exactly,15.97
POS~exactly,0.06


In [12]:
pd.DataFrame([env_given_exactly, #env_given_exactly_alt, 
              exactly_given_env, #exactly_given_env_alt
              ], 
             index = ['deltaP1', #'deltaP1_alt', 
                      'deltaP2', #'deltaP2_alt'
                      ]).T.join(env_exactly_ratio.to_frame('deltaP1_ratio')).join(exactly_env_ratio.to_frame('deltaP2_ratio'))

Unnamed: 0_level_0,deltaP1,deltaP2,deltaP1_ratio,deltaP2_ratio
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
RBdirect~exactly,0.67,0.01,19.2,62.57
COM~exactly,-0.67,-0.01,0.3,0.02
NEGmirror~exactly,0.59,0.0,5.06,15.97
POS~exactly,-0.59,-0.0,0.32,0.06


In [13]:
def add_dp_columns(amdf:pd.DataFrame):
    for ratio in (True, False): 
        for given in ('f1', 'f2'): 

            amdf[f'dP{1 if given=="f2" else 2}_{"ratio" if ratio else "diff"}'
                       ] = amdf.apply(deltaP, ratio=ratio, given_margin=given, axis=1)

    return amdf
exactly_am = add_dp_columns(exactly_am)

|                  |      f |        f1 |     f2 |          N |
|:-----------------|-------:|----------:|-------:|-----------:|
| RBdirect~exactly | 43,635 | 3,226,213 | 61,599 | 86,330,752 |

P(exactly|RBdirect) => 43,635 / 3,226,213 = 0.014
  / P(exactly|~RBdirect) => 17,964 / 83,104,539 = 0.000
= 62.570

|             |      f |         f1 |     f2 |          N |
|:------------|-------:|-----------:|-------:|-----------:|
| COM~exactly | 17,964 | 83,102,035 | 61,599 | 86,330,752 |

P(exactly|COM) => 17,964 / 83,102,035 = 0.000
  / P(exactly|~COM) => 43,635 / 3,228,717 = 0.014
= 0.016

|                   |   f |      f1 |    f2 |         N |
|:------------------|----:|--------:|------:|----------:|
| NEGmirror~exactly | 813 | 293,963 | 1,114 | 2,032,082 |

P(exactly|NEGmirror) => 813 / 293,963 = 0.003
  / P(exactly|~NEGmirror) => 301 / 1,738,119 = 0.000
= 15.970

|             |   f |        f1 |    f2 |         N |
|:------------|----:|----------:|------:|----------:|
| POS~exactly

In [14]:
exactly_am.filter(regex=r'am_p|dP')

Unnamed: 0_level_0,am_p1_given2,am_p2_given1,am_p1_given2_simple,am_p2_given1_simple,dP2_ratio,dP1_ratio,dP2_diff,dP1_diff
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
RBdirect~exactly,0.67,0.01,0.71,0.01,62.57,19.2,0.01,0.67
COM~exactly,-0.67,-0.01,0.29,0.0,0.02,0.3,-0.01,-0.67
NEGmirror~exactly,0.59,0.0,0.73,0.0,15.97,5.06,0.0,0.59
POS~exactly,-0.59,-0.0,0.27,0.0,0.06,0.32,-0.0,-0.59


## Confirm Calculations by duplicating Gries (2013) example (p. 8)

In [15]:
of_course = pd.Series({'f': 5610, 'f1': 174548, 'f2': 7867, 'N': 10409898}, name = 'of~course', dtype='int64')
of_course.to_frame().T

Unnamed: 0,f,f1,f2,N
of~course,5610,174548,7867,10409898


In [16]:
of_course.to_frame().T.apply(deltaP, axis=1)

|           |    f |     f1 |   f2 |           N |
|:----------|-----:|-------:|-----:|------------:|
| of~course | 5610 | 174548 | 7867 | 1.04099e+07 |

P(of|course) => 5,610 / 7,867 = 0.713
  - P(of|~course) => 168,938 / 10,402,031 = 0.016
= 0.697



of~course   0.70
dtype: float64

In [17]:
of_course.to_frame().T.apply(deltaP, given_margin='f1', axis=1)

|           |    f |     f1 |   f2 |           N |
|:----------|-----:|-------:|-----:|------------:|
| of~course | 5610 | 174548 | 7867 | 1.04099e+07 |

P(course|of) => 5,610 / 174,548 = 0.032
  - P(course|~of) => 2,257 / 10,235,350 = 0.000
= 0.032



of~course   0.03
dtype: float64