 # Comparing bigram associations with polarity for different methods

 1. **mirror** comparison associations
 1. (original) "set **complement**" associations

In [1]:
from pathlib import Path

import pandas as pd

UCS_DIR = Path('/share/compling/projects/sanpi/results/ucs')
pd.set_option('display.max_colwidth', 20)
pd.set_option('display.max_columns', 8)
pd.set_option('display.width', 200)

 - "skewed" cutoff set to adjusted conditional probability of polar context `> 0.8`
 - loaded data contains rows for only pairings that occur `>= 15` times

In [2]:
COUNT_FLOOR = 15
THRESH = 0.8


def select_data(df):
    return (df.loc[df.am_p1_given2 > THRESH, :]
            # > remove the "rank" columns
            .filter(regex=r'^[^r][^_]'))

 ## Set paths for current parameters

In [3]:
def set_paths():
    return {
        'mirr': {
            'csv': UCS_DIR.joinpath(
                f'readable/MIRROR_polar_bigram/polarized-bigram_MIRROR.35f-868thresh_min{COUNT_FLOOR}x.rsort-view.csv'),
            'pickle': UCS_DIR.joinpath(
                f'dataframes/polarized-bigram_MIRROR.35f-868thresh_min{COUNT_FLOOR}x.rsort-view_extra.pkl.gz')},
        'comp': {
            'csv': UCS_DIR.joinpath(
                f'readable/polarized_bigram/polarized-bigram_min{COUNT_FLOOR}x.rsort-view.csv'),
            'pickle': UCS_DIR.joinpath(
                f'dataframes/polarized-bigram_min{COUNT_FLOOR}x.rsort-view_extra.pkl.gz')}
    }


PATHS = set_paths()

 ## Identify all skewed bigram-polarity pairings

In [4]:
def get_skewed(method, load_format):
    if load_format == 'csv':
        adf = pd.read_csv(PATHS[method]['csv'])
    else:
        adf = pd.read_pickle(PATHS[method]['pickle'])
    print(f'\n### {method} {load_format}\n')
    print(adf.head(3))
    print()
    print(adf.copy()[['l1','f1', 'N']].drop_duplicates('l1').set_index('l1').to_markdown(floatfmt=',.0f'))
    
    skew_df = select_data(adf)
    print()
    print(skew_df.l1.value_counts().to_frame('polarity totals in "skewed"').to_markdown(floatfmt=',.0f'))
    print(f'\n### Top 10 skewed {method} (from {load_format})')
    print(skew_df.head(10).to_markdown(floatfmt='.2f'))
    
    return skew_df

def show_adv_counts(skew_df): 
    try:
        adverbs = skew_df.adv
    except AttributeError: 
        adverbs = skew_df.l2.str.split('_').str.get(0)
    counts = adverbs.value_counts()
        
    print(counts.loc[counts>0].to_frame('# skewed bigrams').to_markdown(floatfmt=',.0f'))

## Loading from `*.pkl.gz` tables
### mirror associations (from `*.pkl.gz`)

In [5]:
pmirr = get_skewed('mirr', 'pickle')

FileNotFoundError: [Errno 2] No such file or directory: '/share/compling/projects/sanpi/results/assoc_df/polarized-bigram_MIRROR.35f-868thresh_min15x.rsort-view_extra.pkl.gz'

In [None]:
pmirr

Unnamed: 0_level_0,index,l1,l2,E11,...,adv,adj,adv_total,adj_total
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
NEG-ever_simple,0,NEGATIVE,ever_simple,30.131042,...,ever,simple,5050.0,27767.0
NEG-ever_enough,1,NEGATIVE,ever_enough,21.397407,...,ever,enough,5050.0,10176.0
NEG-ever_certain,2,NEGATIVE,ever_certain,20.815164,...,ever,certain,5050.0,2116.0
NEG-entirely_certain,3,NEGATIVE,entirely_certain,9.024757,...,entirely,certain,11491.0,2116.0
NEG-ever_black,4,NEGATIVE,ever_black,8.151393,...,ever,black,5050.0,1704.0
...,...,...,...,...,...,...,...,...,...
NEG-too_surprising,88,NEGATIVE,too_surprising,31.441087,...,too,surprising,186765.0,3040.0
NEG-too_careful,89,NEGATIVE,too_careful,103.930261,...,too,careful,186765.0,2162.0
NEG-particularly_groundbreaking,90,NEGATIVE,particularly_gro...,2.765651,...,particularly,groundbreaking,14612.0,134.0
NEG-inherently_improper,91,NEGATIVE,inherently_improper,2.765651,...,inherently,improper,5467.0,176.0


In [None]:
show_adv_counts(pmirr)

|              |   # skewed bigrams |
|:-------------|-------------------:|
| ever         |                 36 |
| particularly |                  9 |
| really       |                  7 |
| too          |                  6 |
| more         |                  5 |
| terribly     |                  4 |
| yet          |                  3 |
| fully        |                  3 |
| truly        |                  3 |
| entirely     |                  3 |
| quite        |                  2 |
| especially   |                  2 |
| inherently   |                  2 |
| exactly      |                  2 |
| that         |                  1 |
| there        |                  1 |
| anywhere     |                  1 |
| any          |                  1 |
| overtly      |                  1 |
| overly       |                  1 |


### complement associations (from `*.pkl.gz`)

In [None]:
pcomp = get_skewed('comp', 'pickle')


### comp pickle

                      index       l1                   l2    f  ...  ipm_reference  ipm_expected      adv          adj
key                                                             ...                                                   
NEG-exactly_shy           0  NEGATED          exactly_shy  124  ...            0.0      1.488875  exactly          shy
NEG-exactly_forth...      1  NEGATED  exactly_forthcoming  107  ...            0.0      1.284755  exactly  forthcoming
NEG-exactly_pract...      2  NEGATED    exactly_practical  106  ...            0.0      1.272748  exactly    practical

[3 rows x 59 columns]

| l1         |         f1 |          N |
|:-----------|-----------:|-----------:|
| NEGATED    |  3,151,804 | 83,284,343 |
| COMPLEMENT | 80,132,539 | 83,284,343 |

|            |   polarity totals in "skewed" |
|:-----------|------------------------------:|
| NEGATED    |                         1,022 |
| COMPLEMENT |                             0 |

### Top 1

In [None]:
pcomp

Unnamed: 0_level_0,index,l1,l2,E11,...,ipm_reference,ipm_expected,adv,adj
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
NEG-exactly_shy,0,NEGATED,exactly_shy,4.692643,...,0.000000,1.488875,exactly,shy
NEG-exactly_forthcoming,1,NEGATED,exactly_forthcoming,4.049297,...,0.000000,1.284755,exactly,forthcoming
NEG-exactly_practical,2,NEGATED,exactly_practical,4.011453,...,0.000000,1.272748,exactly,practical
NEG-exactly_impressive,3,NEGATED,exactly_impressive,3.784389,...,0.000000,1.200706,exactly,impressive
NEG-exactly_straightforward,4,NEGATED,exactly_straight...,3.141043,...,0.000000,0.996586,exactly,straightforward
...,...,...,...,...,...,...,...,...,...
NEG-exactly_amazing,1017,NEGATED,exactly_amazing,1.173161,...,0.062397,0.372219,exactly,amazing
NEG-only_excessive,1018,NEGATED,only_excessive,1.173161,...,0.062397,0.372219,only,excessive
NEG-exactly_normal,1019,NEGATED,exactly_normal,6.092867,...,0.324462,1.933136,exactly,normal
NEG-only_passionate,1020,NEGATED,only_passionate,9.120378,...,0.486694,2.893701,only,passionate


In [None]:
show_adv_counts(pcomp)

|              |   # skewed bigrams |
|:-------------|-------------------:|
| exactly      |                318 |
| only         |                228 |
| necessarily  |                168 |
| that         |                135 |
| yet          |                 67 |
| always       |                 21 |
| entirely     |                 12 |
| even         |                  8 |
| terribly     |                  7 |
| nearly       |                  7 |
| just         |                  6 |
| altogether   |                  6 |
| too          |                  5 |
| immediately  |                  4 |
| quite        |                  3 |
| all          |                  3 |
| really       |                  3 |
| constantly   |                  2 |
| real         |                  2 |
| remotely     |                  2 |
| mutually     |                  2 |
| actually     |                  1 |
| overly       |                  1 |
| usually      |                  1 |
| individual

## Loading from `*.csv` tables
### mirror associations (from `*.csv`)

In [None]:
cmirr = get_skewed('mirr', 'csv')


### mirr csv

         l1            l2    f        E11  ...  am_expect_diff      f1   f2        N
0  NEGATIVE   ever_simple  207  30.131042  ...      176.868958  285435  207  1960936
1  NEGATIVE   ever_enough  147  21.397407  ...      125.602593  285435  147  1960936
2  NEGATIVE  ever_certain  143  20.815164  ...      122.184836  285435  143  1960936

[3 rows x 31 columns]

| l1       |        f1 |         N |
|:---------|----------:|----------:|
| NEGATIVE |   285,435 | 1,960,936 |
| POSITIVE | 1,675,501 | 1,960,936 |

|          |   polarity totals in "skewed" |
|:---------|------------------------------:|
| NEGATIVE |                            93 |

### Top 10 skewed mirr (from csv)
|    | l1       | l2               |   E11 |   am_log_likelihood |   am_log_likelihood_tt |   am_odds_ratio_disc |   am_Dice |   am_t_score |   am_p1_given2 |   am_p2_given1 |   am_p1_given2_simple |   am_p2_given1_simple |   am_p1_given2_margin |   am_p2_given1_margin |   am_expect_diff |     f1 |   

In [None]:
cmirr

Unnamed: 0,l1,l2,E11,am_log_likelihood,...,am_p2_given1_margin,am_expect_diff,f1,f2
0,NEGATIVE,ever_simple,30.131042,797.973725,...,0.000620,176.868958,285435,207
1,NEGATIVE,ever_enough,21.397407,566.650578,...,0.000440,125.602593,285435,147
2,NEGATIVE,ever_certain,20.815164,551.229802,...,0.000428,122.184836,285435,143
3,NEGATIVE,entirely_certain,9.024757,238.979702,...,0.000186,52.975243,285435,62
4,NEGATIVE,ever_black,8.151393,215.851628,...,0.000168,47.848607,285435,56
...,...,...,...,...,...,...,...,...,...
88,NEGATIVE,too_surprising,31.441087,706.788645,...,0.000608,173.558913,285435,216
89,NEGATIVE,too_careful,103.930261,2331.281185,...,0.002008,573.069739,285435,714
90,NEGATIVE,particularly_gro...,2.765651,61.858136,...,0.000053,15.234349,285435,19
91,NEGATIVE,inherently_improper,2.765651,61.858136,...,0.000053,15.234349,285435,19


In [None]:
show_adv_counts(cmirr)

|              |   # skewed bigrams |
|:-------------|-------------------:|
| ever         |                 36 |
| particularly |                  9 |
| really       |                  7 |
| too          |                  6 |
| more         |                  5 |
| terribly     |                  4 |
| truly        |                  3 |
| yet          |                  3 |
| fully        |                  3 |
| entirely     |                  3 |
| exactly      |                  2 |
| quite        |                  2 |
| especially   |                  2 |
| inherently   |                  2 |
| overtly      |                  1 |
| anywhere     |                  1 |
| there        |                  1 |
| overly       |                  1 |
| that         |                  1 |
| any          |                  1 |


### complement associations (from `*.csv`)

In [None]:
ccomp = get_skewed('comp', 'csv')


### comp csv

        l1                   l2    f       E11  ...  am_expect_diff       f1   f2         N
0  NEGATED          exactly_shy  124  4.692643  ...      119.307357  3151804  124  83284343
1  NEGATED  exactly_forthcoming  107  4.049297  ...      102.950703  3151804  107  83284343
2  NEGATED    exactly_practical  106  4.011453  ...      101.988547  3151804  106  83284343

[3 rows x 31 columns]

| l1         |         f1 |          N |
|:-----------|-----------:|-----------:|
| NEGATED    |  3,151,804 | 83,284,343 |
| COMPLEMENT | 80,132,539 | 83,284,343 |

|         |   polarity totals in "skewed" |
|:--------|------------------------------:|
| NEGATED |                         1,022 |

### Top 10 skewed comp (from csv)
|    | l1      | l2                      |   E11 |   am_log_likelihood |   am_log_likelihood_tt |   am_odds_ratio_disc |   am_Dice |   am_t_score |   am_p1_given2 |   am_p2_given1 |   am_p1_given2_simple |   am_p2_given1_simple |   am_p1_given2_margin |   am_p2

In [None]:
ccomp

Unnamed: 0,l1,l2,E11,am_log_likelihood,...,am_p2_given1_margin,am_expect_diff,f1,f2
0,NEGATED,exactly_shy,4.692643,812.02752,...,0.000038,119.307357,3151804,124
1,NEGATED,exactly_forthcoming,4.049297,700.70061,...,0.000033,102.950703,3151804,107
2,NEGATED,exactly_practical,4.011453,694.15197,...,0.000032,101.988547,3151804,106
3,NEGATED,exactly_impressive,3.784390,654.86017,...,0.000031,96.215610,3151804,100
4,NEGATED,exactly_straight...,3.141043,543.53351,...,0.000025,79.858957,3151804,83
...,...,...,...,...,...,...,...,...,...
1017,NEGATED,exactly_amazing,1.173161,143.25703,...,0.000008,24.826839,3151804,31
1018,NEGATED,only_excessive,1.173161,143.25703,...,0.000008,24.826839,3151804,31
1019,NEGATED,exactly_normal,6.092867,743.70168,...,0.000041,128.907133,3151804,161
1020,NEGATED,only_passionate,9.120379,1112.45860,...,0.000061,192.879621,3151804,241


In [None]:
show_adv_counts(ccomp)

|              |   # skewed bigrams |
|:-------------|-------------------:|
| exactly      |                318 |
| only         |                228 |
| necessarily  |                168 |
| that         |                135 |
| yet          |                 67 |
| always       |                 21 |
| entirely     |                 12 |
| even         |                  8 |
| nearly       |                  7 |
| terribly     |                  7 |
| just         |                  6 |
| altogether   |                  6 |
| too          |                  5 |
| immediately  |                  4 |
| all          |                  3 |
| quite        |                  3 |
| really       |                  3 |
| constantly   |                  2 |
| remotely     |                  2 |
| real         |                  2 |
| mutually     |                  2 |
| about        |                  1 |
| therefore    |                  1 |
| individually |                  1 |
| honestly  