# FrameAxis analysis for r/self and r/nosleep

In [1]:
import tensorflow as tf
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)
print("Num GPUs Available: ", len(physical_devices))

Num GPUs Available:  1


In [2]:
import pickle
with open('./Download/data_cleaned.pickle', 'rb') as handle:
    data_cleaned = pickle.load(handle)

# 1. Building a Set of Microframes

In [3]:
import nltk
from nltk.corpus import wordnet as wn
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /home/anthony/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/anthony/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Show all antonyms in WordNet:

In [4]:
from nltk.corpus import wordnet as wn

def antonyms_for(word):
    antonyms = set()
    for ss in wn.synsets(word):
        for lemma in ss.lemmas():
            any_pos_antonyms = [ antonym.name() for antonym in lemma.antonyms() ]
            for antonym in any_pos_antonyms:
                antonym_synsets = wn.synsets(antonym)
                if wn.ADJ not in [ ss.pos() for ss in antonym_synsets ]:
                    continue
                antonyms.add(antonym)
    return antonyms

In [5]:
from nltk.corpus import wordnet as wn

wn_all_antonyms = set()

for i in wn.all_synsets():
    if i.pos() in ['a', 's']: # If synset is adj or satelite-adj.
        for j in i.lemmas(): # Iterating through lemmas for each synset.
            if j.antonyms(): # If adj has antonym.
                # Prints the adj-antonym pair.
                wn_all_antonyms.add((j.name(), j.antonyms()[0].name()))

In [6]:
len(wn_all_antonyms)

3531

### Import pretrained embeddings

### **Options**

`CLASStorchtext.vocab.GloVe(name='840B', dim=300, **kwargs)`

`CLASStorchtext.vocab.FastText(language='en', **kwargs)`

`CLASStorchtext.vocab.CharNGram(**kwargs)`

In [7]:
import torch
import torchtext
import numpy as np

In [8]:
glove = torchtext.vocab.GloVe(name="840B",dim=300)

In [9]:
import spacy
nlp = spacy.load('en_core_web_lg')

In [10]:
from nltk.corpus import wordnet as wn

wn_all_antonyms_words = set()

for ant_pair in wn_all_antonyms:
    wn_all_antonyms_words = set(tuple(wn_all_antonyms_words) + ant_pair)

In [11]:
no_emb_words = set()
for adj in wn_all_antonyms_words:
    if torch.all(glove[adj] == torch.zeros(300)):
        no_emb_words.add(adj)

In [12]:
len(wn_all_antonyms)

3531

In [13]:
copy = wn_all_antonyms.copy()
for pair in wn_all_antonyms:
    pair1, pair2 = pair
    if pair1 in no_emb_words or pair2 in no_emb_words:
        copy.remove(pair)
        
wn_all_antonyms = copy

In [14]:
len(wn_all_antonyms)

3131

In [15]:
3531-3131

400

In [16]:
wn_all_antonyms

{('orienting', 'disorienting'),
 ('stingy', 'generous'),
 ('preprandial', 'postprandial'),
 ('flexible', 'inflexible'),
 ('consumptive', 'generative'),
 ('senior', 'junior'),
 ('landless', 'landed'),
 ('apocrine', 'eccrine'),
 ('intractable', 'tractable'),
 ('unfair', 'fair'),
 ('ingenuous', 'disingenuous'),
 ('tasteful', 'tasteless'),
 ('overhead', 'surface'),
 ('cursorial', 'fossorial'),
 ('cenobitic', 'eremitic'),
 ('commissioned', 'noncommissioned'),
 ('distant', 'close'),
 ('cool', 'warm'),
 ('future', 'past'),
 ('bowed', 'plucked'),
 ('confined', 'invasive'),
 ('atomistic', 'holistic'),
 ('illegal', 'legal'),
 ('ashamed', 'unashamed'),
 ('honorable', 'dishonorable'),
 ('unawed', 'awed'),
 ('offensive', 'defensive'),
 ('unparented', 'parented'),
 ('supportive', 'unsupportive'),
 ('actinomorphic', 'zygomorphic'),
 ('wrinkled', 'unwrinkled'),
 ('heuristic', 'algorithmic'),
 ('unenforced', 'enforced'),
 ('diploid', 'polyploid'),
 ('unreasonable', 'reasonable'),
 ('improper', 'proper'

### Add Custom Antonym Pairs

In [17]:
from nltk.corpus import wordnet as wn

# add words here:
WORDS = ['man', 'human']

for word in WORDS:
    for syn in wn.synsets(word):
        for j in syn.lemmas(): # Iterating through lemmas for each synset.
            if j.antonyms():
                pair1 = j.name()
                pair2 = j.antonyms()[0].name()
                # don't add pair to the list if at least one of them has no glove embeddings
                if torch.all(glove[pair1] == torch.zeros(300)) or torch.all(glove[pair2] == torch.zeros(300)):
                    print(f"Pair {(j.name(), j.antonyms()[0].name())} cannot be added because one of them does not have GloVe embeddings.")
                else:
                    print(f"Added {(pair1, pair2)}")
                    wn_all_antonyms.add((pair1, pair2))

Added ('man', 'woman')
Added ('serviceman', 'civilian')
Added ('man', 'woman')
Added ('human', 'nonhuman')


In [18]:
microframes = wn_all_antonyms

In [19]:
len(microframes)

3133

### Semantic Axis Vector

In [20]:
semantic_axis_vectors = dict()
for mframe in microframes:
    semantic_axis_vectors['(' + ', '.join(mframe) + ')'] = np.expand_dims(glove[mframe[1]] - glove[mframe[0]], axis= 0)

In [21]:
semantic_axis_vectors['(man, woman)']

array([[ 1.98666990e-01,  7.22199827e-02, -1.86462998e-01,
         5.83739996e-01, -7.46250004e-02, -9.17997956e-03,
         3.19599956e-02,  3.92699987e-02,  1.38819993e-01,
         6.78999424e-02, -2.03620002e-01,  2.38368988e-01,
        -2.72006691e-01, -3.08981687e-01, -1.61559999e-01,
        -1.46412000e-01, -4.35483992e-01,  1.89160019e-01,
        -2.95740008e-01, -3.50700021e-02,  9.05001163e-03,
         2.55119979e-01, -1.56845003e-01, -3.59719992e-01,
        -2.88099945e-02,  4.00590003e-01,  1.07859999e-01,
        -2.12710008e-01, -3.14889997e-01, -1.76756993e-01,
         1.34200007e-01, -5.70900142e-02,  2.26850003e-01,
        -2.34786004e-01,  4.04179990e-01,  2.45397985e-01,
         1.38130009e-01, -3.17710042e-01, -6.44200146e-02,
         5.04499972e-02,  5.87419987e-01,  1.61250010e-01,
        -1.27790004e-01, -5.09299934e-02,  5.82489967e-02,
        -1.54311001e-01, -2.74599999e-01, -4.05699909e-02,
         4.65460002e-01, -2.69311011e-01, -6.96425974e-0

## Conclusion

- Total of 3131 adjective antonym pairs plus 1 or more custom pairs
- Each pair must have GloVe embeddings (i.e. embedding not [0,0,0,0,0, ..., 0])

# 2. Contribution of a Word to Microframes

For calculating cosine similarity, see [documentaion](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.cosine_similarity.html) from sklearn

In [22]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# example:
x = np.array([[1,2,2]])
y = np.array([[3,4,1]])
cosine_similarity(x,y).item()

0.8498365855987975

In [23]:
np.expand_dims(glove['man'], axis=0).shape
cosine_similarity(np.expand_dims(glove['man'], axis=0), np.expand_dims(glove['woman'], axis=0)).item()

0.740174412727356

### Wrapper for cosine similarity
Given two words, find similarity:

In [24]:
def my_cos_similarity(func):
    def wrapper(word, axis_vector):
        return func(np.expand_dims(glove[word], axis=0), axis_vector)
    return wrapper

@my_cos_similarity
def cos_similarity(x, y):
    return cosine_similarity(x, y).item()

cos_similarity('woman',semantic_axis_vectors['(man, woman)'])

0.4572717547416687

The **absolute value** of the similarity between a word vector and
a microframe vector captures the relevance of the word to the
microframe, while the **sign** of the similarity captures a bias toward
one of the poles in the microframe.

# 3. Framing Bias and Intensity

## Bias and word frequency

In [25]:
import nltk
from multiprocessing import Pool
from tqdm.notebook import tqdm

In [26]:
# f
# word is a single word string
# doc_tokens is a list of all document tokens

def word_freq(word, doc_tokens):
    return doc_tokens.count(word)

### Regular function

In [27]:
# B
def bias(doc, frame=None):
    doc_tokens = nltk.word_tokenize(doc)
    
    numerator, denominator = 0, 0
    for word in doc_tokens:
        numerator += word_freq(word, doc_tokens) * cos_similarity(word, semantic_axis_vectors[frame])
        denominator += word_freq(word, doc_tokens)
        
    return numerator / denominator

In [28]:
%%timeit
sample = data_cleaned['RS_2020_nosleep'].iloc[0:1].selftext.item()
bias(sample, frame='(man, woman)')

98.7 ms ± 724 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


### Same function, but with performance enhancement

In [29]:
%%timeit
sum1, sum2 = 0, 0
for i in range(1_000_000):
    sum1 += 1
    sum2 += 2
sum1 / sum2

59.4 ms ± 4.34 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [30]:
%%timeit
sum1, sum2 = sum([1 for i in range(1_000_000)]), sum([2 for i in range(1_000_000)])
sum1 / sum2

55.2 ms ± 642 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [31]:
# B
def bias(doc, frame=None):
    doc_tokens = nltk.word_tokenize(doc)
    li = [ sum([word_freq(word, doc_tokens) * cos_similarity(word, semantic_axis_vectors[f"({ant1}, {ant2})"]) for word in doc_tokens]) / sum([word_freq(word, doc_tokens) for word in doc_tokens]) for ant1, ant2 in tqdm(frame)]
    return np.array(li)

In [32]:
%%timeit
sample = data_cleaned['RS_2020_nosleep'].iloc[0:1].selftext.item()
bias(sample, frame={('man', 'woman')})

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

122 ms ± 1.92 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


# $B^T$ (failed for now)

In [33]:
import pandas as pd
concat_df = pd.concat([data_cleaned['RS_2020_self'], data_cleaned['RS_2020_nosleep']])

In [34]:
np.savetxt("Download/concat_texts.txt", concat_df['selftext'].astype('string').values, fmt='%s')

In [35]:
with open('Download/concat_texts.txt', 'r') as file:
    concat_texts = file.read()

In [36]:
# bias(concat_texts, frame = 'man - woman')

## Intensity (failed for now)

### Create the entire corpus *T*

In [37]:
# I
def intensity(doc, frame=None, corpus = concat_texts):
    doc_tokens = nltk.word_tokenize(doc)
    
    print('Calculating B^T')
    B_T = bias(corpus, frame = frame)
    print('Done')
    numerator, denominator = sum([word_freq(word, doc_tokens) * (cos_similarity(word, semantic_axis_vectors[frame]) - B_T)**2 for word in doc_tokens]), sum([word_freq(word, doc_tokens) for word in doc_tokens])
        
    return numerator / denominator

In [38]:
sample = data_cleaned['RS_2020_nosleep'].iloc[0:1].selftext.item()
# intensity(sample, frame='man - woman', corpus = concat_texts)

# 4. Statistical Significance of Microframes

In [39]:
import pandas as pd
import numpy as np

def df_mframes(df, microframes=microframes, topn=None):
    # if not specify topn
    if not topn:
        iterables = [df['id'].to_list(), list(microframes)]
        index = pd.MultiIndex.from_product(iterables, names=['id', 'microframe'])
        df_new = pd.DataFrame(np.repeat(df[['title', 'selftext', 'score']].values, 
                                    len(microframes), 
                                    axis = 0),
                          index=index,
                          columns=['title', 'selftext', 'score']).copy()
        # map semantic axis vector to index
        # how it works: index -> get 2nd level index -> to series -> convert tuple to string -> map values
        a = df_new.index.get_level_values(1).to_series().apply(lambda row: f"({row[0]}, {row[1]})").map(semantic_axis_vectors)
        df_new['Semantic Axis Vector'] = a.to_list()
        return df_new
    else:
        iterables = [df['id'].head(topn).to_list(), list(microframes)]
        index = pd.MultiIndex.from_product(iterables, names=['id', 'microframe'])
        df_new = pd.DataFrame(np.repeat(df[['title', 'selftext', 'score']].head(topn).values, 
                                    len(microframes), 
                                    axis = 0),
                          index=index,
                          columns=['title', 'selftext', 'score']).copy()
        # map semantic axis vector to index
        # how it works: index -> get 2nd level index -> to series -> convert tuple to string -> map values
        a = df_new.index.get_level_values(1).to_series().apply(lambda row: f"({row[0]}, {row[1]})").map(semantic_axis_vectors)
        df_new['Semantic Axis Vector'] = a.to_list()
        return df_new

In [40]:
sample_doc = data_cleaned['RS_2020_nosleep'].iloc[0:1].selftext.item()

In [41]:
sample_doc



In [42]:
with open('Download/concat_texts.txt', 'r') as file:
    concat_texts = file.read()
    T = nltk.word_tokenize(concat_texts)

In [43]:
len(nltk.word_tokenize(sample_doc))

575

In [40]:
from sklearn.utils import resample
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

N = 140
t = len(nltk.word_tokenize(sample_doc))
df = pd.DataFrame(columns=microframes)

for i in tqdm(range(N)):
    s = ' '.join(resample(T, replace=True, n_samples=t))
    biases = bias(s, frame=microframes)
        
    df.loc[i] = biases

import pickle
pickle.dump( df, open( "Download/sample_biases_df.pickle", "wb" ) )

  0%|          | 0/140 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

  0%|          | 0/3133 [00:00<?, ?it/s]

In [44]:
sample_biases_df = pickle.load(open('Download/sample_biases_df.pickle', 'rb'))
sample_biases_df

Unnamed: 0_level_0,discontented,immediate,hipped,necessary,autumnal,declarative,maxi,humorous,monovalent,indiscreet,...,alcoholic,assertive,unlivable,original,individual,principled,immoderate,unknown,freelance,forceless
Unnamed: 0_level_1,contented,mediate,gabled,unnecessary,wintry,interrogative,mini,humorless,polyvalent,discreet,...,nonalcoholic,unassertive,livable,unoriginal,common,unprincipled,moderate,known,salaried,forceful
0,0.165546,-0.154739,-0.002083,-0.115420,0.065003,-0.088813,0.146881,-0.159049,0.008047,0.162146,...,-0.224326,-0.242138,0.158984,-0.189906,0.007315,-0.125275,0.244697,0.071690,-0.100881,0.282711
1,0.177506,-0.150237,-0.007614,-0.107309,0.070593,-0.081920,0.137711,-0.165901,0.014199,0.156433,...,-0.225266,-0.246804,0.155883,-0.176094,0.007535,-0.130717,0.247655,0.056833,-0.096931,0.284652
2,0.169375,-0.156033,-0.005874,-0.110519,0.065521,-0.085905,0.146263,-0.163001,0.009533,0.160838,...,-0.226490,-0.245076,0.156482,-0.188997,0.008848,-0.127639,0.245994,0.069606,-0.101717,0.283759
3,0.170841,-0.157961,-0.001664,-0.104724,0.062080,-0.086970,0.141440,-0.161939,0.013173,0.160702,...,-0.221755,-0.237164,0.149789,-0.182591,0.019616,-0.123959,0.245925,0.064065,-0.103672,0.279415
4,0.175307,-0.148986,-0.011530,-0.108341,0.065765,-0.083990,0.138884,-0.166447,0.018695,0.158907,...,-0.227087,-0.245679,0.154710,-0.174680,0.017252,-0.131417,0.248683,0.057177,-0.108773,0.285297
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135,0.174030,-0.157935,-0.006736,-0.104627,0.063156,-0.084405,0.142071,-0.166416,0.015494,0.160062,...,-0.224311,-0.244643,0.150649,-0.181650,0.013336,-0.126066,0.245580,0.062228,-0.103226,0.284359
136,0.172434,-0.146922,-0.031942,-0.102224,0.066450,-0.086326,0.138581,-0.167259,0.019127,0.151235,...,-0.238764,-0.249297,0.151840,-0.169481,0.014482,-0.135287,0.251480,0.065976,-0.112789,0.286763
137,0.174234,-0.153681,-0.011042,-0.104586,0.065546,-0.082413,0.137912,-0.166008,0.012966,0.158591,...,-0.227856,-0.244326,0.155517,-0.176637,0.012130,-0.129457,0.250048,0.063810,-0.103120,0.281563
138,0.174427,-0.156169,-0.005981,-0.089861,0.051854,-0.090725,0.140425,-0.165273,0.014739,0.158256,...,-0.216647,-0.235032,0.145497,-0.156429,0.023597,-0.125767,0.233690,0.042400,-0.113347,0.269686


In [45]:
sample_bias = bias(sample_doc, frame=microframes)

  0%|          | 0/3133 [00:00<?, ?it/s]

In [46]:
sample_boot_bias = sample_biases_df.mean(axis=0).to_numpy()

In [53]:
from operator import itemgetter
rank = itemgetter(*np.argsort(- sample_bias + sample_boot_bias))(list(microframes))

In [54]:
rank

(('nondisposable', 'disposable'),
 ('unwooded', 'wooded'),
 ('unamended', 'amended'),
 ('unmoved', 'moved'),
 ('inessential', 'essential'),
 ('uncomplaining', 'complaining'),
 ('nonexplosive', 'explosive'),
 ('unguided', 'guided'),
 ('inefficacious', 'efficacious'),
 ('onymous', 'anonymous'),
 ('irreligious', 'religious'),
 ('deniable', 'undeniable'),
 ('nonbearing', 'bearing'),
 ('evitable', 'inevitable'),
 ('nonspeaking', 'speaking'),
 ('nonrigid', 'rigid'),
 ('nonlinguistic', 'linguistic'),
 ('ordinal', 'cardinal'),
 ('unhappy', 'happy'),
 ('uncontaminated', 'contaminated'),
 ('uncommitted', 'committed'),
 ('inexact', 'exact'),
 ('uninvolved', 'involved'),
 ('maladjusted', 'adjusted'),
 ('styleless', 'stylish'),
 ('handless', 'handed'),
 ('unquiet', 'quiet'),
 ('unacknowledged', 'acknowledged'),
 ('nonpolitical', 'political'),
 ('unsurprised', 'surprised'),
 ('unabused', 'abused'),
 ('ungeared', 'geared'),
 ('insufficient', 'sufficient'),
 ('purposeless', 'purposeful'),
 ('phlegmy',

In [46]:
tuple(sorted(('eed','abc')))

('abc', 'eed')

In [55]:
new_rank = []
for ant1, ant2 in rank:
    new_rank.append(tuple(sorted((ant1,ant2))))

In [56]:
len(new_rank)

3133

In [57]:
len(rank)

3133

In [58]:
new_rank = list(dict.fromkeys(new_rank))

In [59]:
new_rank[:10]

[('disposable', 'nondisposable'),
 ('unwooded', 'wooded'),
 ('amended', 'unamended'),
 ('moved', 'unmoved'),
 ('essential', 'inessential'),
 ('complaining', 'uncomplaining'),
 ('explosive', 'nonexplosive'),
 ('guided', 'unguided'),
 ('efficacious', 'inefficacious'),
 ('anonymous', 'onymous')]

In [60]:
new_rank[-10:]

[('colourful', 'colourless'),
 ('natural', 'sharp'),
 ('one-piece', 'two-piece'),
 ('hypotensive', 'normotensive'),
 ('carnivorous', 'herbivorous'),
 ('alkaline', 'amphoteric'),
 ('anadromous', 'catadromous'),
 ('sonic', 'subsonic'),
 ('subsurface', 'surface'),
 ('feminine', 'neuter')]