# FrameAxis analysis for r/self and r/nosleep

In [1]:
import tensorflow as tf
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)
print("Num GPUs Available: ", len(physical_devices))

Num GPUs Available:  1


In [2]:
import pickle
with open('./Download/data_cleaned.pickle', 'rb') as handle:
    data_cleaned = pickle.load(handle)

# 1. Building a Set of Microframes

In [3]:
import nltk
from nltk.corpus import wordnet as wn
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /home/anthony/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/anthony/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Show all antonyms in WordNet:

In [4]:
from nltk.corpus import wordnet as wn

def antonyms_for(word):
    antonyms = set()
    for ss in wn.synsets(word):
        for lemma in ss.lemmas():
            any_pos_antonyms = [ antonym.name() for antonym in lemma.antonyms() ]
            for antonym in any_pos_antonyms:
                antonym_synsets = wn.synsets(antonym)
                if wn.ADJ not in [ ss.pos() for ss in antonym_synsets ]:
                    continue
                antonyms.add(antonym)
    return antonyms

In [7]:
antonyms_for('scary')

set()

In [8]:
from nltk.corpus import wordnet as wn

wn_all_antonyms = set()

for i in wn.all_synsets():
    if i.pos() in ['a', 's']: # If synset is adj or satelite-adj.
        for j in i.lemmas(): # Iterating through lemmas for each synset.
            if j.antonyms(): # If adj has antonym.
                # Prints the adj-antonym pair.
                wn_all_antonyms.add((j.name(), j.antonyms()[0].name()))

In [9]:
len(wn_all_antonyms)

3531

### Import pretrained embeddings

### **Options**

`CLASStorchtext.vocab.GloVe(name='840B', dim=300, **kwargs)`

`CLASStorchtext.vocab.FastText(language='en', **kwargs)`

`CLASStorchtext.vocab.CharNGram(**kwargs)`

In [10]:
import torch
import torchtext
import numpy as np

In [11]:
glove = torchtext.vocab.GloVe(name="840B",dim=300)

In [12]:
import spacy
nlp = spacy.load('en_core_web_lg')

In [13]:
from nltk.corpus import wordnet as wn

wn_all_antonyms_words = set()

for ant_pair in wn_all_antonyms:
    wn_all_antonyms_words = set(tuple(wn_all_antonyms_words) + ant_pair)

In [14]:
no_emb_words = set()
for adj in wn_all_antonyms_words:
    if torch.all(glove[adj] == torch.zeros(300)):
        no_emb_words.add(adj)

In [15]:
len(wn_all_antonyms)

3531

In [16]:
copy = wn_all_antonyms.copy()
for pair in wn_all_antonyms:
    pair1, pair2 = pair
    if pair1 in no_emb_words or pair2 in no_emb_words:
        copy.remove(pair)
        
wn_all_antonyms = copy

In [17]:
len(wn_all_antonyms)

3131

In [18]:
3531-3131

400

In [19]:
wn_all_antonyms

{('unprofessional', 'professional'),
 ('nonporous', 'porous'),
 ('amphibious', 'aquatic'),
 ('sighted', 'blind'),
 ('inexact', 'exact'),
 ('nonsurgical', 'surgical'),
 ('contemptible', 'estimable'),
 ('credible', 'incredible'),
 ('fretted', 'unfretted'),
 ('normal', 'abnormal'),
 ('certified', 'uncertified'),
 ('mutable', 'immutable'),
 ('parallel', 'perpendicular'),
 ('unreproducible', 'reproducible'),
 ('compromising', 'uncompromising'),
 ('untalented', 'talented'),
 ('offstage', 'onstage'),
 ('detachable', 'attachable'),
 ('seedy', 'seedless'),
 ('local', 'general'),
 ('acidic', 'alkaline'),
 ('vocal', 'instrumental'),
 ('receptive', 'unreceptive'),
 ('disinclined', 'inclined'),
 ('wary', 'unwary'),
 ('topless', 'topped'),
 ('vocalic', 'consonantal'),
 ('attractive', 'unattractive'),
 ('noncrystalline', 'crystalline'),
 ('short', 'long'),
 ('unowned', 'owned'),
 ('lowercase', 'uppercase'),
 ('unmoving', 'moving'),
 ('supersonic', 'sonic'),
 ('argillaceous', 'arenaceous'),
 ('unshape

### Add Custom Antonym Pairs

In [20]:
from nltk.corpus import wordnet as wn

# add words here:
WORDS = ['man', 'human']

for word in WORDS:
    for syn in wn.synsets(word):
        for j in syn.lemmas(): # Iterating through lemmas for each synset.
            if j.antonyms():
                pair1 = j.name()
                pair2 = j.antonyms()[0].name()
                # don't add pair to the list if at least one of them has no glove embeddings
                if torch.all(glove[pair1] == torch.zeros(300)) or torch.all(glove[pair2] == torch.zeros(300)):
                    print(f"Pair {(j.name(), j.antonyms()[0].name())} cannot be added because one of them does not have GloVe embeddings.")
                else:
                    print(f"Added {(pair1, pair2)}")
                    wn_all_antonyms.add((pair1, pair2))

Added ('man', 'woman')
Added ('serviceman', 'civilian')
Added ('man', 'woman')
Added ('human', 'nonhuman')


In [55]:
microframes = {('man', 'woman'), ('human', 'nonhuman'), ('creepy', 'pleasant')}

In [56]:
len(microframes)

3

### Semantic Axis Vector

In [61]:
semantic_axis_vectors = dict()
for mframe in microframes:
    semantic_axis_vectors['(' + ', '.join(mframe) + ')'] = np.expand_dims(glove[mframe[1]] - glove[mframe[0]], axis= 0)

In [62]:
semantic_axis_vectors['(man, woman)']

array([[ 1.98666990e-01,  7.22199827e-02, -1.86462998e-01,
         5.83739996e-01, -7.46250004e-02, -9.17997956e-03,
         3.19599956e-02,  3.92699987e-02,  1.38819993e-01,
         6.78999424e-02, -2.03620002e-01,  2.38368988e-01,
        -2.72006691e-01, -3.08981687e-01, -1.61559999e-01,
        -1.46412000e-01, -4.35483992e-01,  1.89160019e-01,
        -2.95740008e-01, -3.50700021e-02,  9.05001163e-03,
         2.55119979e-01, -1.56845003e-01, -3.59719992e-01,
        -2.88099945e-02,  4.00590003e-01,  1.07859999e-01,
        -2.12710008e-01, -3.14889997e-01, -1.76756993e-01,
         1.34200007e-01, -5.70900142e-02,  2.26850003e-01,
        -2.34786004e-01,  4.04179990e-01,  2.45397985e-01,
         1.38130009e-01, -3.17710042e-01, -6.44200146e-02,
         5.04499972e-02,  5.87419987e-01,  1.61250010e-01,
        -1.27790004e-01, -5.09299934e-02,  5.82489967e-02,
        -1.54311001e-01, -2.74599999e-01, -4.05699909e-02,
         4.65460002e-01, -2.69311011e-01, -6.96425974e-0

## Conclusion

- Total of 3131 adjective antonym pairs plus 1 or more custom pairs
- Each pair must have GloVe embeddings (i.e. embedding not [0,0,0,0,0, ..., 0])

# 2. Contribution of a Word to Microframes

For calculating cosine similarity, see [documentaion](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.cosine_similarity.html) from sklearn

In [25]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# example:
x = np.array([[1,2,2]])
y = np.array([[3,4,1]])
cosine_similarity(x,y).item()

0.8498365855987975

In [26]:
np.expand_dims(glove['man'], axis=0).shape
cosine_similarity(np.expand_dims(glove['man'], axis=0), np.expand_dims(glove['woman'], axis=0)).item()

0.740174412727356

### Wrapper for cosine similarity
Given two words, find similarity:

In [27]:
def my_cos_similarity(func):
    def wrapper(word, axis_vector):
        return func(np.expand_dims(glove[word], axis=0), axis_vector)
    return wrapper

@my_cos_similarity
def cos_similarity(x, y):
    return cosine_similarity(x, y).item()

cos_similarity('woman',semantic_axis_vectors['(man, woman)'])

0.4572717547416687

The **absolute value** of the similarity between a word vector and
a microframe vector captures the relevance of the word to the
microframe, while the **sign** of the similarity captures a bias toward
one of the poles in the microframe.

# 3. Framing Bias and Intensity

## Bias and word frequency

In [28]:
import nltk
from multiprocessing import Pool
from tqdm.notebook import tqdm

In [29]:
# f
# word is a single word string
# doc_tokens is a list of all document tokens

def word_freq(word, doc_tokens):
    return doc_tokens.count(word)

### Regular function

In [30]:
# B
def bias(doc, frame=None):
    doc_tokens = nltk.word_tokenize(doc)
    
    numerator, denominator = 0, 0
    for word in doc_tokens:
        numerator += word_freq(word, doc_tokens) * cos_similarity(word, semantic_axis_vectors[frame])
        denominator += word_freq(word, doc_tokens)
        
    return numerator / denominator

In [31]:
%%timeit
sample = data_cleaned['RS_2020_nosleep'].iloc[0:1].selftext.item()
bias(sample, frame='(man, woman)')

101 ms ± 718 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


### Same function, but with performance enhancement

In [32]:
%%timeit
sum1, sum2 = 0, 0
for i in range(1_000_000):
    sum1 += 1
    sum2 += 2
sum1 / sum2

76.1 ms ± 5.58 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [33]:
%%timeit
sum1, sum2 = sum([1 for i in range(1_000_000)]), sum([2 for i in range(1_000_000)])
sum1 / sum2

57.3 ms ± 976 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [34]:
# B
def bias(doc, frame=None):
    doc_tokens = nltk.word_tokenize(doc)
    li = [ sum([word_freq(word, doc_tokens) * cos_similarity(word, semantic_axis_vectors[f"({ant1}, {ant2})"]) for word in doc_tokens]) / sum([word_freq(word, doc_tokens) for word in doc_tokens]) for ant1, ant2 in tqdm(frame)]
    return np.array(li)

In [35]:
%%timeit
sample = data_cleaned['RS_2020_nosleep'].iloc[0:1].selftext.item()
bias(sample, frame={('man', 'woman')})

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

121 ms ± 1.55 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


# $B^T$ (failed for now)

In [36]:
import pandas as pd
concat_df = pd.concat([data_cleaned['RS_2020_self'], data_cleaned['RS_2020_nosleep']])

In [37]:
np.savetxt("Download/concat_texts.txt", concat_df['selftext'].astype('string').values, fmt='%s')

In [38]:
with open('Download/concat_texts.txt', 'r') as file:
    concat_texts = file.read()

In [39]:
# bias(concat_texts, frame = 'man - woman')

## Intensity (failed for now)

### Create the entire corpus *T*

In [40]:
# I
def intensity(doc, frame=None, corpus = concat_texts):
    doc_tokens = nltk.word_tokenize(doc)
    
    print('Calculating B^T')
    B_T = bias(corpus, frame = frame)
    print('Done')
    numerator, denominator = sum([word_freq(word, doc_tokens) * (cos_similarity(word, semantic_axis_vectors[frame]) - B_T)**2 for word in doc_tokens]), sum([word_freq(word, doc_tokens) for word in doc_tokens])
        
    return numerator / denominator

In [41]:
sample = data_cleaned['RS_2020_nosleep'].iloc[0:1].selftext.item()
# intensity(sample, frame='man - woman', corpus = concat_texts)

# 4. Statistical Significance of Microframes

In [42]:
import pandas as pd
import numpy as np

def df_mframes(df, microframes=microframes, topn=None):
    # if not specify topn
    if not topn:
        iterables = [df['id'].to_list(), list(microframes)]
        index = pd.MultiIndex.from_product(iterables, names=['id', 'microframe'])
        df_new = pd.DataFrame(np.repeat(df[['title', 'selftext', 'score']].values, 
                                    len(microframes), 
                                    axis = 0),
                          index=index,
                          columns=['title', 'selftext', 'score']).copy()
        # map semantic axis vector to index
        # how it works: index -> get 2nd level index -> to series -> convert tuple to string -> map values
        a = df_new.index.get_level_values(1).to_series().apply(lambda row: f"({row[0]}, {row[1]})").map(semantic_axis_vectors)
        df_new['Semantic Axis Vector'] = a.to_list()
        return df_new
    else:
        iterables = [df['id'].head(topn).to_list(), list(microframes)]
        index = pd.MultiIndex.from_product(iterables, names=['id', 'microframe'])
        df_new = pd.DataFrame(np.repeat(df[['title', 'selftext', 'score']].head(topn).values, 
                                    len(microframes), 
                                    axis = 0),
                          index=index,
                          columns=['title', 'selftext', 'score']).copy()
        # map semantic axis vector to index
        # how it works: index -> get 2nd level index -> to series -> convert tuple to string -> map values
        a = df_new.index.get_level_values(1).to_series().apply(lambda row: f"({row[0]}, {row[1]})").map(semantic_axis_vectors)
        df_new['Semantic Axis Vector'] = a.to_list()
        return df_new

In [78]:
sample_doc = data_cleaned['RS_2020_nosleep'].iloc[0:1].selftext.item()

In [79]:
sample_doc



In [68]:
with open('Download/concat_texts.txt', 'r') as file:
    concat_texts = file.read()
    T = nltk.word_tokenize(concat_texts)

KeyboardInterrupt: 

In [69]:
len(nltk.word_tokenize(sample_doc))

575

In [77]:
from sklearn.utils import resample
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

N = 140
t = len(nltk.word_tokenize(sample_doc))
df = pd.DataFrame(columns=microframes)

for i in tqdm(range(N)):
    s = ' '.join(resample(T, replace=True, n_samples=t))
    biases = bias(s, frame=microframes)
        
    df.loc[i] = biases

# import pickle
# pickle.dump( df, open( "Download/sample_biases_df.pickle", "wb" ) )

  0%|          | 0/140 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

In [71]:
sample_biases_df = df
sample_biases_df

Unnamed: 0_level_0,man,creepy,human
Unnamed: 0_level_1,woman,pleasant,nonhuman
0,-0.036779,0.045565,-0.357121
1,-0.037995,0.049996,-0.353584
2,-0.026426,0.044231,-0.347726
3,-0.028199,0.050562,-0.330185
4,-0.031033,0.042708,-0.344572
...,...,...,...
135,-0.025431,0.048873,-0.349170
136,-0.031482,0.051112,-0.359157
137,-0.024772,0.045595,-0.343701
138,-0.014855,0.063317,-0.345282


In [72]:
sample_bias = bias(sample_doc, frame=microframes)
sample_bias

  0%|          | 0/3 [00:00<?, ?it/s]

array([-0.02326027,  0.05281185, -0.35184012])

In [73]:
sample_boot_bias = sample_biases_df.mean(axis=0).to_numpy()

In [74]:
from operator import itemgetter
rank = itemgetter(*np.argsort(- sample_bias + sample_boot_bias))(list(microframes))

In [66]:
sample_bias-sample_boot_bias

array([-0.00309378,  0.00473373])

In [49]:
tuple(sorted(('eed','abc')))

('abc', 'eed')

In [50]:
new_rank = []
for ant1, ant2 in rank:
    new_rank.append(tuple(sorted((ant1,ant2))))

In [51]:
len(new_rank)

2

In [52]:
len(rank)

2

In [53]:
new_rank = list(dict.fromkeys(new_rank))

In [54]:
new_rank[:10]

[('man', 'woman'), ('human', 'nonhuman')]

In [55]:
new_rank[-10:]

[('man', 'woman'), ('human', 'nonhuman')]