In [1]:
import tensorflow as tf
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)
print("Num GPUs Available: ", len(physical_devices))

Num GPUs Available:  1


In [2]:
import pickle
with open('./Download/data_cleaned.pickle', 'rb') as handle:
    data_cleaned = pickle.load(handle)

# 1. Building a Set of Microframes

In [3]:
import nltk
from nltk.corpus import wordnet as wn
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /home/anthony/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/anthony/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Show all antonyms in WordNet:

In [4]:
from nltk.corpus import wordnet as wn

def antonyms_for(word):
    antonyms = set()
    for ss in wn.synsets(word):
        for lemma in ss.lemmas():
            any_pos_antonyms = [ antonym.name() for antonym in lemma.antonyms() ]
            for antonym in any_pos_antonyms:
                antonym_synsets = wn.synsets(antonym)
                if wn.ADJ not in [ ss.pos() for ss in antonym_synsets ]:
                    continue
                antonyms.add(antonym)
    return antonyms

In [5]:
from nltk.corpus import wordnet as wn

wn_all_antonyms = set()

for i in wn.all_synsets():
    if i.pos() in ['a', 's']: # If synset is adj or satelite-adj.
        for j in i.lemmas(): # Iterating through lemmas for each synset.
            if j.antonyms(): # If adj has antonym.
                # Prints the adj-antonym pair.
                wn_all_antonyms.add((j.name(), j.antonyms()[0].name()))

In [6]:
len(wn_all_antonyms)

3531

### Import pretrained embeddings

### **Options**

`CLASStorchtext.vocab.GloVe(name='840B', dim=300, **kwargs)`

`CLASStorchtext.vocab.FastText(language='en', **kwargs)`

`CLASStorchtext.vocab.CharNGram(**kwargs)`

In [7]:
from sentence_transformers import SentenceTransformer
import numpy as np

In [8]:
sbert = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')

In [9]:
import spacy
nlp = spacy.load('en_core_web_lg')

In [10]:
from nltk.corpus import wordnet as wn

wn_all_antonyms_words = set()

for ant_pair in wn_all_antonyms:
    wn_all_antonyms_words = set(tuple(wn_all_antonyms_words) + ant_pair)

In [11]:
len(wn_all_antonyms)

3531

In [12]:
wn_all_antonyms

{('uninteresting', 'interesting'),
 ('immortal', 'mortal'),
 ('noisy', 'quiet'),
 ('staged', 'unstaged'),
 ('adducent', 'abducent'),
 ('wheelless', 'wheeled'),
 ('atonic', 'tonic'),
 ('unarmed', 'armed'),
 ('unconvincing', 'convincing'),
 ('intensive', 'extensive'),
 ('emotional', 'cerebral'),
 ('unconvinced', 'convinced'),
 ('irreligious', 'religious'),
 ('downstage', 'upstage'),
 ('ahistorical', 'historical'),
 ('greater', 'lesser'),
 ('listed', 'unlisted'),
 ('leading', 'following'),
 ('anterior', 'posterior'),
 ('disingenuous', 'ingenuous'),
 ('unrhymed', 'rhymed'),
 ('bright', 'dull'),
 ('awkward', 'graceful'),
 ('native', 'foreign'),
 ('written', 'spoken'),
 ('irreconcilable', 'reconcilable'),
 ('nondeductible', 'deductible'),
 ('relative', 'absolute'),
 ('piano', 'forte'),
 ('exclusive', 'inclusive'),
 ('untalented', 'talented'),
 ('germy', 'germfree'),
 ('unoriginal', 'original'),
 ('monolingual', 'multilingual'),
 ('nonarbitrable', 'arbitrable'),
 ('patterned', 'plain'),
 ('mo

### Add Custom Antonym Pairs

In [13]:
from nltk.corpus import wordnet as wn

# add words here:
WORDS = ['man', 'human']

for word in WORDS:
    for syn in wn.synsets(word):
        for j in syn.lemmas(): # Iterating through lemmas for each synset.
            if j.antonyms():
                pair1 = j.name()
                pair2 = j.antonyms()[0].name()
                
                print(f"Added {(pair1, pair2)}")
                wn_all_antonyms.add((pair1, pair2))

Added ('man', 'woman')
Added ('serviceman', 'civilian')
Added ('man', 'woman')
Added ('human', 'nonhuman')


In [14]:
microframes = wn_all_antonyms

In [15]:
len(microframes)

3533

### Semantic Axis Vector

In [16]:
semantic_axis_vectors = dict()
for mframe in microframes:
    semantic_axis_vectors['(' + ', '.join(mframe) + ')'] = np.expand_dims(sbert.encode(mframe[1]) - sbert.encode(mframe[0]), axis= 0)

In [17]:
semantic_axis_vectors['(man, woman)']

array([[ 4.61445272e-01, -1.63598835e-01, -9.09718946e-02,
        -6.64489985e-01, -3.24462116e-01,  1.22423761e-01,
         5.13927639e-01,  1.83106333e-01, -5.12893736e-01,
         2.12878203e+00, -3.37066770e-01, -4.27194834e-01,
         3.05582732e-01,  3.47756773e-01,  2.67127156e-03,
        -1.40166426e+00,  6.70769095e-01,  7.77182937e-01,
         1.20007157e-01,  9.96577740e-03,  1.44999897e+00,
        -5.07247865e-01,  1.13095617e+00, -1.22853422e+00,
         6.89658999e-01, -3.51170421e-01, -8.64769220e-02,
         2.46869028e-01, -3.37070525e-01, -2.12183863e-01,
        -5.65851212e-01, -7.07897902e-01,  6.56521320e-03,
         1.21668887e+00,  1.38573337e+00,  1.08423412e+00,
         4.30539221e-01,  8.87471914e-01, -9.77675021e-01,
         6.12992123e-02, -5.24116695e-01, -8.19561481e-01,
         1.60409808e-02,  8.54127645e-01,  1.14410114e+00,
         2.92331517e-01, -7.48084784e-01, -5.29809296e-02,
        -1.42695546e-01, -7.13742599e-02,  2.95895368e-0

## Conclusion

- Total of 3131 adjective antonym pairs plus 1 or more custom pairs
- Each pair must have GloVe embeddings (i.e. embedding not [0,0,0,0,0, ..., 0])

# 2. Contribution of a Word to Microframes

### Wrapper for cosine similarity
Given two words, find similarity:

In [20]:
from sklearn.metrics.pairwise import cosine_similarity
def my_cos_similarity(func):
    def wrapper(sent, axis_vector):
        return func(np.expand_dims(sbert.encode(sent), axis=0), axis_vector)
    return wrapper

@my_cos_similarity
def cos_similarity(x, y):
    return cosine_similarity(x, y).item()

cos_similarity('I am in love with a woman.',semantic_axis_vectors['(man, woman)'])

0.4079359769821167

The **absolute value** of the similarity between a word vector and
a microframe vector captures the relevance of the word to the
microframe, while the **sign** of the similarity captures a bias toward
one of the poles in the microframe.

# 3. Framing Bias and Intensity

## Bias and word frequency

In [21]:
import nltk
from multiprocessing import Pool
from tqdm.notebook import tqdm

In [22]:
# f
# word is a single word string
# doc_tokens is a list of all document tokens

def sent_freq(sent, doc_sents):
    return doc_sents.count(sent)

### Regular function

In [23]:
# B
def bias(doc, frame=None):
    doc_sents = nltk.sent_tokenize(doc)
    
    numerator, denominator = 0, 0
    for sent in doc_sents:
        numerator += sent_freq(sent, doc_sents) * cos_similarity(sent, semantic_axis_vectors[frame])
        denominator += sent_freq(sent, doc_sents)
        
    return numerator / denominator

In [24]:
%%timeit
sample = data_cleaned['RS_2020_nosleep'].iloc[0:1].selftext.item()
bias(sample, frame='(man, woman)')

150 ms ± 960 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


### Same function, but with performance enhancement

In [25]:
# B
def bias(doc, frame=None):
    doc_sents = nltk.sent_tokenize(doc)
    li = [ sum([sent_freq(sent, doc_sents) * cos_similarity(sent, semantic_axis_vectors[f"({ant1}, {ant2})"]) for sent in doc_sents]) / sum([sent_freq(sent, doc_sents) for sent in doc_sents]) for ant1, ant2 in tqdm(frame)]
    return li

In [26]:
# %%timeit
sample = data_cleaned['RS_2020_nosleep'].iloc[0:1].selftext.item()
bias(sample, frame={('man', 'woman')})

  0%|          | 0/1 [00:00<?, ?it/s]

[0.06302168016770372]

In [27]:
import pandas as pd
import numpy as np

def df_mframes(df, microframes=microframes, topn=None):
    # if not specify topn
    if not topn:
        iterables = [df['id'].to_list(), list(microframes)]
        index = pd.MultiIndex.from_product(iterables, names=['id', 'microframe'])
        df_new = pd.DataFrame(np.repeat(df[['title', 'selftext', 'score']].values, 
                                    len(microframes), 
                                    axis = 0),
                          index=index,
                          columns=['title', 'selftext', 'score']).copy()
        # map semantic axis vector to index
        # how it works: index -> get 2nd level index -> to series -> convert tuple to string -> map values
        a = df_new.index.get_level_values(1).to_series().apply(lambda row: f"({row[0]}, {row[1]})").map(semantic_axis_vectors)
        df_new['Semantic Axis Vector'] = a.to_list()
        return df_new
    else:
        iterables = [df['id'].head(topn).to_list(), list(microframes)]
        index = pd.MultiIndex.from_product(iterables, names=['id', 'microframe'])
        df_new = pd.DataFrame(np.repeat(df[['title', 'selftext', 'score']].head(topn).values, 
                                    len(microframes), 
                                    axis = 0),
                          index=index,
                          columns=['title', 'selftext', 'score']).copy()
        # map semantic axis vector to index
        # how it works: index -> get 2nd level index -> to series -> convert tuple to string -> map values
        a = df_new.index.get_level_values(1).to_series().apply(lambda row: f"({row[0]}, {row[1]})").map(semantic_axis_vectors)
        df_new['Semantic Axis Vector'] = a.to_list()
        return df_new

In [28]:
sample_doc = data_cleaned['RS_2020_nosleep'].iloc[0:1].selftext.item()

In [29]:
print(sample_doc)

This is the only rule of our household. If you feel a presence standing over you while you sleep, do **NOT** open your eyes. Ignore it and try to fall asleep. This may sound a bit like the show Bird Box from Netflix but this is real, this is serious. You don't want to know what happens if you open your eyes. Let me start from the beginning, from where it all began.

------------------------

My family had just moved to a new house in a new city in an eerily quiet neighborhood. We barely ever saw the neighbors and there were almost no animals around. It threw me off a little bit but i quickly got accustomed to it. My school was pretty close to my house so it was not difficult to make friends that I could regularly bring home. So, a few months in the new house, when i was sleeping one night, i felt a figure standing over me. Now, I have always been a bit of a scaredy cat so I slammed my eyes shut and waited. A few moments later, I felt it go away. This happened to me every night and I co

In [30]:
with open('Download/concat_texts.txt', 'r') as file:
    concat_texts = file.read()
    T = nltk.sent_tokenize(concat_texts)

In [31]:
len(nltk.sent_tokenize(sample_doc))

27

In [53]:
from sklearn.utils import resample
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

N = 140
t = len(nltk.sent_tokenize(sample_doc))
df = pd.DataFrame(columns=microframes)

for i in tqdm(range(N)):
    s = ' '.join(resample(T, replace=True, n_samples=t))
    biases = bias(s, frame=microframes)
        
    df.loc[i] = biases

import pickle
pickle.dump( df, open( "Download/sample_biases_df2.pickle", "wb" ) )

  0%|          | 0/140 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

  0%|          | 0/3533 [00:00<?, ?it/s]

In [32]:
sample_biases_df = pickle.load(open('Download/sample_biases_df2.pickle', 'rb'))

In [33]:
sample_bias = bias(sample_doc, frame=microframes)

  0%|          | 0/3533 [00:00<?, ?it/s]

In [34]:
sample_boot_bias = sample_biases_df.mean(axis=0).to_numpy()

In [47]:
from operator import itemgetter
rank = itemgetter(*np.argsort(sample_boot_bias- np.array(sample_bias)))(list(microframes))

In [48]:
tuple(sorted(('eed','abc')))

('abc', 'eed')

In [49]:
new_rank = []
for ant1, ant2 in rank:
    new_rank.append(tuple(sorted((ant1,ant2))))

In [50]:
len(new_rank)

3533

In [51]:
len(rank)

3533

In [52]:
new_rank = list(dict.fromkeys(new_rank))

In [53]:
new_rank[:10]

[('passing', 'running'),
 ('perinatal', 'postnatal'),
 ('nonreturnable', 'returnable'),
 ('antemortem', 'postmortem'),
 ('immoderate', 'moderate'),
 ('ambidextrous', 'left-handed'),
 ('manly', 'unmanly'),
 ('lost', 'won'),
 ('ethical', 'unethical'),
 ('multilateral', 'unilateral')]

In [54]:
new_rank[-10:]

[('annual', 'biennial'),
 ('perinatal', 'prenatal'),
 ('acatalectic', 'hypercatalectic'),
 ('amphibious', 'aquatic'),
 ('endomorphic', 'mesomorphic'),
 ('oviparous', 'viviparous'),
 ('air-to-air', 'surface-to-air'),
 ('early', 'middle'),
 ('analogous', 'heterologous'),
 ('ambidextrous', 'right-handed')]