# FrameAxis analysis for r/self and r/nosleep

In [1]:
import tensorflow as tf
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)
print("Num GPUs Available: ", len(physical_devices))

Num GPUs Available:  1


In [2]:
import pickle
with open('./Download/data_cleaned.pickle', 'rb') as handle:
    data_cleaned = pickle.load(handle)

# 1. Building a Set of Microframes

In [3]:
import nltk
from nltk.corpus import wordnet as wn
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /home/anthony/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/anthony/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Show all antonyms in WordNet:

In [4]:
from nltk.corpus import wordnet as wn

def antonyms_for(word):
    antonyms = set()
    for ss in wn.synsets(word):
        for lemma in ss.lemmas():
            any_pos_antonyms = [ antonym.name() for antonym in lemma.antonyms() ]
            for antonym in any_pos_antonyms:
                antonym_synsets = wn.synsets(antonym)
                if wn.ADJ not in [ ss.pos() for ss in antonym_synsets ]:
                    continue
                antonyms.add(antonym)
    return antonyms

In [5]:
from nltk.corpus import wordnet as wn

wn_all_antonyms = set()

for i in wn.all_synsets():
    if i.pos() in ['a', 's']: # If synset is adj or satelite-adj.
        for j in i.lemmas(): # Iterating through lemmas for each synset.
            if j.antonyms(): # If adj has antonym.
                # Prints the adj-antonym pair.
                wn_all_antonyms.add((j.name(), j.antonyms()[0].name()))

In [6]:
len(wn_all_antonyms)

3531

### Import pretrained embeddings

### **Options**

`CLASStorchtext.vocab.GloVe(name='840B', dim=300, **kwargs)`

`CLASStorchtext.vocab.FastText(language='en', **kwargs)`

`CLASStorchtext.vocab.CharNGram(**kwargs)`

In [7]:
import torch
import torchtext
import numpy as np

In [8]:
glove = torchtext.vocab.GloVe(name="840B",dim=300)

In [9]:
import spacy
nlp = spacy.load('en_core_web_lg')

In [10]:
from nltk.corpus import wordnet as wn

wn_all_antonyms_words = set()

for ant_pair in wn_all_antonyms:
    wn_all_antonyms_words = set(tuple(wn_all_antonyms_words) + ant_pair)

In [11]:
no_emb_words = set()
for adj in wn_all_antonyms_words:
    if torch.all(glove[adj] == torch.zeros(300)):
        no_emb_words.add(adj)

In [12]:
len(wn_all_antonyms)

3531

In [13]:
copy = wn_all_antonyms.copy()
for pair in wn_all_antonyms:
    pair1, pair2 = pair
    if pair1 in no_emb_words or pair2 in no_emb_words:
        copy.remove(pair)
        
wn_all_antonyms = copy

In [14]:
len(wn_all_antonyms)

3131

In [15]:
3531-3131

400

In [16]:
wn_all_antonyms

{('natural', 'supernatural'),
 ('discourteous', 'courteous'),
 ('prenuptial', 'postnuptial'),
 ('provident', 'improvident'),
 ('loose', 'compact'),
 ('unrepeatable', 'repeatable'),
 ('curtained', 'curtainless'),
 ('melted', 'unmelted'),
 ('idle', 'busy'),
 ('uncrowned', 'crowned'),
 ('unrequested', 'requested'),
 ('unaccompanied', 'accompanied'),
 ('hostile', 'amicable'),
 ('laced', 'unlaced'),
 ('uncompromising', 'compromising'),
 ('unscientific', 'scientific'),
 ('nontechnical', 'technical'),
 ('altricial', 'precocial'),
 ('unappealing', 'appealing'),
 ('consumptive', 'generative'),
 ('returnable', 'nonreturnable'),
 ('fresh', 'preserved'),
 ('asexual', 'sexual'),
 ('handled', 'handleless'),
 ('diadromous', 'anadromous'),
 ('unbound', 'bound'),
 ('careless', 'careful'),
 ('keyed', 'keyless'),
 ('negligent', 'diligent'),
 ('facultative', 'obligate'),
 ('retractile', 'nonretractile'),
 ('separate', 'joint'),
 ('parallel', 'perpendicular'),
 ('anestrous', 'estrous'),
 ('unprincipled', '

### Add Custom Antonym Pairs

In [17]:
from nltk.corpus import wordnet as wn

# add words here:
WORDS = ['man', 'human']

for word in WORDS:
    for syn in wn.synsets(word):
        for j in syn.lemmas(): # Iterating through lemmas for each synset.
            if j.antonyms():
                pair1 = j.name()
                pair2 = j.antonyms()[0].name()
                # don't add pair to the list if at least one of them has no glove embeddings
                if torch.all(glove[pair1] == torch.zeros(300)) or torch.all(glove[pair2] == torch.zeros(300)):
                    print(f"Pair {(j.name(), j.antonyms()[0].name())} cannot be added because one of them does not have GloVe embeddings.")
                else:
                    print(f"Added {(pair1, pair2)}")
                    wn_all_antonyms.add((pair1, pair2))

Added ('man', 'woman')
Added ('serviceman', 'civilian')
Added ('man', 'woman')
Added ('human', 'nonhuman')


In [18]:
microframes = wn_all_antonyms

In [19]:
len(microframes)

3133

### Semantic Axis Vector

In [20]:
semantic_axis_vectors = dict()
for mframe in microframes:
    semantic_axis_vectors[' - '.join(mframe)] = np.expand_dims(glove[mframe[1]] - glove[mframe[0]], axis= 0)

In [21]:
semantic_axis_vectors['man - woman']

array([[ 1.98666990e-01,  7.22199827e-02, -1.86462998e-01,
         5.83739996e-01, -7.46250004e-02, -9.17997956e-03,
         3.19599956e-02,  3.92699987e-02,  1.38819993e-01,
         6.78999424e-02, -2.03620002e-01,  2.38368988e-01,
        -2.72006691e-01, -3.08981687e-01, -1.61559999e-01,
        -1.46412000e-01, -4.35483992e-01,  1.89160019e-01,
        -2.95740008e-01, -3.50700021e-02,  9.05001163e-03,
         2.55119979e-01, -1.56845003e-01, -3.59719992e-01,
        -2.88099945e-02,  4.00590003e-01,  1.07859999e-01,
        -2.12710008e-01, -3.14889997e-01, -1.76756993e-01,
         1.34200007e-01, -5.70900142e-02,  2.26850003e-01,
        -2.34786004e-01,  4.04179990e-01,  2.45397985e-01,
         1.38130009e-01, -3.17710042e-01, -6.44200146e-02,
         5.04499972e-02,  5.87419987e-01,  1.61250010e-01,
        -1.27790004e-01, -5.09299934e-02,  5.82489967e-02,
        -1.54311001e-01, -2.74599999e-01, -4.05699909e-02,
         4.65460002e-01, -2.69311011e-01, -6.96425974e-0

## Conclusion

- Total of 3131 adjective antonym pairs plus 1 or more custom pairs
- Each pair must have GloVe embeddings (i.e. embedding not [0,0,0,0,0, ..., 0])

# 2. Contribution of a Word to Microframes

For calculating cosine similarity, see [documentaion](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.cosine_similarity.html) from sklearn

In [22]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# example:
x = np.array([[1,2,2]])
y = np.array([[3,4,1]])
cosine_similarity(x,y).item()

0.8498365855987975

In [23]:
np.expand_dims(glove['man'], axis=0).shape
cosine_similarity(np.expand_dims(glove['man'], axis=0), np.expand_dims(glove['woman'], axis=0)).item()

0.740174412727356

### Wrapper for cosine similarity
Given two words, find similarity:

In [24]:
def my_cos_similarity(func):
    def wrapper(word, axis_vector):
        return func(np.expand_dims(glove[word], axis=0), axis_vector)
    return wrapper

@my_cos_similarity
def cos_similarity(x, y):
    return cosine_similarity(x, y).item()

cos_similarity('woman',semantic_axis_vectors['man - woman'])

0.4572717547416687

The **absolute value** of the similarity between a word vector and
a microframe vector captures the relevance of the word to the
microframe, while the **sign** of the similarity captures a bias toward
one of the poles in the microframe.

# 3. Framing Bias and Intensity

## Bias and word frequency

In [25]:
import nltk
from multiprocessing import Pool
from tqdm.notebook import tqdm

In [26]:
# f
# word is a single word string
# doc_tokens is a list of all document tokens

def word_freq(word, doc_tokens):
    return doc_tokens.count(word)

### Same function, but using multiprocessing

In [168]:
def calc_bias(word, doc_tokens, frame):
    numerator = word_freq(word, doc_tokens) * cos_similarity(word, semantic_axis_vectors[frame])
    denominator = word_freq(word, doc_tokens)
    
    return (numerator, denominator)
# B
def bias(doc, frame=None):
    doc_tokens = nltk.word_tokenize(doc)
    
    with Pool() as pool:
        b = pool.starmap(calc_bias, [(word, doc_tokens, frame) for word in doc_tokens])
    
    numerator = 0
    denominator = 0
    for i in b:
        num,denom = i
        numerator += num
        denominator += denom
        
    return numerator / denominator

In [169]:
%%timeit
sample = data_cleaned['RS_2020_nosleep'].iloc[0:1].selftext.item()
bias(sample, frame='man - woman')

811 ms ± 12.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Regular function

In [27]:
# B
def bias(doc, frame=None):
    doc_tokens = nltk.word_tokenize(doc)
    
    numerator = 0
    denominator = 0
    for word in tqdm(doc_tokens):
        numerator += word_freq(word, doc_tokens) * cos_similarity(word, semantic_axis_vectors[frame])
        denominator += word_freq(word, doc_tokens)
        
    return numerator / denominator

In [28]:
sample = data_cleaned['RS_2020_nosleep'].iloc[0:1].selftext.item()
bias(sample, frame='man - woman')

  0%|          | 0/575 [00:00<?, ?it/s]

-0.023260273511452257

## Intensity

### Create the entire corpus *T*

In [29]:
import pandas as pd
concat_df = pd.concat([data_cleaned['RS_2020_self'], data_cleaned['RS_2020_nosleep']])

In [30]:
np.savetxt("Download/concat_texts.txt", concat_df['selftext'].astype('string').values, fmt='%s')

In [31]:
with open('Download/concat_texts.txt', 'r') as file:
    concat_texts = file.read()

# I
def intensity(doc, frame=None, corpus = concat_texts):        
    import nltk
    doc_tokens = nltk.word_tokenize(doc)
    
    numerator = 0
    denominator = 0
    B_T = bias(corpus, frame = frame)
    for word in doc_tokens:
        numerator += word_freq(word, doc_tokens) * (cos_similarity(word, semantic_axis_vectors[frame]) - B_T)**2
        denominator += word_freq(word, doc_tokens)
        
    return numerator / denominator

In [32]:
intensity(sample, frame='man - woman', corpus = concat_texts)

  0%|          | 0/34244595 [00:00<?, ?it/s]

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/anthony/.local/share/virtualenvs/Research-Mapping-Uncanny-Valley-DSc8QBrC/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3418, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-32-12af64e11311>", line 1, in <module>
    intensity(sample, frame='man - woman', corpus = concat_texts)
  File "<ipython-input-31-0307f44d0bc8>", line 11, in intensity
    B_T = bias(corpus, frame = frame)
  File "<ipython-input-27-c655ea6c54f4>", line 9, in bias
    denominator += word_freq(word, doc_tokens)
  File "<ipython-input-26-a66d003e4caa>", line 6, in word_freq
    return doc_tokens.count(word)
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/anthony/.local/share/virtualenvs/Research-Mapping-Uncanny-Valley-DSc8QBrC/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 2045, in showtraceback
   

TypeError: object of type 'NoneType' has no len()

In [117]:
import multiprocessing.dummy as mp 

def do_print(s):
    print(s, flush= True)

if __name__=="__main__":
    p=mp.Pool(4)
    p.map(do_print,range(0,100)) # range(0,1000) if you want to replicate your example
    p.close()
    p.join()

0
714
21
1

15
228
2

323169



4
10
24
17
25
18
115

26
6
1219

27
28
20
13
35
2942
49

36
30
43
50
37
51
44
31
38
45
52
32
39
33
5346

40
34
54
47
41
5648
55

63
57
70
64
77
58
71
65
78
59
72
66
79
60
73
67
80
61
74
68
81
62
75
82
69
7684

9183

85
92
98
86
93
87
94
99
88
9589

9690

97
