# Word Embedding Evaluations
## Import Statements

In [42]:
from utils import *
from cluster import *
import codecs

## Load Embeddings

In [44]:
# load double hard debiased glove
dbl_glove_wv, dbl_glove_w2i, dbl_glove_vocab = load_embedding('data/double_hard_glove.txt')

In [45]:
print(dbl_glove_wv.shape)

(322636, 300)


## WEAT test

In [46]:
# load word lists for tests
# start with the male and female names
A = [name.lower() for name in WEAT_words['A']]
B = [name.lower() for name in WEAT_words['B']]
# career and family
C = WEAT_words['C']
D = WEAT_words['D']
# math and arts
E = WEAT_words['E']
F = WEAT_words['F']
# science and arts
G = WEAT_words['G']
H = WEAT_words['H']

### Double Hard Debias Glove

In [47]:
# calculate effect size and p value for career and family
print('Career and Family')
print(effect_size(A, B, C, D, dbl_glove_wv, dbl_glove_w2i, dbl_glove_vocab))
print(p_value_test(A, B, C, D, dbl_glove_wv, dbl_glove_w2i, dbl_glove_vocab))

# calculate effect size and p value for math and arts
print('Math and Arts')
print(effect_size(A, B, E, F, dbl_glove_wv, dbl_glove_w2i, dbl_glove_vocab))
print(p_value_test(A, B, E, F, dbl_glove_wv, dbl_glove_w2i, dbl_glove_vocab))

# calculate effect size and p value for science and arts
print('Science and Arts')
print(effect_size(A, B, G, H, dbl_glove_wv, dbl_glove_w2i, dbl_glove_vocab))
print(p_value_test(A, B, G, H, dbl_glove_wv, dbl_glove_w2i, dbl_glove_vocab))

Career and Family
1.456237842884396
num of samples 12870
0.0003885003885003885
Math and Arts
0.7520731402456332
num of samples 12870
0.06752136752136752
Science and Arts
0.13660239768806068
num of samples 12870
0.39533799533799535


## Clustering

In [48]:
# limit vocab by excluding words that 'should' have gender bias
gender_specific = []

with open('./data/male_words.txt') as f:
    for l in f:
        gender_specific.append(l.strip())
with open('./data/female_words.txt') as f:
    for l in f:
        gender_specific.append(l.strip())

with codecs.open('./data/gender_specific_full.json') as f:
    gender_specific.extend(json.load(f))

glove_wv, glove_w2i, glove_vocab = load_embedding('./data/glove.txt')
glove_wv = decentralize(glove_wv)
glove_wv = normalize(glove_wv)
glove_vocab_limit, glove_wv_limit, glove_w2i_limit = limit_vocab(glove_wv, glove_w2i, glove_vocab, exclude=gender_specific)
dbl_glove_vocab_limit, dbl_glove_wv_limit, dbl_glove_w2i_limit = limit_vocab(dbl_glove_wv, dbl_glove_w2i, dbl_glove_vocab, exclude=gender_specific)

size of vocabulary: 47628
size of vocabulary: 47628


### Double Hard Debiase Glove

In [49]:
# get most biased words
he_vector = glove_wv[glove_w2i['he'], :]
she_vector = glove_wv[glove_w2i['she'], :]
biased_words = compute_word_bias(glove_wv_limit, glove_w2i_limit, glove_vocab_limit, he_vector, she_vector)

In [50]:

# cluster using limited vocabulary
for n in [100, 500, 1000]:
    my_cluster(dbl_glove_wv_limit, dbl_glove_w2i_limit, 1, dbl_glove_vocab_limit, biased_words, num_biased_words=n)


precision 0.6950000000000001
precision 0.8049999999999999
precision 0.795


## Word Analogy

In [51]:
from web.evaluate import evaluate_on_semeval_2012_2
from web.datasets.analogy import fetch_msr_analogy
from web.evaluate import evaluate_analogy

### Double Hard Debias Glove

In [52]:
# build word dictionary
dbl_glove_dict = {}
for word in dbl_glove_vocab:
    dbl_glove_dict[word] = dbl_glove_wv[dbl_glove_w2i[word], :]

In [53]:
dbl_glove_results = evaluate_on_semeval_2012_2(dbl_glove_dict)

  prot_left = np.mean(np.vstack(w.get(word, mean_vector) for word in prototypes[:, 0]), axis=0)
  prot_right = np.mean(np.vstack(w.get(word, mean_vector) for word in prototypes[:, 1]), axis=0)
  question_left, question_right = np.vstack(w.get(word, mean_vector) for word in questions[:, 0]), \
  np.vstack(w.get(word, mean_vector) for word in questions[:, 1])


In [54]:
print(dbl_glove_results['all'])

0.18006947757814007


In [55]:
msr_data = fetch_msr_analogy()

In [56]:
dbl_glove_analogy_results = evaluate_analogy(dbl_glove_dict, msr_data['X'], msr_data['y'])

Missing 410 words. Will replace them with mean vector
  A, B, C = np.vstack(w.get(word, mean_vector) for word in X_b[:, 0]), \
  np.vstack(w.get(word, mean_vector) for word in X_b[:, 1]), \
  np.vstack(w.get(word, mean_vector) for word in X_b[:, 2])


In [57]:
print(dbl_glove_analogy_results)

0.512125


## Categorization

In [58]:
from web.evaluate import evaluate_categorization
from web.datasets.categorization import fetch_BLESS

### Double Hard Debias Glove

In [59]:
bless_data = fetch_BLESS()

In [60]:
dbl_glove_cat_results = evaluate_categorization(dbl_glove_dict, bless_data['X'], bless_data['y'])

  words = np.vstack(w.get(word, mean_vector) for word in X.flatten())


In [61]:
print(dbl_glove_cat_results)

0.84
