# Word Embedding Evaluations
## Import Statements

In [1]:
from utils import *
from cluster import *
import codecs

## Load Embeddings

In [8]:
# load regular glove
glove_wv, glove_w2i, glove_vocab = load_embedding('data/glove.txt')

# load hard debiased glove
hd_glove_wv, hd_glove_w2i, hd_glove_vocab = load_embedding('data/hard_debias.txt')

## WEAT test

In [3]:
# load word lists for tests
# start with the male and female names
A = [name.lower() for name in WEAT_words['A']]
B = [name.lower() for name in WEAT_words['B']]
# career and family
C = WEAT_words['C']
D = WEAT_words['D']
# math and arts
E = WEAT_words['E']
F = WEAT_words['F']
# science and arts
G = WEAT_words['G']
H = WEAT_words['H']

### Glove

In [12]:
# calculate effect size and p value for career and family
print('Career and Family')
print(effect_size(A, B, C, D, glove_wv, glove_w2i, glove_vocab))
print(p_value_test(A, B, C, D, glove_wv, glove_w2i, glove_vocab))

# calculate effect size and p value for math and arts
print('Math and Arts')
print(effect_size(A, B, E, F, glove_wv, glove_w2i, glove_vocab))
print(p_value_test(A, B, E, F, glove_wv, glove_w2i, glove_vocab))

# calculate effect size and p value for science and arts
print('Science and Arts')
print(effect_size(A, B, G, H, glove_wv, glove_w2i, glove_vocab))
print(p_value_test(A, B, G, H, glove_wv, glove_w2i, glove_vocab))

Career and Family
1.8440745886996364
num of samples 12870
0.0
Math and Arts
0.7598566596555253
num of samples 12870
0.06767676767676768
Science and Arts
1.0550881121697777
num of samples 12870
0.014063714063714063


### Hard Debias Glove

In [13]:
# calculate effect size and p value for career and family
print('Career and Family')
print(effect_size(A, B, C, D, hd_glove_wv, hd_glove_w2i, hd_glove_vocab))
print(p_value_test(A, B, C, D, hd_glove_wv, hd_glove_w2i, hd_glove_vocab))

# calculate effect size and p value for math and arts
print('Math and Arts')
print(effect_size(A, B, E, F, hd_glove_wv, hd_glove_w2i, hd_glove_vocab))
print(p_value_test(A, B, E, F, hd_glove_wv, hd_glove_w2i, hd_glove_vocab))

# calculate effect size and p value for science and arts
print('Science and Arts')
print(effect_size(A, B, G, H, hd_glove_wv, hd_glove_w2i, hd_glove_vocab))
print(p_value_test(A, B, G, H, hd_glove_wv, hd_glove_w2i, hd_glove_vocab))

Career and Family
1.6177007410709874
num of samples 12870
7.77000777000777e-05
Math and Arts
0.1489070026968469
num of samples 12870
0.38578088578088576
Science and Arts
0.08429122526252406
num of samples 12870
0.43496503496503497


## Clustering

In [15]:
# limit vocab by excluding words that 'should' have gender bias
gender_specific = []

with open('./data/male_words.txt') as f:
    for l in f:
        gender_specific.append(l.strip())
with open('./data/female_words.txt') as f:
    for l in f:
        gender_specific.append(l.strip())

with codecs.open('./data/gender_specific_full.json') as f:
    gender_specific.extend(json.load(f))

glove_vocab_limit, glove_wv_limit, glove_w2i_limit = limit_vocab(glove_wv, glove_w2i, glove_vocab, exclude=gender_specific)
hd_glove_vocab_limit, hd_glove_wv_limit, hd_glove_w2i_limit = limit_vocab(hd_glove_wv, hd_glove_w2i, hd_glove_vocab, exclude=gender_specific)

size of vocabulary: 47628
size of vocabulary: 47628


In [6]:
# get most biased words
he_vector = glove_wv[glove_w2i['he'], :]
she_vector = glove_wv[glove_w2i['she'], :]
biased_words = compute_word_bias(glove_wv_limit, glove_w2i_limit, glove_vocab_limit, he_vector, she_vector)

### Glove

In [7]:
# cluster using limited vocabulary
for n in [100, 500, 1000]:
    my_cluster(glove_wv_limit, glove_w2i_limit, 1, glove_vocab_limit, biased_words, num_biased_words=n)

precision 1.0
precision 1.0
precision 0.9995


### Hard Debias Glove

In [17]:
# cluster using limited vocabulary
for n in [100, 500, 1000]:
    my_cluster(hd_glove_wv_limit, hd_glove_w2i_limit, 1, hd_glove_vocab_limit, biased_words, num_biased_words=n)

precision 0.76
precision 0.8
precision 0.8
