# Word Embedding Evaluations
## Import Statements

In [1]:
from utils import *
from cluster import *
import codecs

## Load Embeddings

In [2]:
# load regular glove
glove_wv, glove_w2i, glove_vocab = load_embedding('data/glove.txt')

# load hard debiased glove
hd_glove_wv, hd_glove_w2i, hd_glove_vocab = load_embedding('data/hard_debias.txt')

# load double hard debiased glove
#dbl_glove_wv, dbl_glove_w2i, dbl_glove_vocab = load_embedding('data/double_hard_glove.txt')

In [3]:
print(len(glove_wv))
print(len(hd_glove_wv))
#print(len(dbl_glove_wv))

322636
322636


## WEAT test

In [4]:
# load word lists for tests
# start with the male and female names
A = [name.lower() for name in WEAT_words['A']]
B = [name.lower() for name in WEAT_words['B']]
# career and family
C = WEAT_words['C']
D = WEAT_words['D']
# math and arts
E = WEAT_words['E']
F = WEAT_words['F']
# science and arts
G = WEAT_words['G']
H = WEAT_words['H']

### Glove

In [5]:
# calculate effect size and p value for career and family
print('Career and Family')
print(effect_size(A, B, C, D, glove_wv, glove_w2i, glove_vocab))
print(p_value_test(A, B, C, D, glove_wv, glove_w2i, glove_vocab))

# calculate effect size and p value for math and arts
print('Math and Arts')
print(effect_size(A, B, E, F, glove_wv, glove_w2i, glove_vocab))
print(p_value_test(A, B, E, F, glove_wv, glove_w2i, glove_vocab))

# calculate effect size and p value for science and arts
print('Science and Arts')
print(effect_size(A, B, G, H, glove_wv, glove_w2i, glove_vocab))
print(p_value_test(A, B, G, H, glove_wv, glove_w2i, glove_vocab))

Career and Family
1.8440745886996364
num of samples 12870
0.0
Math and Arts
0.7598566596555253
num of samples 12870
0.06775446775446775
Science and Arts
1.0550881121697777
num of samples 12870
0.014063714063714063


### Hard Debias Glove

In [6]:
# calculate effect size and p value for career and family
print('Career and Family')
print(effect_size(A, B, C, D, hd_glove_wv, hd_glove_w2i, hd_glove_vocab))
print(p_value_test(A, B, C, D, hd_glove_wv, hd_glove_w2i, hd_glove_vocab))

# calculate effect size and p value for math and arts
print('Math and Arts')
print(effect_size(A, B, E, F, hd_glove_wv, hd_glove_w2i, hd_glove_vocab))
print(p_value_test(A, B, E, F, hd_glove_wv, hd_glove_w2i, hd_glove_vocab))

# calculate effect size and p value for science and arts
print('Science and Arts')
print(effect_size(A, B, G, H, hd_glove_wv, hd_glove_w2i, hd_glove_vocab))
print(p_value_test(A, B, G, H, hd_glove_wv, hd_glove_w2i, hd_glove_vocab))

Career and Family
1.6177007410709874
num of samples 12870
7.77000777000777e-05
Math and Arts
0.1489070026968469
num of samples 12870
0.3857031857031857
Science and Arts
0.08429122526252406
num of samples 12870
0.43496503496503497


### Double Hard Debias

In [7]:
'''
# calculate effect size and p value for career and family
print('Career and Family')
print(effect_size(A, B, C, D, dbl_glove_wv, dbl_glove_w2i, dbl_glove_vocab))
print(p_value_test(A, B, C, D, dbl_glove_wv, dbl_glove_w2i, dbl_glove_vocab))

# calculate effect size and p value for math and arts
print('Math and Arts')
print(effect_size(A, B, E, F, dbl_glove_wv, dbl_glove_w2i, dbl_glove_vocab))
print(p_value_test(A, B, E, F, dbl_glove_wv, dbl_glove_w2i, dbl_glove_vocab))

# calculate effect size and p value for science and arts
print('Science and Arts')
print(effect_size(A, B, G, H, dbl_glove_wv, dbl_glove_w2i, dbl_glove_vocab))
print(p_value_test(A, B, G, H, dbl_glove_wv, dbl_glove_w2i, dbl_glove_vocab))
'''

"\n# calculate effect size and p value for career and family\nprint('Career and Family')\nprint(effect_size(A, B, C, D, dbl_glove_wv, dbl_glove_w2i, dbl_glove_vocab))\nprint(p_value_test(A, B, C, D, dbl_glove_wv, dbl_glove_w2i, dbl_glove_vocab))\n\n# calculate effect size and p value for math and arts\nprint('Math and Arts')\nprint(effect_size(A, B, E, F, dbl_glove_wv, dbl_glove_w2i, dbl_glove_vocab))\nprint(p_value_test(A, B, E, F, dbl_glove_wv, dbl_glove_w2i, dbl_glove_vocab))\n\n# calculate effect size and p value for science and arts\nprint('Science and Arts')\nprint(effect_size(A, B, G, H, dbl_glove_wv, dbl_glove_w2i, dbl_glove_vocab))\nprint(p_value_test(A, B, G, H, dbl_glove_wv, dbl_glove_w2i, dbl_glove_vocab))\n"

## Clustering

In [8]:
# limit vocab by excluding words that 'should' have gender bias
gender_specific = []

with open('./data/male_words.txt') as f:
    for l in f:
        gender_specific.append(l.strip())
with open('./data/female_words.txt') as f:
    for l in f:
        gender_specific.append(l.strip())

with codecs.open('./data/gender_specific_full.json') as f:
    gender_specific.extend(json.load(f))

glove_vocab_limit, glove_wv_limit, glove_w2i_limit = limit_vocab(glove_wv, glove_w2i, glove_vocab, exclude=gender_specific)
hd_glove_vocab_limit, hd_glove_wv_limit, hd_glove_w2i_limit = limit_vocab(hd_glove_wv, hd_glove_w2i, hd_glove_vocab, exclude=gender_specific)
#dbl_glove_vocab_limit, dbl_glove_wv_limit, dbl_glove_w2i_limit = limit_vocab(dbl_glove_wv, dbl_glove_w2i, dbl_glove_vocab, exclude=gender_specific)

size of vocabulary: 47628
size of vocabulary: 47628


### Glove

In [13]:
# get most biased words
he_vector = glove_wv[glove_w2i['he'], :]
she_vector = glove_wv[glove_w2i['she'], :]
biased_words = compute_word_bias(glove_wv_limit, glove_w2i_limit, glove_vocab_limit, he_vector, she_vector)

In [14]:
# cluster using limited vocabulary
for n in [100, 500, 1000]:
    my_cluster(glove_wv_limit, glove_w2i_limit, 1, glove_vocab_limit, biased_words, num_biased_words=n)

precision 1.0
precision 1.0
precision 0.9995


### Hard Debias Glove

In [15]:
# cluster using limited vocabulary
for n in [100, 500, 1000]:
    my_cluster(hd_glove_wv_limit, hd_glove_w2i_limit, 1, hd_glove_vocab_limit, biased_words, num_biased_words=n)

precision 0.76
precision 0.8
precision 0.8


### Double Hard Debiase Glove

In [11]:
'''
# cluster using limited vocabulary
for n in [100, 500, 1000]:
    my_cluster(dbl_glove_wv_limit, dbl_glove_w2i_limit, 1, dbl_glove_vocab_limit, biased_words, num_biased_words=n)
'''

precision 1.0
precision 1.0
precision 0.9995


## Word Analogy

In [33]:
from web.evaluate import evaluate_on_semeval_2012_2
from web.datasets.analogy import fetch_msr_analogy
from web.evaluate import evaluate_analogy

### Glove

In [23]:
# build word dictionary
glove_dict = {}
for word in glove_vocab:
    glove_dict[word] = glove_wv[glove_w2i[word], :]

In [25]:
glove_results = evaluate_on_semeval_2012_2(glove_dict)

In [26]:
print(glove_results['all'])

0.17252551510397837


In [31]:
msr_data = fetch_msr_analogy()


Dataset created in /Users/ericaustin/web_data/analogy/EN-MSR



In [34]:
glove_analogy_results = evaluate_analogy(glove_dict, msr_data['X'], msr_data['y'])

Missing 410 words. Will replace them with mean vector
  A, B, C = np.vstack(w.get(word, mean_vector) for word in X_b[:, 0]), \
  np.vstack(w.get(word, mean_vector) for word in X_b[:, 1]), \
  np.vstack(w.get(word, mean_vector) for word in X_b[:, 2])


In [35]:
print(glove_analogy_results)

0.446375


### Hard Debias Glove

In [27]:
# build word dictionary
hd_glove_dict = {}
for word in hd_glove_vocab:
    hd_glove_dict[word] = hd_glove_wv[hd_glove_w2i[word], :]

In [28]:
hd_glove_results = evaluate_on_semeval_2012_2(hd_glove_dict)

In [29]:
print(hd_glove_results['all'])

0.17629261126785614


In [36]:
hd_glove_analogy_results = evaluate_analogy(hd_glove_dict, msr_data['X'], msr_data['y'])

Missing 410 words. Will replace them with mean vector


In [37]:
print(hd_glove_analogy_results)

0.511625


## Categorization

In [39]:
from web.evaluate import evaluate_categorization
from web.datasets.categorization import fetch_BLESS

### Glove

In [40]:
bless_data = fetch_BLESS()

In [42]:
glove_cat_results = evaluate_categorization(glove_dict, bless_data['X'], bless_data['y'])

  words = np.vstack(w.get(word, mean_vector) for word in X.flatten())


In [43]:
print(glove_cat_results)

0.81


In [44]:
hd_glove_cat_results = evaluate_categorization(hd_glove_dict, bless_data['X'], bless_data['y'])

  words = np.vstack(w.get(word, mean_vector) for word in X.flatten())


In [45]:
print(hd_glove_cat_results)

0.84
