# Word Embedding Evaluations
## Import Statements

In [None]:
from utils import *
from cluster import *
import codecs


/content/drive/.shortcut-targets-by-id/1Aj4y870cQktsbomfGNEXbJwEQXSsQ6T5/CMPUT 654 Project/Evaluation
/content/drive/.shortcut-targets-by-id/1Aj4y870cQktsbomfGNEXbJwEQXSsQ6T5/CMPUT 654 Project


## Load Embeddings

In [None]:
# load regular glove
glove_wv, glove_w2i, glove_vocab = load_embedding('data/glove.txt')

# load hard debiased glove
hd_glove_wv, hd_glove_w2i, hd_glove_vocab = load_embedding('data/hard_debias.txt')

# load double hard debiased glove
dbl_glove_wv, dbl_glove_w2i, dbl_glove_vocab = load_embedding('data/double_hard_glove.txt')

In [None]:
print(len(glove_wv))
print(len(hd_glove_wv))
print(len(dbl_glove_wv))

322636
322636
322636


## WEAT test

In [None]:
# load word lists for tests
# start with the male and female names
A = [name.lower() for name in WEAT_words['A']]
B = [name.lower() for name in WEAT_words['B']]
# career and family
C = WEAT_words['C']
D = WEAT_words['D']
# math and arts
E = WEAT_words['E']
F = WEAT_words['F']
# science and arts
G = WEAT_words['G']
H = WEAT_words['H']

### Glove

In [None]:
# calculate effect size and p value for career and family
print('Career and Family')
print(effect_size(A, B, C, D, glove_wv, glove_w2i, glove_vocab))
print(p_value_test(A, B, C, D, glove_wv, glove_w2i, glove_vocab))

# calculate effect size and p value for math and arts
print('Math and Arts')
print(effect_size(A, B, E, F, glove_wv, glove_w2i, glove_vocab))
print(p_value_test(A, B, E, F, glove_wv, glove_w2i, glove_vocab))

# calculate effect size and p value for science and arts
print('Science and Arts')
print(effect_size(A, B, G, H, glove_wv, glove_w2i, glove_vocab))
print(p_value_test(A, B, G, H, glove_wv, glove_w2i, glove_vocab))

Career and Family
1.8440745886996364
num of samples 12870
7.77000777000777e-05
Math and Arts
0.7598566596555253
num of samples 12870
0.06767676767676768
Science and Arts
1.0550881121697777
num of samples 12870
0.014063714063714063


### Hard Debias Glove

In [None]:
# calculate effect size and p value for career and family
print('Career and Family')
print(effect_size(A, B, C, D, hd_glove_wv, hd_glove_w2i, hd_glove_vocab))
print(p_value_test(A, B, C, D, hd_glove_wv, hd_glove_w2i, hd_glove_vocab))

# calculate effect size and p value for math and arts
print('Math and Arts')
print(effect_size(A, B, E, F, hd_glove_wv, hd_glove_w2i, hd_glove_vocab))
print(p_value_test(A, B, E, F, hd_glove_wv, hd_glove_w2i, hd_glove_vocab))

# calculate effect size and p value for science and arts
print('Science and Arts')
print(effect_size(A, B, G, H, hd_glove_wv, hd_glove_w2i, hd_glove_vocab))
print(p_value_test(A, B, G, H, hd_glove_wv, hd_glove_w2i, hd_glove_vocab))

Career and Family
1.6177007410709874
num of samples 12870
7.77000777000777e-05
Math and Arts
0.14890700269684617
num of samples 12870
0.38578088578088576
Science and Arts
0.08429122526252365
num of samples 12870
0.43504273504273505


### Double Hard Debias

In [None]:
# calculate effect size and p value for career and family
print('Career and Family')
print(effect_size(A, B, C, D, dbl_glove_wv, dbl_glove_w2i, dbl_glove_vocab))
print(p_value_test(A, B, C, D, dbl_glove_wv, dbl_glove_w2i, dbl_glove_vocab))

# calculate effect size and p value for math and arts
print('Math and Arts')
print(effect_size(A, B, E, F, dbl_glove_wv, dbl_glove_w2i, dbl_glove_vocab))
print(p_value_test(A, B, E, F, dbl_glove_wv, dbl_glove_w2i, dbl_glove_vocab))

# calculate effect size and p value for science and arts
print('Science and Arts')
print(effect_size(A, B, G, H, dbl_glove_wv, dbl_glove_w2i, dbl_glove_vocab))
print(p_value_test(A, B, G, H, dbl_glove_wv, dbl_glove_w2i, dbl_glove_vocab))

Career and Family
1.4562378428843936
num of samples 12870
0.0004662004662004662
Math and Arts
0.7520731402456355
num of samples 12870
0.06752136752136752
Science and Arts
0.13660239768806012
num of samples 12870
0.39533799533799535


## Clustering

In [None]:
# limit vocab by excluding words that 'should' have gender bias
gender_specific = []

with open('./data/male_words.txt') as f:
    for l in f:
        gender_specific.append(l.strip())
with open('./data/female_words.txt') as f:
    for l in f:
        gender_specific.append(l.strip())

with codecs.open('./data/gender_specific_full.json') as f:
    gender_specific.extend(json.load(f))

glove_vocab_limit, glove_wv_limit, glove_w2i_limit = limit_vocab(glove_wv, glove_w2i, glove_vocab, exclude=gender_specific)
hd_glove_vocab_limit, hd_glove_wv_limit, hd_glove_w2i_limit = limit_vocab(hd_glove_wv, hd_glove_w2i, hd_glove_vocab, exclude=gender_specific)
dbl_glove_vocab_limit, dbl_glove_wv_limit, dbl_glove_w2i_limit = limit_vocab(dbl_glove_wv, dbl_glove_w2i, dbl_glove_vocab, exclude=gender_specific)

size of vocabulary: 47628
size of vocabulary: 47628
size of vocabulary: 47628


### Glove

In [None]:
# get most biased words
he_vector = glove_wv[glove_w2i['he'], :]
she_vector = glove_wv[glove_w2i['she'], :]
biased_words = compute_word_bias(glove_wv_limit, glove_w2i_limit, glove_vocab_limit, he_vector, she_vector)

In [None]:
# cluster using limited vocabulary
for n in [100, 500, 1000]:
    my_cluster(glove_wv_limit, glove_w2i_limit, 1, glove_vocab_limit, biased_words, num_biased_words=n)

precision 1.0
precision 1.0
precision 0.9995


### Hard Debias Glove

In [None]:
# cluster using limited vocabulary
for n in [100, 500, 1000]:
    my_cluster(hd_glove_wv_limit, hd_glove_w2i_limit, 1, hd_glove_vocab_limit, biased_words, num_biased_words=n)

precision 0.76
precision 0.8
precision 0.8


### Double Hard Debiase Glove

In [None]:
# cluster using limited vocabulary
for n in [100, 500, 1000]:
    my_cluster(dbl_glove_wv_limit, dbl_glove_w2i_limit, 1, dbl_glove_vocab_limit, biased_words, num_biased_words=n)

precision 0.76
precision 0.81
precision 0.8085


## Word Analogy
### Google, MSR

In [None]:
cd benchmarks/

/content/drive/.shortcut-targets-by-id/1Aj4y870cQktsbomfGNEXbJwEQXSsQ6T5/CMPUT 654 Project/benchmarks


In [None]:
from web.evaluate import evaluate_on_semeval_2012_2
from web.datasets.analogy import fetch_msr_analogy, fetch_google_analogy
from web.evaluate import evaluate_analogy



### Glove Analogy

In [None]:
# build word dictionary
glove_dict = {}
for word in glove_vocab:
    glove_dict[word] = glove_wv[glove_w2i[word], :]

In [None]:
# glove_results = evaluate_on_semeval_2012_2(glove_dict)


Dataset created in /root/web_data/analogy



  prot_left = np.mean(np.vstack(w.get(word, mean_vector) for word in prototypes[:, 0]), axis=0)
  prot_right = np.mean(np.vstack(w.get(word, mean_vector) for word in prototypes[:, 1]), axis=0)
  question_left, question_right = np.vstack(w.get(word, mean_vector) for word in questions[:, 0]), \
  np.vstack(w.get(word, mean_vector) for word in questions[:, 1])


In [None]:
# print(glove_results['all'])

0.17252551510397837


In [None]:
msr_data = fetch_msr_analogy()


Dataset created in /root/web_data/analogy/EN-MSR



In [None]:
glove_analogy_MSR_results = evaluate_analogy(glove_dict, msr_data['X'], msr_data['y'])

Missing 410 words. Will replace them with mean vector
  A, B, C = np.vstack(w.get(word, mean_vector) for word in X_b[:, 0]), \
  np.vstack(w.get(word, mean_vector) for word in X_b[:, 1]), \
  np.vstack(w.get(word, mean_vector) for word in X_b[:, 2])


In [None]:
print(glove_analogy_MSR_results)

0.446375


In [None]:
google_data = fetch_google_analogy()


Dataset created in /root/web_data/analogy/EN-GOOGLE



In [None]:
glove_analogy_google_results = evaluate_analogy(glove_dict, google_data['X'], google_data['y'])


Dataset created in /root/web_data/analogy/EN-GOOGLE



  A, B, C = np.vstack(w.get(word, mean_vector) for word in X_b[:, 0]), \
  np.vstack(w.get(word, mean_vector) for word in X_b[:, 1]), \
  np.vstack(w.get(word, mean_vector) for word in X_b[:, 2])


In [None]:
print(glove_analogy_google_results)

0.6440851412198118


### Hard Debias Glove Analogy

In [None]:
# build word dictionary
hd_glove_dict = {}
for word in hd_glove_vocab:
    hd_glove_dict[word] = hd_glove_wv[hd_glove_w2i[word], :]

In [None]:
# hd_glove_results = evaluate_on_semeval_2012_2(hd_glove_dict)

  prot_left = np.mean(np.vstack(w.get(word, mean_vector) for word in prototypes[:, 0]), axis=0)
  prot_right = np.mean(np.vstack(w.get(word, mean_vector) for word in prototypes[:, 1]), axis=0)
  question_left, question_right = np.vstack(w.get(word, mean_vector) for word in questions[:, 0]), \
  np.vstack(w.get(word, mean_vector) for word in questions[:, 1])


In [None]:
print(hd_glove_results['all'])

0.17629261126785617


In [None]:
hd_analogy_MSR_results = evaluate_analogy(hd_glove_dict, msr_data['X'], msr_data['y'])

Missing 410 words. Will replace them with mean vector
  A, B, C = np.vstack(w.get(word, mean_vector) for word in X_b[:, 0]), \
  np.vstack(w.get(word, mean_vector) for word in X_b[:, 1]), \
  np.vstack(w.get(word, mean_vector) for word in X_b[:, 2])


In [None]:
print(hd_analogy_MSR_results)

0.511625


In [None]:
hd_analogy_google_results = evaluate_analogy(hd_glove_dict, google_data['X'], google_data['y'])
print(hd__analogy_google_results)

  A, B, C = np.vstack(w.get(word, mean_vector) for word in X_b[:, 0]), \
  np.vstack(w.get(word, mean_vector) for word in X_b[:, 1]), \
  np.vstack(w.get(word, mean_vector) for word in X_b[:, 2])


0.7056385591485878


### Double Hard Debias Glove Analogy

In [None]:
# build word dictionary
dbl_glove_dict = {}
for word in hd_glove_vocab:
    dbl_glove_dict[word] = dbl_glove_wv[dbl_glove_w2i[word], :]

In [None]:
dbl_analogy_MSR_results = evaluate_analogy(dbl_glove_dict, msr_data['X'], msr_data['y'])

  A, B, C = np.vstack(w.get(word, mean_vector) for word in X_b[:, 0]), \
  np.vstack(w.get(word, mean_vector) for word in X_b[:, 1]), \
  np.vstack(w.get(word, mean_vector) for word in X_b[:, 2])


In [None]:
print(dbl_analogy_MSR_results)

0.512125


In [None]:
dbl_analogy_google_results = evaluate_analogy(dbl_glove_dict, google_data['X'], google_data['y'])

In [None]:
print(dbl_analogy_google_results)

## Categorization

In [None]:
from web.evaluate import evaluate_categorization
from web.datasets.categorization import fetch_BLESS, fetch_battig, fetch_AP, fetch_ESSLI_1a, fetch_ESSLI_2b, fetch_ESSLI_2c

### Glove

In [None]:
bless_data = fetch_BLESS()
battig_data = fetch_battig()
ap_data = fetch_AP()
ESSLI_1a_data  = fetch_ESSLI_1a()
ESSLI_2b_data  = fetch_ESSLI_2b()
ESSLI_2c_data  = fetch_ESSLI_2c()

In [None]:
glove_cat_results_bless = evaluate_categorization(glove_dict, bless_data['X'], bless_data['y'])
glove_cat_results_battig = evaluate_categorization(glove_dict, battig_data['X'], battig_data['y'])
glove_cat_results_ap = evaluate_categorization(glove_dict, ap_data['X'], ap_data['y'])
glove_cat_results_ESSLI_1a = evaluate_categorization(glove_dict, ESSLI_1a_data['X'], ESSLI_1a_data['y'])
glove_cat_results_ESSLI_2b = evaluate_categorization(glove_dict, ESSLI_2b_data['X'], ESSLI_2b_data['y'])
glove_cat_results_ESSLI_2c = evaluate_categorization(glove_dict, ESSLI_2c_data['X'], ESSLI_2c_data['y'])

In [None]:
print("BLESS: " + str(glove_cat_results_bless))
print("Battig: " + str(glove_cat_results_battig))
print("AP:" + str(glove_cat_results_ap))
print(glove_cat_results_ESSLI_1a)
print(glove_cat_results_ESSLI_2b)
print(glove_cat_results_ESSLI_2c)
print("ESSLI_avg: " + str((glove_cat_results_ESSLI_1a + glove_cat_results_ESSLI_2b + glove_cat_results_ESSLI_2c)/3))

### Hard Debias Glove

In [None]:
hd_glove_cat_results_bless = evaluate_categorization(hd_glove_dict, bless_data['X'], bless_data['y'])
hd_glove_cat_results_battig = evaluate_categorization(hd_glove_dict, battig_data['X'], battig_data['y'])
hd_glove_cat_results_ap = evaluate_categorization(hd_glove_dict, ap_data['X'], ap_data['y'])
hd_glove_cat_results_ESSLI_1a = evaluate_categorization(hd_glove_dict, ESSLI_1a_data['X'], ESSLI_1a_data['y'])
hd_glove_cat_results_ESSLI_2b = evaluate_categorization(hd_glove_dict, ESSLI_2b_data['X'], ESSLI_2b_data['y'])
hd_glove_cat_results_ESSLI_2c = evaluate_categorization(hd_glove_dict, ESSLI_2c_data['X'], ESSLI_2c_data['y'])

In [None]:
print("BLESS: " + str(hd_glove_cat_results_bless))
print("Battig: " + str(hd_glove_cat_results_battig))
print("AP: " + str(hd_glove_cat_results_ap))
print(hd_glove_cat_results_ESSLI_1a)
print(hd_glove_cat_results_ESSLI_2b)
print(hd_glove_cat_results_ESSLI_2c)
print("ESSLI_avg: " + str((hd_glove_cat_results_ESSLI_1a + hd_glove_cat_results_ESSLI_2b + hd_glove_cat_results_ESSLI_2c)/3))

### Double Hard Debias Glove

In [None]:
dbl_glove_cat_results_bless = evaluate_categorization(dbl_glove_dict, bless_data['X'], bless_data['y'])
dbl_glove_cat_results_battig = evaluate_categorization(dbl_glove_dict, battig_data['X'], battig_data['y'])
dbl_glove_cat_results_ap = evaluate_categorization(dbl_glove_dict, ap_data['X'], ap_data['y'])
dbl_glove_cat_results_ESSLI_1a = evaluate_categorization(dbl_glove_dict, ESSLI_1a_data['X'], ESSLI_1a_data['y'])
dbl_glove_cat_results_ESSLI_2b = evaluate_categorization(dbl_glove_dict, ESSLI_2b_data['X'], ESSLI_2b_data['y'])
dbl_glove_cat_results_ESSLI_2c = evaluate_categorization(dbl_glove_dict, ESSLI_2c_data['X'], ESSLI_2c_data['y'])

In [None]:
print("BLESS: " + str(dbl_glove_cat_results_bless))
print("Battig: " + str(dbl_glove_cat_results_battig))
print("AP: " + str(dbl_glove_cat_results_ap))
print(dbl_glove_cat_results_ESSLI_1a)
print(dbl_glove_cat_results_ESSLI_2b)
print(dbl_glove_cat_results_ESSLI_2c)
print("ESSLI_avg: " + str((dbl_glove_cat_results_ESSLI_1a + dbl_glove_cat_results_ESSLI_2b + dbl_glove_cat_results_ESSLI_2c)/3))

In [None]:
print("BLESS: " + str(dbl_glove_cat_results_bless))
print("Battig: " + str(dbl_glove_cat_results_battig))
print("AP: " + str(dbl_glove_cat_results_ap))
print(dbl_glove_cat_results_ESSLI_1a)
print(dbl_glove_cat_results_ESSLI_2b)
print(dbl_glove_cat_results_ESSLI_2c)
print("ESSLI_avg: " + str((dbl_glove_cat_results_ESSLI_1a + dbl_glove_cat_results_ESSLI_2b + dbl_glove_cat_results_ESSLI_2c)/3))

  A, B, C = np.vstack(w.get(word, mean_vector) for word in X_b[:, 0]), \
  np.vstack(w.get(word, mean_vector) for word in X_b[:, 1]), \
  np.vstack(w.get(word, mean_vector) for word in X_b[:, 2])


In [None]:
print(dbl_analogy_google_results)

0.7082480556692591


## Categorization

In [None]:
from web.evaluate import evaluate_categorization
from web.datasets.categorization import fetch_BLESS, fetch_battig, fetch_AP, fetch_ESSLI_1a, fetch_ESSLI_2b, fetch_ESSLI_2c

### Glove

In [None]:
bless_data = fetch_BLESS()
battig_data = fetch_battig()
ap_data = fetch_AP()
ESSLI_1a_data  = fetch_ESSLI_1a()
ESSLI_2b_data  = fetch_ESSLI_2b()
ESSLI_2c_data  = fetch_ESSLI_2c()

In [None]:
glove_cat_results_bless = evaluate_categorization(glove_dict, bless_data['X'], bless_data['y'])
glove_cat_results_battig = evaluate_categorization(glove_dict, battig_data['X'], battig_data['y'])
glove_cat_results_ap = evaluate_categorization(glove_dict, ap_data['X'], ap_data['y'])
glove_cat_results_ESSLI_1a = evaluate_categorization(glove_dict, ESSLI_1a_data['X'], ESSLI_1a_data['y'])
glove_cat_results_ESSLI_2b = evaluate_categorization(glove_dict, ESSLI_2b_data['X'], ESSLI_2b_data['y'])
glove_cat_results_ESSLI_2c = evaluate_categorization(glove_dict, ESSLI_2c_data['X'], ESSLI_2c_data['y'])

  words = np.vstack(w.get(word, mean_vector) for word in X.flatten())
  words = np.vstack(w.get(word, mean_vector) for word in X.flatten())
  words = np.vstack(w.get(word, mean_vector) for word in X.flatten())
  words = np.vstack(w.get(word, mean_vector) for word in X.flatten())
  words = np.vstack(w.get(word, mean_vector) for word in X.flatten())
  words = np.vstack(w.get(word, mean_vector) for word in X.flatten())


In [None]:
print("BLESS: " + str(glove_cat_results_bless))
print("Battig: " + str(glove_cat_results_battig))
print("AP:" + str(glove_cat_results_ap))
print(glove_cat_results_ESSLI_1a)
print(glove_cat_results_ESSLI_2b)
print(glove_cat_results_ESSLI_2c)
print("ESSLI_avg: " + str((glove_cat_results_ESSLI_1a + glove_cat_results_ESSLI_2b + glove_cat_results_ESSLI_2c)/3))

BLESS: 0.81
Battig: 0.40126170904224817
AP:0.5522388059701493
0.7727272727272727
0.775
0.5777777777777778
ESSLI_avg: 0.7085016835016836


### Hard Debias Glove

In [None]:
hd_glove_cat_results_bless = evaluate_categorization(hd_glove_dict, bless_data['X'], bless_data['y'])
hd_glove_cat_results_battig = evaluate_categorization(hd_glove_dict, battig_data['X'], battig_data['y'])
hd_glove_cat_results_ap = evaluate_categorization(hd_glove_dict, ap_data['X'], ap_data['y'])
hd_glove_cat_results_ESSLI_1a = evaluate_categorization(hd_glove_dict, ESSLI_1a_data['X'], ESSLI_1a_data['y'])
hd_glove_cat_results_ESSLI_2b = evaluate_categorization(hd_glove_dict, ESSLI_2b_data['X'], ESSLI_2b_data['y'])
hd_glove_cat_results_ESSLI_2c = evaluate_categorization(hd_glove_dict, ESSLI_2c_data['X'], ESSLI_2c_data['y'])

  words = np.vstack(w.get(word, mean_vector) for word in X.flatten())
  words = np.vstack(w.get(word, mean_vector) for word in X.flatten())
  words = np.vstack(w.get(word, mean_vector) for word in X.flatten())
  words = np.vstack(w.get(word, mean_vector) for word in X.flatten())
  words = np.vstack(w.get(word, mean_vector) for word in X.flatten())
  words = np.vstack(w.get(word, mean_vector) for word in X.flatten())


In [None]:
print("BLESS: " + str(hd_glove_cat_results_bless))
print("Battig: " + str(hd_glove_cat_results_battig))
print("AP: " + str(hd_glove_cat_results_ap))
print(hd_glove_cat_results_ESSLI_1a)
print(hd_glove_cat_results_ESSLI_2b)
print(hd_glove_cat_results_ESSLI_2c)
print("ESSLI_avg: " + str((hd_glove_cat_results_ESSLI_1a + hd_glove_cat_results_ESSLI_2b + hd_glove_cat_results_ESSLI_2c)/3))

BLESS: 0.84
Battig: 0.39915886063850126
AP:0.6194029850746269
0.7954545454545455
0.8
0.6444444444444445
ESSLI_avg: 0.7466329966329966


### Double Hard Debias Glove

In [None]:
dbl_glove_cat_results_bless = evaluate_categorization(dbl_glove_dict, bless_data['X'], bless_data['y'])
dbl_glove_cat_results_battig = evaluate_categorization(dbl_glove_dict, battig_data['X'], battig_data['y'])
dbl_glove_cat_results_ap = evaluate_categorization(dbl_glove_dict, ap_data['X'], ap_data['y'])
dbl_glove_cat_results_ESSLI_1a = evaluate_categorization(dbl_glove_dict, ESSLI_1a_data['X'], ESSLI_1a_data['y'])
dbl_glove_cat_results_ESSLI_2b = evaluate_categorization(dbl_glove_dict, ESSLI_2b_data['X'], ESSLI_2b_data['y'])
dbl_glove_cat_results_ESSLI_2c = evaluate_categorization(dbl_glove_dict, ESSLI_2c_data['X'], ESSLI_2c_data['y'])

  words = np.vstack(w.get(word, mean_vector) for word in X.flatten())
  words = np.vstack(w.get(word, mean_vector) for word in X.flatten())
  words = np.vstack(w.get(word, mean_vector) for word in X.flatten())
  words = np.vstack(w.get(word, mean_vector) for word in X.flatten())
  words = np.vstack(w.get(word, mean_vector) for word in X.flatten())
  words = np.vstack(w.get(word, mean_vector) for word in X.flatten())


In [None]:
print("BLESS: " + str(dbl_glove_cat_results_bless))
print("Battig: " + str(dbl_glove_cat_results_battig))
print("AP: " + str(dbl_glove_cat_results_ap))
print(dbl_glove_cat_results_ESSLI_1a)
print(dbl_glove_cat_results_ESSLI_2b)
print(dbl_glove_cat_results_ESSLI_2c)
print("ESSLI_avg: " + str((dbl_glove_cat_results_ESSLI_1a + dbl_glove_cat_results_ESSLI_2b + dbl_glove_cat_results_ESSLI_2c)/3))

BLESS: 0.84
Battig: 0.40011470082202255
AP: 0.6293532338308457
0.7954545454545455
0.75
0.5777777777777778
ESSLI_avg: 0.7077441077441078
