From Double-Hard Debias: [https://github.com/uvavision/Double-Hard-Debias/blob/master/GloVe_Debias.ipynb]

In [1]:
import gensim
import numpy as np
from numpy import linalg as LA
import scipy
import codecs, os, json
import operator
import pickle
from random import shuffle

%load_ext autoreload
%autoreload 2

In [2]:
#loading Embeddings: Double Hard Debias

def normalize(wv):    
    # normalize vectors
    norms = np.apply_along_axis(LA.norm, 1, wv)
    wv = wv / norms[:, np.newaxis]
    return wv

def load_w2v(file_path):
    # load w2v file format
    model =gensim.models.KeyedVectors.load_word2vec_format(file_path, binary=True)
    vocab = sorted([w for w in model.vocab], key=lambda w: model.vocab[w].index)
    w2i = {w: i for i, w in enumerate(vocab)}
    wv = [model[w] for w in vocab]
    wv = np.array(wv)
    print(len(vocab), wv.shape, len(w2i))
    
    return wv, w2i, vocab

def load_embeddings_p(path):
    # load pickle files (double hard debias)
    debiased_embeds = pickle.load(open(path, 'rb'))
    wv = []
    vocab = []
    for w in debiased_embeds:
        wv.append(np.array(debiased_embeds[w]))
        vocab.append(str(w))
        
    w2i = {w: i for i, w in enumerate(vocab)}
    wv = np.array(wv).astype(float)
    print(len(vocab), wv.shape, len(w2i))
        
    return wv, w2i, vocab

def load_embeddings_from_np(filename):
    #print('loading ...')
    with codecs.open(filename + '.vocab', 'r', 'utf-8') as f_embed:
        vocab = [line.strip() for line in f_embed]
        
    w2i = {w: i for i, w in enumerate(vocab)}
    wv = np.load(filename + '.wv.npy')
    
    print(len(vocab), wv.shape, len(w2i))

    return vocab, wv, w2i

def save_embeds(wv, vocab, filename):
    out_emb_file = filename + '.wv'
    out_vocab_file = filename + '.vocab'
    
    #print('Saving binary file to {}'.format(out_emb_file))
    np.save(out_emb_file, wv)

    #print('Saving vocabulary file to {}'.format(out_vocab_file))
    with codecs.open(out_vocab_file, 'w', 'utf-8') as f_out:
        for word in vocab:
            f_out.write(word + '\n')

In [3]:
# load Double Hard Debiased Embeddings (GloVe)
# GloVe
dhd_glove, dhd_glove_w2i, dhd_vocab = load_embeddings_p("../data/Wang/glove_dhd.p")
# save pickle embeds as npy arrays
save_embeds(dhd_glove, dhd_vocab, '../data/Wang/dhd_glove')

322636 (322636, 300) 322636


In [3]:
# load Double Hard Debiased Embeddings (W2V)
# Word2Vec
dhd_wv, dhd_w2i, dhd_vocab = load_embeddings_p("../data/Wang/dhd_wv.p")
# save pickle embeds as npy arrays
save_embeds(dhd_wv, dhd_vocab, '../data/Wang/dhd_w2v')

3000000 (3000000, 300) 3000000


In [9]:
# load gender neutral GloVe Embeddings
gn_glove, gn_glove_w2i, gn_vocab = load_embeddings_from_np("../data/Zhao/gn_glove")

322636 (322636, 300) 322636


In [17]:
# load original GloVe Embeddings
glove, glove_w2i, glove_vocab = load_embeddings_from_np("../data/Zhao/orig_glove")

322636 (322636, 300) 322636


In [21]:
# load original w2v Embeddings
w2v, w2v_w2i, w2v_vocab = load_embeddings_from_np("../data/Bolubaski/orig_w2v")

2999996 (2999996, 300) 2999996


In [22]:
# load Hard Debiased w2v Embeddings
hd_w2v, hd_w2v_w2i, hd_vocab = load_embeddings_from_np("../data/Bolubaski/hard_debiased_w2v")

2999996 (2999996, 300) 2999996


In [None]:
from sklearn.decomposition import PCA

# get main PCA components
def my_pca(wv):
    wv_mean = np.mean(np.array(wv), axis=0)
    wv_hat = np.zeros(wv.shape).astype(float)

    for i in range(len(wv)):
        wv_hat[i, :] = wv[i, :] - wv_mean

    main_pca = PCA()
    main_pca.fit(wv_hat)
    
    return main_pca

main_pca = my_pca(wv)
wv_mean = np.mean(np.array(wv), axis=0)

# Bolukbasi projection-based debiasing method
def hard_debias(wv, w2i, w2i_partial, vocab_partial, component_ids):
    
    D = []

    for i in component_ids:
        D.append(main_pca.components_[i])
    
    # get rid of frequency features
    wv_f = np.zeros((len(vocab_partial), wv.shape[1])).astype(float)
    
    for i, w in enumerate(vocab_partial):
        u = wv[w2i[w], :]
        sub = np.zeros(u.shape).astype(float)
        for d in D:
            sub += np.dot(np.dot(np.transpose(d), u), d)
        wv_f[w2i_partial[w], :] = wv[w2i[w], :] - sub - wv_mean
        
    # debias
    gender_directions = list()
    for gender_word_list in [definitional_pairs]:
        gender_directions.append(doPCA(gender_word_list, wv_f, w2i_partial).components_[0])
    
    wv_debiased = np.zeros((len(vocab_partial), len(wv_f[0, :]))).astype(float)
    for i, w in enumerate(vocab_partial):
        u = wv_f[w2i_partial[w], :]
        for gender_direction in gender_directions:
            u = drop(u, gender_direction)
            wv_debiased[w2i_partial[w], :] = u
    
    return wv_debiased