In [1]:
#Import relevant packages
import numpy as np
import pickle
import json
import scipy
import spacy
import gensim
import danlp
import sys
import pandas as pd
from numpy import dot
from numpy.linalg import norm

In [2]:
def cosine_sim(v1, v2, embedding):
    """
    Returns cosine of the angle between two vectors and automatically normalizes for vector length
    The function first loads the embeddings for the two words v1 and v2 that the cosine similarity has to be calculated for. 
    It then calculates the cosine similarity using the formula: cos(x, y) = x . y / ||X|| * ||y||
    np.dot returns the dot product of x and y (x . y)
    np.linalg.norm returns the length of the given vector (||x|| and ||y||)
    
    """
    v1 = embedding[v1]
    v2 = embedding[v2]
    cos = np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
    return cos

In [3]:
def weat_association(W, A, B, embedding):
    """
    The function makes two lists - one that stores the cosine distance between a target word W and all of attribute words in A and one storing the cosine distance between the target word W and all attribute words in B.
    Cosine distance is calculated by the function defined as cosine_sim 
    The function returns the difference in means between the word W's association with respectively A and B
    Returns what corresponds to s(w, A, B) in the equations defined in the thesis

    """
    cosine_scoresA = [] #create an empty list for associations between W and words in A
    cosine_scoresB = [] #create an empty list for associations between W and words in B
    for i in A:
        cosine_scoresA.append(cosine_sim(W, i, embedding))
    for i in B:
        cosine_scoresB.append(cosine_sim(W, i, embedding))
        
    association = np.mean(cosine_scoresA) - np.mean(cosine_scoresB)
    
    return association

In [4]:
def weat_differential_association(X, Y, A, B, embedding):
    """
    The function calculates the association of all words W in respectively X and Y and there association with A and B
    It uses the function defined above (weat_association) to calculate the different in association between a given 
    word W's association with words in respectively A and B - but it iterates for all words W in respectively X and Y
    Returns what corresponds to s(X, Y, A, B) in the equations
    """
    associationX = []
    associationY = []
    for i in X:
        associationX.append(weat_association(i, A, B, embedding))
    for i in Y:
        associationY.append(weat_association(i, A, B, embedding))
    
    diff_association = np.sum(associationX) - np.sum(associationY)
    
    return diff_association

In [24]:
def weat_effect_size(X, Y, A, B,embedding):
    """
    We first need to retrieve s(w, A, B) for all words w (i) in both X and Y and then find the standard deviation of all words
    We use the previously defined weat_association to retrieve s(w, A, B)
    We also retrieve the mean difference in association between the target words' association to X and Y
    """
      
    associationX = []
    associationY = []
    for i in X:
        associationX.append(weat_association(i, A, B, embedding))
    for i in Y:
        associationY.append(weat_association(i, A, B, embedding))
    associationXY = []
    associationXY = associationX + associationY
    tmp1 = np.mean(associationX) - np.mean(associationY)
    tmp2 = np.std(associationXY, ddof=1) 
        
    effect_size = tmp1/tmp2
    
    return effect_size

In [25]:
#Permutation test p-value - has to be adjusted + code has to run more smoothly
from scipy import stats

def weat_p_value(X, Y, A, B, embedding, p):
    """
    Returns one-sided p-value of the permutation test 
    What the permutation test basically does: we want to make a lot (i) possible combinations of our target words and assess
    their association to the attributes. We then calculate the proportion of differential association that are higher for 
    permuted distributions than for the non permuted distrubition

    """
    diff_association = weat_differential_association(X, Y, A, B, embedding)
    target_words = np.concatenate((X, Y), axis=0)
    np.random.shuffle(target_words) #shuffle target words before permutations
    
    #Test if target words can be divided into two sets of equal size - otherwise print warning
    if target_words.shape[0] % 2 != 0:
        print('WARNING - target word set can not be divided into two sets of equal size')
        
    partition_diff_association = [] #Create empty list to be filled during loop

    for i in range(p): #Iterate p times (number of permutations)
        seq = np.random.permutation(target_words) #Permute target words
        partition_X = seq[:len(seq)//2] #Load  first partition of data to create to sets of permuted target words
        partition_Y = seq[len(seq)//2:] #Load second partition of data to create to sets of permuted target words
        #Calculate and append differential association for permuted target words to attributes
        partition_diff_association.append(weat_differential_association(partition_X, partition_Y, A, B, embedding))
      
    partition_diff_association = np.array(partition_diff_association) #Convert differential association for all permuted samples to numpy array

    mean = np.mean(partition_diff_association) #Mean differential association for permutations
    stdev = np.std(partition_diff_association) #Standard deviation of differential association for permutations
    pvalue = ((np.sum(partition_diff_association > diff_association)) / (len(partition_diff_association))) #Calculation of p-value, corresponds to proportion of differential association for permuted target words that are higher than for the non permuted value
#OBS p-value not working, tæller kan aldrig være større end nævner
    return diff_association, mean, stdev, pvalue

In [26]:
#First we make one function that returns all relevant values for one embedding and one kind of gender bias

def results_weat(X, Y, A, B, embedding, p):
    #Define empty lists
    diff_association = []
    effect_size = []
    mean_diff_association_p = []
    std_diff_association_p = []
    pvalue = []

    #Retrieve values from already defined functions
    diff_association = weat_differential_association(X, Y, A, B, embedding)
    effect_size = weat_effect_size(X, Y, A, B,embedding)
    pvalue = weat_p_value(X, Y, A, B, embedding, p)[3]
    
    #Combine all values in dataframe
    s1=pd.Series(diff_association,name='diff_association')
    s2=pd.Series(effect_size,name='effect_size')
    s3=pd.Series(pvalue ,name='pvalue')

    results = pd.concat([s1,s2,s3], axis=1)
        
    #Return dataframe with all results
    return results


In [27]:
# Caliskan et. al algorithms and use of words
#p-value can differ because it is measured by a permutation test, but the rest of the results should be the same

In [9]:
from gensim.scripts.glove2word2vec import glove2word2vec
glove2word2vec(glove_input_file="glove.840B.300d.txt", word2vec_output_file="gensim_glove_vectors.txt")

(2196017, 300)

In [10]:
from gensim.models.keyedvectors import KeyedVectors
glove_model = KeyedVectors.load_word2vec_format("gensim_glove_vectors.txt", binary=False)

In [28]:
#Set permutations
p =1000

#Set target and attribute words - Science vs. Arts
A = ['brother', 'father', 'uncle', 'grandfather', 'son', 'he', 'his', 'him']
B = ['sister', 'mother', 'aunt', 'grandmother', 'daughter', 'she', 'hers', 'her']
X = ['science', 'technology', 'physics', 'chemistry', 'Einstein', 'NASA', 'experiment', 'astronomy']
Y = ['poetry', 'art', 'Shakespeare', 'dance', 'literature', 'novel', 'symphony', 'drama']
   
results_weat(X, Y, A, B, glove_model, p)

Unnamed: 0,diff_association,effect_size,pvalue
0,0.345604,1.237453,0.004


In [None]:
#Google News replication

In [34]:
# Load Google's pre-trained Word2Vec model
word2vec_model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)  

INFO:gensim.models.utils_any2vec:loading projection weights from GoogleNews-vectors-negative300.bin
DEBUG:smart_open.smart_open_lib:{'uri': 'GoogleNews-vectors-negative300.bin', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'ignore_ext': False, 'transport_params': None}
INFO:gensim.models.utils_any2vec:loaded (3000000, 300) matrix from GoogleNews-vectors-negative300.bin


In [35]:
#Set permutations
p =1000

#Set target and attribute words - Science vs. Arts
A = ['brother', 'father', 'uncle', 'grandfather', 'son', 'he', 'his', 'him']
B = ['sister', 'mother', 'aunt', 'grandmother', 'daughter', 'she', 'hers', 'her']
X = ['science', 'technology', 'physics', 'chemistry', 'Einstein', 'NASA', 'experiment', 'astronomy']
Y = ['poetry', 'art', 'Shakespeare', 'dance', 'literature', 'novel', 'symphony', 'drama']
   
results_weat(X, Y, A, B, word2vec_model, p)

Unnamed: 0,diff_association,effect_size,pvalue
0,0.357187,1.243855,0.003
