In [2]:
#Import  packages
import numpy as np
import gensim
import danlp
import pandas as pd
from numpy import dot
from numpy.linalg import norm

In [None]:
#First all functions that are needed to do the Word Embedding Association Test (WEAT) are defined

In [3]:
def cosine_sim(v1, v2, embedding):
    """
    Returns cosine of the angle between two vectors
    The function first loads the embeddings for the two words v1 and v2 that the cosine similarity has to be calculated for. 
    It then calculates the cosine similarity using the formula: cos(x, y) = x . y / |x| * |y|
    np.dot returns the dot product of the vectors x and y (x . y)
    np.linalg.norm returns the length of the given vector (|x| and |y|)
    
    """
    v1 = embedding[v1]
    v2 = embedding[v2]
    cos = np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
    return cos

In [5]:
def weat_association(W, A, B, embedding):
    """
    The function makes two lists - one that stores the cosine distance between a target word W and all of attribute words in A and one storing the cosine distance between the target word W and all attribute words in B.
    Cosine distance is calculated by the function defined as cosine_sim 
    The function returns the difference in means between the word W's association with the attributes A and B
    Returns what corresponds to s(w, A, B) in the equations defined in the thesis

    """
    cosine_scoresA = [] #create an empty list for associations between W and words in A
    cosine_scoresB = [] #create an empty list for associations between W and words in B
    for i in A:
        cosine_scoresA.append(cosine_sim(W, i, embedding))
    for i in B:
        cosine_scoresB.append(cosine_sim(W, i, embedding))
        
    association = np.mean(cosine_scoresA) - np.mean(cosine_scoresB)
    
    return association

In [4]:
def weat_differential_association(X, Y, A, B, embedding):
    """
    The function calculates the differential association
    It uses the function defined above (weat_association) to calculate the different in association between a given 
    word W's association with words in respectively A and B - but it iterates for all words W in respectively X and Y
    Returns what corresponds to s(X, Y, A, B) in the equations
    """
    associationX = []
    associationY = []
    for i in X:
        associationX.append(weat_association(i, A, B, embedding))
    for i in Y:
        associationY.append(weat_association(i, A, B, embedding))
    
    diff_association = np.sum(associationX) - np.sum(associationY)
    
    return diff_association

In [6]:
def weat_effect_size(X, Y, A, B,embedding):
    """
    Calculates the WEAT effect size as described in the thesis
    """
      
    associationX = []
    associationY = []
    for i in X:
        associationX.append(weat_association(i, A, B, embedding))
    for i in Y:
        associationY.append(weat_association(i, A, B, embedding))
    associationXY = []
    associationXY = associationX + associationY
    tmp1 = np.mean(associationX) - np.mean(associationY)
    tmp2 = np.std(associationXY,ddof=1) 
        
    effect_size = tmp1/tmp2
    
    return effect_size

In [7]:
#Permutation test p-value - has to be adjusted + code has to run more smoothly
def weat_p_value(X, Y, A, B, embedding, p):
    """
    Returns one-sided p-value of the permutation test 
    What the permutation test basically does: makes a lot (i) possible combinations of our target words and assesses
    their association to the attributes. The proportion of differential association that are higher for permuted distributions 
    than for the non permuted distrubition are then calculated, which is the permutation test p-value

    """
    diff_association = weat_differential_association(X, Y, A, B, embedding)
    target_words = np.concatenate((X, Y), axis=0)
    np.random.shuffle(target_words) #shuffle target words before permutations
    
    #Test if target words can be divided into two sets of equal size - otherwise print warning
    if target_words.shape[0] % 2 != 0:
        print('WARNING - target word set can not be divided into two sets of equal size')
        
    partition_diff_association = [] #Create empty list to be filled during loop

    for i in range(p): #Iterate p times (number of permutations)
        seq = np.random.permutation(target_words) #Permute target words
        partition_X = seq[:len(seq)//2] #Load  first partition of data to create to sets of permuted target words
        partition_Y = seq[len(seq)//2:] #Load second partition of data to create to sets of permuted target words
        #Calculate and append differential association for permuted target words to attributes
        partition_diff_association.append(weat_differential_association(partition_X, partition_Y, A, B, embedding))
      
    partition_diff_association = np.array(partition_diff_association) #Convert differential association for all permuted samples to numpy array

    mean = np.mean(partition_diff_association) #Mean differential association for permutations
    stdev = np.std(partition_diff_association) #Standard deviation of differential association for permutations
    pvalue = ((np.sum(partition_diff_association > diff_association)+1) / (len(partition_diff_association)+1)) #Calculation of p-value, corresponds to proportion of differential association for permuted target words that are higher than for the non permuted value
    
    return diff_association, mean, stdev, pvalue

In [8]:
#The next section runs the results for each model individually, as it is too computationally expensive to run all models together

In [9]:
#First we make one function that returns all relevant values for one embedding and one kind of gender bias

def results_weat(X, Y, A, B, embedding, p):
    #Define empty lists
    diff_association = []
    effect_size = []
    pvalue = []

    #Retrieve values from already defined functions
    diff_association = weat_differential_association(X, Y, A, B, embedding)
    effect_size = weat_effect_size(X, Y, A, B,embedding)
    pvalue = weat_p_value(X, Y, A, B, embedding, p)[3]
    
    #Combine all values in dataframe
    s1=pd.Series(diff_association,name='diff_association')
    s2=pd.Series(effect_size,name='effect_size')
    s3=pd.Series(pvalue ,name='pvalue')

    results = pd.concat([s1,s2,s3], axis=1)
        
    #Return dataframe with all results
    return results


In [None]:
#Now run the tests for all types of gender biases (science vs. arts, math vs. arts, career vs. family) + all pre-trained models (separately as computational strain is too big otherwise)

In [None]:
from danlp.models.embeddings  import load_wv_with_gensim

embeddings1 = load_wv_with_gensim('wiki.da.wv') #fastText Facebook wiki

#Set permutations
p =10000

#Set target and attribute words - Science vs. Arts
A = ['mandlig', 'mand','dreng','bror','han','ham','hans','søn'] #Attribute words for mand
B = ['kvindelig', 'kvinde', 'pige', 'søster', 'hun', 'hende', 'hendes', 'datter'] #Attribute words for kvinde
X = ['videnskab', 'teknologi', 'fysik', 'kemi', 'rumfart', 'eksperiment', 'astronomi', 'biologi'] #Target words for Science
Y = ['poesi', 'kunst', 'dans', 'litteratur', 'roman', 'symfoni', 'drama', 'skulptur'] #Target words for Arts


results_weat(X, Y, A, B, embeddings1, p)

In [None]:
#Set target a and attribute words - Math vs. Arts
A = ['mandlig', 'mand','dreng','bror','han','ham','hans','søn'] #Attribute words for mand
B = ['kvindelig', 'kvinde', 'pige', 'søster', 'hun', 'hende', 'hendes', 'datter'] #Attribute words for kvinde
X = ['matematik', 'algebra', 'geometri', 'regning', 'ligninger', 'beregning', 'tal', 'addition'] #Target words for Math
Y = ['poesi', 'kunst', 'dans', 'litteratur',  'roman' ,'symfoni', 'drama', 'skulptur'] #Target words for Arts

results_weat(X, Y, A, B, embeddings1, p)

In [None]:
#Set target and attribute words - Career vs. Family
A = ['mandlig', 'mand','dreng','bror','han','ham','hans','søn'] #Attribute words for mand
B = ['kvindelig', 'kvinde', 'pige', 'søster', 'hun', 'hende', 'hendes', 'datter'] #Attribute words for kvinde
X = ['leder', 'ledelse', 'professionel', 'virksomhed', 'løn', 'kontor', 'forretning', 'karriere'] #Target words for Career
Y = ['hjem','forældre', 'børn', 'familie','bedsteforældre', 'ægteskab', 'bryllup', 'pårørende'] #Target words for Family

results_weat(X, Y, A, B, embeddings1, p)

In [None]:
from danlp.models.embeddings  import load_wv_with_gensim

embeddings2 = load_wv_with_gensim('cc.da.wv') #fastText Facebook wiki + CommonCrawl

#Set permutations
p =10000

#Set target and attribute words - Science vs. Arts
A = ['mandlig', 'mand','dreng','bror','han','ham','hans','søn'] #Attribute words for mand
B = ['kvindelig', 'kvinde', 'pige', 'søster', 'hun', 'hende', 'hendes', 'datter'] #Attribute words for kvinde
X = ['videnskab', 'teknologi', 'fysik', 'kemi', 'rumfart', 'eksperiment', 'astronomi', 'biologi'] #Target words for Science
Y = ['poesi', 'kunst', 'dans', 'litteratur', 'roman', 'symfoni', 'drama', 'skulptur'] #Target words for Arts

results_weat(X, Y, A, B, embeddings2, p)

In [None]:
#Set target and attribute words - Math vs. Arts
A = ['mandlig', 'mand','dreng','bror','han','ham','hans','søn'] #Attribute words for mand
B = ['kvindelig', 'kvinde', 'pige', 'søster', 'hun', 'hende', 'hendes', 'datter'] #Attribute words for kvinde
X = ['matematik', 'algebra', 'geometri', 'regning', 'ligninger', 'beregning', 'tal', 'addition'] #Target words for Math
Y = ['poesi', 'kunst', 'dans', 'litteratur',  'roman' ,'symfoni', 'drama', 'skulptur'] #Target words for Arts

results_weat(X, Y, A, B, embeddings2, p)

In [None]:
#Set target and attribute words - Career vs. Family
A = ['mandlig', 'mand','dreng','bror','han','ham','hans','søn'] #Attribute words for mand
B = ['kvindelig', 'kvinde', 'pige', 'søster', 'hun', 'hende', 'hendes', 'datter'] #Attribute words for kvinde
X = ['leder', 'ledelse', 'professionel', 'virksomhed', 'løn', 'kontor', 'forretning', 'karriere'] #Target words for Career
Y = ['hjem','forældre', 'børn', 'familie','bedsteforældre', 'ægteskab', 'bryllup', 'pårørende'] #Target words for Family

results_weat(X, Y, A, B, embeddings2, p)

In [None]:
from danlp.models.embeddings  import load_wv_with_gensim

embeddings3 = load_wv_with_gensim('conll17.da.wv') #word2vec Skipgram CoNLL2017

#Set permutations
p =10000

#Set target and attribute words - Science vs. Arts
A = ['mandlig', 'mand','dreng','bror','han','ham','hans','søn'] #Attribute words for mand
B = ['kvindelig', 'kvinde', 'pige', 'søster', 'hun', 'hende', 'hendes', 'datter'] #Attribute words for kvinde
X = ['videnskab', 'teknologi', 'fysik', 'kemi', 'rumfart', 'eksperiment', 'astronomi', 'biologi'] #Target words for Science
Y = ['poesi', 'kunst', 'dans', 'litteratur', 'roman', 'symfoni', 'drama', 'skulptur'] #Target words for Arts

results_weat(X, Y, A, B, embeddings3, p)

In [None]:
#Set target and attribute words - Math vs. Arts
A = ['mandlig', 'mand','dreng','bror','han','ham','hans','søn'] #Attribute words for mand
B = ['kvindelig', 'kvinde', 'pige', 'søster', 'hun', 'hende', 'hendes', 'datter'] #Attribute words for kvinde
X = ['matematik', 'algebra', 'geometri', 'regning', 'ligninger', 'beregning', 'tal', 'addition'] #Target words for Math
Y = ['poesi', 'kunst', 'dans', 'litteratur',  'roman' ,'symfoni', 'drama', 'skulptur'] #Target words for Arts

results_weat(X, Y, A, B, embeddings3, p)

In [None]:
#Set target and attribute words - Career vs. Family
A = ['mandlig', 'mand','dreng','bror','han','ham','hans','søn'] #Attribute words for mand
B = ['kvindelig', 'kvinde', 'pige', 'søster', 'hun', 'hende', 'hendes', 'datter'] #Attribute words for kvinde
X = ['leder', 'ledelse', 'professionel', 'virksomhed', 'løn', 'kontor', 'forretning', 'karriere'] #Target words for Career
Y = ['hjem','forældre', 'børn', 'familie','bedsteforældre', 'ægteskab', 'bryllup', 'pårørende'] #Target words for Family

results_weat(X, Y, A, B, embeddings3, p)

In [None]:
import gensim

embeddings4 = gensim.models.fasttext.load_facebook_vectors('datenten14_5.bin')  

#Set permutations
p =10000

#Set target and attribute words - Science vs. Arts
A = ['mandlig', 'mand','dreng','bror','han','ham','hans','søn'] #Attribute words for mand
B = ['kvindelig', 'kvinde', 'pige', 'søster', 'hun', 'hende', 'hendes', 'datter'] #Attribute words for kvinde
X = ['videnskab', 'teknologi', 'fysik', 'kemi', 'rumfart', 'eksperiment', 'astronomi', 'biologi'] #Target words for Science
Y = ['poesi', 'kunst', 'dans', 'litteratur', 'roman', 'symfoni', 'drama', 'skulptur'] #Target words for Arts

results_weat(X, Y, A, B, embeddings4, p)

In [None]:
#Set target and attribute words - Math vs. Arts
A = ['mandlig', 'mand','dreng','bror','han','ham','hans','søn'] #Attribute words for mand
B = ['kvindelig', 'kvinde', 'pige', 'søster', 'hun', 'hende', 'hendes', 'datter'] #Attribute words for kvinde
X = ['matematik', 'algebra', 'geometri', 'regning', 'ligninger', 'beregning', 'tal', 'addition'] #Target words for Math
Y = ['poesi', 'kunst', 'dans', 'litteratur',  'roman' ,'symfoni', 'drama', 'skulptur'] #Target words for Arts

results_weat(X, Y, A, B, embeddings4, p)

In [None]:
#Set target and attribute words - Career vs. Family
A = ['mandlig', 'mand','dreng','bror','han','ham','hans','søn'] #Attribute words for mand
B = ['kvindelig', 'kvinde', 'pige', 'søster', 'hun', 'hende', 'hendes', 'datter'] #Attribute words for kvinde
X = ['leder', 'ledelse', 'professionel', 'virksomhed', 'løn', 'kontor', 'forretning', 'karriere'] #Target words for Career
Y = ['hjem','forældre', 'børn', 'familie','bedsteforældre', 'ægteskab', 'bryllup', 'pårørende'] #Target words for Family

results_weat(X, Y, A, B, embeddings4, p)

In [None]:
from danlp.models.embeddings  import load_wv_with_gensim, load_wv_with_spacy

embeddings5 = load_wv_with_gensim('dslreddit.da.wv') #word2vec CBOW DSL Reddit

#Set permutations
p =10000

#Set target and attribute words - Science vs. Arts
A = ['mandlig', 'mand','dreng','bror','han','ham','hans','søn'] #Attribute words for mand
B = ['kvindelig', 'kvinde', 'pige', 'søster', 'hun', 'hende', 'hendes', 'datter'] #Attribute words for kvinde
X = ['videnskab', 'teknologi', 'fysik', 'kemi', 'rumfart', 'eksperiment', 'astronomi', 'biologi'] #Target words for Science
Y = ['poesi', 'kunst', 'dans', 'litteratur', 'roman', 'symfoni', 'drama', 'skulptur'] #Target words for Arts

results_weat(X, Y, A, B, embeddings5, p)

In [None]:
#Set target and attribute words - Math vs. Arts
A = ['mandlig', 'mand','dreng','bror','han','ham','hans','søn'] #Attribute words for mand
B = ['kvindelig', 'kvinde', 'pige', 'søster', 'hun', 'hende', 'hendes', 'datter'] #Attribute words for kvinde
X = ['matematik', 'algebra', 'geometri', 'regning', 'ligninger', 'beregning', 'tal', 'addition'] #Target words for Math
Y = ['poesi', 'kunst', 'dans', 'litteratur',  'roman' ,'symfoni', 'drama', 'skulptur'] #Target words for Arts

results_weat(X, Y, A, B, embeddings5, p)

In [None]:
#Set target and attribute words - Career vs. Family
A = ['mandlig', 'mand','dreng','bror','han','ham','hans','søn'] #Attribute words for mand
B = ['kvindelig', 'kvinde', 'pige', 'søster', 'hun', 'hende', 'hendes', 'datter'] #Attribute words for kvinde
X = ['leder', 'ledelse', 'professionel', 'virksomhed', 'løn', 'kontor', 'forretning', 'karriere'] #Target words for Career
Y = ['hjem','forældre', 'børn', 'familie','bedsteforældre', 'ægteskab', 'bryllup', 'pårørende'] #Target words for Family

results_weat(X, Y, A, B, embeddings5, p)

In [20]:
from gensim.models.keyedvectors import KeyedVectors

#embeddings6 = KeyedVectors.load_word2vec_format('danish_newspapers_1880To2013.txt', binary=False) 

model_test = KeyedVectors.load_word2vec_format('/work/dagw_wordembeddings/fasttext_model/fasttext_vectors.txt')
import os
print(os.getcwd())
#Set permutations
p =10000

#Set target and attribute words - Science vs. Arts
A = ['mandlig', 'mand','dreng','bror','han','ham','hans','søn'] #Attribute words for mand
B = ['kvindelig', 'kvinde', 'pige', 'søster', 'hun', 'hende', 'hendes', 'datter'] #Attribute words for kvinde
X = ['videnskab', 'teknologi', 'fysik', 'kemi', 'rumfart', 'eksperiment', 'astronomi', 'biologi'] #Target words for Science
Y = ['poesi', 'kunst', 'dans', 'litteratur', 'roman', 'symfoni', 'drama', 'skulptur'] #Target words for Arts

#results_weat(X, Y, A, B, model_test, p)


ValueError: cannot copy sequence with size 0 to array axis with dimension 300

In [None]:
#Set target and attribute words - Math vs. Arts
A = ['mandlig', 'mand','dreng','bror','han','ham','hans','søn'] #Attribute words for mand
B = ['kvindelig', 'kvinde', 'pige', 'søster', 'hun', 'hende', 'hendes', 'datter'] #Attribute words for kvinde
X = ['matematik', 'algebra', 'geometri', 'regning', 'ligninger', 'beregning', 'tal', 'addition'] #Target words for Math
Y = ['poesi', 'kunst', 'dans', 'litteratur',  'roman' ,'symfoni', 'drama', 'skulptur'] #Target words for Arts

results_weat(X, Y, A, B, embeddings6, p)

In [None]:
#Set target and attribute words - Career vs. Family
A = ['mandlig', 'mand','dreng','bror','han','ham','hans','søn'] #Attribute words for mand
B = ['kvindelig', 'kvinde', 'pige', 'søster', 'hun', 'hende', 'hendes', 'datter'] #Attribute words for kvinde
X = ['leder', 'ledelse', 'professionel', 'virksomhed', 'løn', 'kontor', 'forretning', 'karriere'] #Target words for Career
Y = ['hjem','forældre', 'børn', 'familie','bedsteforældre', 'ægteskab', 'bryllup', 'pårørende'] #Target words for Family

results_weat(X, Y, A, B, embeddings6, p)