In [1]:
#Import relevant packages
import numpy as np
import pickle
import json
import scipy
import gensim
import danlp
import sys
import pandas as pd
from numpy import dot
from numpy.linalg import norm

In [2]:
#First all functions that are needed to do the Word Embedding Association Test (WEAT) are defined

In [3]:
def cosine_sim(v1, v2, embedding):
    """
    Returns cosine of the angle between two vectors and automatically normalizes for vector length
    The function first loads the embeddings for the two words v1 and v2 that the cosine similarity has to be calculated for. 
    It then calculates the cosine similarity using the formula: co1s(x, y) = x . y / ||X|| * ||y||
    np.dot returns the dot product of x and y (x . y)
    np.linalg.norm returns the length of the given vector (||x|| and ||y||)
    
    """
    v1 = embedding[v1]
    v2 = embedding[v2]
    cos = np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
    return cos

In [4]:
def weat_association(W, A, B, embedding):
    """
    The function makes two lists - one that stores the cosine distance between a target word W and all of attribute words in A and one storing the cosine distance between the target word W and all attribute words in B.
    Cosine distance is calculated by the function defined as cosine_sim 
    The function returns the difference in means between the word W's association with respectively A and B
    Returns what corresponds to s(w, A, B) in the equations defined in the thesis

    """
    cosine_scoresA = [] #create an empty list for associations between W and words in A
    cosine_scoresB = [] #create an empty list for associations between W and words in B
    for i in A:
        cosine_scoresA.append(cosine_sim(W, i, embedding))
    for i in B:
        cosine_scoresB.append(cosine_sim(W, i, embedding))
        
    association = np.mean(cosine_scoresA) - np.mean(cosine_scoresB)
    
    return association

In [5]:
def weat_differential_association(X, Y, A, B, embedding):
    """
    The function calculates the association of all words W in respectively X and Y and there association with A and B
    It uses the function defined above (weat_association) to calculate the different in association between a given 
    word W's association with words in respectively A and B - but it iterates for all words W in respectively X and Y
    Returns what corresponds to s(X, Y, A, B) in the equations
    """
    associationX = []
    associationY = []
    for i in X:
        associationX.append(weat_association(i, A, B, embedding))
    for i in Y:
        associationY.append(weat_association(i, A, B, embedding))
    
    diff_association = np.sum(associationX) - np.sum(associationY)
    
    return diff_association

In [6]:
def weat_effect_size(X, Y, A, B,embedding):
    """
    We first need to retrieve s(w, A, B) for all words w (i) in both X and Y and then find the standard deviation of all words
    We use the previously defined weat_association to retrieve s(w, A, B)
    We also retrieve the mean difference in association between the target words' association to X and Y
    """
      
    associationX = []
    associationY = []
    for i in X:
        associationX.append(weat_association(i, A, B, embedding))
    for i in Y:
        associationY.append(weat_association(i, A, B, embedding))
    associationXY = []
    associationXY = associationX + associationY
    tmp1 = np.mean(associationX) - np.mean(associationY)
    tmp2 = np.std(associationXY,ddof=1) 
        
    effect_size = tmp1/tmp2
    
    return effect_size

In [7]:
#Permutation test p-value - has to be adjusted + code has to run more smoothly
from scipy import stats

def weat_p_value(X, Y, A, B, embedding, p):
    """
    Returns one-sided p-value of the permutation test 
    What the permutation test basically does: we want to make a lot (i) possible combinations of our target words and assess
    their association to the attributes. We then calculate the proportion of differential association that are higher for 
    permuted distributions than for the non permuted distrubition

    """
    diff_association = weat_differential_association(X, Y, A, B, embedding)
    target_words = np.concatenate((X, Y), axis=0)
    np.random.shuffle(target_words) #shuffle target words before permutations
    
    #Test if target words can be divided into two sets of equal size - otherwise print warning
    if target_words.shape[0] % 2 != 0:
        print('WARNING - target word set can not be divided into two sets of equal size')
        
    partition_diff_association = [] #Create empty list to be filled during loop

    for i in range(p): #Iterate p times (number of permutations)
        seq = np.random.permutation(target_words) #Permute target words
        partition_X = seq[:len(seq)//2] #Load  first partition of data to create to sets of permuted target words
        partition_Y = seq[len(seq)//2:] #Load second partition of data to create to sets of permuted target words
        #Calculate and append differential association for permuted target words to attributes
        partition_diff_association.append(weat_differential_association(partition_X, partition_Y, A, B, embedding))
      
    partition_diff_association = np.array(partition_diff_association) #Convert differential association for all permuted samples to numpy array

    mean = np.mean(partition_diff_association) #Mean differential association for permutations
    stdev = np.std(partition_diff_association) #Standard deviation of differential association for permutations
    pvalue = ((np.sum(partition_diff_association > diff_association)+1) / (len(partition_diff_association)+1)) #Calculation of p-value, corresponds to proportion of differential association for permuted target words that are higher than for the non permuted value
    return diff_association, mean, stdev, pvalue

In [8]:
#The next section runs the results for each model individually, as it is too computationally expensive to run all models together

In [9]:
#First we make one function that returns all relevant values for one embedding and one kind of gender bias

def results_weat(X, Y, A, B, embedding, p):
    #Define empty lists
    diff_association = []
    effect_size = []
    mean_diff_association_p = []
    std_diff_association_p = []
    pvalue = []

    #Retrieve values from already defined functions
    diff_association = weat_differential_association(X, Y, A, B, embedding)
    effect_size = weat_effect_size(X, Y, A, B,embedding)
    mean_diff_association_p = weat_p_value(X, Y, A, B, embedding, p)[1]
    std_diff_association_p = weat_p_value(X, Y, A, B, embedding, p)[2]
    pvalue = weat_p_value(X, Y, A, B, embedding, p)[3]
    
    #Combine all values in dataframe
    s1=pd.Series(diff_association,name='diff_association')
    s2=pd.Series(effect_size,name='effect_size')
    s3=pd.Series(mean_diff_association_p,name='mean_diff_association_p')
    s4=pd.Series(std_diff_association_p ,name='std_diff_association_p')
    s5=pd.Series(pvalue ,name='pvalue')

    results = pd.concat([s1,s2,s3,s4,s5], axis=1)
        
    #Return dataframe with all results
    return results


In [9]:
#Now run the tests for all types of gender biases (science vs. arts, math vs. arts, career vs. family) + all pre-trained models (separately as computational strain is too big otherwise)

In [10]:
from danlp.models.embeddings  import load_wv_with_gensim, load_wv_with_spacy

embeddings1 = load_wv_with_gensim('wiki.da.wv') #fastText Facebook wiki

#Set permutations
p =10000

#Set target and attribute words - Science vs. Arts
A = ['mandlig', 'mand','dreng','bror','han','ham','hans','søn'] #Attribute words for mand
B = ['kvindelig', 'kvinde', 'pige', 'søster', 'hun', 'hende', 'hendes', 'datter'] #Attribute words for kvinde
X = ['videnskab', 'teknologi', 'fysik', 'kemi', 'rumfart', 'eksperiment', 'astronomi', 'biologi'] #Target words for Science
Y = ['poesi', 'kunst', 'dans', 'litteratur', 'roman', 'symfoni', 'drama', 'skulptur'] #Target words for Arts


results_weat(X, Y, A, B, embeddings1, p)

Unnamed: 0,diff_association,effect_size,mean_diff_association_p,std_diff_association_p,pvalue
0,0.168158,0.720477,0.001154,0.116908,0.072693


In [19]:
#Set target a and attribute words - Math vs. Arts
A = ['mandlig', 'mand','dreng','bror','han','ham','hans','søn'] #Attribute words for mand
B = ['kvindelig', 'kvinde', 'pige', 'søster', 'hun', 'hende', 'hendes', 'datter'] #Attribute words for kvinde
X = ['matematik', 'algebra', 'geometri', 'regning', 'ligninger', 'beregning', 'tal', 'addition'] #Target words for Math
Y = ['poesi', 'kunst', 'dans', 'litteratur',  'roman' ,'symfoni', 'drama', 'skulptur'] #Target words for Arts

results_weat(X, Y, A, B, embeddings1, p)

Unnamed: 0,diff_association,effect_size,mean_diff_association_p,std_diff_association_p,pvalue
0,0.411015,1.562719,-0.001012,0.132708,0.0002


In [26]:
#Set target and attribute words - Career vs. Family
A = ['mandlig', 'mand','dreng','bror','han','ham','hans','søn'] #Attribute words for mand
B = ['kvindelig', 'kvinde', 'pige', 'søster', 'hun', 'hende', 'hendes', 'datter'] #Attribute words for kvinde
X = ['leder', 'ledelse', 'professionel', 'virksomhed', 'løn', 'kontor', 'forretning', 'karriere'] #Target words for Career
Y = ['hjem','forældre', 'børn', 'familie','bedsteforældre', 'ægteskab', 'bryllup', 'pårørende'] #Target words for Family

results_weat(X, Y, A, B, embeddings1, p)

Unnamed: 0,diff_association,effect_size,mean_diff_association_p,std_diff_association_p,pvalue
0,0.502864,1.406383,0.002253,0.180124,0.0019


In [27]:
from danlp.models.embeddings  import load_wv_with_gensim, load_wv_with_spacy

embeddings2 = load_wv_with_gensim('cc.da.wv') #fastText Facebook wiki + CommonCrawl

#Set permutations
p =10000

#Set target and attribute words - Science vs. Arts
A = ['mandlig', 'mand','dreng','bror','han','ham','hans','søn'] #Attribute words for mand
B = ['kvindelig', 'kvinde', 'pige', 'søster', 'hun', 'hende', 'hendes', 'datter'] #Attribute words for kvinde
X = ['videnskab', 'teknologi', 'fysik', 'kemi', 'rumfart', 'eksperiment', 'astronomi', 'biologi'] #Target words for Science
Y = ['poesi', 'kunst', 'dans', 'litteratur', 'roman', 'symfoni', 'drama', 'skulptur'] #Target words for Arts

results_weat(X, Y, A, B, embeddings2, p)

Unnamed: 0,diff_association,effect_size,mean_diff_association_p,std_diff_association_p,pvalue
0,0.258944,1.083315,0.001102,0.11937,0.012599


In [28]:
#Set target and attribute words - Math vs. Arts
A = ['mandlig', 'mand','dreng','bror','han','ham','hans','søn'] #Attribute words for mand
B = ['kvindelig', 'kvinde', 'pige', 'søster', 'hun', 'hende', 'hendes', 'datter'] #Attribute words for kvinde
X = ['matematik', 'algebra', 'geometri', 'regning', 'ligninger', 'beregning', 'tal', 'addition'] #Target words for Math
Y = ['poesi', 'kunst', 'dans', 'litteratur',  'roman' ,'symfoni', 'drama', 'skulptur'] #Target words for Arts

results_weat(X, Y, A, B, embeddings2, p)

Unnamed: 0,diff_association,effect_size,mean_diff_association_p,std_diff_association_p,pvalue
0,0.350271,1.430992,-0.001092,0.120756,0.0016


In [29]:
#Set target and attribute words - Career vs. Family
A = ['mandlig', 'mand','dreng','bror','han','ham','hans','søn'] #Attribute words for mand
B = ['kvindelig', 'kvinde', 'pige', 'søster', 'hun', 'hende', 'hendes', 'datter'] #Attribute words for kvinde
X = ['leder', 'ledelse', 'professionel', 'virksomhed', 'løn', 'kontor', 'forretning', 'karriere'] #Target words for Career
Y = ['hjem','forældre', 'børn', 'familie','bedsteforældre', 'ægteskab', 'bryllup', 'pårørende'] #Target words for Family

results_weat(X, Y, A, B, embeddings2, p)

Unnamed: 0,diff_association,effect_size,mean_diff_association_p,std_diff_association_p,pvalue
0,0.381471,1.542061,0.000438,0.123613,0.0001


In [30]:
from danlp.models.embeddings  import load_wv_with_gensim, load_wv_with_spacy

embeddings3 = load_wv_with_gensim('conll17.da.wv') #word2vec Skipgram CoNLL2017

#Set permutations
p =10000

#Set target and attribute words - Science vs. Arts
A = ['mandlig', 'mand','dreng','bror','han','ham','hans','søn'] #Attribute words for mand
B = ['kvindelig', 'kvinde', 'pige', 'søster', 'hun', 'hende', 'hendes', 'datter'] #Attribute words for kvinde
X = ['videnskab', 'teknologi', 'fysik', 'kemi', 'rumfart', 'eksperiment', 'astronomi', 'biologi'] #Target words for Science
Y = ['poesi', 'kunst', 'dans', 'litteratur', 'roman', 'symfoni', 'drama', 'skulptur'] #Target words for Arts

results_weat(X, Y, A, B, embeddings3, p)

Unnamed: 0,diff_association,effect_size,mean_diff_association_p,std_diff_association_p,pvalue
0,0.356648,1.053621,0.001581,0.168108,0.016798


In [31]:
#Set target and attribute words - Math vs. Arts
A = ['mandlig', 'mand','dreng','bror','han','ham','hans','søn'] #Attribute words for mand
B = ['kvindelig', 'kvinde', 'pige', 'søster', 'hun', 'hende', 'hendes', 'datter'] #Attribute words for kvinde
X = ['matematik', 'algebra', 'geometri', 'regning', 'ligninger', 'beregning', 'tal', 'addition'] #Target words for Math
Y = ['poesi', 'kunst', 'dans', 'litteratur',  'roman' ,'symfoni', 'drama', 'skulptur'] #Target words for Arts

results_weat(X, Y, A, B, embeddings3, p)

Unnamed: 0,diff_association,effect_size,mean_diff_association_p,std_diff_association_p,pvalue
0,0.239183,0.957526,-0.000551,0.12568,0.025197


In [32]:
#Set target and attribute words - Career vs. Family
A = ['mandlig', 'mand','dreng','bror','han','ham','hans','søn'] #Attribute words for mand
B = ['kvindelig', 'kvinde', 'pige', 'søster', 'hun', 'hende', 'hendes', 'datter'] #Attribute words for kvinde
X = ['leder', 'ledelse', 'professionel', 'virksomhed', 'løn', 'kontor', 'forretning', 'karriere'] #Target words for Career
Y = ['hjem','forældre', 'børn', 'familie','bedsteforældre', 'ægteskab', 'bryllup', 'pårørende'] #Target words for Family

results_weat(X, Y, A, B, embeddings3, p)

Unnamed: 0,diff_association,effect_size,mean_diff_association_p,std_diff_association_p,pvalue
0,0.647286,1.687687,-0.003095,0.193504,0.0001


In [10]:
import gensim
#https://github.com/alexandrainst/danlp/blob/master/danlp/models/embeddings.py
embeddings4 = gensim.models.fasttext.load_facebook_vectors('datenten14_5.bin')  

#Set permutations
p =10000

#Set target and attribute words - Science vs. Arts
A = ['mandlig', 'mand','dreng','bror','han','ham','hans','søn'] #Attribute words for mand
B = ['kvindelig', 'kvinde', 'pige', 'søster', 'hun', 'hende', 'hendes', 'datter'] #Attribute words for kvinde
X = ['videnskab', 'teknologi', 'fysik', 'kemi', 'rumfart', 'eksperiment', 'astronomi', 'biologi'] #Target words for Science
Y = ['poesi', 'kunst', 'dans', 'litteratur', 'roman', 'symfoni', 'drama', 'skulptur'] #Target words for Arts

results_weat(X, Y, A, B, embeddings4, p)

Unnamed: 0,diff_association,effect_size,mean_diff_association_p,std_diff_association_p,pvalue
0,0.104216,0.522161,-0.000657,0.099883,0.157584


In [11]:
#Set target and attribute words - Math vs. Arts
A = ['mandlig', 'mand','dreng','bror','han','ham','hans','søn'] #Attribute words for mand
B = ['kvindelig', 'kvinde', 'pige', 'søster', 'hun', 'hende', 'hendes', 'datter'] #Attribute words for kvinde
X = ['matematik', 'algebra', 'geometri', 'regning', 'ligninger', 'beregning', 'tal', 'addition'] #Target words for Math
Y = ['poesi', 'kunst', 'dans', 'litteratur',  'roman' ,'symfoni', 'drama', 'skulptur'] #Target words for Arts

results_weat(X, Y, A, B, embeddings4, p)

Unnamed: 0,diff_association,effect_size,mean_diff_association_p,std_diff_association_p,pvalue
0,0.261126,1.256206,0.000962,0.10292,0.003


In [12]:
#Set target and attribute words - Career vs. Family
A = ['mandlig', 'mand','dreng','bror','han','ham','hans','søn'] #Attribute words for mand
B = ['kvindelig', 'kvinde', 'pige', 'søster', 'hun', 'hende', 'hendes', 'datter'] #Attribute words for kvinde
X = ['leder', 'ledelse', 'professionel', 'virksomhed', 'løn', 'kontor', 'forretning', 'karriere'] #Target words for Career
Y = ['hjem','forældre', 'børn', 'familie','bedsteforældre', 'ægteskab', 'bryllup', 'pårørende'] #Target words for Family

results_weat(X, Y, A, B, embeddings4, p)

Unnamed: 0,diff_association,effect_size,mean_diff_association_p,std_diff_association_p,pvalue
0,0.515193,1.483032,-7.2e-05,0.174904,0.0004


In [10]:
from danlp.models.embeddings  import load_wv_with_gensim, load_wv_with_spacy

embeddings5 = load_wv_with_gensim('dslreddit.da.wv') #word2vec CBOW DSL Reddit

#Set permutations
p =10000

#Set target and attribute words - Science vs. Arts
A = ['mandlig', 'mand','dreng','bror','han','ham','hans','søn'] #Attribute words for mand
B = ['kvindelig', 'kvinde', 'pige', 'søster', 'hun', 'hende', 'hendes', 'datter'] #Attribute words for kvinde
X = ['videnskab', 'teknologi', 'fysik', 'kemi', 'rumfart', 'eksperiment', 'astronomi', 'biologi'] #Target words for Science
Y = ['poesi', 'kunst', 'dans', 'litteratur', 'roman', 'symfoni', 'drama', 'skulptur'] #Target words for Arts

results_weat(X, Y, A, B, embeddings5, p)

Unnamed: 0,diff_association,effect_size,mean_diff_association_p,std_diff_association_p,pvalue
0,0.188072,0.978981,-4e-05,0.095957,0.015598


In [11]:
#Set target and attribute words - Math vs. Arts
A = ['mandlig', 'mand','dreng','bror','han','ham','hans','søn'] #Attribute words for mand
B = ['kvindelig', 'kvinde', 'pige', 'søster', 'hun', 'hende', 'hendes', 'datter'] #Attribute words for kvinde
X = ['matematik', 'algebra', 'geometri', 'regning', 'ligninger', 'beregning', 'tal', 'addition'] #Target words for Math
Y = ['poesi', 'kunst', 'dans', 'litteratur',  'roman' ,'symfoni', 'drama', 'skulptur'] #Target words for Arts

results_weat(X, Y, A, B, embeddings5, p)

Unnamed: 0,diff_association,effect_size,mean_diff_association_p,std_diff_association_p,pvalue
0,0.099225,0.660592,-3.7e-05,0.074427,0.09679


In [12]:
#Set target and attribute words - Career vs. Family
A = ['mandlig', 'mand','dreng','bror','han','ham','hans','søn'] #Attribute words for mand
B = ['kvindelig', 'kvinde', 'pige', 'søster', 'hun', 'hende', 'hendes', 'datter'] #Attribute words for kvinde
X = ['leder', 'ledelse', 'professionel', 'virksomhed', 'løn', 'kontor', 'forretning', 'karriere'] #Target words for Career
Y = ['hjem','forældre', 'børn', 'familie','bedsteforældre', 'ægteskab', 'bryllup', 'pårørende'] #Target words for Family

results_weat(X, Y, A, B, embeddings5, p)

Unnamed: 0,diff_association,effect_size,mean_diff_association_p,std_diff_association_p,pvalue
0,0.501554,1.7073,0.001247,0.147298,0.0001


In [13]:
from gensim.models.keyedvectors import KeyedVectors

embeddings6 = KeyedVectors.load_word2vec_format('danish_newspapers_1880To2013.txt', binary=False) 

#Set permutations
p =10000

#Set target and attribute words - Science vs. Arts
A = ['mandlig', 'mand','dreng','bror','han','ham','hans','søn'] #Attribute words for mand
B = ['kvindelig', 'kvinde', 'pige', 'søster', 'hun', 'hende', 'hendes', 'datter'] #Attribute words for kvinde
X = ['videnskab', 'teknologi', 'fysik', 'kemi', 'rumfart', 'eksperiment', 'astronomi', 'biologi'] #Target words for Science
Y = ['poesi', 'kunst', 'dans', 'litteratur', 'roman', 'symfoni', 'drama', 'skulptur'] #Target words for Arts

results_weat(X, Y, A, B, embeddings6, p)


Unnamed: 0,diff_association,effect_size,mean_diff_association_p,std_diff_association_p,pvalue
0,0.243586,1.161395,-0.000599,0.10499,0.008899


In [14]:
#Set target and attribute words - Math vs. Arts
A = ['mandlig', 'mand','dreng','bror','han','ham','hans','søn'] #Attribute words for mand
B = ['kvindelig', 'kvinde', 'pige', 'søster', 'hun', 'hende', 'hendes', 'datter'] #Attribute words for kvinde
X = ['matematik', 'algebra', 'geometri', 'regning', 'ligninger', 'beregning', 'tal', 'addition'] #Target words for Math
Y = ['poesi', 'kunst', 'dans', 'litteratur',  'roman' ,'symfoni', 'drama', 'skulptur'] #Target words for Arts

results_weat(X, Y, A, B, embeddings6, p)

Unnamed: 0,diff_association,effect_size,mean_diff_association_p,std_diff_association_p,pvalue
0,0.277163,1.344452,0.000462,0.102612,0.002


In [15]:
#Set target and attribute words - Career vs. Family
A = ['mandlig', 'mand','dreng','bror','han','ham','hans','søn'] #Attribute words for mand
B = ['kvindelig', 'kvinde', 'pige', 'søster', 'hun', 'hende', 'hendes', 'datter'] #Attribute words for kvinde
X = ['leder', 'ledelse', 'professionel', 'virksomhed', 'løn', 'kontor', 'forretning', 'karriere'] #Target words for Career
Y = ['hjem','forældre', 'børn', 'familie','bedsteforældre', 'ægteskab', 'bryllup', 'pårørende'] #Target words for Family

results_weat(X, Y, A, B, embeddings6, p)

Unnamed: 0,diff_association,effect_size,mean_diff_association_p,std_diff_association_p,pvalue
0,0.491016,1.601569,-0.001639,0.153657,0.0002


In [None]:
#Overview
#from danlp.models.embeddings  import load_wv_with_gensim, load_wv_with_spacy

#embeddings1 = load_wv_with_gensim('wiki.da.wv') #fastText Facebook wiki
#embeddings2 = load_wv_with_gensim('cc.da.wv') #fastText Facebook wiki + CommonCrawl
#embeddings3 = load_wv_with_gensim('conll17.da.wv') #word2vec Skipgram CoNLL2017
#embeddings4 = load_wv_with_gensim('sketchengine.da.wv') #fastText SketchEngine
#embeddings5 = load_wv_with_gensim('dslreddit.da.wv') #word2vec CBOW DSL Reddit

#embeddingsX = load_wv_with_gensim('news.da.wv') #word2vec Skipgram Kongelige Bibliotek (nyhedsartikler) - will not use, requires too much memory
#Problems with the loading 4