In [1]:
# Reference
# [1] Caliskan, Aylin, Joanna J. Bryson, and Arvind Narayanan. "Semantics derived automatically from language corpora contain human-like biases." Science 356, no. 6334 (2017): 183-186.
# [2] https://psychbruce.github.io/PsychWordVec/reference/test_WEAT.html

In [1]:
import numpy as np
from tqdm import tqdm

In [3]:
# Loading word vector
def load_glove_vector(filename):
    word2vec = {}
    num_lines = sum(1 for _ in open(filename, 'r', encoding='utf-8'))
    
    with open(filename, 'r', encoding='utf-8') as f:
        for line in tqdm(f, total=num_lines, desc="Loading GloVe"):
            values = line.split()
            word = values[0]
            try:
                vector = np.asarray(values[1:], dtype='float32')
                word2vec[word] = vector
            except ValueError as e:
                1
    return word2vec

In [4]:
glove_vectors = load_glove_vector('glove.840B.300d.txt')

Loading GloVe: 100%|██████████| 2196017/2196017 [01:35<00:00, 23099.15it/s]


In [7]:
# --- Generic Functions ---
# Compute cosine similarity
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

In [8]:
# --- WEAT-related Functions ---

# Compute the association of word w with A & B
# refer: s_w(w, A, B) measures the association of w with the attribute
def s_w(w, A, B, embeddings):
    # Compute the cosine similarity between word w and every word in A
    similarities_with_A = [cosine_similarity(embeddings[w], embeddings[a]) for a in A]
    mean_sim_with_A = np.mean(similarities_with_A)
    
    # Compute the cosine similarity between word w and every word in B
    similarities_with_B = [cosine_similarity(embeddings[w], embeddings[b]) for b in B]
    mean_sim_with_B = np.mean(similarities_with_B)
    
    return mean_sim_with_A - mean_sim_with_B

# Compute the differential association between two sets of words (X & Y) and A & B
# refer: s_group(X,Y,A,B) measures the differential association of the two sets of target words with the attribute
def s_group(X, Y, A, B, embeddings):
    associations_with_X = [s_w(x, A, B, embeddings) for x in X]
    total_association_X = np.sum(associations_with_X)
    
    associations_with_Y = [s_w(y, A, B, embeddings) for y in Y]
    total_association_Y = np.sum(associations_with_Y)
    
    return total_association_X - total_association_Y

# Calculate effect size
def effect_size(X, Y, A, B, embeddings):
    associations_with_X = [s_w(x, A, B, embeddings) for x in X]
    mean_association_X = np.mean(associations_with_X)
    
    associations_with_Y = [s_w(y, A, B, embeddings) for y in Y]
    mean_association_Y = np.mean(associations_with_Y)
    
    all_associations = associations_with_X + associations_with_Y
    standard_deviation = np.std(all_associations, ddof=1)  # Using sample standard deviation
    
    return (mean_association_X - mean_association_Y) / standard_deviation

# Calculate one-side p-value
def p_value(X, Y, A, B, embeddings, n_iterations):
    combined = X + Y
    count_exceeding_original = 0  # Count the number of times exceeding the original s_group value
    original_s = s_group(X, Y, A, B, embeddings) # Calculate the original s_group value

    for _ in tqdm(range(n_iterations), desc="Calculating P value"):
        np.random.shuffle(combined)
        
        # The array after permutation becomes the new X and Y
        shuffled_X = combined[:len(X)]
        shuffled_Y = combined[len(X):]
        
        # If the s_group value after permutation is greater than the original, increment the counter
        if s_group(shuffled_X, shuffled_Y, A, B, embeddings) > original_s:
            count_exceeding_original += 1

    return count_exceeding_original / n_iterations

In [9]:
# WEAT Experiment

target1= ['woman', 'mother'] # X
target2= ['man', 'father'] # Y
attribute1= ['health', 'happy'] # A
attribute2= ['pollute', 'tragedy'] # B

In [10]:
# WEAT
# Compute the WEAT effect-size and p-value both with the normal distribution generated by 1,000 iterations

es = effect_size(target1, target2, attribute1, attribute2, glove_vectors)
p_val = p_value(target1, target2, attribute1, attribute2, glove_vectors, n_iterations=1000)
print(f"Effect Size: {es}")
print(f"P Value: {p_val}")

Calculating P value:   0%|          | 0/1000 [00:00<?, ?it/s]

Calculating P value: 100%|██████████| 1000/1000 [00:00<00:00, 4344.18it/s]

Effect Size: 1.1705615520477295
P Value: 0.156





In [11]:
# WEAT
# The empirical distribution generated by 1,000,000 iterations

es = effect_size(target1, target2, attribute1, attribute2, glove_vectors)
p_val = p_value(target1, target2, attribute1, attribute2, glove_vectors, n_iterations=1000000)
print(f"Effect Size: {es}")
print(f"P Value: {p_val}")

Calculating P value: 100%|██████████| 1000000/1000000 [03:39<00:00, 4565.27it/s]

Effect Size: 1.1705615520477295
P Value: 0.166081





In [12]:
# WEAT
# Here I changed some words to test for obvious bias, to see if it's significant.
# Here the p-value is indeed < 0.05

target1= ["woman", "female", "girl", "lady", "daughter", "sister", "she", "her", "wife", "mother", "aunt", "niece", "grandmother", "bride", "madam", "mrs", "miss", "queen", "princess", "damsel"]#X
target2= ["man", "male", "boy", "gentleman", "son", "brother", "he", "him", "husband", "father", "uncle", "nephew", "grandfather", "groom", "sir", "mr", "king", "prince", "lord", "duke"] #Y
attribute1= ["home", "family", "children", "parent", "kitchen", "domestic", "nurture", "caring", "house", "marriage", "homemaker", "household", "birth", "rearing", "nanny", "caretaker", "cook", "clean", "maid", "housewife"] #A
attribute2= ["work", "career", "office", "professional", "salary", "job", "business", "corporate", "manager", "executive", "profession", "ceo", "entrepreneur", "employee", "occupation", "labor", "workplace", "boss", "hire", "industry"] #B

es = effect_size(target1, target2, attribute1, attribute2, glove_vectors)
p_val = p_value(target1, target2, attribute1, attribute2, glove_vectors, n_iterations=1000)
print(f"Effect Size: {es}")
print(f"P Value: {p_val}")

Calculating P value: 100%|██████████| 1000/1000 [00:12<00:00, 77.19it/s]

Effect Size: 0.9262961745262146
P Value: 0.0





In [13]:
# --- SC-WEAT-related Functions ---

# Compute the association of word w with attributes A & B
def s_w_sc_weat(w, A, B, embeddings):

    similarities_with_A = [cosine_similarity(embeddings[w], embeddings[a]) for a in A]
    mean_sim_with_A = np.mean(similarities_with_A)
    
    similarities_with_B = [cosine_similarity(embeddings[w], embeddings[b]) for b in B]
    mean_sim_with_B = np.mean(similarities_with_B)
    
    all_similarities = similarities_with_A + similarities_with_B
    standard_deviation = np.std(all_similarities, ddof=1)  # Using sample standard deviation
    
    return (mean_sim_with_A - mean_sim_with_B) / standard_deviation

# Compute the average effect size for all target words
def effect_size_sc_weat(target, A, B, embeddings):
    associations_with_target = [s_w_sc_weat(w, A, B, embeddings) for w in target]
    mean_association = np.mean(associations_with_target)
    return mean_association

# Analogous to WEAT, calculate p-value by permuting attributes A and B.
# This is a bit different from the approach in the Reference paper. In the reference paper, each word has a property (p_w), and linear regression is used for prediction.
# Here, while we are given a target_list of words, there is no pw provided. Hence, we still take an approach analogous to WEAT, permuting A and B, to determine significance.

def p_value_sc_weat(target, A, B, embeddings, n_iterations):
    combined_attributes = A + B
    original_effect_size = effect_size_sc_weat(target, A, B, embeddings)
    count_exceeding_original = 0  # Count exceeding the original effect size
    
    for _ in tqdm(range(n_iterations), desc="Computing p-value"):
        np.random.shuffle(combined_attributes)
        
        # The attribute list after permutation becomes the new A and B
        shuffled_A = combined_attributes[:len(A)]
        shuffled_B = combined_attributes[len(A):]
        
        # If the effect size after permutation is greater than the original effect size, increment the counter
        if effect_size_sc_weat(target, shuffled_A, shuffled_B, embeddings) > original_effect_size:
            count_exceeding_original += 1

    return count_exceeding_original / n_iterations

In [14]:
# SC-WEAT Experiment
# "AylinCaliskan" isn't a single word; couldn't find it in glove_vectors. So here, I've split it into two words: first name and last name.

target = ["Xinyu", "weat", "Aylin", "Caliskan"] 
attribute1 = ["caress", "freedom", "health", "love", "peace", "cheer", "friend", "heaven", "loyal", "pleasure", "diamond", "gentle", "honest", "lucky", "rainbow", "diploma", "gift", "honor", "miracle", "sunrise", "family", "happy", "laughter", "paradise", "vacation"]
attribute2 = ["abuse", "crash", "filth", "murder", "sickness", "accident", "death", "grief", "poison", "stink", "assault", "disaster", "hatred", "pollute", "tragedy", "divorce", "jail", "poverty", "ugly", "cancer", "kill", "rotten", "vomit", "agony", "prison"]

In [16]:
# Compute the SC-WEAT (aka WEFAT) effect-size and p-value with an empirical distribution generated via 1,000,000 permutations for the case sensitive words
# 1000 permutations as a quick test

es_sc_weat = effect_size_sc_weat(target, attribute1, attribute2, glove_vectors)
p_val_sc_weat = p_value_sc_weat(target, attribute1, attribute2, glove_vectors, n_iterations=1000)
print(f"Effect Size: {es_sc_weat}")
print(f"P Value: {p_val_sc_weat}")

Computing p-value:   0%|          | 0/1000 [00:00<?, ?it/s]

Computing p-value: 100%|██████████| 1000/1000 [00:01<00:00, 592.30it/s]

Effect Size: -0.20858444273471832
P Value: 0.803





In [17]:
# 1,000,000 permutations

es_sc_weat = effect_size_sc_weat(target, attribute1, attribute2, glove_vectors)
p_val_sc_weat = p_value_sc_weat(target, attribute1, attribute2, glove_vectors, n_iterations=1000000)
print(f"Effect Size: {es_sc_weat}")
print(f"P Value: {p_val_sc_weat}")

Computing p-value: 100%|██████████| 1000000/1000000 [27:56<00:00, 596.51it/s]

Effect Size: -0.20858444273471832
P Value: 0.815381





In [18]:
# SC-WEAT
# Here I changed some words to test for obvious bias, to see if it's significant.
# Indeed, the p-value here is < 0.05.

target= ["woman", "female", "girl", "lady", "daughter", "sister", "she", "her", "wife", "mother", "aunt", "niece", "grandmother", "bride", "madam", "mrs", "miss", "queen", "princess", "damsel"]#X
attribute1= ["home", "family", "children", "parent", "kitchen", "domestic", "nurture", "caring", "house", "marriage", "homemaker", "household", "birth", "rearing", "nanny", "caretaker", "cook", "clean", "maid", "housewife"] #A
attribute2= ["work", "career", "office", "professional", "salary", "job", "business", "corporate", "manager", "executive", "profession", "ceo", "entrepreneur", "employee", "occupation", "labor", "workplace", "boss", "hire", "industry"] #B

es_sc_weat = effect_size_sc_weat(target, attribute1, attribute2, glove_vectors)
p_val_sc_weat = p_value_sc_weat(target, attribute1, attribute2, glove_vectors, n_iterations=1000)
print(f"Effect Size: {es_sc_weat}")
print(f"P Value: {p_val_sc_weat}")

Computing p-value: 100%|██████████| 1000/1000 [00:06<00:00, 145.85it/s]

Effect Size: 1.0810129642486572
P Value: 0.0



