In [1]:
from transformers import BertTokenizer, BertForMaskedLM, AutoModel
from transformers import DistilBertModel, DistilBertConfig
import torch

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [219]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [318]:
from sklearn.decomposition import PCA
from scipy.spatial.distance import cosine
import pandas as pd

In [3]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForMaskedLM.from_pretrained("bert-base-uncased")
model_emb = AutoModel.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.s

## 1. Computing the attribute subspace

First we find the direction or subsapce within the embedding space that best encodes our attribute of interest. For gender, we will find the space that best encodes ***the difference*** between binary genders. 

In [173]:
# select vocabulary that "defines" your attribute of interest

# if using a template to create semantically bleached sentences as input, take care to choose words that create 
# coherent sentences
# for binary gender, we can define semantic pairs (words that differ only in gender)

def_pairs = [["mother", "father"], 
             ["woman", "man"],
             ["girl", "boy"], 
             ["gal", "guy"], 
             ["lady", "gentleman"], 
             ["aunt", "uncle"],
             ["grandmother", "grandfather"],
             ["grandma", "grandpa"],
             ["daughter", "son"],       
             ["actress", "actor"],
             ["waitress", "waiter"]]

In [174]:
# for each word pair, obtain the difference vector between sentence representations

diffs = []
for pair in def_pairs:
    # create input sentences
    sentence_pair = ["She is a " + pair[0] + ".", "He is a " + pair[1] + "."]
    #sentence_pair = pair

    # convert text sentence to BERT input
    inputs = [tokenizer(s, return_tensors="pt") for s in sentence_pair]

    # pass input through BERT to obtain contextualized sentence representations
    reps = []
    for i in [0, 1]: # there are only 2 sentences for each pair
        outputs = model_emb(**inputs[i])
        cls_rep = outputs['last_hidden_state'][0][0]
        sentence_repr = cls_rep.cpu().detach().numpy()
        norm = np.linalg.norm(sentence_repr)
        reps.append(sentence_repr/norm)
    # store the difference vector for this pair of representations    
    diff = reps[0] - reps[1]
    diff_norm = np.linalg.norm(diff)
    diffs.append(diff/diff_norm)

In [175]:
# we should have 11 difference vectors (one for each defined word pair)
# and the length of each is the size of the model output (768 for base BERT)
diffs = np.array(diffs)
diffs.shape

(11, 768)

In [176]:
# fit PCA for difference vectors and observe variance explained
pca = PCA(n_components=2, random_state = 1)
pca.fit(diffs)
pca.explained_variance_ratio_

array([0.25373462, 0.1690021 ], dtype=float32)

In [177]:
# define the one-dimensional gender direction to be the first principal component
g = pca.components_[0]

In [179]:
g.shape

(768,)

## 2. Applying Hard Debias to gender neutral representations

Any sentence representation that is obtained through BERT can be hard debiased by projecting to the nullspace of the gender direction (i.e. make representations orthogonal to g in order to make them equally similar to both binary genders). Here we show an example application to some gender-neutral words of interest (occupations).

In [180]:
# define some gender neutral words (taken from A1)

gn_vocab = ["nurse", 
            "assistant",
            "housekeeper",
            "hairdresser",
            "nanny",
            "director",
            "programmer",
            "software engineer",
            "CEO", 
            "president",
            "lawyer",
            "doctor",
            "teacher",
            "pretty",
            "beautiful",
            "sweet",
            "quiet",
            "tough",
            "mean",
            "family",
            "loyal",
            "hero",
            "hysterical",
            "office",
            "business",
            "kitchen"]

In [332]:
# function to take a word of interest, and obtain output sentence rep from BERT using single word input
def sent_rep(sentence):
    s_input = tokenizer(sentence, return_tensors="pt")
    output = model_emb(**s_input)
    cls_rep = output['last_hidden_state'][0][0]
    sentence_repr = cls_rep.cpu().detach().numpy()
    return sentence_repr/np.linalg.norm(sentence_repr)

In [190]:
# create dictiony between words and BERT representations
word_reps = {}
for w in gn_vocab:
    word_reps[w] = sent_rep(w)

In [191]:
# now can access representations for each word by lookup
word_reps["CEO"].shape

(768,)

In [346]:
# check similarity between a job and gender subspace before debiasing 
# following experimental procedure from GG paper (use she-he axis)
f = sent_rep("she")
m = sent_rep("he")
y = []
for word, emb in word_reps.items():
    # if more similar to she than he, assign F label
    if np.inner(f, emb) > np.inner(m,emb):
        print(word, np.inner(f, emb) - np.inner(m,emb))
        y.append(0)
    else:
        y.append(1)

nurse 0.0038719177
assistant 0.00054454803
housekeeper 0.0026962757
hairdresser 0.006020844
nanny 0.0076822042
teacher 0.0052980185
pretty 0.0059719086
beautiful 0.008090854
sweet 0.008212507
quiet 0.003807187
family 0.0024880767
hysterical 0.0012001395
kitchen 0.0036526918


In [347]:
# obtain the "debiased" representations for each word
deb_word_reps = {}
X = []
for word, emb in word_reps.items():
    deb_emb = emb - np.inner(g, emb)*g
    deb_word_reps[word] = deb_emb
    
    X.append(deb_emb)
X = np.array(X)

In [348]:
# double check that the inner product (similarity) between gender neutral word and gender subspace is now ~0
np.inner(deb_word_reps["teacher"], g)

-2.6499947e-08

In [349]:
X.shape

(26, 768)

In [350]:
def hard_debias(emb, g):
    deb_emb = emb - np.inner(g, emb)*g
    return deb_emb

## 3. Recoverability experiment

To use gender names data, download "name_gender_dataset.csv" from https://archive.ics.uci.edu/ml/datasets/Gender+by+Name and place it in the working directory.

In [320]:
names = pd.read_csv("name_gender_dataset.csv")

In [406]:
names.head(5)

Unnamed: 0,Name,Gender,Count,Probability
0,James,M,5304407,0.014517
1,John,M,5260831,0.014398
2,Robert,M,4970386,0.013603
3,Michael,M,4579950,0.012534
4,William,M,4226608,0.011567


In [399]:
X_train = []
y_train = []
for i in range(2000):
    sent = names.loc[i, "Name"]
    emb = sent_rep(sent)
    deb_emb = hard_debias(emb, g)
    X_train.append(deb_emb)
    if names.loc[i, "Gender"] == "F":
        y_train.append(0)
    else:
        y_train.append(1)

In [400]:
my_classifier = LogisticRegression()

In [401]:
my_classifier.fit(X_train, y_train)
#predictions = my_classifier.predict(X)

In [402]:
score = my_classifier.score(X,y) # predict gender labels given debiased representations of the gender-neutral words

In [403]:
score

0.8846153846153846

## 4. Iterative Linear Nullspace Projection

Instead of estimating the gender direction from our defined vocabulary and applying PCA, we learn the most informative decision boundary and project to the nullspace iteratively, until information cannot be recovered by a linear classifier.

In [408]:
from sklearn import svm

In [415]:
from scipy.linalg import orth

In [407]:
# using our names data again construct a gender emb dataset, this time using their original (biased) representations
X_train = []
y_train = []
for i in range(2000):
    sent = names.loc[i, "Name"]
    emb = sent_rep(sent)
    X_train.append(emb)
    if names.loc[i, "Gender"] == "F":
        y_train.append(0)
    else:
        y_train.append(1)

In [411]:
# train a linear svm to classify the embs as either M or F 
# we can use the entire set for training as the only objective is to find a decision boundary
# hold out a test set if you want to observe diminishing accuracy over iterations

# we want a linear decision boundary that we can project 
my_clf = svm.LinearSVC()
my_clf.fit(X_train, y_train)

LinearSVC()

In [427]:
W = np.array(my_clf.coef_) #obtain weights from classifier

In [429]:
w_basis = orth(W.T) # orthogonal basis

In [430]:
P_W = w_basis.dot(w_basis.T) #ensure P is a projection

In [431]:
P_W.shape #projection matrix

(768, 768)

In [432]:
# debias w_deb = P*w
def INLP(emb, P):
    return P*emb

Repeat for as many iterations as needed until classification accuracy looks like random guessing.