In [1]:
import numpy as np
import pandas as pd
import pickle
from pathlib import Path
from collections import Counter

In [2]:
import pickle

def load_dataset(path):
    
    with open(path, "rb") as f:
        
        data = pickle.load(f)
    return data

def load_dictionary(path):
    
    with open(path, "r", encoding = "utf-8") as f:
        
        lines = f.readlines()
        
    k2v, v2k = {}, {}
    for line in lines:
        
        k,v = line.strip().split("\t")
        v = int(v)
        k2v[k] = v
        v2k[v] = k
    
    return k2v, v2k
    
def count_profs_and_gender(data):
    
    counter = defaultdict(Counter)
    for entry in data:
        gender, prof = entry["g"], entry["p"]
        counter[prof][gender] += 1
        
    return counter

def count_profs_and_race(data):
    
    counter = defaultdict(Counter)
    for entry in data:
        gender, prof = entry["economy"], entry["p"]
        counter[prof][gender] += 1
        
    return counter


In [3]:

def load_word_vectors(fname):
    
    model = KeyedVectors.load_word2vec_format(fname, binary=False)
    vecs = model.vectors
    words = list(model.vocab.keys())
    return model, vecs, words


def get_embeddings_based_dataset(data, word2vec_model, p2i, filter_stopwords = False):
    
    X, Y = [], []
    unk, total = 0., 0.
    unknown = []
    vocab_counter = Counter()
    
    for entry in tqdm.tqdm_notebook(data, total = len(data)):
        y = p2i[entry["p"]]
        words = entry["hard_text"].split(" ")
        if filter_stopwords:
            words = [w for w in words if w.lower() not in STOPWORDS]
            
        vocab_counter.update(words) 
        bagofwords = np.sum([word2vec_model[w] if w in word2vec_model else word2vec_model["unk"] for w in words], axis = 0)
        #print(bagofwords.shape)
        X.append(bagofwords)
        Y.append(y)
        total += len(words)
        
        unknown_entry = [w for w in words if w not in word2vec_model]
        unknown.extend(unknown_entry)
        unk += len(unknown_entry)
    
    X = np.array(X)
    Y = np.array(Y)
    print("% unknown: {}".format(unk/total))
    return X,Y,unknown,vocab_counter



In [4]:
data_path = Path("/home/xudongh1/Project/joint_debiasing/data/bios/biasbios_location")

In [5]:
with open(data_path / "train_with_location_current_red.pkl", "rb") as f:
    train_loc_sub = pickle.load(f)

with open(data_path / "dev_with_location_current_red.pkl", "rb") as f:
    dev_loc_sub = pickle.load(f)

with open(data_path / "test_with_location_current_red.pkl", "rb") as f:
    test_loc_sub = pickle.load(f)

In [6]:
inlp_project_path = Path("/home/xudongh1/Project/nullspace_projection")

p2i, i2p = load_dictionary( inlp_project_path / "data/biasbios/profession2index.txt")
g2i, i2g = load_dictionary( inlp_project_path / "data/biasbios/gender2index.txt")

In [7]:
train_loc_sub[1].keys()


dict_keys(['g', 'p', 'text', 'start', 'hard_text', 'hard_text_untokenized', 'text_without_gender', 'location', 'country', 'economy', 'econ_class'])

In [8]:
Counter([i['g'] for i in train_loc_sub])

Counter({'m': 40115, 'f': 34753})

In [9]:
Counter([i['p'] for i in train_loc_sub])

Counter({'attorney': 5994,
         'photographer': 7440,
         'painter': 1695,
         'psychologist': 4310,
         'filmmaker': 1885,
         'architect': 1678,
         'rapper': 470,
         'physician': 4713,
         'professor': 13411,
         'dentist': 5423,
         'accountant': 585,
         'model': 2381,
         'nurse': 6853,
         'surgeon': 4226,
         'chiropractor': 880,
         'paralegal': 181,
         'journalist': 4038,
         'pastor': 610,
         'personal_trainer': 181,
         'comedian': 603,
         'dj': 393,
         'poet': 1589,
         'software_engineer': 640,
         'teacher': 2474,
         'dietitian': 681,
         'composer': 1024,
         'interior_designer': 285,
         'yoga_teacher': 225})

In [10]:
Counter([i['econ_class'] for i in train_loc_sub])

Counter({1: 62704, 0: 12164})

# Create DataFrame

In [11]:
bios_train_df = pd.DataFrame(train_loc_sub)
bios_dev_df = pd.DataFrame(dev_loc_sub)
bios_test_df = pd.DataFrame(test_loc_sub)

# Get BERT encoding

In [12]:
import torch
from transformers import *
import pickle
from tqdm import tqdm

In [14]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "1"

In [15]:
model_class, tokenizer_class, pretrained_weights = (BertModel, BertTokenizer, 'bert-base-uncased')
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




In [18]:
def tokenize(tokenizer, df):
    """
    Iterate over the data and tokenize it. Sequences longer than 512 tokens are trimmed.
    :param tokenizer: tokenizer to use for tokenization
    :param data: data to tokenize
    :return: a list of the entire tokenized data
    """
    tokenized_data = []
    for row in tqdm(df['hard_text']):
        tokens = tokenizer.encode(row, add_special_tokens=True)
        # keeping a maximum length of bert tokens: 512
        tokenized_data.append(tokens[:512])
    return tokenized_data

In [19]:
bios_train_tokens = tokenize(tokenizer, bios_train_df)
bios_dev_tokens = tokenize(tokenizer, bios_dev_df)
bios_test_tokens = tokenize(tokenizer, bios_test_df)

 56%|█████▋    | 42298/74868 [00:58<00:43, 753.40it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (593 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 74868/74868 [01:42<00:00, 727.35it/s]
100%|██████████| 11902/11902 [00:14<00:00, 805.84it/s]
100%|██████████| 26202/26202 [00:38<00:00, 675.23it/s]


In [20]:
bios_train_df["BERT_tokens"] = bios_train_tokens
bios_dev_df["BERT_tokens"] = bios_dev_tokens
bios_test_df["BERT_tokens"] = bios_test_tokens

In [24]:
# bios_train_df.to_pickle("bios_train_df")
# bios_dev_df.to_pickle("bios_dev_df")
# bios_test_df.to_pickle("bios_test_df")

In [22]:
model = model.cuda()

In [25]:
def encode_text(model, data):
    """
    encode the text
    :param model: encoding model
    :param data: data
    :return: two numpy matrices of the data:
                first: average of all tokens in each sentence
                second: cls token of each sentence
    """
    all_data_cls = []
    all_data_avg = []
    batch = []
    for row in tqdm(data):
        batch.append(row)
        input_ids = torch.tensor(batch).cuda()
        with torch.no_grad():
            last_hidden_states = model(input_ids)[0].detach().cpu()
            all_data_avg.append(last_hidden_states.squeeze(0).mean(dim=0).numpy())
            all_data_cls.append(last_hidden_states.squeeze(0)[0].numpy())
        batch = []
    return np.array(all_data_avg), np.array(all_data_cls)

In [26]:
train_avg_data, train_cls_data = encode_text(model, bios_train_tokens)

100%|██████████| 74868/74868 [17:54<00:00, 69.68it/s]


In [31]:
bios_train_df["train_avg_data"] = list(train_avg_data)
bios_train_df["train_cls_data"] = list(train_cls_data)

In [32]:
# bios_train_df.to_pickle("bios_train_df")

In [34]:
dev_avg_data, dev_cls_data = encode_text(model, bios_dev_tokens)

100%|██████████| 11902/11902 [02:52<00:00, 68.84it/s]


In [35]:
bios_dev_df["train_avg_data"] = list(dev_avg_data)
bios_dev_df["train_cls_data"] = list(dev_cls_data)

In [36]:
# bios_dev_df.to_pickle("bios_dev_df")

In [37]:
test_avg_data, test_cls_data = encode_text(model, bios_test_tokens)

100%|██████████| 26202/26202 [06:28<00:00, 67.38it/s]


In [38]:
bios_test_df["train_avg_data"] = list(test_avg_data)
bios_test_df["train_cls_data"] = list(test_cls_data)

In [39]:
# bios_test_df.to_pickle("bios_test_df")

In [42]:
bios_train_main_label = [p2i[p] for p in bios_train_df["p"]]
bios_dev_main_label = [p2i[p] for p in bios_dev_df["p"]]
bios_test_main_label = [p2i[p] for p in bios_test_df["p"]]

In [43]:
bios_train_gender_label = [g2i[g] for g in bios_train_df["g"]]
bios_dev_gender_label = [g2i[g] for g in bios_dev_df["g"]]
bios_test_gender_label = [g2i[g] for g in bios_test_df["g"]]

In [44]:
bios_train_df["gender_class"] = bios_train_gender_label
bios_dev_df["gender_class"] = bios_dev_gender_label
bios_test_df["gender_class"] = bios_test_gender_label

In [45]:
bios_train_df["profession_class"] = bios_train_main_label
bios_dev_df["profession_class"] = bios_dev_main_label
bios_test_df["profession_class"] = bios_test_main_label

In [46]:
bios_train_df.to_pickle("bios_train_df.pkl")
bios_dev_df.to_pickle("bios_dev_df.pkl")
bios_test_df.to_pickle("bios_test_df.pkl")

In [47]:
bios_test_df

Unnamed: 0,g,p,text,start,hard_text,hard_text_untokenized,text_without_gender,location,country,economy,econ_class,BERT_tokens,train_avg_data,train_cls_data,gender_class,profession_class
0,f,teacher,Samantha Gamble is a music teacher at Swift El...,83,Gamble works to integrate the arts into core s...,Gamble works to integrate the arts into core s...,_ works to integrate the arts into core subjec...,"[Chicago, Illinois]",united states,High income (H),1,"[101, 18503, 2573, 2000, 17409, 1996, 2840, 20...","[-0.21925582, 0.3185657, -0.082215905, -0.2758...","[-0.3803718, -0.097306155, -0.8822688, -0.6219...",1,26
1,m,professor,Miguel Esteban is an Assistant Professor in th...,126,"There , his research and teaching relates to t...","There, his research and teaching relates to th...","There, _ research and teaching relates to the ...",[Japan],japan,High income (H),1,"[101, 2045, 1010, 2010, 2470, 1998, 4252, 1462...","[-0.3420705, 0.08448499, -0.14151192, -0.07653...","[-0.39880997, -0.000994616, -0.7533077, -0.208...",0,21
2,m,professor,Dr. Neil Rubens is an Assistant Professor at t...,123,He is the Director of Active Intelligence Rese...,He is the Director of Active Intelligence Rese...,_ is the Director of Active Intelligence Resea...,[Japan],japan,High income (H),1,"[101, 2002, 2003, 1996, 2472, 1997, 3161, 4454...","[-0.44729003, 0.11454038, 0.05102728, -0.06796...","[-0.21576567, -0.2939665, -0.72540194, -0.2641...",0,21
3,m,dentist,"Dr. Bhagirath Rajpurohit is a Dentist in Kudi,...",102,Dr. Bhagirath Rajpurohit practices at Satyam D...,Dr. Bhagirath Rajpurohit practices at Satyam D...,Dr. _ _ practices at Satyam Dental Clinic in K...,"[Kudi, Jodhpur]",india,Lower middle income (LM),0,"[101, 2852, 1012, 1038, 3270, 5856, 27362, 119...","[0.33411, 0.121224634, -0.053565085, -0.329597...","[-0.40261286, 0.17509018, -0.30223414, -0.6536...",0,6
4,f,pastor,Carol Howard Merritt is a pastor at Western Pr...,83,"Western is a traditional , intergenerational c...","Western is a traditional, intergenerational co...","Western is a traditional, intergenerational co...",[Washington],united states,High income (H),1,"[101, 2530, 2003, 1037, 3151, 1010, 6970, 6914...","[-0.08476864, 0.15655439, 0.011266252, -0.3253...","[-0.17510362, 0.039392967, -0.7716085, -0.7649...",1,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26197,f,professor,Masami Toku is an associate professor of art e...,93,She is the general director of the project Pow...,She is the general director of the project Pow...,_ is the general director of the project Power...,[Chico],united states,High income (H),1,"[101, 2016, 2003, 1996, 2236, 2472, 1997, 1996...","[-0.27369493, 0.024090193, 0.105042726, -0.109...","[-0.1305432, 0.023827186, -0.52613086, -0.4801...",1,21
26198,f,journalist,Journalist Gabrielle Paluch is a freelance jou...,108,"She worked as an editor at the Myanmar Times ,...","She worked as an editor at the Myanmar Times, ...","_ worked as an editor at the Myanmar Times, wh...","[Myanmar, Thailand]",thailand,Upper middle income (UM),0,"[101, 2016, 2499, 2004, 2019, 3559, 2012, 1996...","[-0.08720053, -0.06562692, -0.008284882, -0.18...","[-0.11247709, -0.022806767, -0.78408843, -0.79...",1,11
26199,m,photographer,Photographer Alexander Lupascu is a photograph...,161,He started shooting back in 2006 and got into ...,He started shooting back in 2006 and got into ...,_ started shooting back in 2006 and got into w...,[Romania],romania,High income (H),1,"[101, 2002, 2318, 5008, 2067, 1999, 2294, 1998...","[0.0804553, 0.026444903, 0.13191348, -0.030254...","[-0.17706892, -0.03509802, -0.1651837, -0.4455...",0,18
26200,f,surgeon,Dr. G Santhi Vardhani is a General Surgeon in ...,115,Dr. G Santhi Vardhani practices at Tulasi Hosp...,Dr. G Santhi Vardhani practices at Tulasi Hosp...,Dr. G _ _ practices at Tulasi Hospitals in Kus...,"[Hyderabad, Kushaiguda]",india,Lower middle income (LM),0,"[101, 2852, 1012, 1043, 15548, 4048, 13075, 17...","[0.14391442, 0.14700025, -0.093546554, -0.2743...","[-0.18534903, -0.08657265, -0.2575438, -0.7811...",1,25
