In [None]:
pip install transformers

In [None]:
import time
import torch
import random
import datetime
import warnings
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import torch.nn as nn
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import get_linear_schedule_with_warmup,AdamW
from transformers import BertTokenizer,BertForSequenceClassification,BertConfig
from transformers import DistilBertTokenizer,DistilBertForSequenceClassification,DistilBertConfig
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
from sklearn.metrics import confusion_matrix
import torch.nn.functional as F

In [None]:
#Helper function for formatting time

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))

    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
def demographic_parity(label, sensitive_att):

  m1,m0,f1,f0 = 0,0,0,0

  for i in range(len(label)):
    if label[i] == 1 and sensitive_att[i]== 1:
      m1 = m1 +1

    if label[i] == 0 and sensitive_att[i]== 1:
      m0 = m0 +1

    if label[i] == 1 and sensitive_att[i]== 0:
      f1 = f1 +1

    if label[i] == 0 and sensitive_att[i]== 0:
      f0 = f0 +1

  rd = abs((m1/(m1+m0)) - (f1/(f1+f0)))

  return rd

In [None]:
#Device used for training
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [None]:
main_df = pd.read_csv('review/train.csv')
test_df = pd.read_csv('review/test.csv')

train_post,val_post,train_label,val_label,train_gender,val_gender = train_test_split(
    main_df.text.values.tolist(),
    main_df.label.values.tolist(),
    main_df.gender.values.tolist(),
    test_size=0.10,random_state=42)

test_posts = test_df.text.values.tolist()
test_label = test_df.label.values.tolist()
test_gender = test_df.gender.values.tolist()

In [None]:
train_crosstab = pd.crosstab(train_gender,
                            train_label,
                               margins = True)

print(train_crosstab)
print("\nTrain DP:")
print(abs((train_crosstab[1][0]/(train_crosstab[1][0] + train_crosstab[0][0])) - train_crosstab[1][1]/(train_crosstab[0][1]+train_crosstab[1][1])))


val_crosstab = pd.crosstab(val_gender,
                            val_label,
                               margins = True)

print(val_crosstab)
print("\nValidation DP:")
print(abs((val_crosstab[1][0]/(val_crosstab[1][0] + val_crosstab[0][0])) - val_crosstab[1][1]/(val_crosstab[0][1]+val_crosstab[1][1])))



test_crosstab = pd.crosstab(test_gender,
                            test_label,
                               margins = True)

print(test_crosstab)
print("\nTest DP:")
print(abs((test_crosstab[1][0]/(test_crosstab[1][0] + test_crosstab[0][0])) - test_crosstab[1][1]/(test_crosstab[0][1]+test_crosstab[1][1])))

In [None]:
#Initializing hyperparmaters
batch_size = 50
learning_rate = 0.00001
learning_rate_adv = 0.0001
epochs = 5 
MAX_LEN = 256

epsilon = 1e-8
warmup_steps = 10

In [None]:
bert_model = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(bert_model)

In [None]:
def tokenize_util(given_text,MAX_LEN):
  input_ids=[]
  attention_masks=[]
  #Tokenizing the posts,adding special tokens,truncating and creating the attention masks
  for post in given_text:
    # print (post)
    # print (type(post))
    encoded_dict = tokenizer.encode_plus(
                        post,                      # Sentence to encode.
                        add_special_tokens = True,
                        truncation = True, # Add '[CLS]' and '[SEP]'
                        max_length = MAX_LEN,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                        # is_split_into_words=True
                   )

    # Add the encoded sentence to the list.
    input_ids.append(encoded_dict['input_ids'])

    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

  #Converting the lists into tensors
  input_ids = torch.cat(input_ids, dim=0)
  attention_masks = torch.cat(attention_masks, dim=0)

  return input_ids,attention_masks

In [None]:
#Tokenizing the training dataset and gathering the input ID's of each token,attention masks
trn_inputs,trn_attentions = tokenize_util(train_post,MAX_LEN)
train_label = torch.tensor(train_label)
train_sensitive = torch.tensor(train_gender)

#Tokenizing the validation dataset and gathering the input ID's of each token,attention masks
vldtn_inputs,vldtn_attentions = tokenize_util(val_post,MAX_LEN)
val_label = torch.tensor(val_label)
val_sensitive = torch.tensor(val_gender)

#Tokenizing the training dataset and gathering the input ID's of each token,attention masks
test_inputs,test_attentions=tokenize_util(test_posts,MAX_LEN)
test_label = torch.tensor(test_label)
test_sensitive = torch.tensor(test_gender)

print(np.asarray(trn_inputs).shape, np.asarray(trn_attentions).shape, np.asarray(train_label).shape)
print(np.asarray(vldtn_inputs).shape, np.asarray(vldtn_attentions).shape, np.asarray(val_label).shape)
print(np.asarray(test_inputs).shape, np.asarray(test_attentions).shape, np.asarray(test_label).shape)

In [None]:
#Combining training input ID's,attention masks and labels into TensorDatasets
train_dataset = TensorDataset(trn_inputs,trn_attentions,train_label,train_sensitive)
train_dataloader = DataLoader(
            train_dataset,
            sampler = RandomSampler(train_dataset),
            batch_size = batch_size
        )

val_dataset = TensorDataset(vldtn_inputs, vldtn_attentions,val_label,val_sensitive)
validation_dataloader = DataLoader(
            val_dataset,
            sampler = SequentialSampler(val_dataset),
            batch_size = batch_size
        )

test_dataset = TensorDataset(test_inputs, test_attentions,test_label,test_sensitive)
test_dataloader = DataLoader(
            test_dataset,
            sampler = SequentialSampler(test_dataset),
            batch_size = batch_size
        )

In [None]:
clf = DistilBertForSequenceClassification.from_pretrained(

    bert_model, # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.
    output_attentions = True, # Whether the model returns attentions weights.
    output_hidden_states = True, # Whether the model returns all hidden-states.
)


# Tell pytorch to run this model on the GPU.
clf.cuda()

In [None]:
adv = nn.Sequential(
    nn.Linear(768, 512),
    nn.ReLU(),
    nn.Linear(512, 128),
    nn.ReLU(),
    nn.Linear(128, 2),
)
adv.cuda()

In [None]:
g_dir = 'review/models/distilbert/vanila/model_g'
tokenizer_g = DistilBertTokenizer.from_pretrained(g_dir)
g = DistilBertForSequenceClassification.from_pretrained(g_dir, num_labels = 2, output_attentions=True)

In [None]:
optim = AdamW(clf.parameters(),
              lr=learning_rate,
              eps = epsilon)

total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optim,
                                            num_warmup_steps = warmup_steps,
                                            num_training_steps = total_steps)

adv_optim = AdamW(adv.parameters(),
                            lr=learning_rate_adv)

In [None]:
def evaluate(model,data):

  model.eval()

  preds=[]
  labels=[]
  sensitives=[]
  total_eval_loss = 0

  for batch in data:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        b_sensitive = batch[3].to(device)

        with torch.no_grad():
            outputs = model( b_input_ids,
#                              token_type_ids=None,
                             attention_mask=b_input_mask,
                             labels=b_labels, output_attentions=True)

        loss,logits = outputs.loss,outputs.logits
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        sensitive_ids = b_sensitive.to('cpu').numpy()
        preds.append(logits)
        labels.append(label_ids)
        sensitives.append(sensitive_ids)
        total_eval_loss += loss.item()
        
  preds = np.concatenate(preds,axis=0)
  preds = np.argmax(preds, axis=1).flatten()  
  labels = np.concatenate(labels,axis=0)
  labels = labels.flatten()
  sensitives = np.concatenate(sensitives,axis=0)
  sensitives = sensitives.flatten()
  acc = accuracy_score(labels, preds)
  rd = demographic_parity(preds, sensitives)

  return acc, rd, total_eval_loss

In [None]:
import nltk
import string
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
punctuation = set(string.punctuation)

# Function to mask stop words and punctuation in attention matrices
def mask_preprocessing(attention_tuple, input_ids, tokenizer):
    masked_attention_layers = []
    for attention in attention_tuple:
        masked_attention = attention.clone()
        for i, seq in enumerate(input_ids):
            for j, token_id in enumerate(seq):
                token = tokenizer.decode([token_id])
                if token in stop_words or token in punctuation:
                    masked_attention[i, :, j, :] = 0
                    masked_attention[i, :, :, j] = 0
        masked_attention_layers.append(masked_attention)
    return tuple(masked_attention_layers)


def min_max_normalize(attention):
    min_val = torch.min(attention)
    max_val = torch.max(attention)
    normalized_attention = (attention - min_val) / (max_val - min_val)
    return normalized_attention

In [None]:
def pretrain_model(clf, data, optimizer, scheduler, epoch):
  clf.train()

  for e in range(epoch):
    print("")
    print('epoch: ', e+1)
    epoch_loss = 0
    preds = []
    labels = []
    t0 = time.time()

    for step, batch in enumerate(data):

      if step % 100 == 0 and not step == 0:
          # Calculate elapsed time in minutes.
          elapsed = format_time(time.time() - t0)
          print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

      b_input_ids = batch[0].to(device)
      b_input_mask = batch[1].to(device)
      b_labels = batch[2].to(device)

      inputs = {'input_ids':      b_input_ids,
                'attention_mask': b_input_mask,
                'labels':         b_labels,
                }

      optimizer.zero_grad()
      outputs = clf(**inputs)
      loss = outputs[0]
      logits = outputs.logits
      logits = logits.detach().cpu().numpy()
      label_ids = b_labels.to('cpu').numpy()
      preds.append(logits)
      labels.append(label_ids)
      epoch_loss += loss.item()
      loss.backward()
      torch.nn.utils.clip_grad_norm_(clf.parameters(), 1.0)
      optimizer.step()
      scheduler.step()
        
    print("loss: {0:.7f}".format(epoch_loss/len(data)))
    preds = np.concatenate(preds,axis=0)
    preds = np.argmax(preds, axis=1).flatten()  
    labels = np.concatenate(labels,axis=0)
    labels = labels.flatten()
    acc = accuracy_score(labels, preds)
    f1_micro = f1_score(labels, preds, average="micro")
    print("accuracy: {0:.7f}".format(acc))
    print("f1: {0:.7f}".format(f1_micro))

  return clf


def pretrain_debiasing_model(adv, clf, data, adv_optimizer, epoch):
  cross_entropy_loss = torch.nn.CrossEntropyLoss()
#   cross_entropy_loss = torch.nn.BCEWithLogitsLoss()
  adv.train()

  for e in range(epoch):
    print("")
    print('epoch: ', e+1)
    epoch_loss = 0
    preds = []
    sensitives = []
    t0 = time.time()

    for step, batch in enumerate(data):

      if step % 100 == 0 and not step == 0:
          # Calculate elapsed time in minutes.
          elapsed = format_time(time.time() - t0)
          print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

      b_input_ids = batch[0].to(device)
      b_input_mask = batch[1].to(device)
      b_labels = batch[2].to(device)
      b_sensitive = batch[3].to(device)

      inputs = {'input_ids':      b_input_ids,
                'attention_mask': b_input_mask,
                'labels':         b_labels,
                }
    
      adv_optimizer.zero_grad()
      outputs = clf(**inputs)
      last_hidden_states = outputs.hidden_states[-1][:, 0, :] 
      adv_outputs = adv(last_hidden_states)
      p = adv_outputs.detach().cpu().numpy()
      preds.append(p) 
      adv_loss = cross_entropy_loss(adv_outputs, b_sensitive)
      sensitive_ids = b_sensitive.to('cpu').numpy()
      sensitives.append(sensitive_ids)
      epoch_loss += adv_loss.item()
      adv_loss.backward()
      adv_optimizer.step()

    print("loss: {0:.7f}".format(epoch_loss/len(data)))
    preds = np.concatenate(preds,axis=0)
    preds = np.argmax(preds, axis=1).flatten() 
    sensitives = np.concatenate(sensitives,axis=0)
    sensitives = sensitives.flatten()
    acc = accuracy_score(sensitives, preds)
    f1_micro = f1_score(sensitives, preds, average="micro")
    print("accuracy: {0:.7f}".format(acc))
    print("f1: {0:.7f}".format(f1_micro))

  return adv


def evaluate_to_attention(model, b_input_ids, attention_mask):
    model.eval()
    preds = []
    labels = []
    total_eval_loss = 0

    with torch.no_grad():
        outputs = model(b_input_ids, attention_mask=attention_mask, output_attentions=True)

    attention = outputs.attentions

    return attention


def train_model(clf, g, adv, data, optimizer, adv_optimizer, lmbda, alpha):
  cross_entropy_loss = torch.nn.CrossEntropyLoss()  
  epoch_loss = 0
  preds = []
  labels = []
  sensitives=[]
  clf.train()
    
  g.to(device)

  for step, batch in enumerate(data):

    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)
    b_sensitive = batch[3].to(device)

    inputs = {'input_ids':      b_input_ids,
              'attention_mask': b_input_mask,
              'labels':         b_labels,
              }
    
    optimizer.zero_grad()
    outputs = clf(**inputs)
    loss = outputs[0]
    logits = outputs.logits
    last_hidden_states = outputs.hidden_states[-1][:, 0, :]
    
    f_attention = outputs.attentions
    g_attention= evaluate_to_attention(g, b_input_ids, b_input_mask)
    
    # Mask stop words and punctuation in attention matrices
    f_attention_masked = mask_preprocessing(f_attention, b_input_ids, tokenizer)
    g_attention_masked = mask_preprocessing(g_attention, b_input_ids, tokenizer)
    
    f_attention_stacked = torch.stack(f_attention_masked)
    g_attention_stacked = torch.stack(g_attention_masked)

    # Compute the mean across the layer dimension (0th dimension after stacking)
    f_attention_aggregated = torch.mean(f_attention_stacked, dim=0)
    g_attention_aggregated = torch.mean(g_attention_stacked, dim=0)

    # Reshape the aggregated attention matrices
    f_attention_aggregated = f_attention_aggregated.view(-1, f_attention_aggregated.size(-2), f_attention_aggregated.size(-1))
    g_attention_aggregated = g_attention_aggregated.view(-1, g_attention_aggregated.size(-2), g_attention_aggregated.size(-1))
    
    f_attention_normalized = min_max_normalize(f_attention_aggregated)
    g_attention_normalized = min_max_normalize(g_attention_aggregated)

    # Compute the cosine similarity for the aggregated attention
    cosine_similarity_aggregated = F.cosine_similarity(f_attention_normalized, g_attention_normalized, dim=-1)
    cosine_similarity_aggregated = cosine_similarity_aggregated*cosine_similarity_aggregated    
    mean_cosine_similarity = cosine_similarity_aggregated.mean()
    
    p = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    sensitive_ids = b_sensitive.to('cpu').numpy()
    preds.append(p)
    labels.append(label_ids)
    sensitives.append(sensitive_ids)
    adv_outputs = adv(last_hidden_states)
    adv_loss = cross_entropy_loss(adv_outputs, b_sensitive)
    total_loss = loss - (lmbda * adv_loss) + (alpha * mean_cosine_similarity)
#     total_loss = loss + (alpha * mean_cosine_similarity)

    total_loss.backward()
    optimizer.step()
    scheduler.step()
    
    print("clf loss: {0:.7f}".format(loss.item()))
    print("adv loss: {0:.7f}".format(adv_loss.item()))
    print("cosine similarity: {0:.7f}".format(mean_cosine_similarity))
    print("total loss: {0:.7f}".format(total_loss.item()))
    preds = np.concatenate(preds,axis=0)
    preds = np.argmax(preds, axis=1).flatten()  
    labels = np.concatenate(labels,axis=0)
    labels = labels.flatten()
    sensitives = np.concatenate(sensitives,axis=0)
    sensitives = sensitives.flatten()
    acc = accuracy_score(labels, preds)
    f1_micro = f1_score(labels, preds, average="micro")
    print("accuracy: {0:.7f}".format(acc))
    print("f1: {0:.7f}".format(f1_micro))
    
    break

  return clf

In [None]:
print("~~~~~~~~~~~~~~~ CLF Pre-training ~~~~~~~~~~~~~~~")
for param in adv.parameters():
    param.requires_grad = False
    
clf = pretrain_model(clf, train_dataloader, optim, scheduler, 3)

for param in adv.parameters():
    param.requires_grad = True
print("")

In [None]:
print("~~~~~~~~~~~~~~~ ADV Pre-training ~~~~~~~~~~~~~~~")
for param in clf.parameters():
    param.requires_grad = False
    
adv = pretrain_debiasing_model(adv, clf, train_dataloader, adv_optim, 25)

for param in clf.parameters():
    param.requires_grad = True
print("")

In [None]:
iterations = 40
lbda = 10
alpha = 5

for iteration in range(iterations):
    print("\n")
    print("Iteration: ", iteration)
    
    #train adv for one epoch
    for param in clf.parameters():
        param.requires_grad = False
    
    print("-------------for adv-------------")
    adv = pretrain_debiasing_model(adv, clf, train_dataloader, adv_optim, 1)

    for param in clf.parameters():
        param.requires_grad = True
        
    #train clf for one mini-batch
    for param in adv.parameters():
        param.requires_grad = False
    
    print("\n")
    print("-------------for clf-------------")
    clf = train_model(clf, g, adv, train_dataloader, optim, adv_optim, lbda, alpha)

    for param in adv.parameters():
        param.requires_grad = True
        
    if (iteration + 1) % 2 == 0:
        print("\n")
        print("evaluation\n")
        acc, rd, lss = evaluate(clf, train_dataloader)
        print("train loss: {0:.7f}".format(lss))
        print("train accuracy: {0:.7f}".format(acc))
#         print("train F1: {0:.2f}".format(f1))
        print("train E-Opp: {0:.7f}".format(rd))
        print("\n")

        acc, rd, lss = evaluate(clf, validation_dataloader)
        print("validation loss: {0:.7f}".format(lss))
        print("validation accuracy: {0:.7f}".format(acc))
#         print("validation f1: {0:.2f}".format(f1))
        print("validation E-Opp: {0:.7f}".format(rd))
        
print("\n")
print("Finish")

In [None]:
acc, rd, lss = evaluate(clf, test_dataloader)
print(acc, rd, lss)