In [None]:
pip install transformers

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import DistilBertTokenizer,DistilBertForSequenceClassification,DistilBertConfig
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import euclidean as euclidean_dist
from scipy.spatial.distance import cosine as cosine_sim
from scipy.integrate import trapz
import torch
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split

In [None]:
def input_tokenizer(tokenizer, text, leng):
  input_tokens = []
  input_ids = []
  attention_mask = []

  for doc in text:
    tkn = tokenizer.encode_plus(doc, add_special_tokens=True, truncation = True, max_length = leng, return_tensors='pt', return_attention_mask=True)
    id = tkn['input_ids']
    mask = tkn['attention_mask']

    input_tokens.append(tkn)
    input_ids.append(id)
    attention_mask.append(mask)

  return input_tokens, input_ids, attention_mask

In [None]:
def interpretation(tokenizer, input_ids, attention_weights):
  importance_scores = []

  for attention in attention_weights:
    attention = torch.sum(attention, dim=1)  
    attention = attention.squeeze().detach().numpy()

    valid_tokens = tokenizer.convert_ids_to_tokens(input_ids.squeeze())[1:-1]
    valid_attention_weights = attention[1:-1, 1:-1]

    importance = np.sum(valid_attention_weights, axis=0)
    importance_scores.append(importance)

  importance_scores = np.sum(importance_scores, axis=0)

  return valid_tokens, importance_scores

In [None]:
def evaluation(tokenizer, model, input_ids, attention_mask):
  pred = []
  valid_tokens = []
  importance_scores = []

  for i in range(len(input_ids)):
    with torch.no_grad():
      outputs = model(input_ids[i], attention_mask=attention_mask[i], output_attentions=True)

    logits = outputs.logits
    attention_weights = outputs.attentions

    # Apply softmax and get predictions
    probabilities = torch.softmax(logits, dim=1)
    predicted_class = torch.argmax(probabilities, dim=1).item()

    token, score = interpretation(tokenizer, input_ids[i], attention_weights)

    valid_tokens.append(token)
    importance_scores.append(score)
    pred.append(predicted_class)

  return pred, valid_tokens, importance_scores

In [None]:
def alpha_filter(token, score_y, score_s):

    new_tokens = []
    new_scores_y = []
    new_scores_s = []

    for i in range(len(valid_tokens_y)):
        filtered_tokens = []
        filtered_scores_y = []
        filtered_scores_s = []

        for t, y, s in zip(valid_tokens_y[i], importance_scores_y[i], importance_scores_s[i]):
            if t.isalpha():
                filtered_tokens.append(t)
                filtered_scores_y.append(y)
                filtered_scores_s.append(s)
                
        new_tokens.append(filtered_tokens)
        new_scores_y.append(filtered_scores_y)
        new_scores_s.append(filtered_scores_s)
    
    return new_tokens, new_scores_y, new_scores_s


def topK_extractor(token, score, percentile):
    
    t_new = []
    s_new = []

    for t,s in zip(token, score):

        sorted_scores = sorted(s, reverse=True)
        threshold_index = int(len(sorted_scores) * percentile / 100)

        threshold = sorted_scores[threshold_index-1]
        s_filtered = [x for x in s if x >= threshold]
        t_filtered = [t[i] for i, x in enumerate(s) if x >= threshold]
        s_new.append(s_filtered)
        t_new.append(t_filtered)

    return t_new, s_new


def jaccard_similarity(arr1, arr2):
    sim = []
    
    for i in range(len(arr1)):
        set_a1 = set(arr1[i])
        set_a2 = set(arr2[i])
        intersection = set_a1.intersection(set_a2)
        union = set_a1.union(set_a2)
        similarity = len(intersection) / len(union)
        sim.append(similarity)

    return sim, np.mean(sim)

In [None]:
def demographic_parity(label, sensitive_att):

  m1,m0,f1,f0 = 0,0,0,0

  for i in range(len(label)):
    if label[i] == 1 and sensitive_att[i]== 1:
      f1 = f1 +1

    if label[i] == 0 and sensitive_att[i]== 1:
      f0 = f0 +1

    if label[i] == 1 and sensitive_att[i]== 0:
      m1 = m1 +1

    if label[i] == 0 and sensitive_att[i]== 0:
      m0 = m0 +1

  rd = abs((m1/(m1+m0)) - (f1/(f1+f0)))

  return rd


def euqal(label_flat, pred_flat, test_gender):
    m_true = []
    m_pred = []
    f_true = []
    f_pred = []

    for i in range(len(test_gender)):
      if(test_gender[i] == 0):
        m_true.append(label_flat[i])
        m_pred.append(pred_flat[i])
      else:
        f_true.append(label_flat[i])
        f_pred.append(pred_flat[i])

    tn_m, fp_m, fn_m, tp_m = confusion_matrix(m_true, m_pred).ravel()
    tn_f, fp_f, fn_f, tp_f = confusion_matrix(f_true, f_pred).ravel()

    TPR_m = tp_m / (tp_m + fn_m)
    FPR_m = fp_m / (fp_m + tn_m)

    TPR_f = tp_f / (tp_f + fn_f)
    FPR_f = fp_f / (fp_f + tn_f)

    equality_of_odds = abs(TPR_m - TPR_f) + abs(FPR_m - FPR_f)
    equality_of_opportunity = abs(TPR_m - TPR_f)

    return equality_of_odds, equality_of_opportunity


def metrics(label, pred):
  accuracy = accuracy_score(label, pred)
  f1_micro = f1_score(label, pred, average="micro")
  f1_macro = f1_score(label, pred, average="macro")
  f1_weighted = f1_score(label, pred, average="weighted")
  tn, fp, fn, tp = confusion_matrix(label, pred).ravel()
  precision = tp / (tp + fp)
  recall = tp / (tp + fn)

  return accuracy, f1_micro, f1_macro, f1_weighted, tn, fp, fn, tp, precision, recall

In [None]:
dataset_dir = 'toxic/test.csv'
model_y_dir = 'toxic/models/bert/adhe/model'
model_s_dir = 'toxic/models/old/dp-20/model_toxic_s'
leng = 128

main_df = pd.read_csv(filepath_or_buffer=dataset_dir)
text = main_df.text.values.tolist()
y = main_df.label.values.tolist()
s = main_df.gender.values.tolist()

tokenizer_y = BertTokenizer.from_pretrained(model_y_dir)
model_y = BertForSequenceClassification.from_pretrained(model_y_dir, num_labels = 2, output_attentions=True)

tokenizer_s = BertTokenizer.from_pretrained(model_s_dir)
model_s = BertForSequenceClassification.from_pretrained(model_s_dir, num_labels = 2, output_attentions=True)

input_tokens_y, input_ids_y, attention_mask_y = input_tokenizer(tokenizer_y, text, leng)
input_tokens_s, input_ids_s, attention_mask_s = input_tokenizer(tokenizer_s, text, leng)

pred_y, valid_tokens_y, importance_scores_y = evaluation(tokenizer_y, model_y, input_ids_y, attention_mask_y)
pred_s, valid_tokens_s, importance_scores_s = evaluation(tokenizer_s, model_s, input_ids_s, attention_mask_s)

accuracy_y, f1_micro_y, f1_macro_y, f1_weighted_y, tn_y, fp_y, fn_y, tp_y, precision_y, recall_y = metrics(y, pred_y)
accuracy_s, f1_micro_s, f1_macro_s, f1_weighted_s, tn_s, fp_s, fn_s, tp_s, precision_s, recall_s = metrics(s, pred_s)
rd_true = demographic_parity(y, s)
rd_pred = demographic_parity(pred_y, s)
odds, opp = euqal(y, pred_y, s)

print("For model Y")
print("accuracy: %.6f"%accuracy_y)
print("f1_micro: %.6f, f1_macro: %.3f, f1_weighted: %.3f"%(f1_micro_y, f1_macro_y, f1_weighted_y))
print("precision: %.6f"%precision_y)
print("recall: %.6f"%recall_y)
print("tp:",tp_y, "fp:",fp_y, "tn:",tn_y, "fn:",fn_y)

print("\nFor model S")
print("accuracy: %.6f"%accuracy_s)
print("f1_micro: %.6f, f1_macro: %.3f, f1_weighted: %.3f"%(f1_micro_s, f1_macro_s, f1_weighted_s))
print("precision: %.6f"%precision_s)
print("recall: %.6f"%recall_s)
print("tp:",tp_s, "fp:",fp_s, "tn:",tn_s, "fn:",fn_s)

print("\nFairness Metrics")
print("\nDemographic Parity")
print("Before prediction: %.6f"%rd_true)
print("After prediction: %.6f"%rd_pred)

print("\nEqualized Odds: %.6f"%odds)
print("Equal Opportunity: %.6f"%opp)

In [None]:
valid_tokens, importance_scores_y, importance_scores_s = alpha_filter(valid_tokens_y, importance_scores_y, importance_scores_s)

top = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]

jaccard_ty = []
jaccard_ts = []

for i in top:
    jty, jy = topK_extractor(valid_tokens, importance_scores_y, i*100)
    jts, js = topK_extractor(valid_tokens, importance_scores_s, i*100)
    jaccard_ty.append(jty)
    jaccard_ts.append(jts)
    
top.insert(0, 0.0) 

jaccard = [0]

for i in range(len(jaccard_ty)):
    j, j_avg = jaccard_similarity(jaccard_ty[i], jaccard_ts[i])
    jaccard.append(j_avg)
    
print(jaccard)
print("AUSC:", trapz(np.array(jaccard), top))