In [18]:
import json
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from collections import Counter, defaultdict
from experiments import HateXplainExperiments
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
f = open('dataset.json')
data = json.load(f)

vectorizer = pickle.load(open('models/vectorizer.pkl', 'rb'))

description = "Exponential function applied to tokens in the TF-IDF vectorised training matrix (1) tagged as not indicative in offensive and hate-speech related human rationales and (2) and not unique to sentences labelled as normal [b]"
hatexplain = HateXplainExperiments(description)
dataset = hatexplain.prepare_dataset()
hatexplain.prepare_properties(dataset)
X_train_vect_experiment_B = hatexplain.preprocess_training_data_option_two()
best_algo = hatexplain.get_performance_metrics(X_train_vect_experiment_B)

# description = "No injection"
# hatexplain = HateXplainExperiments(description)
# dataset = hatexplain.prepare_dataset()
# X_train_vect_no_injection = hatexplain.prepare_properties(dataset)
# best_algo = hatexplain.get_performance_metrics()

{'nb__alpha': 0.01, 'nb__class_prior': None, 'nb__norm': False, 'sdg__alpha': 0.0001, 'sdg__average': True, 'sdg__class_weight': {0: 1.085606519835657, 1: 0.8211319985735391, 2: 1.1614065427295}, 'sdg__learning_rate': 'optimal', 'sdg__max_iter': 1000}
{'Description': ['Exponential function applied to tokens in the TF-IDF vectorised training matrix (1) tagged as not indicative in offensive and hate-speech related human rationales and (2) and not unique to sentences labelled as normal [b]'], 'CV score': [0.6443101459808495], 'Accuracy': [0.6461538461538462], 'Balanced accuracy': [0.6210725722935097], 'AUC-ROC score': [0.8026349808176313], 'F1 score': [0.6291350326357898], 'Precision': [0.6412326891866073], 'Recall': [0.6461538461538462]}


In [3]:
cleaned_dataset = []

for k,v in data.items():
    
    individual_data = {}
    individual_data['post_id'] = v['post_id']
    individual_data['post_tokens'] = v['post_tokens']
    individual_data['post_tokens_joined'] = ' '.join(v['post_tokens'])
    individual_data['target1'] = v['annotators'][0]['target']
    individual_data['target2'] = v['annotators'][1]['target']
    individual_data['target3'] = v['annotators'][2]['target']
    individual_data['annotator_a_label'] = v['annotators'][0]['label']
    individual_data['annotator_b_label'] = v['annotators'][1]['label']
    individual_data['annotator_c_label'] = v['annotators'][2]['label']
    
    labels = [v['annotators'][0]['label'], v['annotators'][1]['label'], v['annotators'][2]['label']]
    final_label = max(labels, key=labels.count)
    individual_data['final_label'] = final_label
    
    if set(labels) != 3:
    
        if len(v['rationales']) == 0: 
            individual_data['annotator_a_rationale'] = []
            individual_data['annotator_b_rationale'] = []
            individual_data['annotator_c_rationale'] = []
            individual_data['final_rationale'] = []
            
        else:  
            individual_data['annotator_a_rationale'] = v['rationales'][0]
            individual_data['annotator_b_rationale'] = v['rationales'][1]
        
            if final_label != v['annotators'][1]['label']:
                individual_data['final_rationale'] = v['rationales'][0]
            else:
                individual_data['final_rationale'] = v['rationales'][1]
                       
        cleaned_dataset.append(individual_data)

In [4]:
df = pd.DataFrame(cleaned_dataset)
df.head()

Unnamed: 0,post_id,post_tokens,post_tokens_joined,target1,target2,target3,annotator_a_label,annotator_b_label,annotator_c_label,final_label,annotator_a_rationale,annotator_b_rationale,annotator_c_rationale,final_rationale
0,1179055004553900032_twitter,"[i, dont, think, im, getting, my, baby, them, ...",i dont think im getting my baby them white 9 h...,[None],[None],[None],normal,normal,normal,normal,[],[],[],[]
1,1179063826874032128_twitter,"[we, cannot, continue, calling, ourselves, fem...",we cannot continue calling ourselves feminists...,[None],[None],[None],normal,normal,normal,normal,[],[],[],[]
2,1178793830532956161_twitter,"[nawt, yall, niggers, ignoring, me]",nawt yall niggers ignoring me,[African],[None],[African],normal,normal,hatespeech,normal,[],[],[],[]
3,1179088797964763136_twitter,"[<user>, i, am, bit, confused, coz, chinese, p...",<user> i am bit confused coz chinese ppl can n...,[Asian],[Asian],[Asian],hatespeech,offensive,hatespeech,hatespeech,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,1179085312976445440_twitter,"[this, bitch, in, whataburger, eating, a, burg...",this bitch in whataburger eating a burger with...,"[Caucasian, Women]","[Women, Caucasian]","[Women, Caucasian]",hatespeech,hatespeech,offensive,hatespeech,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [5]:
train_indices, test_indices = train_test_split(df.index, test_size=0.2, random_state=42)
df_train = df.loc[train_indices]
df_test = df.loc[test_indices]

In [6]:
# BPSN (Background Positive, Subgroup Negative) AUC
# BNSP (Background Negative, Subgroup Positive) AUC
# GMB-Subgroup-AUC
# GMB-BPSN-AUC
# GMB-BNSP-AUC

In [7]:
# From https://github.com/hate-alert/HateXplain/blob/master/Bias_Calculation_NB.ipynb

In [8]:
def generate_target_information(dataset):
    
    final_target_output = defaultdict(list)
    all_communities_selected = []
    
    for each in dataset.iterrows(): 
        
        # All the target communities tagged for this post
        all_targets = each[1]['target1'] + each[1]['target2'] + each[1]['target3']  
        community_dict = dict(Counter(all_targets))
        
        # Select only those communities which are present more than once.
        for key in community_dict:
            if community_dict[key]>1:  
                final_target_output[each[1]['post_id']].append(key)
                all_communities_selected.append(key)
        
        # If no community is selected based on majority voting then we don't select any community
        if each[1]['post_id'] not in final_target_output:
            final_target_output[each[1]['post_id']].append('None')
            all_communities_selected.append(key)

    return final_target_output, all_communities_selected

In [9]:
target_information, all_communities_selected = generate_target_information(df_test)

In [10]:
community_count_dict = Counter(all_communities_selected)

# We remove None and Other from dictionary
community_count_dict.pop('None')
community_count_dict.pop('Other')

# For the bias calculation, we are considering the top 10 communites based on their count
list_selected_community = [community for community, value in community_count_dict.most_common(10)]
list_selected_community

['African',
 'Islam',
 'Jewish',
 'Homosexual',
 'Women',
 'Refugee',
 'Arab',
 'Caucasian',
 'Asian',
 'Hispanic']

In [11]:
final_target_information = {}
for each in target_information:
    temp = list(set(target_information[each]) & set(list_selected_community))
    if len(temp) == 0:
        final_target_information[each] = None
    else:
        final_target_information[each] = temp

In [12]:
df_test['final_target_category'] = df_test['post_id'].map(final_target_information) 

In [13]:
# The bias methods that will be considered

method_list = ['subgroup', 'bpsn', 'bnsp']
community_list = list(list_selected_community)

In [14]:
sentences_vect = vectorizer.transform(df_test['post_tokens_joined'])
df_test['classification'] = best_algo.predict(sentences_vect)

df_test['final_label'] = df_test['final_label'].replace({'offensive': 'toxic', 'hatespeech': 'toxic', 'normal': 'non-toxic'})
df_test['classification'] = df_test['classification'].replace({'offensive': 'toxic', 'hatespeech': 'toxic', 'normal': 'non-toxic'})

In [15]:
# Function to divide the ids into postive or class class based on the method.
def bias_evaluation_metric(dataset, method, community):
    
    positive_ids = []
    negative_ids = []
    
    if method=='subgroup':
        for eachrow in dataset.iterrows():
            if eachrow[1]['final_target_category'] == None:
                continue
            if community in eachrow[1]['final_target_category']:
                if eachrow[1]['final_label'] =='non-toxic':
                    negative_ids.append(eachrow[1]['post_id'])
                else:
                    positive_ids.append(eachrow[1]['post_id'])
            else:
                pass
    
    elif method=='bpsn':
        for eachrow in dataset.iterrows():
            if eachrow[1]['final_target_category'] == None:
                continue
            if community in eachrow[1]['final_target_category']:
                if eachrow[1]['final_label'] =='non-toxic':
                    negative_ids.append(eachrow[1]['post_id'])
                else:
                    pass
            else:
                if eachrow[1]['final_label'] !='non-toxic':
                    positive_ids.append(eachrow[1]['post_id'])
                else:
                    pass
    
    elif method=='bnsp':
        for eachrow in dataset.iterrows():
            if eachrow[1]['final_target_category'] == None:
                continue
            if community in eachrow[1]['final_target_category']:
                if eachrow[1]['final_label'] !='non-toxic':
                    positive_ids.append(eachrow[1]['post_id'])
                else:
                    pass
            else:
                if eachrow[1]['final_label'] =='non-toxic':
                    negative_ids.append(eachrow[1]['post_id'])
                else:
                    pass
                
    return {'positiveID':positive_ids, 'negativeID':negative_ids}

In [16]:
final_bias_dictionary = defaultdict(lambda: defaultdict(dict))

# We compute the bias score using each method for all the community
for each_method in method_list:
    for each_community in community_list:
        community_data = bias_evaluation_metric(df_test, each_method, each_community)
        
        truth_values = []
        prediction_values = []

        label_to_value = {'toxic': 1.0, 'non-toxic': 0.0}
        
        for each in community_data['positiveID']:
            truth_values.append(label_to_value[df_test[df_test['post_id'] == each]['final_label'].iloc[0]])
            prediction_values.append(label_to_value[df_test[df_test['post_id'] == each]['classification'].iloc[0]])

        for each in community_data['negativeID']:
            truth_values.append(label_to_value[df_test[df_test['post_id'] == each]['final_label'].iloc[0]])
            prediction_values.append(label_to_value[df_test[df_test['post_id'] == each]['classification'].iloc[0]])

        roc_output_value = roc_auc_score(truth_values, prediction_values)
        final_bias_dictionary[each_method][each_community] = roc_output_value

In [17]:
# To combine the per-identity Bias AUCs into one overall measure, we calculate their generalized mean as defined below:
power_value = -5
num_communities = len(community_list)


for each_method in final_bias_dictionary:
    temp_value =[]
    for each_community in final_bias_dictionary[each_method]:
        temp_value.append(pow(final_bias_dictionary[each_method][each_community], power_value))
    print(each_method, pow(np.sum(temp_value)/num_communities, 1/power_value))

subgroup 0.6712524178525073
bpsn 0.650849979794217
bnsp 0.6487227864376864
