In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from itertools import combinations_with_replacement  
from itertools import product
from tqdm import tqdm

In [2]:
# load data split
split_folder = ""
test = pd.read_csv(split_folder+"test.tsv", sep="\t", na_values='x')
train = pd.read_csv(split_folder+"train.tsv", sep="\t", na_values='x')
valid = pd.read_csv(split_folder+"valid.tsv", sep="\t", na_values='x')

# Load original dataset
original_dataset_dir = ""
full_corpus = pd.read_csv(original_dataset_dir+"corpus.tsv", sep="\t", na_values='x')

In [3]:
train_pkl_df = pd.read_pickle("train_set.pkl")
valid_pkl_df = pd.read_pickle("valid_set.pkl")
test_pkl_df = pd.read_pickle("test_set.pkl")

In [4]:
test_pkl_df.head()

Unnamed: 0,tid,uid,text_encoding,date,gender,age,city,state,country,ethnicity,label,text_hidden,text,target_gender,target_race,target_religion
0,-3610489237063955101,8952678256548929855,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,2017-4-4,,,,,,,1,"[0.47649562, 0.030928096, -0.41260943, 0.77192...",ebl 4 pcs 600mah 9v li-ion rechargeable batter...,,,
1,-4719811451085656482,7217783508079414686,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,2015-5-27,0.0,1.0,,,0.0,0.0,0,"[0.38377494, 0.42311928, -0.041753568, 0.20655...",hashtag user url,,,
2,-7441423804391677032,-2293180168487378857,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,2015-2-4,1.0,0.0,Independence,Kansas,1.0,0.0,1,"[0.7557561, 0.2791906, -0.3461117, 0.48534566,...",user feminazi . she changed it sarcastically f...,female,,
3,5036602591935647032,-6373731656100463832,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,2017-4-7,,,,,,,1,"[0.31144798, 0.06574376, -0.1957732, 0.3786313...",hashtag as515blu - lt-pk $ 1500 each ( cash / ...,,,
4,8231585118909995298,-1989034984099107849,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,2017-4-5,,,,,,,1,"[0.40854383, 0.20999284, -0.23556745, 0.486343...",help to save foxes and dogs from horrific crue...,,,


In [5]:
test.head()

Unnamed: 0,tid,uid,text,date,gender,age,city,state,country,ethnicity,label
0,-3610489237063955101,8952678256548929855,ebl 4 pcs 600mah 9v li-ion rechargeable batter...,2017-4-4,,,,,,,1
1,-4719811451085656482,7217783508079414686,hashtag user url,2015-5-27,0.0,1.0,,,0.0,0.0,0
2,-7441423804391677032,-2293180168487378857,user feminazi . she changed it sarcastically f...,2015-2-4,1.0,0.0,Independence,Kansas,1.0,0.0,1
3,5036602591935647032,-6373731656100463832,hashtag as515blu - lt-pk $ 1500 each ( cash / ...,2017-4-7,,,,,,,1
4,8231585118909995298,-1989034984099107849,help to save foxes and dogs from horrific crue...,2017-4-5,,,,,,,1


In [6]:
len(full_corpus)

83077

In [7]:
id2label = {
    "gender":["male", "female"],
    "age":["Over", "Under"],
    "country":["NotIn", "InUS"],
    "ethnicity":["White", "Other"]
}

In [8]:
tasks = ['gender', 'age', 'country', 'ethnicity']

def main_label_dist(df):
    label_dist = Counter(df["label"])
    label_dist = np.array([label_dist[0], label_dist[1]])
    label_dist = label_dist/sum(label_dist)
    return label_dist

def full_label_data(df, tasks):
    selected_rows = np.array([True]*len(df))
    for task in tasks:
        selected_rows = selected_rows & df[task].notnull().to_numpy()
    return selected_rows

def single_analysis(in_df, 
                    tasks = ['gender', 'age', 'country', 'ethnicity'],
                    all_labels = False
                    ):
    # save dist at different level
    dist_results = {}    
    
    if all_labels:

        df = in_df[full_label_data(in_df, tasks)]
    else:
        df = in_df

    overall_dist = main_label_dist(df)
    dist_results["overall"] = (overall_dist, len(df))
    
    '''fairness gaps'''
    for task in tasks:
        # filter out the one does not have attributes
        task_df = df[df[task].notnull()]
        dist_results[task] = (main_label_dist(task_df), len(task_df))

        # get the unique types of demographic groups
        uniq_types = task_df[task].unique()
        for group in uniq_types:
            # calculate group specific confusion matrix
            group_df = task_df[task_df[task] == group]
            group_key = "_".join([task, 
                                id2label[task][int(group)] if task in id2label.keys() else str(group)
                                ])
            dist_results[group_key] = (main_label_dist(group_df), len(group_df))
    return dist_results

In [9]:
# main label distribution
single_analysis(full_corpus)

{'overall': (array([0.62974108, 0.37025892]), 83077),
 'gender': (array([0.63377179, 0.36622821]), 47211),
 'gender_female': (array([0.63247767, 0.36752233]), 26763),
 'gender_male': (array([0.63546557, 0.36453443]), 20448),
 'age': (array([0.63326407, 0.36673593]), 44757),
 'age_Under': (array([0.62439711, 0.37560289]), 24880),
 'age_Over': (array([0.64436283, 0.35563717]), 19877),
 'country': (array([0.67253094, 0.32746906]), 51953),
 'country_InUS': (array([0.65189113, 0.34810887]), 32441),
 'country_NotIn': (array([0.70684707, 0.29315293]), 19512),
 'ethnicity': (array([0.63458454, 0.36541546]), 46755),
 'ethnicity_White': (array([0.68858981, 0.31141019]), 26897),
 'ethnicity_Other': (array([0.5614362, 0.4385638]), 19858)}

In [10]:
single_analysis(full_corpus, all_labels=True)

{'overall': (array([0.66865459, 0.33134541]), 32362),
 'gender': (array([0.66865459, 0.33134541]), 32362),
 'gender_female': (array([0.67731376, 0.32268624]), 18941),
 'gender_male': (array([0.65643395, 0.34356605]), 13421),
 'age': (array([0.66865459, 0.33134541]), 32362),
 'age_Under': (array([0.68269781, 0.31730219]), 17466),
 'age_Over': (array([0.65218851, 0.34781149]), 14896),
 'country': (array([0.66865459, 0.33134541]), 32362),
 'country_InUS': (array([0.64872495, 0.35127505]), 21960),
 'country_NotIn': (array([0.71072871, 0.28927129]), 10402),
 'ethnicity': (array([0.66865459, 0.33134541]), 32362),
 'ethnicity_White': (array([0.71929913, 0.28070087]), 19861),
 'ethnicity_Other': (array([0.58819294, 0.41180706]), 12501)}

In [11]:
# main label distribution in train set
single_analysis(train)

{'overall': (array([0.63203396, 0.36796604]), 57951),
 'gender': (array([0.63423013, 0.36576987]), 32895),
 'gender_female': (array([0.6299166, 0.3700834]), 18585),
 'gender_male': (array([0.63983229, 0.36016771]), 14310),
 'age': (array([0.63358485, 0.36641515]), 31145),
 'age_Under': (array([0.62507942, 0.37492058]), 17313),
 'age_Over': (array([0.64423077, 0.35576923]), 13832),
 'country': (array([0.67275026, 0.32724974]), 36382),
 'country_InUS': (array([0.65147111, 0.34852889]), 22704),
 'country_NotIn': (array([0.70807136, 0.29192864]), 13678),
 'ethnicity': (array([0.63500245, 0.36499755]), 32592),
 'ethnicity_White': (array([0.68690419, 0.31309581]), 18777),
 'ethnicity_Other': (array([0.56445892, 0.43554108]), 13815)}

In [12]:
# main label distribution in dev set
single_analysis(valid)

{'overall': (array([0.62632387, 0.37367613]), 12369),
 'gender': (array([0.63647959, 0.36352041]), 7056),
 'gender_male': (array([0.63144076, 0.36855924]), 3047),
 'gender_female': (array([0.6403093, 0.3596907]), 4009),
 'age': (array([0.63694647, 0.36305353]), 6707),
 'age_Over': (array([0.65061036, 0.34938964]), 3031),
 'age_Under': (array([0.62568009, 0.37431991]), 3676),
 'country': (array([0.67516586, 0.32483414]), 7687),
 'country_NotIn': (array([0.70687237, 0.29312763]), 2852),
 'country_InUS': (array([0.65646329, 0.34353671]), 4835),
 'ethnicity': (array([0.63698925, 0.36301075]), 6975),
 'ethnicity_Other': (array([0.55810856, 0.44189144]), 3003),
 'ethnicity_White': (array([0.69662638, 0.30337362]), 3972)}

In [13]:
# main label distribution in test set
single_analysis(test)

{'overall': (array([0.63618849, 0.36381151]), 12457),
 'gender': (array([0.64480565, 0.35519435]), 7075),
 'gender_male': (array([0.63618486, 0.36381514]), 3051),
 'gender_female': (array([0.65134195, 0.34865805]), 4024),
 'age': (array([0.64396423, 0.35603577]), 6710),
 'age_Under': (array([0.63983849, 0.36016151]), 3715),
 'age_Over': (array([0.6490818, 0.3509182]), 2995),
 'country': (array([0.68176569, 0.31823431]), 7793),
 'country_NotIn': (array([0.71722278, 0.28277722]), 2967),
 'country_InUS': (array([0.65996685, 0.34003315]), 4826),
 'ethnicity': (array([0.6465468, 0.3534532]), 7008),
 'ethnicity_White': (array([0.69758858, 0.30241142]), 4064),
 'ethnicity_Other': (array([0.57608696, 0.42391304]), 2944)}

In [14]:
def task_comb_data(df, 
                task_combs, 
                conditions,):
    selected_rows = np.array([True]*len(df))
    for task, condition in zip(task_combs, conditions):
        selected_rows = selected_rows & (df[task].to_numpy()==condition)
    return selected_rows

def get_all_combs(unique_type_list):
    numer_tasks = len(unique_type_list)
    no_unique_types = [len(unique_type) for unique_type in unique_type_list]+[1]
    total_number = np.prod(no_unique_types)
    # print(numer_tasks, total_number)
    
    # init 2d matrix
    group_combinations = [[None for j in range(numer_tasks)] for i in range(total_number)]

    for single_task_id, single_task_types in enumerate(unique_type_list):
        # print(single_task_id, single_task_types)

        # calculate single repeat time
        single_repeat_time = int(np.prod(no_unique_types[single_task_id+1:]))
        # calculate whole list repeat time
        whole_repeat_time = int(total_number/single_repeat_time/len(single_task_types))
        # print(single_repeat_time, whole_repeat_time)

        # create col number
        task_col = []
        # single repeat
        for t_type in single_task_types:
            task_col = task_col + [t_type]*single_repeat_time
        # whole repeat
        task_col = task_col * whole_repeat_time
        # print(task_col)

        # add to matrix
        for i, v in enumerate(task_col):
            group_combinations[i][single_task_id] = v
    return group_combinations


def cpmbination_analysis(in_df,
                         task_combs,
                         tasks = ['gender', 'age', 'country', 'ethnicity'],
                         all_labels = False
                         ):
    # save dist at different level
    dist_results = {}    
    
    if all_labels:
        df = in_df[full_label_data(in_df, tasks)]
    else:
        df = in_df

    overall_dist = main_label_dist(df)
    dist_results["overall"] = (overall_dist, len(df))
    
    '''fairness gaps'''
    for task_comb in task_combs:
        # group_combinations = [p for p in product([0, 1], repeat=len(task_comb))]
        comb_uniq_types = [df[t].unique() for t in task_comb]

        group_combinations = get_all_combs(comb_uniq_types)


        for group_comb in group_combinations:
            if "nan" in [str(i) for i in group_comb]:
                continue
            group_df = df[task_comb_data(df, task_comb, group_comb)]
            # group_key = "_".join(task_comb+[str(i) for i in group_comb])
            group_key = "_".join(task_comb+
                                [id2label[task_comb[j]][int(i)] 
                                    if task_comb[j] in id2label.keys()
                                    else str(i) 
                                for j,i in enumerate(group_comb)]
                                )
            
            dist_results[group_key] = (main_label_dist(group_df), len(group_df))
    return dist_results

In [15]:
comb_dists_subset = cpmbination_analysis(in_df = full_corpus,
                                task_combs = [ ['gender'], 
                                                ['age'], 
                                                ['country'], 
                                                ['ethnicity'],
                                                ['gender', 'age'],
                                                ['gender', 'country'],
                                                ['gender', 'ethnicity'],
                                                ['age', 'country'],
                                                ['age', 'ethnicity'],
                                                ['country', 'ethnicity'],
                                                ['age', 'country', 'ethnicity'],
                                                ['gender', 'country', 'ethnicity'],
                                                ['gender', 'age', 'ethnicity'],
                                                ['gender', 'age', 'country'],
                                                ['gender', 'age', 'country', 'ethnicity']
                                                ],
                                tasks = ['gender', 'age', 'country', 'ethnicity'],
                                all_labels = True
                                )

In [16]:
comb_dists_full = cpmbination_analysis(in_df = full_corpus,
                                        task_combs = [ ['gender'], 
                                                        ['age'], 
                                                        ['country'], 
                                                        ['ethnicity'],
                                                        ['gender', 'age'],
                                                        ['gender', 'country'],
                                                        ['gender', 'ethnicity'],
                                                        ['age', 'country'],
                                                        ['age', 'ethnicity'],
                                                        ['country', 'ethnicity'],
                                                        ['age', 'country', 'ethnicity'],
                                                        ['gender', 'country', 'ethnicity'],
                                                        ['gender', 'age', 'ethnicity'],
                                                        ['gender', 'age', 'country'],
                                                        ['gender', 'age', 'country', 'ethnicity']
                                                        ],
                                        tasks = ['gender', 'age', 'country', 'ethnicity'],
                                        all_labels = False
                                        )

In [17]:
def sort_by_dist_gap(results):
    Gap_keys = []
    overall_hatespeech_percentage = results["overall"][0][1]
    for k in results.keys():
        k_hatespeech_percentage = results[k][0][1]
        k_gap = k_hatespeech_percentage-overall_hatespeech_percentage
        Gap_keys.append({"abs_gap" : abs(k_gap),
                        "gap" : k_gap,
                        "hatespeech_percentage" : k_hatespeech_percentage,
                        "key" : k}
                        )
    # return sorted(Gap_keys, reverse=True)
    return Gap_keys[1:]

In [18]:
comb_dists_full

{'overall': (array([0.62974108, 0.37025892]), 83077),
 'gender_female': (array([0.63247767, 0.36752233]), 26763),
 'gender_male': (array([0.63546557, 0.36453443]), 20448),
 'age_Under': (array([0.62439711, 0.37560289]), 24880),
 'age_Over': (array([0.64436283, 0.35563717]), 19877),
 'country_InUS': (array([0.65189113, 0.34810887]), 32441),
 'country_NotIn': (array([0.70684707, 0.29315293]), 19512),
 'ethnicity_White': (array([0.68858981, 0.31141019]), 26897),
 'ethnicity_Other': (array([0.5614362, 0.4385638]), 19858),
 'gender_age_female_Under': (array([0.65045593, 0.34954407]), 18424),
 'gender_age_female_Over': (array([0.58850911, 0.41149089]), 7519),
 'gender_age_male_Under': (array([0.55004648, 0.44995352]), 6454),
 'gender_age_male_Over': (array([0.67831998, 0.32168002]), 12357),
 'gender_country_female_InUS': (array([0.66450974, 0.33549026]), 15145),
 'gender_country_female_NotIn': (array([0.70862471, 0.29137529]), 4719),
 'gender_country_male_InUS': (array([0.62009569, 0.3799043

In [19]:
#  Estimate overall hate speech rate from all rows
comb_dists_full_gap = sort_by_dist_gap(comb_dists_full)
comb_dists_full_gap[:2]

[{'abs_gap': 0.0027365908281548057,
  'gap': -0.0027365908281548057,
  'hatespeech_percentage': 0.3675223255987744,
  'key': 'gender_female'},
 {'abs_gap': 0.0057244876319370586,
  'gap': -0.0057244876319370586,
  'hatespeech_percentage': 0.36453442879499215,
  'key': 'gender_male'}]

In [20]:
sorted_by_abs_gap_full = sorted([list(i.values()) for i in comb_dists_full_gap], reverse=True)
sorted_by_abs_gap_full_keys = [(i[-1], i[1]) for i in sorted_by_abs_gap_full]
for i in sorted_by_abs_gap_full_keys[:10]:
    print(i)

('gender_age_country_ethnicity_female_Under_InUS_White', -0.20694824697086225)
('gender_age_country_ethnicity_female_Under_InUS_Other', 0.1926353321816051)
('age_country_ethnicity_Under_InUS_White', -0.17979396481021928)
('age_country_ethnicity_Under_InUS_Other', 0.1767224725880504)
('gender_age_country_ethnicity_female_Over_InUS_White', 0.16157132229986126)
('gender_age_ethnicity_female_Under_Other', 0.15439652002023774)
('gender_age_country_ethnicity_male_Under_InUS_Other', 0.15165809117096674)
('gender_country_ethnicity_female_InUS_Other', 0.14316476038207104)
('gender_age_country_ethnicity_female_Over_NotIn_White', -0.13676455249778266)
('gender_age_country_female_Over_InUS', 0.13545319324557648)


In [21]:
#  Estimate overall hate speech rate from rows that contains all 4 labels
comb_dists_gap_subset = sort_by_dist_gap(comb_dists_subset)
comb_dists_gap_subset[:2]

[{'abs_gap': 0.008659168897582836,
  'gap': -0.008659168897582836,
  'hatespeech_percentage': 0.32268623620716963,
  'key': 'gender_female'},
 {'abs_gap': 0.012220648095456144,
  'gap': 0.012220648095456144,
  'hatespeech_percentage': 0.3435660532002086,
  'key': 'gender_male'}]

In [22]:
sorted_by_abs_gap_subset = sorted([list(i.values()) for i in comb_dists_gap_subset], reverse=True)
sorted_by_abs_gap_keys_subset = [(i[-1], i[1]) for i in sorted_by_abs_gap_subset]
for i in sorted_by_abs_gap_keys_subset[:10]:
    print(i)

('gender_age_country_ethnicity_female_Under_InUS_Other', 0.23154884350378185)
('age_country_ethnicity_Under_InUS_Other', 0.21563598391022715)
('gender_age_country_ethnicity_female_Over_InUS_White', 0.200484833622038)
('gender_country_ethnicity_female_InUS_Other', 0.19536578020409562)
('gender_age_country_ethnicity_male_Under_InUS_Other', 0.19057160249314348)
('gender_age_country_female_Over_InUS', 0.1744017213320292)
('gender_age_country_ethnicity_female_Under_InUS_White', -0.1680347356486855)
('gender_age_country_male_Under_InUS', 0.16451584834420302)
('gender_age_ethnicity_female_Under_Other', 0.1629639585786412)
('country_ethnicity_InUS_Other', 0.1484630339322654)


In [23]:
full_corpus

Unnamed: 0,tid,uid,text,date,gender,age,city,state,country,ethnicity,label
0,3760865421407572231,4926862304955734550,cisco had to deal with a fat cash payout to th...,2015-5-11,1.0,1.0,Portland,Oregon,1.0,0.0,0
1,4740245386948307605,4926862304955734550,"user i'm decent at editing , no worries ^ . ^",2015-2-11,1.0,1.0,Portland,Oregon,1.0,0.0,0
2,-5338233947289977333,4926862304955734550,user will read . gotta go afk for a bit - stil...,2015-2-7,1.0,1.0,Portland,Oregon,1.0,0.0,0
3,-2507086924347139249,4926862304955734550,guys . show me the data . show me your github ...,2015-5-5,1.0,1.0,Portland,Oregon,1.0,0.0,0
4,-5650959460091509765,4926862304955734550,user nothings broken . i was just driving thro...,2015-2-6,1.0,1.0,Portland,Oregon,1.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...
83072,5414367322427879624,-6091053152482070248,rt user : plot twist : he never stopped fuckin...,2017-4-8,,,,,,,1
83073,7283184898252962785,5800779473908422678,rt user : just want one fucking person just one,2017-4-6,1.0,0.0,,,,1.0,1
83074,50259246356599598,-4195009631820497391,"for fans of simon r . green - fast-paced , gru...",2017-4-4,1.0,0.0,Evansville,Indiana,1.0,0.0,0
83075,5150719371086939990,3213952400245967536,rt user : i'm like 5 bad days away from saying...,2017-4-5,0.0,0.0,Oquawka,Illinois,1.0,0.0,1


## Target

In [24]:
# add target label
gender_lexicons = {"he" : ["male"],
                    "his" : ["male"],
                    "son" : ["male"],
                    "father" : ["male"],
                    "male" : ["male"],
                    "boy" : ["male"],
                    "uncle" : ["male"],
                    "man" : ["male"],
                    "brother" : ["male"],
                    "she" : ["female"],
                    "her" : ["female"],
                    "daughter" : ["female"],
                    "mother" : ["female"],
                    "female" : ["female"],
                    "girl" : ["female"],
                    "aunt" : ["female"],
                    "woman" : ["female"],
                    "sister" : ["female"]
                    }
race_lexicons = {
    "black" : ["Black"],
    "caucasian" : ["White"],
    "asian" : ["Asian"],
    "africa" : ["Black"],
    "america" : ["White"],
    "asia" : ["Asian"]
}
religion_lexicons = {
    "judaism" : ["Judaism"],
    "christianity" : ["Christianity"],
    "islam" : ["Christianity"],
    "jew" : ["Judaism"],
    "christian" : ["Christianity"],
    "muslim" : ["Islam"],
    "synagogue" : ["Judaism"],
    "church" : ["Christianity"],
    "mosque" : ["Islam"],
    "torah" : ["Judaism"],
    "bible" : ["Christianity"],
    "quran" : ["Islam"],
    "rabbi" : ["Judaism"],
    "priest" : ["Christianity"],
    "imam" : ["Islam"]
}

In [25]:
def label_target(sentences = full_corpus["text"]):
    gender_label = []
    race_label = []
    religion_label = []
    # iterate all sentecnes
    for sent in tqdm(sentences):
        sent_gender_label = []
        sent_race_label = []
        sent_religion_label = []
        for word in sent.split(" "):
            # gender
            sent_gender_label = sent_gender_label + gender_lexicons.get(word, [])
            # race
            sent_race_label = sent_race_label + race_lexicons.get(word, [])
            # religion
            sent_religion_label = sent_religion_label + religion_lexicons.get(word, [])

        # gender
        if len(sent_gender_label)==0:
            gender_label.append(np.nan)
        elif len(set(sent_gender_label)) == 1:
            gender_label.append(sent_gender_label[0])
        else:
            gender_label.append("Unk")
        # race
        if len(sent_race_label)==0:
            race_label.append(np.nan)
        elif len(set(sent_race_label)) == 1:
            race_label.append(sent_race_label[0])
        else:
            race_label.append("Unk")
        # religion
        if len(sent_religion_label)==0:
            religion_label.append(np.nan)
        elif len(set(sent_religion_label)) == 1:
            religion_label.append(sent_religion_label[0])
        else:
            religion_label.append("Unk")
    return gender_label, race_label, religion_label

In [26]:
full_corpus_gender_label, full_corpus_race_label, full_corpus_religion_label = label_target(sentences = full_corpus["text"])

full_corpus["target_gender"] = full_corpus_gender_label # 0: female; 1: male; 2: unk; 3:nan
full_corpus["target_race"] = full_corpus_race_label # 0:Black; 1: White; 2: Asian; 3: unk; 4: nan
full_corpus["target_religion"] = full_corpus_religion_label # 0: Judaism; 1: Christianity; 2: Islam; 3: unk; 4: nan

100%|██████████| 83077/83077 [00:00<00:00, 137283.27it/s]


In [27]:
len(full_corpus[full_corpus["target_gender"].notnull()])

9299

In [28]:
len(full_corpus[full_corpus["target_race"].notnull()])

777

In [29]:
len(full_corpus[full_corpus["target_religion"].notnull()])

1493

In [30]:
single_analysis(in_df = full_corpus,
                tasks = ['target_gender', 'target_race', 'target_religion'],
                all_labels = False
                )

{'overall': (array([0.62974108, 0.37025892]), 83077),
 'target_gender': (array([0.57457791, 0.42542209]), 9299),
 'target_gender_male': (array([0.61981567, 0.38018433]), 5208),
 'target_gender_female': (array([0.5136967, 0.4863033]), 3541),
 'target_gender_Unk': (array([0.53818182, 0.46181818]), 550),
 'target_race': (array([0.57786358, 0.42213642]), 777),
 'target_race_Black': (array([0.55105973, 0.44894027]), 519),
 'target_race_White': (array([0.62621359, 0.37378641]), 206),
 'target_race_Asian': (array([0.66666667, 0.33333333]), 45),
 'target_race_Unk': (array([0.57142857, 0.42857143]), 7),
 'target_religion': (array([0.39316812, 0.60683188]), 1493),
 'target_religion_Christianity': (array([0.42140468, 0.57859532]), 897),
 'target_religion_Islam': (array([0.36032389, 0.63967611]), 494),
 'target_religion_Unk': (array([0.29885057, 0.70114943]), 87),
 'target_religion_Judaism': (array([0.33333333, 0.66666667]), 15)}

In [31]:
# intersection of as and target

In [32]:
intersection_comb_dists_full = cpmbination_analysis(in_df = full_corpus,
                                        task_combs = [ ['gender'], 
                                                        ['age'], 
                                                        ["target_gender"],
                                                        ['gender', 'age'],
                                                        ['gender', 'target_gender'],
                                                        ['age', 'target_gender'],
                                                        ['age', 'gender', 'target_gender']
                                                        ],
                                        tasks = ['gender', 'age', "target_gender"],
                                        all_labels = False
                                        )

In [33]:
intersection_comb_dists_full

{'overall': (array([0.62974108, 0.37025892]), 83077),
 'gender_female': (array([0.63247767, 0.36752233]), 26763),
 'gender_male': (array([0.63546557, 0.36453443]), 20448),
 'age_Under': (array([0.62439711, 0.37560289]), 24880),
 'age_Over': (array([0.64436283, 0.35563717]), 19877),
 'target_gender_male': (array([0.61981567, 0.38018433]), 5208),
 'target_gender_female': (array([0.5136967, 0.4863033]), 3541),
 'target_gender_Unk': (array([0.53818182, 0.46181818]), 550),
 'gender_age_female_Under': (array([0.65045593, 0.34954407]), 18424),
 'gender_age_female_Over': (array([0.58850911, 0.41149089]), 7519),
 'gender_age_male_Under': (array([0.55004648, 0.44995352]), 6454),
 'gender_age_male_Over': (array([0.67831998, 0.32168002]), 12357),
 'gender_target_gender_female_male': (array([0.57469136, 0.42530864]), 1620),
 'gender_target_gender_female_female': (array([0.55971731, 0.44028269]), 1415),
 'gender_target_gender_female_Unk': (array([0.51923077, 0.48076923]), 208),
 'gender_target_gende

In [34]:
#  Estimate overall hate speech rate from all rows
intersection_comb_dists_full_gap = sort_by_dist_gap(intersection_comb_dists_full)
intersection_comb_dists_full_gap[:2]

[{'abs_gap': 0.0027365908281548057,
  'gap': -0.0027365908281548057,
  'hatespeech_percentage': 0.3675223255987744,
  'key': 'gender_female'},
 {'abs_gap': 0.0057244876319370586,
  'gap': -0.0057244876319370586,
  'hatespeech_percentage': 0.36453442879499215,
  'key': 'gender_male'}]

In [35]:
intersection_sorted_by_abs_gap_full = sorted([list(i.values()) for i in intersection_comb_dists_full_gap], reverse=True)
intersection_sorted_by_abs_gap_full_keys = [(i[-1], i[2]) for i in intersection_sorted_by_abs_gap_full]
for i in intersection_sorted_by_abs_gap_full_keys[:10]:
    print(i)

('age_gender_target_gender_Over_female_Unk', 0.6666666666666666)
('age_target_gender_Over_Unk', 0.5483870967741935)
('age_gender_target_gender_Over_female_female', 0.5339578454332553)
('age_gender_target_gender_Under_male_female', 0.5173611111111112)
('target_gender_female', 0.4863033041513697)
('gender_target_gender_female_Unk', 0.4807692307692308)
('age_target_gender_Over_female', 0.47648514851485146)
('target_gender_Unk', 0.4618181818181818)
('age_gender_target_gender_Under_male_male', 0.45794392523364486)
('gender_target_gender_male_Unk', 0.45)


In [36]:
intersection_comb_dists_full = cpmbination_analysis(in_df = train_pkl_df,
                                        task_combs = [ ['gender'], 
                                                        ['age'], 
                                                        ["target_gender"],
                                                        ['gender', 'age'],
                                                        ['gender', 'target_gender'],
                                                        ['age', 'target_gender'],
                                                        ['age', 'gender', 'target_gender']
                                                        ],
                                        tasks = ['gender', 'age', "target_gender"],
                                        all_labels = False
                                        )

In [37]:
intersection_comb_dists_full

{'overall': (array([0.63217201, 0.36782799]), 57951),
 'gender_female': (array([0.62964757, 0.37035243]), 18585),
 'gender_male': (array([0.64028525, 0.35971475]), 14303),
 'age_Under': (array([0.62500723, 0.37499277]), 17299),
 'age_Over': (array([0.64436416, 0.35563584]), 13840),
 'target_gender_male': (array([0.62752141, 0.37247859]), 3619),
 'target_gender_female': (array([0.52207371, 0.47792629]), 2469),
 'target_gender_Unk': (array([0.54187192, 0.45812808]), 406),
 'gender_age_female_Under': (array([0.65016476, 0.34983524]), 12746),
 'gender_age_female_Over': (array([0.57934845, 0.42065155]), 5249),
 'gender_age_male_Under': (array([0.55460338, 0.44539662]), 4551),
 'gender_age_male_Over': (array([0.68405122, 0.31594878]), 8590),
 'gender_target_gender_female_male': (array([0.5752051, 0.4247949]), 1097),
 'gender_target_gender_female_female': (array([0.57128514, 0.42871486]), 996),
 'gender_target_gender_female_Unk': (array([0.53333333, 0.46666667]), 150),
 'gender_target_gender_