# Experimiento de reproducción modelo paper arango-resources

Esto es una reproducción y adaptación del codigo encontrado en el repositorio:

https://github.com/hate-alert/DE-LIMIT

Asociado a la publicación:

***A Deep Dive into Multilingual Hate Speech Classification***

In [2]:
!pip install laserembeddings

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting laserembeddings
  Downloading laserembeddings-1.1.2-py3-none-any.whl (13 kB)
Collecting transliterate==1.10.2
  Downloading transliterate-1.10.2-py2.py3-none-any.whl (45 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.8/45.8 KB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Collecting subword-nmt<0.4.0,>=0.3.6
  Downloading subword_nmt-0.3.8-py3-none-any.whl (27 kB)
Collecting sacremoses==0.0.35
  Downloading sacremoses-0.0.35.tar.gz (859 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m859.8/859.8 KB[0m [31m33.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting mock
  Downloading mock-5.0.1-py3-none-any.whl (30 kB)
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.35-py3-

In [3]:
!python -m laserembeddings download-models

Downloading models into /usr/local/lib/python3.8/dist-packages/laserembeddings/data

✅   Downloaded https://dl.fbaipublicfiles.com/laser/models/93langs.fcodes    
✅   Downloaded https://dl.fbaipublicfiles.com/laser/models/93langs.fvocab    
✅   Downloaded https://dl.fbaipublicfiles.com/laser/models/bilstm.93langs.2018-12-26.pt    

✨ You're all set!


In [4]:
from sklearn.linear_model import LogisticRegression
from laserembeddings import Laser
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
import pickle
import random

laser = Laser()

In [5]:
# We set our Seeds
random_state = 42 # our beloved seed the answer to the universe
seed = np.random.seed(random_state) # set numpy seed
random.seed(random_state) # set python seed

## Datos

In [6]:
google_drive = True
train_models = False

if google_drive:
  print("Mounting your Google Drive ...")

  from google.colab import drive
  drive.flush_and_unmount()
  drive.mount('/content/drive', force_remount=True)

Mounting your Google Drive ...
Drive not mounted, so nothing to flush and unmount.
Mounted at /content/drive


In [7]:
test_file = 'tweets_test.csv'
train_file = 'tweets_train.csv'
referenced = 'referenced_tweets_data.csv'
# Arango-resources
train_arango = 'aarango_HS.xlsx'
# hateval2019
train_hateval_es = "hateval2019/hateval2019_es_train.csv"
test_hateval_es = "hateval2019/hateval2019_es_test.csv"
train_hateval_en = "hateval2019/hateval2019_en_train.csv"
test_hateval_en = "hateval2019/hateval2019_en_test.csv"
# Hatecheck
test_hatecheck = "hatecheck_cases_final_spanish.csv"

if google_drive:
  path = "/content/drive/My Drive/Paper_dataton/Datos/" # Esta es la ruta
  save_path = "/content/drive/My Drive/Paper_dataton/models_reproduction/" # Esta es la ruta de los experimentos
else:
  path = ''

data_train = pd.read_csv(path + train_file, encoding='utf-8', index_col=0)
data_test = pd.read_csv(path + test_file, encoding='utf-8', index_col=0)
data_referenced = pd.read_csv(path + referenced, encoding='utf-8', index_col=0)
data_train.rename(columns={'Odio':'HS'}, inplace = True)
data_test.rename(columns={'Odio':'HS'}, inplace = True)
# Arango
data_arango_train = pd.read_excel(path + train_arango).dropna(subset=['Usuario'])
# preprocess arango columns
data_arango_train.rename(columns={'tweet a etiquetar':'text', '12. Estereotipo':'HS'}, inplace = True)
# Hateval
data_hateval_es_train = pd.read_csv(path + train_hateval_es)
data_hateval_es_test = pd.read_csv(path + test_hateval_es)
data_hateval_en_train = pd.read_csv(path + train_hateval_en)
data_hateval_en_test = pd.read_csv(path + test_hateval_en)
# Hatecheck
data_hatecheck = pd.read_csv(path + test_hatecheck)
data_hatecheck = data_hatecheck.drop(columns=['ref_case_id', 	'ref_templ_id', 	'templ_id', 	'case_templ', 	'gender_male', 	'gender_female', 	'label_annotated', 	'label_annotated_maj', 	'disagreement_in_case', 	'disagreement_in_template'])
data_hatecheck['HS'] = data_hatecheck['label_gold'].apply(lambda x: 1 if x == 'hateful' else 0)
data_hatecheck.rename(columns={'test_case':'text'}, inplace = True)

# print('Amount of different Authors: ' + 
# print(data_train.author_id.value_counts())


print('Len Arango: {}\nLen hateval es: {}\nLen hateval en: {}\nLen hatecheck en: {}'.format(len(data_arango_train),
                                                                      len(data_hateval_es_train) + len(data_hateval_es_test),
                                                                      len(data_hateval_en_train) + len(data_hateval_en_test),
                                                                      len(data_hatecheck)))
display(data_train.head())
display(data_arango_train.head())
display(data_hateval_es_train.head())
display(data_hateval_en_train.head())
display(data_hatecheck.head())

Len Arango: 9834
Len hateval es: 6100
Len hateval en: 12000
Len hatecheck en: 3745


Unnamed: 0,tweet_id,author_id,conversation_id,text,HS,Mujeres,Comunidad LGBTQ+,Comunidades Migrantes,Pueblos Originarios
0,1399516036240662528,1329989512391438336,(),En una amistad o soy tipo: \r\nLo peor es que ...,0,0,0,0,0
1,1320788179721560065,1319131581949378560,(),QUIEN FUE LA MARACA CULIA TE VOY A MATAR PERRA...,1,1,0,0,0
2,1079889645280944129,2328230546,(),menos mal q se recibe el año con ropa blanca j...,0,0,0,0,0
3,1369254390134145033,819218347049029633,"(1369117564811550720,)",Cuantos INFILTRADOS extranjeros hay ahi😡 https...,1,0,0,1,0
4,1533854824378290176,1229941558411243520,(),#ENCONTRADO #GUAYAQUIL\r\nFecha: 06/06/22\r\nS...,0,0,0,0,0


Unnamed: 0,CASO,Usuario,Link,text,Contexto,1. Anonimo,2. Genero,3. Mención migración,4. Mención Venezuela,5. Mención política nacional,...,10. Grosería s/Int.,11. Insulto/Sobrenombre,HS,13. Sarcasmo/Ironía/Burla,14. Pregunta Legítima,15. Evidencia,16. Figura femenina,17. Figura masculina,Codificador,Fecha
0,23036.0,Luna,https://twitter.com/labrigidagon/status/140128...,Eran tan pero tan feministas que invisibilizab...,,1.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,Ariel,semana 1
1,23057.0,Karen🌼,https://twitter.com/karenseguelr/status/140270...,@Eneatipo7 @Cooperativa @karina_ol Me carga en...,,1.0,2.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,Ariel,semana 1
4,23058.0,Patria Y Libertad,https://twitter.com/patrianueva2020/status/140...,", ¿Sabrán las femiorcas como @karina_ol y todo...",,1.0,3.0,0.0,0.0,1.0,...,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,Ariel,semana 1
5,23059.0,Nubecita.Roja,https://twitter.com/RojaNubecita/status/140247...,@AntoDelLaw @jonaovalle @CarolinaLeitao @penal...,,1.0,3.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,Ariel,semana 1
9,23061.0,Rodrigo Catalan,https://twitter.com/rodrigo140970/status/14017...,@Ciudadanolevi1 Debajo de que piedra estaba es...,,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,Ariel,semana 1


Unnamed: 0,id,text,HS,TR,AG
0,20001,Easyjet quiere duplicar el número de mujeres p...,1,0,0
1,20002,El gobierno debe crear un control estricto de ...,1,0,0
2,20003,Yo veo a mujeres destruidas por acoso laboral ...,0,0,0
3,20004,"— Yo soy respetuoso con los demás, sólamente l...",0,0,0
4,20007,Antonio Caballero y como ser de mal gusto e ig...,0,0,0


Unnamed: 0,id,text,HS,TR,AG
0,201,"Hurray, saving us $$$ in so many ways @potus @...",1,0,0
1,202,Why would young fighting age men be the vast m...,1,0,0
2,203,@KamalaHarris Illegals Dump their Kids at the ...,1,0,0
3,204,NY Times: 'Nearly All White' States Pose 'an A...,0,0,0
4,205,Orban in Brussels: European leaders are ignori...,0,0,0


Unnamed: 0,mhc_case_id,functionality,text,label_gold,target_ident,HS
0,spanish-1,derog_neg_emote_h,Odio a los negros.,hateful,black people,1
1,spanish-2,derog_neg_emote_h,Odio a los gays.,hateful,gay people,1
2,spanish-3,derog_neg_emote_h,Odio a los trans.,hateful,trans people,1
3,spanish-4,derog_neg_emote_h,Odio a los indígenas.,hateful,indigenous people,1
4,spanish-5,derog_neg_emote_h,Odio a los judíos.,hateful,jews,1


In [8]:
# Define Evaluation metrics
def compute_metrics(p):
    pred, labels = p
    #pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)
    auc = roc_auc_score(y_true=labels, y_score=pred)

    return {"accuracy": accuracy, "precision": precision,
            "recall": recall, "f1": f1, "AUC": auc}


def gen_data_laser(tweets_list, lang):
  embeddings = laser.embed_sentences(tweets_list, lang=lang) 
  embeddings = np.array(embeddings)
  return embeddings


def train(train, dataLabel, train_lang):
  model = LogisticRegression(C=10, solver='lbfgs', class_weight='balanced', random_state=2018)
  dataTrain = gen_data_laser(train, train_lang)
  model.fit(dataTrain, dataLabel)
  return model


def test(model, data_test, test_lang, save_file):
  test = data_test['text']
  dataTest = gen_data_laser(test, test_lang)
  raw_pred = model.predict(dataTest)
  #print(save_file.split('_')[2])
  if save_file.split('_')[3].split('.')[0] == 'cl2' or save_file.split('_')[2] == 'hatecheck':
    df =  data_test.copy()
    df['HS_predict'] = raw_pred
    df.to_csv(save_path + "/predictions/" + save_file)
    return compute_metrics((raw_pred, data_test['HS'].values))
  df = pd.DataFrame(raw_pred, columns=['HS_predict'])
  df['HS_real'] = data_test['HS'].values
  df.to_csv(save_path + "/predictions/" + save_file)
  return compute_metrics((raw_pred, data_test['HS'].values))


### Creating models

In [9]:
# Training

def model_training(training_configs):
  trained_models = []
  for training_config in tqdm(training_configs):
    
    trained_models.append(train(training_config['data']['text'].to_list(),
                                training_config['data']['HS'].to_list(),
                                training_config['lang']))
  return trained_models


# We create our Full spanish and english datasets
data_hateval_es_full = pd.concat([data_hateval_es_train, data_hateval_es_test])
data_hateval_en_full = pd.concat([data_hateval_en_train, data_hateval_en_test])
data_ours_full = pd.concat([data_train, data_test])

# CL1 + CL2
keep_cols = ['text', 'HS', 'text_translated']
drop_columns_ours = [col for col in data_ours_full.columns.tolist() if col not in keep_cols]
drop_columns_arango = [col for col in data_arango_train.columns.tolist() if col not in keep_cols]

data_chilean = pd.concat([data_ours_full.drop(columns=drop_columns_ours),
           data_arango_train.drop(columns=drop_columns_arango)]).reset_index(drop=True)




training_configs = [ # ES
                    {'data': data_hateval_es_train, 'lang': 'es'},
                    {'data': data_hateval_es_full, 'lang': 'es'}, # Trained on full spanish dataset
                     # EN
                    {'data': data_hateval_en_train, 'lang': 'en'},
                    {'data': data_hateval_en_full, 'lang': 'en'}, # Trained on full english dataset
                     # Train CL Arango
                    {'data': data_arango_train, 'lang': 'es'},
                    # Train CL ours
                    {'data': data_ours_full, 'lang': 'es'},
                    {'data': data_chilean, 'lang': 'es'}
                  ]


if train_models:
  trained_models = model_training(training_configs)

### Saving Models

In [10]:
# Saving

def save_models(models, models_path):
  for model, model_path in tqdm(zip(models, models_path), total=len(models)):
    with open(model_path,'wb') as f:
      pickle.dump(model, f)
  del models


models_path = [ # Spanish
                save_path + "/models/" + 'model_es_only.pkl',
                save_path + "/models/" + 'model_es_full.pkl',
               # English
                save_path + "/models/" + 'model_en_only.pkl',
                save_path + "/models/" + 'model_en_full.pkl',
                # Chilean Arango
                save_path + "/models/" + 'model_cl_full.pkl',
                # Chilean Ours
                save_path + "/models/" + 'model_cl_ours_full.pkl',
                 # Chilean all
                save_path + "/models/" + 'model_clT_full.pkl'
               ]

if train_models:
  save_models(trained_models, models_path)

### Loading Models

In [11]:
# load function

def load_models(models_path):
  models = []
  for model_path in tqdm(models_path):
    with open(model_path, 'rb') as f:
      models.append(pickle.load(f))
  
  return models

# Spanish
models_path = [
               save_path + "/models/" + 'model_es_full.pkl', # Spanish full
               save_path + "/models/" + 'model_en_full.pkl', # English full
               save_path + "/models/" + 'model_cl_full.pkl', # Chilean Arango
               save_path + "/models/" + 'model_cl_ours_full.pkl', # Chilean Ours
               save_path + "/models/" + 'model_clT_full.pkl'] # Chilean all

if not train_models:
  loaded_models = load_models(models_path)

  0%|          | 0/5 [00:00<?, ?it/s]

### predictions

In [12]:
def save_predictions(models, test_sets, save_files):
  results_dict = {} 
  for model, test_set, save_file in tqdm(zip(models, test_sets, save_files), total=len(models)):
    for test_, save_ in tqdm(zip(test_set, save_file), total=len(test_set)):
      # if its testing on full set we must use the target language not source
      if 'only' in save_.split('_')[3]:
        test_lang = save_.split('_')[2]
      elif 'cl' in save_.split('_')[3]:
        test_lang = 'es'
      else:
        test_lang = save_.split('_')[3].split('.')[0]
      print(save_.split('_')[3], save_.split('_')[2], test_lang)
      if 'hatecheck' not in save_.split('_')[2] and 'clT' not in save_.split('_')[2]:
        results_dict[save_.split('_')[2] + '_' + save_.split('_')[3].split('.')[0]] = test(model, test_, test_lang, save_)
      elif 'hatecheck' in save_.split('_')[2]:
        results_dict[save_.split('_')[3] + '_' + save_.split('_')[2]] = test(model, test_, test_lang, save_)
      else:
        test(model, test_, test_lang, save_)
  return results_dict

experiments_es = [# Using whole set in english as test
                  data_hateval_en_full,
                  # Using Arango cl
                  data_arango_train, 
                  # Using ours
                  data_ours_full,
                  # Hatecheck
                  data_hatecheck]

experiments_en = [# Using whole set in spanish as test
                  data_hateval_es_full, 
                  # Using Arango cl
                  data_arango_train, 
                  # Using ours
                  data_ours_full,
                  # Hatecheck
                  data_hatecheck]

experiments_cl1 = [# Using whole set in spanish as test
                  data_hateval_es_full,
                  # Using whole set in english as test
                  data_hateval_en_full,
                  # Using ours
                   data_ours_full,
                  # Hatecheck
                  data_hatecheck]

experiments_cl2 = [# Using whole set in spanish as test
                  data_hateval_es_full,
                  # Using whole set in english as test
                  data_hateval_en_full,
                  # Using Arango cl
                  data_arango_train,
                  # Hatecheck
                  data_hatecheck]

experiments_cl1_cl2 = [# Using whole set in spanish as test
                       data_hateval_es_full,
                       # Using whole set in english as test
                       data_hateval_en_full,
                       # Hatecheck
                       data_hatecheck]



# save files path
save_files = [
              ['ECML20_predictions_es_en.csv',   # spanish training, english predictions
               'ECML20_predictions_es_cl1.csv',  # transfer hateval es to cl1
               'ECML20_predictions_es_cl2.csv',  # transfer hateval es to cl2
               'ECML20_predictions_hatecheck_es_es.csv'],
              ['ECML20_predictions_en_es.csv',   # english training, spanish predictions
               'ECML20_predictions_en_cl1.csv',  # transfer hateval es to cl
               'ECML20_predictions_en_cl2.csv',
               'ECML20_predictions_hatecheck_en_es.csv'],  # transfer hateval es to cl
              ['ECML20_predictions_cl1_es.csv',  # spanish training, english predictions
               'ECML20_predictions_cl1_en.csv', 
               'ECML20_predictions_cl1_cl2.csv',
               'ECML20_predictions_hatecheck_cl1_es.csv'],
              ['ECML20_predictions_cl2_es.csv',  # spanish training, english predictions
               'ECML20_predictions_cl2_en.csv',  
               'ECML20_predictions_cl2_cl1.csv', 
               'ECML20_predictions_hatecheck_cl2_es.csv'],
              ['ECML20_predictions_clT_es.csv',  # spanish training, english predictions
               'ECML20_predictions_clT_en.csv',
               'ECML20_predictions_hatecheck_clT_es.csv'], 
              ]

experiments_configs = [
                       experiments_es,          # full spanish model
                       experiments_en,          # full english model
                       experiments_cl1,         # cl1 model
                       experiments_cl2 ,         # cl2 model
                       experiments_cl1_cl2      # cl1 + cl2 model
                       ]
      
if not train_models:
  results = save_predictions(loaded_models, experiments_configs, save_files)

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

en.csv es en
cl1.csv es es
cl2.csv es es
es hatecheck es


  0%|          | 0/4 [00:00<?, ?it/s]

es.csv en es
cl1.csv en es
cl2.csv en es
en hatecheck en


  0%|          | 0/4 [00:00<?, ?it/s]

es.csv cl1 es
en.csv cl1 en
cl2.csv cl1 es
cl1 hatecheck es


  0%|          | 0/4 [00:00<?, ?it/s]

es.csv cl2 es
en.csv cl2 en
cl1.csv cl2 es
cl2 hatecheck es


  0%|          | 0/3 [00:00<?, ?it/s]

es.csv clT es
en.csv clT en
clT hatecheck es


In [13]:
results
display(pd.DataFrame.from_dict(results, orient='index'))

Unnamed: 0,accuracy,precision,recall,f1,AUC
es_en,0.648917,0.597418,0.50466,0.547135,0.629073
es_cl1,0.695851,0.096785,0.447077,0.159123,0.580022
es_cl2,0.610293,0.582651,0.514699,0.546571,0.602616
es_hatecheck,0.553271,0.816502,0.469985,0.596576,0.610105
en_es,0.639672,0.561846,0.575685,0.568681,0.630154
en_cl1,0.784116,0.097732,0.28594,0.145674,0.552165
en_cl2,0.581922,0.557923,0.403855,0.468549,0.567624
en_hatecheck,0.452069,0.807203,0.289514,0.426174,0.562996
cl1_es,0.589672,0.508393,0.168455,0.253059,0.527013
cl1_en,0.588667,0.527677,0.202261,0.292431,0.535513


In [14]:
pd.DataFrame.from_dict(results, orient='index').to_csv(save_path + 'ECML20_evaluation_results.csv')
print(pd.DataFrame.from_dict(results, orient='index').to_latex())

\begin{tabular}{lrrrrr}
\toprule
{} &  accuracy &  precision &    recall &        f1 &       AUC \\
\midrule
es\_en         &  0.648917 &   0.597418 &  0.504660 &  0.547135 &  0.629073 \\
es\_cl1        &  0.695851 &   0.096785 &  0.447077 &  0.159123 &  0.580022 \\
es\_cl2        &  0.610293 &   0.582651 &  0.514699 &  0.546571 &  0.602616 \\
es\_hatecheck  &  0.553271 &   0.816502 &  0.469985 &  0.596576 &  0.610105 \\
en\_es         &  0.639672 &   0.561846 &  0.575685 &  0.568681 &  0.630154 \\
en\_cl1        &  0.784116 &   0.097732 &  0.285940 &  0.145674 &  0.552165 \\
en\_cl2        &  0.581922 &   0.557923 &  0.403855 &  0.468549 &  0.567624 \\
en\_hatecheck  &  0.452069 &   0.807203 &  0.289514 &  0.426174 &  0.562996 \\
cl1\_es        &  0.589672 &   0.508393 &  0.168455 &  0.253059 &  0.527013 \\
cl1\_en        &  0.588667 &   0.527677 &  0.202261 &  0.292431 &  0.535513 \\
cl1\_cl2       &  0.627227 &   0.692308 &  0.329639 &  0.446621 &  0.603331 \\
cl1\_hatecheck &  0.40

### Bias Metrics

In [None]:
!pip install scipy --upgrade

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from scipy.stats import pmean, hmean, gmean

bias_files = ['ECML20_predictions_hatecheck_es_es.csv',
              'ECML20_predictions_hatecheck_en_es.csv',
              'ECML20_predictions_hatecheck_cl1_es.csv',
              'ECML20_predictions_hatecheck_cl2_es.csv',
              'ECML20_predictions_hatecheck_clT_es.csv']


# Define Evaluation metrics
def compute_metrics_II(p):
    pred, labels = p
    auc = roc_auc_score(y_true=labels, y_score=pred)

    return auc

def bias_metrics(bias_files):
  results_dict = dict()
  for file_ in bias_files:
    df = pd.read_csv(save_path + "/predictions/" + file_, index_col=0)
    key_lang = file_.split('_')[2]
    lang = file_.split('_')[3].split('.')[0]
    #print(df.columns.tolist())
    #print(df['target_ident'].unique()[:-1])
    for col in df['target_ident'].unique()[:-1]:
      dicc = dict()
      # subgroup AUC
      df_ = df[df['target_ident'] == col]
      l = df_['HS'].to_numpy()
      p = df_['HS_predict'].to_numpy()
      dicc["Subgroup AUC"] = compute_metrics_II((p, l))
      # BPSN AUC
      df_ = df[(df['target_ident'] == col) & (df['HS'] == 0) | (df['target_ident'] != col) & (df['HS'] == 1)]
      l = df_['HS'].to_numpy()
      p = df_['HS_predict'].to_numpy()
      dicc['BPSN AUC'] = compute_metrics_II((p, l))
      # BNSP AUC
      df_ = df[(df['target_ident'] == col) & (df['HS'] == 1) | (df['target_ident'] != col) & (df['HS'] == 0)]
      l = df_['HS'].to_numpy()
      p = df_['HS_predict'].to_numpy()
      dicc['BNSP AUC'] = compute_metrics_II((p, l))
      results_dict[ key_lang + "_" + lang + '_' + col] = dicc
  
  display(pd.DataFrame.from_dict(results_dict, orient='index'))
  print(pd.DataFrame.from_dict(results_dict, orient='index').to_latex())
  dict_results = dict()
  # GMB AUC
  for like in ['es_', 'en_', 'cl1_', 'cl2_', 'clT_']:
    df = pd.DataFrame.from_dict(results_dict, orient='index').filter(like=like, axis=0)
    dict_temp = dict()
    for col in df.columns.tolist():
      dict_temp['GMB ' + col] = pmean(df[col].tolist(), -5)
    dict_results[like.strip('_')] = dict_temp
  display(pd.DataFrame.from_dict(dict_results, orient='index'))
  print(pd.DataFrame.from_dict(dict_results, orient='index').to_latex())


bias_metrics(bias_files)

Unnamed: 0,Subgroup AUC,BPSN AUC,BNSP AUC
hatecheck_es_black people,0.567679,0.406687,0.769325
hatecheck_es_gay people,0.606516,0.714232,0.50768
hatecheck_es_trans people,0.594907,0.692176,0.519085
hatecheck_es_indigenous people,0.632333,0.56991,0.666981
hatecheck_es_jews,0.626812,0.624064,0.611026
hatecheck_es_disabled people,0.611197,0.727867,0.498552
hatecheck_es_women,0.586387,0.467778,0.726902
hatecheck_en_black people,0.581289,0.488667,0.650197
hatecheck_en_gay people,0.567092,0.571672,0.558118
hatecheck_en_trans people,0.53106,0.611754,0.489866


\begin{tabular}{lrrr}
\toprule
{} &  Subgroup AUC &  BPSN AUC &  BNSP AUC \\
\midrule
hatecheck\_es\_black people       &      0.567679 &  0.406687 &  0.769325 \\
hatecheck\_es\_gay people         &      0.606516 &  0.714232 &  0.507680 \\
hatecheck\_es\_trans people       &      0.594907 &  0.692176 &  0.519085 \\
hatecheck\_es\_indigenous people  &      0.632333 &  0.569910 &  0.666981 \\
hatecheck\_es\_jews               &      0.626812 &  0.624064 &  0.611026 \\
hatecheck\_es\_disabled people    &      0.611197 &  0.727867 &  0.498552 \\
hatecheck\_es\_women              &      0.586387 &  0.467778 &  0.726902 \\
hatecheck\_en\_black people       &      0.581289 &  0.488667 &  0.650197 \\
hatecheck\_en\_gay people         &      0.567092 &  0.571672 &  0.558118 \\
hatecheck\_en\_trans people       &      0.531060 &  0.611754 &  0.489866 \\
hatecheck\_en\_indigenous people  &      0.551469 &  0.588882 &  0.528686 \\
hatecheck\_en\_jews               &      0.557621 &  0.607929 &  0.

Unnamed: 0,GMB Subgroup AUC,GMB BPSN AUC,GMB BNSP AUC
es,0.601449,0.523345,0.569736
en,0.558345,0.529736,0.539042
cl1,0.547545,0.504037,0.534048
cl2,0.603355,0.52047,0.546218
clT,0.578193,0.461843,0.555172


\begin{tabular}{lrrr}
\toprule
{} &  GMB Subgroup AUC &  GMB BPSN AUC &  GMB BNSP AUC \\
\midrule
es  &          0.601449 &      0.523345 &      0.569736 \\
en  &          0.558345 &      0.529736 &      0.539042 \\
cl1 &          0.547545 &      0.504037 &      0.534048 \\
cl2 &          0.603355 &      0.520470 &      0.546218 \\
clT &          0.578193 &      0.461843 &      0.555172 \\
\bottomrule
\end{tabular}



### Hatecheck

In [None]:
files = ['ECML20_predictions_hatecheck_es_es.csv',
         'ECML20_predictions_hatecheck_en_es.csv',
         'ECML20_predictions_hatecheck_cl1_es.csv',
         'ECML20_predictions_hatecheck_cl2_es.csv',
         'ECML20_predictions_hatecheck_clT_es.csv']

def hatecheck(files):
  dicc_ = dict()
  dicc__2 = dict()
  for file_ in files:
    dicc = dict()
    df = pd.read_csv(save_path + "/predictions/" + file_)
    filters = df['functionality'].unique()
    for filter in filters:
      df_ = df[df['functionality'] == filter]
      dicc[filter] = accuracy_score(df_['HS'].values, df_['HS_predict'].values)
    dicc_2 = dict()
    filters = df['target_ident'].unique()
    for filter in filters:
      df_ = df[df['target_ident'] == filter]
      dicc_2[filter] = f1_score(df_['HS'].values, df_['HS_predict'].values, average='macro')
    key = file_.split('_')[3]
    dicc_[key] = dicc
    dicc__2[key] = dicc_2

  df = pd.DataFrame.from_dict(dicc_, orient='index').transpose()
  print(df.columns.tolist())
  df['delta_cl'] = df['cl2'] - df['cl1']
  display(df)
  print(df.to_latex())
  df = pd.DataFrame.from_dict(dicc__2, orient='index').transpose()
  print(df.columns.tolist())
  df['delta_cl'] = df['cl2'] - df['cl1']
  df = df.drop(index=df.index.tolist()[-1])
  display(df)
  print(df.to_latex())


hatecheck(files)

  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


['es', 'en', 'cl1', 'cl2', 'clT']


  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


Unnamed: 0,es,en,cl1,cl2,clT,delta_cl
derog_neg_emote_h,0.307143,0.014286,0.107143,0.342857,0.214286,0.235714
derog_neg_attrib_h,0.65,0.2,0.45,0.628571,0.592857,0.178571
derog_dehum_h,0.635714,0.264286,0.392857,0.592857,0.521429,0.2
derog_impl_h,0.485714,0.407143,0.014286,0.242857,0.171429,0.228571
threat_dir_h,0.45,0.3,0.107143,0.392857,0.292857,0.285714
threat_norm_h,0.634483,0.668966,0.131034,0.77931,0.496552,0.648276
slur_h,0.486667,0.42,0.06,0.333333,0.393333,0.273333
profanity_h,0.492857,0.485714,0.2,0.55,0.5,0.35
profanity_nh,0.97,0.87,1.0,0.98,0.98,-0.02
ref_subs_clause_h,0.441379,0.296552,0.331034,0.572414,0.634483,0.241379


\begin{tabular}{lrrrrrr}
\toprule
{} &        es &        en &       cl1 &       cl2 &       clT &  delta\_cl \\
\midrule
derog\_neg\_emote\_h  &  0.307143 &  0.014286 &  0.107143 &  0.342857 &  0.214286 &  0.235714 \\
derog\_neg\_attrib\_h &  0.650000 &  0.200000 &  0.450000 &  0.628571 &  0.592857 &  0.178571 \\
derog\_dehum\_h      &  0.635714 &  0.264286 &  0.392857 &  0.592857 &  0.521429 &  0.200000 \\
derog\_impl\_h       &  0.485714 &  0.407143 &  0.014286 &  0.242857 &  0.171429 &  0.228571 \\
threat\_dir\_h       &  0.450000 &  0.300000 &  0.107143 &  0.392857 &  0.292857 &  0.285714 \\
threat\_norm\_h      &  0.634483 &  0.668966 &  0.131034 &  0.779310 &  0.496552 &  0.648276 \\
slur\_h             &  0.486667 &  0.420000 &  0.060000 &  0.333333 &  0.393333 &  0.273333 \\
profanity\_h        &  0.492857 &  0.485714 &  0.200000 &  0.550000 &  0.500000 &  0.350000 \\
profanity\_nh       &  0.970000 &  0.870000 &  1.000000 &  0.980000 &  0.980000 & -0.020000 \\
ref\_subs\_clau

Unnamed: 0,es,en,cl1,cl2,clT,delta_cl
black people,0.562289,0.496728,0.326949,0.302776,0.332766,-0.024173
gay people,0.437984,0.416559,0.360498,0.567959,0.553228,0.207461
trans people,0.441558,0.324799,0.286686,0.541956,0.466504,0.255269
indigenous people,0.569662,0.379089,0.510491,0.561862,0.543223,0.051371
jews,0.531449,0.367243,0.398784,0.512586,0.41914,0.113802
disabled people,0.431998,0.333134,0.232501,0.60521,0.467569,0.372708
women,0.565195,0.549012,0.315281,0.438384,0.422599,0.123103


\begin{tabular}{lrrrrrr}
\toprule
{} &        es &        en &       cl1 &       cl2 &       clT &  delta\_cl \\
\midrule
black people      &  0.562289 &  0.496728 &  0.326949 &  0.302776 &  0.332766 & -0.024173 \\
gay people        &  0.437984 &  0.416559 &  0.360498 &  0.567959 &  0.553228 &  0.207461 \\
trans people      &  0.441558 &  0.324799 &  0.286686 &  0.541956 &  0.466504 &  0.255269 \\
indigenous people &  0.569662 &  0.379089 &  0.510491 &  0.561862 &  0.543223 &  0.051371 \\
jews              &  0.531449 &  0.367243 &  0.398784 &  0.512586 &  0.419140 &  0.113802 \\
disabled people   &  0.431998 &  0.333134 &  0.232501 &  0.605210 &  0.467569 &  0.372708 \\
women             &  0.565195 &  0.549012 &  0.315281 &  0.438384 &  0.422599 &  0.123103 \\
\bottomrule
\end{tabular}

