# **Training a hate-sensitive BERT**
Author: Markus Reichel


In this notebook, we will train BERT on the HASOC datasets with the huggingface transformers library.
Set the DATASET variable to 'hasoc2019' for 2019, or to 'hasoc2020' for 2020 data.


In [1]:
#DATASET = 'hasoc2019'
#DATASET = 'hasoc2020'
DATASET = 'hasoc2019-2020'

In [2]:
#basic imports & installs
import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

!pip install --upgrade fsspec
!pip install fsspec==0.9.0
!pip install datasets
# restart notebook if newly installing!


/kaggle/input/hasoc-real2019/english_dataset.tsv
/kaggle/input/hasoc-real2019/hasoc2019_en_test-2919.tsv
/kaggle/input/hasoc-real2020/hasoc_2020_en_test_new.csv
/kaggle/input/hasoc-real2020/hasoc_2020_en_train_new.csv
Collecting fsspec
  Downloading fsspec-2021.7.0-py3-none-any.whl (118 kB)
[K     |████████████████████████████████| 118 kB 3.1 MB/s 
[?25hInstalling collected packages: fsspec
  Attempting uninstall: fsspec
    Found existing installation: fsspec 0.8.7
    Uninstalling fsspec-0.8.7:
      Successfully uninstalled fsspec-0.8.7
Successfully installed fsspec-2021.7.0
Collecting fsspec==0.9.0
  Downloading fsspec-0.9.0-py3-none-any.whl (107 kB)
[K     |████████████████████████████████| 107 kB 3.1 MB/s 
Installing collected packages: fsspec
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2021.7.0
    Uninstalling fsspec-2021.7.0:
      Successfully uninstalled fsspec-2021.7.0
Successfully installed fsspec-0.9.0
Collecting datasets
 

In [3]:
#how the data looks like
raw = pd.read_csv('../input/hasoc-real2019/english_dataset.tsv', sep='\t')
rawtest = pd.read_csv('../input/hasoc-real2019/hasoc2019_en_test-2919.tsv', sep='\t')
rawtest = pd.read_csv('../input/hasoc-real2020/hasoc_2020_en_train_new.csv', sep=',', encoding='cp1252')

X_train = raw.task_1
y_train = raw.text

raw.head()
rawtest.head()

Unnamed: 0,tweet_id,text,task1,task2,ID
0,1123757263427186690,"hate wen females hit ah nigga with tht bro ??,...",HOF,PRFN,hasoc_2020_en_2574
1,1123733301397733380,RT @airjunebug: When you're from the Bay but y...,HOF,PRFN,hasoc_2020_en_3627
2,1123734094108659712,RT @DonaldJTrumpJr: Dear Democrats: The Americ...,NOT,NONE,hasoc_2020_en_3108
3,1126951188170199049,RT @SheLoveTimothy: He ain’t on drugs he just ...,HOF,PRFN,hasoc_2020_en_3986
4,1126863510447710208,RT @TavianJordan: Summer ‘19 I’m coming for yo...,NOT,NONE,hasoc_2020_en_5152


In [4]:
# load the data
# using huggingface datasets
from datasets import load_dataset, concatenate_datasets

#2019
if DATASET == 'hasoc2019':
    dataset = load_dataset('csv', data_files='../input/hasoc-real2019/english_dataset.tsv', delimiter='\t')
    dataset_test = load_dataset('csv', data_files='../input/hasoc-real2019/hasoc2019_en_test-2919.tsv', delimiter='\t')

#2020
if DATASET == 'hasoc2020':
    dataset = load_dataset('csv', data_files='../input/hasoc-real2020/hasoc_2020_en_train_new.csv', delimiter=',', encoding='cp1252')
    dataset_test = load_dataset('csv', data_files='../input/hasoc-real2020/hasoc_2020_en_test_new.csv', delimiter=',', encoding='cp1252')

#both
if DATASET == 'hasoc2019-2020':
    dataset2019 = load_dataset('csv', data_files='../input/hasoc-real2019/english_dataset.tsv', delimiter='\t')
    dataset2019_test = load_dataset('csv', data_files='../input/hasoc-real2019/hasoc2019_en_test-2919.tsv', delimiter='\t')
    dataset2020 = load_dataset('csv', data_files='../input/hasoc-real2020/hasoc_2020_en_train_new.csv', delimiter=',', encoding='cp1252')
    dataset2020_test = load_dataset('csv', data_files='../input/hasoc-real2020/hasoc_2020_en_test_new.csv', delimiter=',', encoding='cp1252')
    
def flag_to_num(dat):
    if(dat['label'] == 'NOT'):
        dat['label'] = 0
    else:
        dat['label'] = 1
    return dat

def select_task1(dat):
    #2019
    if DATASET == 'hasoc2019':
        dat = dat.remove_columns(["text_id", "task_2", "task_3"])
        dat = dat.rename_column("task_1", "label")
    
    #2020
    if DATASET == 'hasoc2020':
        dat = dat.remove_columns(["tweet_id", "task2", "ID"])
        dat = dat.rename_column("task1", "label")
        
    #both
    if DATASET == 'hasoc2019-2020':
        dat[0] = dat[0].remove_columns(["text_id", "task_2", "task_3"])
        dat[0] = dat[0].rename_column("task_1", "label")
        dat[1] = dat[1].remove_columns(["tweet_id", "task2", "ID"])
        dat[1] = dat[1].rename_column("task1", "label")
        dat = concatenate_datasets([dat[0]["train"], dat[1]["train"]])
    
    dat = dat.map(flag_to_num)
    return dat

if DATASET == 'hasoc2019-2020':
    dataset = select_task1([dataset2019, dataset2020])
    dataset_test = select_task1([dataset2019_test, dataset2020_test])
else:
    dataset = select_task1(dataset)
    dataset_test = select_task1(dataset_test)

dataset

    
#dataset['train']['label']

Downloading and preparing dataset csv/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /root/.cache/huggingface/datasets/csv/default-e59ffc7e0963ea66/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff...


0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-e59ffc7e0963ea66/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff. Subsequent calls will reuse this data.
Downloading and preparing dataset csv/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /root/.cache/huggingface/datasets/csv/default-df9691cfe94459d7/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff...


0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-df9691cfe94459d7/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff. Subsequent calls will reuse this data.
Downloading and preparing dataset csv/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /root/.cache/huggingface/datasets/csv/default-3faf5de3a1f031a3/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff...


0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-3faf5de3a1f031a3/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff. Subsequent calls will reuse this data.
Downloading and preparing dataset csv/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /root/.cache/huggingface/datasets/csv/default-0adb7f0d389bedfd/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff...


0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-0adb7f0d389bedfd/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff. Subsequent calls will reuse this data.


  0%|          | 0/9560 [00:00<?, ?ex/s]

  0%|          | 0/2745 [00:00<?, ?ex/s]

Dataset({
    features: ['text', 'label'],
    num_rows: 9560
})

In [5]:
#tokenizer
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")


Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

In [6]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets_test = dataset_test.map(tokenize_function, batched=True)

  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

In [7]:

if DATASET == 'hasoc2019-2020':
    full_train_dataset = tokenized_datasets
    full_eval_dataset = tokenized_datasets_test
else:
    #train keyword everywhere since we are using two different objects for train and test
    single_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1))
    small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
    small_eval_dataset = tokenized_datasets_test["train"].shuffle(seed=42).select(range(1000))

    full_train_dataset = tokenized_datasets["train"]
    full_eval_dataset = tokenized_datasets_test["train"]

In [8]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [9]:
from transformers import TrainingArguments

training_args = TrainingArguments("test_trainer")

In [10]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

Downloading:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

In [11]:
from transformers import Trainer

trainer = Trainer(
    #model=model, args=training_args, train_dataset=small_train_dataset, eval_dataset=small_eval_dataset, compute_metrics=compute_metrics
    model=model, args=training_args, train_dataset=full_train_dataset, eval_dataset=full_eval_dataset, compute_metrics=compute_metrics
)

In [12]:
#train
trainer.train()

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/site-packages/wandb/sdk/wandb_init.py", line 741, in init
    wi.setup(kwargs)
  File "/opt/conda/lib/python3.7/site-packages/wandb/sdk/wandb_init.py", line 155, in setup
  File "/opt/conda/lib/python3.7/site-packages/wandb/sdk/wandb_login.py", line 210, in _login
    wlogin.prompt_api_key()
  File "/opt/conda/lib/python3.7/site-packages/wandb/sdk/wandb_login.py", line 144, in prompt_api_key
    no_create=self._settings.force,
  File "/opt/conda/lib/python3.7/site-packages/wandb/sdk/lib/apikey.py", line 135, in prompt_api_key
    key = input_callback(api_ask).strip()
  File "/opt/conda/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 825, in getpass
    "getpass was called, but this frontend does not support input requests."
IPython.core.error.StdinNotImplementedError: getpass was called, but this frontend does not 

Exception: problem

In [None]:
#trainer = Trainer(
#    model=model,
#    args=training_args,
#    train_dataset=small_train_dataset,
#    eval_dataset=small_eval_dataset,
#    compute_metrics=compute_metrics,
#)
trainer.evaluate()

In [None]:
#get predictions
out = trainer.predict(full_eval_dataset)
predictions = np.argmax(out.predictions, axis=-1)
predictions

In [None]:
full_eval_dataset["label"][0:10]

In [None]:
predictions[0:10]

In [None]:
#METRICS
from sklearn.metrics import confusion_matrix
y_true = full_eval_dataset["label"]
y_pred = predictions
conf = confusion_matrix(y_true, y_pred)
conf

In [None]:
TN, FP, FN, TP = conf.ravel()
print("TN: "+str(TN)+" FP: "+str(FP)+" FN: "+str(FN)+" TP: "+str(TP))

precision = TP/(TP+FP)
recall = TP/(TP+FN)
print("precision: "+str(precision))
print("recall: "+str(recall))

F1 = 2*(precision*recall)/(precision+recall)
print ("\nF1: "+str(F1))

In [None]:
trainer.model.save_pretrained("MYMODEL3")
%ls ./MYMODEL3
#for param in model.parameters():
#  print(param.data)

In [None]:
from sklearn.metrics import precision_score, \
    recall_score, confusion_matrix, classification_report, \
    accuracy_score, f1_score

print ('Accuracy:', accuracy_score(y_true, y_pred))
print ('F1 score:', f1_score(y_true, y_pred))
print ('Recall:', recall_score(y_true, y_pred))
print ('Precision:', precision_score(y_true, y_pred))
print ('\n clasification report:\n', classification_report(y_true,y_pred))
print ('\n confusion matrix:\n',confusion_matrix(y_true, y_pred))

print('macro F1:')
f1_score(y_true, y_pred, average='macro')


In [None]:
#interactive
#add test to try with add_sample
from datasets import *

oldmodel = model

list1 = []
list2 = []

def add_sample(text):
    global list1
    list1 = list1 + [text]

################### add here ###################

add_sample("You are not fucking serious")
add_sample("I like it")
add_sample("Donald Trump, why can't you be normal")
add_sample("Yeah. Fuck this night. I’m out. ✌🏻")
add_sample("Bruh we dont fuck with the opps... Hell type time you on.. how about you request a trade so I can stop rooting for… https://t.co/dwoAMbW6GK")
add_sample("@bigjosh002 Bitch")

###############################################

print(list1)  
d = {'text': list1}
data_pandas = pd.DataFrame(data=d)
data_pandas.to_csv("tmp.csv")
interactive_data =  load_dataset('csv', data_files='tmp.csv', delimiter=',')
interactive_data = interactive_data.map(tokenize_function, batched=True)
interactive_data = interactive_data["train"]
out2 = trainer.predict(interactive_data)
predictions2 = np.argmax(out2.predictions, axis=-1)
predictions2

In [None]:
#try to predict 2019 again
interactive_data =  load_dataset('csv', data_files='../input/hasoc-real2019/hasoc2019_en_test-2919.tsv', delimiter='\t')
interactive_data = interactive_data.remove_columns(["text_id", "task_2", "task_3"])
interactive_data = interactive_data.rename_column("task_1", "label")
interactive_data = interactive_data.map(flag_to_num)
interactive_data = interactive_data.map(tokenize_function, batched=True)
interactive_data = interactive_data["train"]
interactive_data
metric = load_metric("accuracy")
out2 = trainer.predict(interactive_data)
predictions2 = np.argmax(out2.predictions, axis=-1)

f1_score(interactive_data["label"], predictions2, average='macro')

(Refresh page first) <br>
<a href="./MYMODEL3/pytorch_model.bin"> Download Model </a> <br>
<a href="./MYMODEL3/config.json"> Download Config </a>



In [None]:
import torch
import torch.nn.functional as F 
import numpy as np
import tensorflow as tf

model = AutoModelForSequenceClassification.from_pretrained("./MYMODEL3")

def is_hate(string):    
    inputs = tokenizer(string, return_tensors="pt", padding="max_length", truncation=True)

    #labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
    with torch.no_grad():
        outputs = model(**inputs)
    prob = F.softmax(outputs.logits, dim=-1)[0]
    print(prob)
    #print(outputs.loss)
    print(outputs.logits)
    if(prob[0] > prob[1]):
        print("This is not hate.")
    else:
        print("I consider this as hate.")


print("Welcome to hate-sensitive BERT.")

import sys

is_hate("You are not fucking serious")
is_hate("I love you")
is_hate("Donald Trump, why can't you be normal")
is_hate("Yeah. Fuck this night. I'm out.")
is_hate("Bruh we dont fuck with the opps... Hell type time you on.. how about you request a trade so I can stop rooting for https://t.co/dwoAMbW6GK")
is_hate("@bigjosh002 Bitch")

In [None]:
full_path = "./MYMODEL3/pytorch_model.bin"

import hashlib
def file_as_bytes(file):
    with file:
        return file.read()

print (hashlib.md5(file_as_bytes(open(full_path, 'rb'))).hexdigest())