In [2]:
import pandas as pd
import numpy as np
from simpletransformers.classification import ClassificationModel

In [1]:
import torch
torch.cuda.is_available()

True

### Read in data

In [4]:
train = pd.read_csv("data/olid-train.csv")
test = pd.read_csv("data/olid-test.csv")
diagnostics = pd.read_csv("data/olid-subset-diagnostic-tests.csv")

In [5]:
train.head()

Unnamed: 0,id,text,labels
0,86426,@USER She should ask a few native Americans wh...,1
1,90194,@USER @USER Go home you’re drunk!!! @USER #MAG...,1
2,16820,Amazon is investigating Chinese employees who ...,0
3,62688,"@USER Someone should'veTaken"" this piece of sh...",1
4,43605,@USER @USER Obama wanted liberals &amp; illega...,0


#### Quest 1. Some basic statistics

In [6]:
len(train.loc[train['labels'] == 1])

4400

In [7]:
len(train.loc[train['labels'] == 1]) / len(train)

0.3323262839879154

In [8]:
train.loc[train['labels'] == 1]['text'][0]

'@USER She should ask a few native Americans what their take on this is.'

In [9]:
len(train.loc[train['labels'] == 0])

8840

In [10]:
len(train.loc[train['labels'] == 0]) / len(train)

0.6676737160120846

In [11]:
train.loc[train['labels'] == 0]['text'][2]

'Amazon is investigating Chinese employees who are selling internal data to third-party sellers looking for an edge in the competitive marketplace. URL #Amazon #MAGA #KAG #CHINA #TCOT'

#### Quest 2. Baselines

In [12]:
from sklearn.metrics import (f1_score, recall_score, precision_score)

In [13]:
# Random baseline
rows = test.shape[0]

In [14]:
predicts = np.random.randint(0,2, rows)
gold = test['labels']

In [15]:
# class 1
print(recall_score(gold, predicts, pos_label=1))
print(precision_score(gold, predicts, pos_label=1))
print(f1_score(gold, predicts, pos_label=1))


# class 0
print()
print(recall_score(gold, predicts, pos_label=0))
print(precision_score(gold, predicts, pos_label=0))
print(f1_score(gold, predicts, pos_label=0))

# macro average
print()
print(recall_score(gold, predicts, pos_label=1, average='macro'))
print(precision_score(gold, predicts, pos_label=1, average='macro'))
print(f1_score(gold, predicts, pos_label=1, average='macro'))


# micro average
print()
print(recall_score(gold, predicts, pos_label=1, average='weighted'))
print(precision_score(gold, predicts, pos_label=1, average='weighted'))
print(f1_score(gold, predicts, pos_label=1, average='weighted'))

0.4791666666666667
0.2764423076923077
0.350609756097561

0.5145161290322581
0.7184684684684685
0.599624060150376

0.49684139784946235
0.4974553880803881
0.47511690812396845

0.5046511627906977
0.5951123305774468
0.5301316962286601


In [16]:
## Majority
predicts = np.zeros(rows)

In [17]:
# class 1
print(recall_score(gold, predicts, pos_label=1))
print(precision_score(gold, predicts, pos_label=1))
print(f1_score(gold, predicts, pos_label=1))


# class 0
print()
print(recall_score(gold, predicts, pos_label=0))
print(precision_score(gold, predicts, pos_label=0))
print(f1_score(gold, predicts, pos_label=0))

# macro average
print()
print(recall_score(gold, predicts, pos_label=1, average='macro'))
print(precision_score(gold, predicts, pos_label=1, average='macro'))
print(f1_score(gold, predicts, pos_label=1, average='macro'))


# micro average
print()
print(recall_score(gold, predicts, pos_label=1, average='weighted'))
print(precision_score(gold, predicts, pos_label=1, average='weighted'))
print(f1_score(gold, predicts, pos_label=1, average='weighted'))

0.0
0.0
0.0

1.0
0.7209302325581395
0.8378378378378378

0.5
0.36046511627906974
0.4189189189189189

0.7209302325581395
0.519740400216333
0.6040226272784412


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### Quest 3. Fine-tuning BERT for hate speech detection

In [18]:
model = ClassificationModel("bert", "bert-base-cased", use_cuda=torch.cuda.is_available())

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/416M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

In [19]:
model.train_model(train)

  0%|          | 0/13240 [00:00<?, ?it/s]



Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/1655 [00:00<?, ?it/s]

(1655, 0.4937190058728359)

### Run from here if time is of value

In [34]:
import joblib
joblib.dump(model, "outputs/best_model.sav")
model = joblib.load("outputs/best_model.sav")

In [36]:
test.head()

Unnamed: 0,id,text,labels
0,15923,#WhoIsQ #WheresTheServer #DumpNike #DECLASFISA...,1
1,27014,"#ConstitutionDay is revered by Conservatives, ...",0
2,30530,#FOXNews #NRA #MAGA #POTUS #TRUMP #2ndAmendmen...,0
3,13876,#Watching #Boomer getting the news that she is...,0
4,60133,#NoPasaran: Unity demo to oppose the far-right...,1


In [39]:
predicts = model.predict(test['text'].tolist())

  0%|          | 0/860 [00:00<?, ?it/s]

  0%|          | 0/108 [00:00<?, ?it/s]

In [40]:
gold = test['labels'].tolist()

In [43]:
predicts = predicts[0]
# class 1
print(recall_score(gold, predicts, pos_label=1))
print(precision_score(gold, predicts, pos_label=1))
print(f1_score(gold, predicts, pos_label=1))


# class 0
print()
print(recall_score(gold, predicts, pos_label=0))
print(precision_score(gold, predicts, pos_label=0))
print(f1_score(gold, predicts, pos_label=0))

# macro average
print()
print(recall_score(gold, predicts, pos_label=1, average='macro'))
print(precision_score(gold, predicts, pos_label=1, average='macro'))
print(f1_score(gold, predicts, pos_label=1, average='macro'))


# micro average
print()
print(recall_score(gold, predicts, pos_label=1, average='weighted'))
print(precision_score(gold, predicts, pos_label=1, average='weighted'))
print(f1_score(gold, predicts, pos_label=1, average='weighted'))

0.6666666666666666
0.7766990291262136
0.7174887892376681

0.9258064516129032
0.8776758409785933
0.901098901098901

0.7962365591397849
0.8271874350524034
0.8092938451682845

0.8534883720930233
0.8494962655779293
0.8498588698818128


In [44]:
from sklearn.metrics import confusion_matrix

In [48]:
confusion_matrix(  gold, predicts)

array([[574,  46],
       [ 80, 160]], dtype=int64)