In [1]:
import pandas as pd
import numpy as np
from simpletransformers.classification import ClassificationModel

In [2]:
import torch
torch.cuda.is_available()

True

### Read in data

In [3]:
train = pd.read_csv("data/olid-train.csv")
test = pd.read_csv("data/olid-test.csv")
diagnostics = pd.read_csv("data/olid-subset-diagnostic-tests.csv")

In [4]:
train.head()

Unnamed: 0,id,text,labels
0,86426,@USER She should ask a few native Americans wh...,1
1,90194,@USER @USER Go home you’re drunk!!! @USER #MAG...,1
2,16820,Amazon is investigating Chinese employees who ...,0
3,62688,"@USER Someone should'veTaken"" this piece of sh...",1
4,43605,@USER @USER Obama wanted liberals &amp; illega...,0


#### Quest 1. Some basic statistics

In [6]:
len(train.loc[train['labels'] == 1])

4400

In [7]:
len(train.loc[train['labels'] == 1]) / len(train)

0.3323262839879154

In [17]:
train.loc[train['labels'] == 1]['text'][3]

'@USER Someone should\'veTaken" this piece of shit to a volcano. 😂"'

In [9]:
len(train.loc[train['labels'] == 0])

8840

In [10]:
len(train.loc[train['labels'] == 0]) / len(train)

0.6676737160120846

In [11]:
train.loc[train['labels'] == 0]['text'][2]

'Amazon is investigating Chinese employees who are selling internal data to third-party sellers looking for an edge in the competitive marketplace. URL #Amazon #MAGA #KAG #CHINA #TCOT'

#### Quest 2. Baselines

In [18]:
from sklearn.metrics import (f1_score, recall_score, precision_score)

In [19]:
# Random baseline
rows = test.shape[0]

In [28]:
predicts = np.random.randint(0,2, rows)
gold = test['labels']

In [31]:
one_recall = []
one_precision = []
one_f1 = []

zero_recall = []
zero_precision = []
zero_f1 = []

macro_recall = []
macro_precision = []
macro_f1 = []

micro_recall = []
micro_precision = []
micro_f1 = []
for i in range(1000):
    # class 1
    one_recall.append(recall_score(gold, predicts, pos_label=1))
    one_precision.append(precision_score(gold, predicts, pos_label=1))
    one_f1.append(f1_score(gold, predicts, pos_label=1))


    # class 0
    zero_recall.append(recall_score(gold, predicts, pos_label=0))
    zero_precision.append(precision_score(gold, predicts, pos_label=0))
    zero_f1.append(f1_score(gold, predicts, pos_label=0))


    # macro average
    one_recall.append(recall_score(gold, predicts, pos_label=1))
    one_precision.append(precision_score(gold, predicts, pos_label=1))
    one_f1.append(f1_score(gold, predicts, pos_label=1))
    
    
    macro_recall.append(recall_score(gold, predicts, pos_label=1, average='macro'))
    macro_precision.append(precision_score(gold, predicts, pos_label=1, average='macro'))
    macro_f1.append(f1_score(gold, predicts, pos_label=1, average='macro'))


    # micro average
    micro_recall.append(recall_score(gold, predicts, pos_label=1, average='weighted'))
    micro_precision.append(precision_score(gold, predicts, pos_label=1, average='weighted'))
    micro_f1.append(f1_score(gold, predicts, pos_label=1, average='weighted'))

In [33]:
print(np.mean(one_precision))
print(np.mean(one_recall))
print(np.mean(one_f1))

print()
print(np.mean(zero_precision))
print(np.mean(zero_recall))
print(np.mean(zero_f1))

print()
print(np.mean(macro_precision))
print(np.mean(macro_recall))
print(np.mean(macro_f1))

print()
print(np.mean(micro_precision))
print(np.mean(micro_recall))
print(np.mean(micro_f1))

0.297423887587822
0.5291666666666668
0.3808095952023987

0.7390300230946883
0.5161290322580643
0.6077872744539411

0.5182269553412552
0.5226478494623654
0.49429843482816976

0.6157911015578884
0.5197674418604652
0.54444466629072


In [34]:
## Majority
predicts = np.zeros(rows)

In [36]:
# class 1
print(precision_score(gold, predicts, pos_label=1))
print(recall_score(gold, predicts, pos_label=1))
print(f1_score(gold, predicts, pos_label=1))


# class 0
print()
print(precision_score(gold, predicts, pos_label=0))
print(recall_score(gold, predicts, pos_label=0))
print(f1_score(gold, predicts, pos_label=0))

# macro average
print()
print(precision_score(gold, predicts, pos_label=1, average='macro'))
print(recall_score(gold, predicts, pos_label=1, average='macro'))
print(f1_score(gold, predicts, pos_label=1, average='macro'))


# micro average
print()
print(precision_score(gold, predicts, pos_label=1, average='weighted'))
print(recall_score(gold, predicts, pos_label=1, average='weighted'))
print(f1_score(gold, predicts, pos_label=1, average='weighted'))

0.0
0.0
0.0

0.7209302325581395
1.0
0.8378378378378378

0.36046511627906974
0.5
0.4189189189189189

0.519740400216333
0.7209302325581395
0.6040226272784412


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### Quest 3. Fine-tuning BERT for hate speech detection

In [37]:
model = ClassificationModel("bert", "bert-base-cased", use_cuda=torch.cuda.is_available())

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [19]:
model.train_model(train)

  0%|          | 0/13240 [00:00<?, ?it/s]



Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/1655 [00:00<?, ?it/s]

(1655, 0.4937190058728359)

### Run from here if time is of value

In [38]:
import joblib
#joblib.dump(model, "outputs/best_model.sav")
model = joblib.load("outputs/best_model.sav")

In [39]:
test.head()

Unnamed: 0,id,text,labels
0,15923,#WhoIsQ #WheresTheServer #DumpNike #DECLASFISA...,1
1,27014,"#ConstitutionDay is revered by Conservatives, ...",0
2,30530,#FOXNews #NRA #MAGA #POTUS #TRUMP #2ndAmendmen...,0
3,13876,#Watching #Boomer getting the news that she is...,0
4,60133,#NoPasaran: Unity demo to oppose the far-right...,1


In [44]:
predicts = model.predict(test['text'].tolist())
predicts = predicts[0]

  0%|          | 0/860 [00:00<?, ?it/s]

  0%|          | 0/108 [00:00<?, ?it/s]

In [45]:
gold = test['labels'].tolist()

In [46]:
# class 1
print(precision_score(gold, predicts, pos_label=1))
print(recall_score(gold, predicts, pos_label=1))
print(f1_score(gold, predicts, pos_label=1))


# class 0
print()
print(precision_score(gold, predicts, pos_label=0))
print(recall_score(gold, predicts, pos_label=0))
print(f1_score(gold, predicts, pos_label=0))

# macro average
print()
print(precision_score(gold, predicts, pos_label=1, average='macro'))
print(recall_score(gold, predicts, pos_label=1, average='macro'))
print(f1_score(gold, predicts, pos_label=1, average='macro'))


# micro average
print()
print(precision_score(gold, predicts, pos_label=1, average='weighted'))
print(recall_score(gold, predicts, pos_label=1, average='weighted'))
print(f1_score(gold, predicts, pos_label=1, average='weighted'))

0.7766990291262136
0.6666666666666666
0.7174887892376681

0.8776758409785933
0.9258064516129032
0.901098901098901

0.8271874350524034
0.7962365591397849
0.8092938451682845

0.8494962655779293
0.8534883720930233
0.8498588698818128


In [47]:
from sklearn.metrics import confusion_matrix

In [48]:
confusion_matrix(  gold, predicts)

array([[574,  46],
       [ 80, 160]], dtype=int64)

#### Quest 4. Tokenization within BERT

In [49]:
one_string = " ".join(train['text'].tolist())

In [50]:
tokens = one_string.split(" ")

In [51]:
print(f"tokens: {len(tokens)}")

tokens: 300836


In [52]:
after_split = model.tokenizer.tokenize(one_string)

Token indices sequence length is longer than the specified maximum sequence length for this model (478955 > 512). Running this sequence through the model will result in indexing errors


In [53]:
print(f"words split: {len(set(tokens) - set(after_split))}")

words split: 29717


### average split

In [56]:
(len(after_split) - len(tokens)) / len(set(tokens) - set(after_split))

5.993841908671804

### Meaningful splits

In [58]:
tokens

['@USER',
 'She',
 'should',
 'ask',
 'a',
 'few',
 'native',
 'Americans',
 'what',
 'their',
 'take',
 'on',
 'this',
 'is.',
 '@USER',
 '@USER',
 'Go',
 'home',
 'you’re',
 'drunk!!!',
 '@USER',
 '#MAGA',
 '#Trump2020',
 '👊🇺🇸👊',
 'URL',
 'Amazon',
 'is',
 'investigating',
 'Chinese',
 'employees',
 'who',
 'are',
 'selling',
 'internal',
 'data',
 'to',
 'third-party',
 'sellers',
 'looking',
 'for',
 'an',
 'edge',
 'in',
 'the',
 'competitive',
 'marketplace.',
 'URL',
 '#Amazon',
 '#MAGA',
 '#KAG',
 '#CHINA',
 '#TCOT',
 '@USER',
 'Someone',
 'should\'veTaken"',
 'this',
 'piece',
 'of',
 'shit',
 'to',
 'a',
 'volcano.',
 '😂"',
 '@USER',
 '@USER',
 'Obama',
 'wanted',
 'liberals',
 '&amp;',
 'illegals',
 'to',
 'move',
 'into',
 'red',
 'states',
 '@USER',
 'Liberals',
 'are',
 'all',
 'Kookoo',
 '!!!',
 '@USER',
 '@USER',
 'Oh',
 'noes!',
 'Tough',
 'shit.',
 '@USER',
 'was',
 'literally',
 'just',
 'talking',
 'about',
 'this',
 'lol',
 'all',
 'mass',
 'shootings',
 'like',
 '

In [57]:
after_split

['@',
 'US',
 '##ER',
 'She',
 'should',
 'ask',
 'a',
 'few',
 'native',
 'Americans',
 'what',
 'their',
 'take',
 'on',
 'this',
 'is',
 '.',
 '@',
 'US',
 '##ER',
 '@',
 'US',
 '##ER',
 'Go',
 'home',
 'you',
 '’',
 're',
 'drunk',
 '!',
 '!',
 '!',
 '@',
 'US',
 '##ER',
 '#',
 'MA',
 '##GA',
 '#',
 'Trump',
 '##20',
 '##20',
 '[UNK]',
 'U',
 '##RL',
 'Amazon',
 'is',
 'investigating',
 'Chinese',
 'employees',
 'who',
 'are',
 'selling',
 'internal',
 'data',
 'to',
 'third',
 '-',
 'party',
 'seller',
 '##s',
 'looking',
 'for',
 'an',
 'edge',
 'in',
 'the',
 'competitive',
 'marketplace',
 '.',
 'U',
 '##RL',
 '#',
 'Amazon',
 '#',
 'MA',
 '##GA',
 '#',
 'K',
 '##AG',
 '#',
 'CH',
 '##IN',
 '##A',
 '#',
 'T',
 '##CO',
 '##T',
 '@',
 'US',
 '##ER',
 'Someone',
 'should',
 "'",
 've',
 '##T',
 '##ake',
 '##n',
 '"',
 'this',
 'piece',
 'of',
 'shit',
 'to',
 'a',
 'volcano',
 '.',
 '[UNK]',
 '"',
 '@',
 'US',
 '##ER',
 '@',
 'US',
 '##ER',
 'Obama',
 'wanted',
 'liberal',
 '##s',

### Vocab

In [62]:
length = 0
word=""
for key in model.tokenizer.vocab.keys():
    if len(key) > length:
        length=len(key)
        word = key

In [64]:
length

18

In [63]:
word

'Telecommunications'