# Intro to NLP: Assignment 2
## Part A
Nils Breeman, Sebastiaan Bye, Julius Wantenaar

In [13]:
import pandas as pd
import numpy as np
from simpletransformers.classification import ClassificationModel
import torch
from sklearn.metrics import (f1_score, recall_score, precision_score)
from sklearn.metrics import confusion_matrix
import joblib

In [2]:
torch.cuda.is_available()

True

In [4]:
train = pd.read_csv("data/olid-train.csv")
test = pd.read_csv("data/olid-test.csv")
diagnostics = pd.read_csv("data/olid-subset-diagnostic-tests.csv")

### Question 1. Class distributions

In [5]:
print(f"Number of instance with class label 1: {len(train.loc[train['labels'] == 1])}")
print(f"Relative label frequency with class label 1: {len(train.loc[train['labels'] == 1]) / len(train)}")
print(f"Example sentence: {train.loc[train['labels'] == 1]['text'][3]}")
print()
print(f"Number of instance with class label 2: {len(train.loc[train['labels'] == 0])}")
print(f"Relative label frequency with class label 2: {len(train.loc[train['labels'] == 0]) / len(train)}")
print(f"Example sentence: {train.loc[train['labels'] == 0]['text'][2]}")


Number of instance with class label 1: 4400
Relative label frequency with class label 1: 0.3323262839879154
Example sentence: @USER Someone should'veTaken" this piece of shit to a volcano. 😂"

Number of instance with class label 2: 8840
Relative label frequency with class label 2: 0.6676737160120846
Example sentence: Amazon is investigating Chinese employees who are selling internal data to third-party sellers looking for an edge in the competitive marketplace. URL #Amazon #MAGA #KAG #CHINA #TCOT


### Question 2. Baselines

In [6]:
rows = test.shape[0]
predicts = np.random.randint(0,2, rows)
gold = test['labels']

one_recall = []
one_precision = []
one_f1 = []

zero_recall = []
zero_precision = []
zero_f1 = []

macro_recall = []
macro_precision = []
macro_f1 = []

micro_recall = []
micro_precision = []
micro_f1 = []
for i in range(1000):
    # class 1
    one_recall.append(recall_score(gold, predicts, pos_label=1))
    one_precision.append(precision_score(gold, predicts, pos_label=1))
    one_f1.append(f1_score(gold, predicts, pos_label=1))


    # class 0
    zero_recall.append(recall_score(gold, predicts, pos_label=0))
    zero_precision.append(precision_score(gold, predicts, pos_label=0))
    zero_f1.append(f1_score(gold, predicts, pos_label=0))


    # macro average
    one_recall.append(recall_score(gold, predicts, pos_label=1))
    one_precision.append(precision_score(gold, predicts, pos_label=1))
    one_f1.append(f1_score(gold, predicts, pos_label=1))
    
    
    macro_recall.append(recall_score(gold, predicts, pos_label=1, average='macro'))
    macro_precision.append(precision_score(gold, predicts, pos_label=1, average='macro'))
    macro_f1.append(f1_score(gold, predicts, pos_label=1, average='macro'))


    # micro average
    micro_recall.append(recall_score(gold, predicts, pos_label=1, average='weighted'))
    micro_precision.append(precision_score(gold, predicts, pos_label=1, average='weighted'))
    micro_f1.append(f1_score(gold, predicts, pos_label=1, average='weighted'))

In [8]:
print("Random Baseline")
print('Class 1')
print(f'Precision: {np.mean(one_precision)}')
print(f'Recall: {np.mean(one_recall)}')
print(f'F1: {np.mean(one_f1)}')

print()
print('Class 0')
print(f'Precision {np.mean(zero_precision)}')
print(f'Recall {np.mean(zero_recall)}')
print(f'F1{np.mean(zero_f1)}')

print()
print('Macro-average')
print(f'Precision {np.mean(macro_precision)}')
print(f'Recall {np.mean(macro_recall)}')
print(f'F1 {np.mean(macro_f1)}')

print('Weighted average')
print(f'Precision {np.mean(micro_precision)}')
print(f'Recall {np.mean(micro_recall)}')
print(f'F1 {np.mean(micro_f1)}')

Random Baseline
Class 1
Precision: 0.28365384615384626
Recall: 0.4916666666666665
F1: 0.35975609756097554

Class 0
Precision 0.7252252252252253
Recall 0.5193548387096775
F10.6052631578947368

Macro-average
Precision 0.5044395356895358
Recall 0.5055107526881721
F1 0.482509627727856
Weighted average
Precision 0.6019960031587939
Recall 0.5116279069767442
F1 0.5367495596620593


In [9]:
## Majority
predicts = np.zeros(rows)

In [11]:
print('class 1')
print(f'precision {precision_score(gold, predicts, pos_label=1)}')
print(f'recall {recall_score(gold, predicts, pos_label=1)}')
print(f'f1 score {f1_score(gold, predicts, pos_label=1)}')

print()
print('class 0')
print(f'precision {precision_score(gold, predicts, pos_label=0)}')
print(f'recall {recall_score(gold, predicts, pos_label=0)}')
print(f'f1 score {f1_score(gold, predicts, pos_label=0)}')

# macro average
print()
print(precision_score(gold, predicts, pos_label=1, average='macro'))
print(recall_score(gold, predicts, pos_label=1, average='macro'))
print(f1_score(gold, predicts, pos_label=1, average='macro'))


# micro average
print()
print(precision_score(gold, predicts, pos_label=1, average='weighted'))
print(recall_score(gold, predicts, pos_label=1, average='weighted'))
print(f1_score(gold, predicts, pos_label=1, average='weighted'))

class 1
precision 0.0
recall 0.0
f1 score 0.0

class 0
precision 0.7209302325581395
recall 1.0
f1 score 0.8378378378378378

0.36046511627906974
0.5
0.4189189189189189

0.519740400216333
0.7209302325581395
0.6040226272784412


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Question 3. Classification by fine-tuning BERT

In [None]:
model = ClassificationModel("bert", "bert-base-cased", use_cuda=torch.cuda.is_available())

In [None]:
model.train_model(train)

In [15]:
model = joblib.load("outputs/best_model.sav")

In [16]:
predicts = model.predict(test['text'].tolist())
predicts = predicts[0]

  0%|          | 0/860 [00:00<?, ?it/s]

  0%|          | 0/108 [00:00<?, ?it/s]

In [17]:
gold = test['labels'].tolist()

In [19]:
print('class 1')
print(f'precision {precision_score(gold, predicts, pos_label=1)}')
print(f'recall {recall_score(gold, predicts, pos_label=1)}')
print(f'f1 score {f1_score(gold, predicts, pos_label=1)}')

print()
print('class 0')
print(f'Precision {precision_score(gold, predicts, pos_label=0)}')
print(f'Recall {recall_score(gold, predicts, pos_label=0)}')
print(f'F1 score {f1_score(gold, predicts, pos_label=0)}')

print()
print('macro average')
print(precision_score(gold, predicts, pos_label=1, average='macro'))
print(recall_score(gold, predicts, pos_label=1, average='macro'))
print(f1_score(gold, predicts, pos_label=1, average='macro'))

print()
print('micro average')
print(precision_score(gold, predicts, pos_label=1, average='weighted'))
print(recall_score(gold, predicts, pos_label=1, average='weighted'))
print(f1_score(gold, predicts, pos_label=1, average='weighted'))

class 1
precision 0.7766990291262136
recall 0.6666666666666666
f1 score 0.7174887892376681

class 0
Precision 0.8776758409785933
Recall 0.9258064516129032
F1 score 0.901098901098901

macro average
0.8271874350524034
0.7962365591397849
0.8092938451682845

micro average
0.8494962655779293
0.8534883720930233
0.8498588698818128


In [20]:
confusion_matrix(  gold, predicts)

array([[574,  46],
       [ 80, 160]], dtype=int64)

### Question 4. Inspect the tokenization

In [26]:
'''
Part a
'''

one_string = " ".join(train['text'].tolist())
tokens = one_string.split(" ")
print(f"Number of tokens: {len(tokens)}")

Number of tokens: 300836


In [32]:
after_splits = model.tokenizer.tokenize(one_string)
print(f"Number of words split: {len(set(tokens) - set(after_splits))}")

Number of words split: 29717


In [28]:
'''
b. 
'''
#print(f"Average number {(len(after_split) - len(tokens)) / len(set(tokens) - set(after_split))}")
## Final answer
count_all_tokens = 0
count_split_words = 0
count_subwords = 0
for text in train['text']:
    toks = text.split(" ")
    count_all_tokens += len(toks)
    after_split = []
    for tok in toks:
        split_token = model.tokenizer.tokenize(tok)
        after_split.append(split_token)
        if len(split_token) > 1:
            count_split_words += 1
            count_subwords += len(split_token)

print('average number of subwords per token: ', count_subwords / count_split_words)


average number of subwords per token:  2.9271822752127963


In [30]:
'''
c. Not meaningful: Pick manually not meaningful
'''
tokens

['@USER',
 'She',
 'should',
 'ask',
 'a',
 'few',
 'native',
 'Americans',
 'what',
 'their',
 'take',
 'on',
 'this',
 'is.',
 '@USER',
 '@USER',
 'Go',
 'home',
 'you’re',
 'drunk!!!',
 '@USER',
 '#MAGA',
 '#Trump2020',
 '👊🇺🇸👊',
 'URL',
 'Amazon',
 'is',
 'investigating',
 'Chinese',
 'employees',
 'who',
 'are',
 'selling',
 'internal',
 'data',
 'to',
 'third-party',
 'sellers',
 'looking',
 'for',
 'an',
 'edge',
 'in',
 'the',
 'competitive',
 'marketplace.',
 'URL',
 '#Amazon',
 '#MAGA',
 '#KAG',
 '#CHINA',
 '#TCOT',
 '@USER',
 'Someone',
 'should\'veTaken"',
 'this',
 'piece',
 'of',
 'shit',
 'to',
 'a',
 'volcano.',
 '😂"',
 '@USER',
 '@USER',
 'Obama',
 'wanted',
 'liberals',
 '&amp;',
 'illegals',
 'to',
 'move',
 'into',
 'red',
 'states',
 '@USER',
 'Liberals',
 'are',
 'all',
 'Kookoo',
 '!!!',
 '@USER',
 '@USER',
 'Oh',
 'noes!',
 'Tough',
 'shit.',
 '@USER',
 'was',
 'literally',
 'just',
 'talking',
 'about',
 'this',
 'lol',
 'all',
 'mass',
 'shootings',
 'like',
 '

In [33]:
after_splits

['@',
 'US',
 '##ER',
 'She',
 'should',
 'ask',
 'a',
 'few',
 'native',
 'Americans',
 'what',
 'their',
 'take',
 'on',
 'this',
 'is',
 '.',
 '@',
 'US',
 '##ER',
 '@',
 'US',
 '##ER',
 'Go',
 'home',
 'you',
 '’',
 're',
 'drunk',
 '!',
 '!',
 '!',
 '@',
 'US',
 '##ER',
 '#',
 'MA',
 '##GA',
 '#',
 'Trump',
 '##20',
 '##20',
 '[UNK]',
 'U',
 '##RL',
 'Amazon',
 'is',
 'investigating',
 'Chinese',
 'employees',
 'who',
 'are',
 'selling',
 'internal',
 'data',
 'to',
 'third',
 '-',
 'party',
 'seller',
 '##s',
 'looking',
 'for',
 'an',
 'edge',
 'in',
 'the',
 'competitive',
 'marketplace',
 '.',
 'U',
 '##RL',
 '#',
 'Amazon',
 '#',
 'MA',
 '##GA',
 '#',
 'K',
 '##AG',
 '#',
 'CH',
 '##IN',
 '##A',
 '#',
 'T',
 '##CO',
 '##T',
 '@',
 'US',
 '##ER',
 'Someone',
 'should',
 "'",
 've',
 '##T',
 '##ake',
 '##n',
 '"',
 'this',
 'piece',
 'of',
 'shit',
 'to',
 'a',
 'volcano',
 '.',
 '[UNK]',
 '"',
 '@',
 'US',
 '##ER',
 '@',
 'US',
 '##ER',
 'Obama',
 'wanted',
 'liberal',
 '##s',

In [35]:
'''
d. Longest subword
'''

length = 0
word=""
for key in model.tokenizer.vocab.keys():
    if len(key) > 2:
        if key[0] == "#" and key[1] == "#":
            if len(key) > length:
                length=len(key)
                word = key
                
print(length)
print(word)

16
##sunderstanding
