In [51]:
import numpy as np

import pandas as pd

from NNs import NeuralNetwork

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [33]:
df = pd.read_csv('data/processed_bert.csv')

print(df.head())

data = df[[str(i) for i in range(768)]+["target"]].values

text_embeddings = data[:,:-1]
targets = data[:,-1]

          0         1         2         3         4         5         6  \
0 -0.822546 -0.472057 -0.579868  0.601334  0.001131 -0.033307  0.621883   
1 -0.932106 -0.444832 -0.946184  0.783482  0.649199 -0.417868  0.810984   
2 -0.786213 -0.454594 -0.989117  0.766382  0.825029 -0.260707  0.601579   
3 -0.929848 -0.641187 -0.984187  0.838084  0.698074 -0.439465  0.848216   
4 -0.782167 -0.475203 -0.785371  0.579609  0.718675 -0.169704  0.333027   

          7         8         9  ...       760       761       762       763  \
0  0.219169 -0.297440 -0.999933  ...  0.746353  0.789280  0.454223  0.652186   
1  0.553593 -0.841144 -0.999991  ...  0.972591  0.666550 -0.809021  0.077952   
2  0.324533 -0.961853 -0.999995  ...  0.992954  0.505823 -0.130431 -0.429334   
3  0.544355 -0.940431 -0.999996  ...  0.995336  0.807030 -0.736760 -0.092651   
4  0.279532 -0.594330 -0.999871  ...  0.968012  0.792530  0.080506  0.567232   

        764       765       766       767  target  \
0  0.384898 -0.

In [34]:
from sklearn.model_selection import train_test_split


print(f"Number of positive samples: {np.sum(targets)}")
print(f"Number of negative samples: {len(targets) - np.sum(targets)}")


X_train, X_test, y_train, y_test = train_test_split(text_embeddings, targets, test_size=0.2, random_state=42)

Number of positive samples: 3271.0
Number of negative samples: 4342.0


## Logistic Regression

In [35]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0, max_iter=1000).fit(X_train, y_train)

train_acc = clf.score(X_train, y_train)
test_acc = clf.score(X_test, y_test)

LR_predictions = clf.predict(text_embeddings)

print(f'Train accuracy: {train_acc}')
print(f'Test accuracy: {test_acc}')

Train accuracy: 0.8288998357963875
Test accuracy: 0.8056467498358503


## Random Forest Classifier

In [36]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(n_estimators=300, max_depth=200, n_jobs=10).fit(X_train, y_train)

train_acc = rf_clf.score(X_train, y_train)
test_acc = rf_clf.score(X_test, y_test)

RF_predictions = rf_clf.predict(text_embeddings)

print(f'Train accuracy: {train_acc}')
print(f'Test accuracy: {test_acc}')

Train accuracy: 0.9977011494252873
Test accuracy: 0.7754432042022325


## SVM

In [37]:
from sklearn.svm import SVC

svm_clf = SVC(C=6).fit(X_train, y_train)

train_acc = svm_clf.score(X_train, y_train)
test_acc = svm_clf.score(X_test, y_test)

SVM_predictions = svm_clf.predict(text_embeddings)

print(f'Train accuracy: {train_acc}')
print(f'Test accuracy: {test_acc}')

Train accuracy: 0.8037766830870279
Test accuracy: 0.8089297439264609


## Neural Network

In [54]:
from torch.optim import Adam
import torch
import skorch

from skorch.callbacks import EarlyStopping, Checkpoint

num_epochs = 100
checkpoint_dir = 'checkpoints/'

callbacks = [EarlyStopping(patience=20, threshold=0.001, threshold_mode='abs', monitor='valid_acc', lower_is_better=False),
            Checkpoint(monitor='valid_acc_best', f_params='DisasterClassifier.pt', dirname=checkpoint_dir)
]

net = skorch.NeuralNetBinaryClassifier(
    NeuralNetwork,
    module__input_size=768,
    module__hidden_size=100,
    module__output_size=1,
    module__num_layers=8,
    optimizer=Adam, 
    optimizer__weight_decay=0.0001,
    lr=0.0001,
    max_epochs=num_epochs, 
    batch_size=32, 
    device=torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu'),
    callbacks=callbacks
)

net.fit(X_train.astype(np.float32), y_train.astype(np.float32))
net.load_params(f_params=checkpoint_dir+'DisasterClassifier.pt', f_optimizer=checkpoint_dir+'optimizer.pt', f_history=checkpoint_dir+'history.json')

train_acc = net.score(X_train.astype(np.float32), y_train.astype(np.float32))
test_acc = net.score(X_test.astype(np.float32), y_test.astype(np.float32))

NN_predictions = net.predict(text_embeddings.astype(np.float32))

print(f'Train accuracy: {train_acc}')
print(f'Test accuracy: {test_acc}')


  epoch    train_loss    valid_acc    valid_loss    cp     dur
-------  ------------  -----------  ------------  ----  ------
      1        [36m0.7274[0m       [32m0.7167[0m        [35m0.5700[0m     +  0.4992
      2        [36m0.6071[0m       [32m0.7332[0m        [35m0.5502[0m     +  0.4874
      3        [36m0.5875[0m       [32m0.7430[0m        [35m0.5393[0m     +  0.4786
      4        [36m0.5680[0m       [32m0.7504[0m        [35m0.5244[0m     +  0.4736
      5        [36m0.5565[0m       [32m0.7586[0m        [35m0.5169[0m     +  0.4963
      6        [36m0.5451[0m       0.7586        [35m0.5058[0m        0.4970
      7        [36m0.5399[0m       [32m0.7652[0m        [35m0.5033[0m     +  0.4875
      8        [36m0.5311[0m       [32m0.7685[0m        [35m0.4952[0m     +  0.4904
      9        0.5347       [32m0.7718[0m        0.4974     +  0.4846
     10        [36m0.5265[0m       [32m0.7750[0m        [35m0.4938[0m     +  0.4876
 

# Finding Common Incorrect Predictions

In [57]:
common_incorrects = np.where(
    (LR_predictions != targets) &
    (RF_predictions != targets) &
    (SVM_predictions != targets) &
    (NN_predictions != targets)
)

for idx in common_incorrects[0]:
    print(df['text'].iloc[idx])
    print(f'LR: {LR_predictions[idx]}, RF: {RF_predictions[idx]}, SVM: {SVM_predictions[idx]}, NN: {NN_predictions[idx]}, Target: {targets[idx]}')

BigRigRadio Live Accident Awareness
LR: 0.0, RF: 0.0, SVM: 0.0, NN: 0, Target: 1.0
the pastor was not in the scene of the accident......who was the owner of the range rover ?
LR: 0.0, RF: 0.0, SVM: 0.0, NN: 0, Target: 1.0
FYI CAD:FYI: ;ACCIDENT PROPERTY DAMAGE;WPD;1600 S 17TH ST
LR: 0.0, RF: 0.0, SVM: 0.0, NN: 0, Target: 1.0
*to Luka* They should all die! All of them! Everything annihilated! - Alois Trancy
LR: 1.0, RF: 1.0, SVM: 1.0, NN: 1, Target: 0.0
(To Luka) 'They should all die! All of them! Everything annihilated!' - Alois Trancy -
LR: 1.0, RF: 1.0, SVM: 1.0, NN: 1, Target: 0.0
@PhilipDuncan @breakfastone People 'annihilated by last nights weather'... Really Philip thought you would have forecast that...
LR: 0.0, RF: 0.0, SVM: 0.0, NN: 0, Target: 1.0
annihilating quarterstaff of annihilation
LR: 0.0, RF: 0.0, SVM: 0.0, NN: 0, Target: 1.0
Armageddon https://t.co/uCSUDk3q1d
LR: 0.0, RF: 0.0, SVM: 0.0, NN: 0, Target: 1.0
Salvation Army hosts rally to reconnect fathers with children: