In [3]:
import numpy as np

import pandas as pd

from NNs import NeuralNetwork

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
#df = pd.read_csv('data/processed_bert.csv')
df = pd.read_csv('data/processed_fine_encode4_bert.csv')
#df = pd.read_csv('data/processed_fine_pool_bert.csv')

print(df.head())

data = df[[str(i) for i in range(768)]+["target"]].values

text_embeddings = data[:,:-1]
targets = data[:,-1]

          0         1         2         3         4         5         6  \
0  0.487306 -0.395449  0.227371 -0.557317 -0.071077 -0.160110 -0.986834   
1  0.185772 -0.816681 -0.999828 -0.292069  0.995954 -0.779973 -0.969660   
2  0.551382 -0.382743 -0.996989 -0.072605  0.969204 -0.422071 -0.918933   
3  0.428486 -0.747063 -0.999662 -0.425291  0.983544 -0.670548 -0.982080   
4  0.792350 -0.606614 -0.993326 -0.723623  0.978465 -0.495949 -0.996044   

          7         8         9  ...       760       761       762       763  \
0  0.195573  0.151923  0.948185  ...  0.165386 -0.737758  0.477984  0.822272   
1  0.831493 -0.999127 -0.866050  ...  0.999826 -0.940624 -0.942294 -0.900074   
2  0.404051 -0.989704 -0.951597  ...  0.998382 -0.825524 -0.105512 -0.669057   
3  0.747862 -0.997347 -0.849096  ...  0.999540 -0.921272 -0.704638 -0.817295   
4  0.553705 -0.972228  0.881664  ...  0.998808 -0.952635 -0.463717 -0.563676   

        764       765       766       767  target  \
0 -0.803042 -0.

In [5]:
from sklearn.model_selection import train_test_split


print(f"Number of positive samples: {np.sum(targets)}")
print(f"Number of negative samples: {len(targets) - np.sum(targets)}")


X_train, X_test, y_train, y_test = train_test_split(text_embeddings, targets, test_size=0.2, random_state=42)

Number of positive samples: 3271.0
Number of negative samples: 4342.0


## Logistic Regression

In [6]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0, max_iter=1000).fit(X_train, y_train)

train_acc = clf.score(X_train, y_train)
test_acc = clf.score(X_test, y_test)

LR_predictions = clf.predict(text_embeddings)

print(f'Train accuracy: {train_acc}')
print(f'Test accuracy: {test_acc}')

Train accuracy: 0.9154351395730707
Test accuracy: 0.8975705843729481


## Random Forest Classifier

In [7]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(n_estimators=300, max_depth=200, n_jobs=10).fit(X_train, y_train)

train_acc = rf_clf.score(X_train, y_train)
test_acc = rf_clf.score(X_test, y_test)

RF_predictions = rf_clf.predict(text_embeddings)

print(f'Train accuracy: {train_acc}')
print(f'Test accuracy: {test_acc}')

Train accuracy: 0.9977011494252873
Test accuracy: 0.8942875902823375


## SVM

In [8]:
from sklearn.svm import SVC

svm_clf = SVC(C=6).fit(X_train, y_train)

train_acc = svm_clf.score(X_train, y_train)
test_acc = svm_clf.score(X_test, y_test)

SVM_predictions = svm_clf.predict(text_embeddings)

print(f'Train accuracy: {train_acc}')
print(f'Test accuracy: {test_acc}')

Train accuracy: 0.9055829228243021
Test accuracy: 0.9041365725541693


## Neural Network

In [9]:
from torch.optim import Adam
import torch
import skorch

from skorch.callbacks import EarlyStopping, Checkpoint

num_epochs = 100
checkpoint_dir = 'checkpoints/'

callbacks = [EarlyStopping(patience=20, threshold=0.001, threshold_mode='abs', monitor='valid_acc', lower_is_better=False),
            Checkpoint(monitor='valid_acc_best', f_params='DisasterClassifier.pt', dirname=checkpoint_dir)
]

net = skorch.NeuralNetBinaryClassifier(
    NeuralNetwork,
    module__input_size=768,
    module__hidden_size=100,
    module__output_size=1,
    module__num_layers=4,
    optimizer=Adam, 
    optimizer__weight_decay=0.00001,
    lr=0.0001,
    max_epochs=num_epochs, 
    batch_size=32, 
    device=torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu'),
    callbacks=callbacks
)

net.fit(X_train.astype(np.float32), y_train.astype(np.float32))
net.load_params(f_params=checkpoint_dir+'DisasterClassifier.pt', f_optimizer=checkpoint_dir+'optimizer.pt', f_history=checkpoint_dir+'history.json')

train_acc = net.score(X_train.astype(np.float32), y_train.astype(np.float32))
test_acc = net.score(X_test.astype(np.float32), y_test.astype(np.float32))

NN_predictions = net.predict(text_embeddings.astype(np.float32))

print(f'Train accuracy: {train_acc}')
print(f'Test accuracy: {test_acc}')


  epoch    train_loss    valid_acc    valid_loss    cp     dur
-------  ------------  -----------  ------------  ----  ------
      1        [36m0.3249[0m       [32m0.8949[0m        [35m0.2664[0m     +  0.5791
      2        [36m0.2893[0m       0.8892        [35m0.2661[0m        0.3952
      3        [36m0.2846[0m       0.8900        0.2665        0.3939
      4        [36m0.2814[0m       0.8933        [35m0.2657[0m        0.3935
      5        [36m0.2779[0m       0.8933        0.2668        0.4939
      6        [36m0.2756[0m       0.8933        0.2675        0.3938
      7        [36m0.2738[0m       [32m0.8957[0m        0.2681     +  0.4216
      8        [36m0.2718[0m       0.8949        0.2685        0.3936
      9        [36m0.2701[0m       0.8949        0.2683        0.3942
     10        [36m0.2688[0m       0.8924        0.2681        0.3930
     11        [36m0.2673[0m       0.8924        0.2684        0.3929
     12        [36m0.2660[0m       

# Finding Common Incorrect Predictions

In [10]:
common_incorrects = np.where(
    (LR_predictions != targets) &
    (RF_predictions != targets) &
    (SVM_predictions != targets) &
    (NN_predictions != targets)
)

for idx in common_incorrects[0]:
    print(df['text'].iloc[idx])
    print(f'LR: {LR_predictions[idx]}, RF: {RF_predictions[idx]}, SVM: {SVM_predictions[idx]}, NN: {NN_predictions[idx]}, Target: {targets[idx]}')

BigRigRadio Live Accident Awareness
LR: 0.0, RF: 0.0, SVM: 0.0, NN: 0, Target: 1.0
U.S National Park Services Tonto National Forest: Stop the Annihilation of the Salt River Wild Horse... https://t.co/sW1sBua3mN via @Change
LR: 0.0, RF: 0.0, SVM: 0.0, NN: 0, Target: 1.0
Armageddon https://t.co/uCSUDk3q1d
LR: 0.0, RF: 0.0, SVM: 0.0, NN: 0, Target: 1.0
Salvation Army hosts rally to reconnect fathers with children: The Salvation Army is hosting a back to school rallyÛ_ http://t.co/rDjpor3AZg
LR: 0.0, RF: 0.0, SVM: 0.0, NN: 0, Target: 1.0
Once again black men didn't make it that way. White men did so why are black men getting attacked  https://t.co/chkP0GfyNJ
LR: 1.0, RF: 1.0, SVM: 1.0, NN: 1, Target: 0.0
Chiasson Sens can't come to deal #ColoradoAvalanche #Avalanche http://t.co/2bk7laGMa9 http://t.co/bkDGCfsuiQ
LR: 0.0, RF: 0.0, SVM: 0.0, NN: 0, Target: 1.0
Avalanche City - Sunset http://t.co/48h3tLvLXr #nowplay #listen #radio
LR: 0.0, RF: 0.0, SVM: 0.0, NN: 0, Target: 1.0
STAR WARS POWER