In [1]:
import numpy as np

import pandas as pd

from NNs import NeuralNetwork

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import f1_score

%load_ext autoreload
%autoreload 2

In [2]:
#df = pd.read_csv('data/processed_bert.csv')
df = pd.read_csv('data/processed_fine_encode4_pool_bert.csv')
#df = pd.read_csv('data/processed_fine_pool_bert.csv')

print(df.head())

data = df[[str(i) for i in range(768)]+["target"]].values

text_embeddings = data[:,:-1]
targets = data[:,-1]

          0         1         2         3         4         5         6  \
0 -0.706173 -0.479326  0.695775  0.105923  0.223339 -0.020102  0.166236   
1 -0.292061 -0.203339 -0.012935 -0.440923  0.521172  0.085015  0.007540   
2 -0.131100  0.053226 -0.548904 -0.121370  0.658592 -0.089726 -0.297121   
3  0.128701  0.005418  0.173510 -0.634204  0.039833  0.456838 -0.580948   
4 -0.220905 -0.434828 -0.033333 -0.451370  0.842869  0.286226 -0.604444   

          7         8         9  ...       760       761       762       763  \
0 -0.118495  0.227647 -0.517054  ... -0.092164  0.139747 -0.496852  0.922823   
1 -0.240759 -0.673398  0.952635  ... -0.586681 -0.748257 -0.970952  0.007169   
2 -0.192829 -0.740907 -0.001359  ...  0.929475 -0.510744 -0.698830  0.517608   
3 -0.463875 -0.434695  0.994325  ... -0.331720 -0.740078 -0.848269  0.427743   
4 -0.219417 -0.062626  0.475039  ...  0.892794 -0.146139 -0.736430  0.861435   

        764       765       766       767  target  \
0 -0.391260  0.

In [3]:
from sklearn.model_selection import train_test_split


print(f"Number of positive samples: {np.sum(targets)}")
print(f"Number of negative samples: {len(targets) - np.sum(targets)}")

x = text_embeddings
y = targets


Number of positive samples: 3271.0
Number of negative samples: 4342.0


## Logistic Regression

In [4]:
from sklearn.linear_model import LogisticRegression

gs = GridSearchCV(LogisticRegression(max_iter=1000, n_jobs=2), param_grid={'C': [0.1]}, cv=5, n_jobs=8)

gs = gs.fit(x, y)

best_lr = gs.best_estimator_

cv_acc = gs.best_score_

LR_predictions = best_lr.predict(x)

print(f'Cross Validation accuracy: {cv_acc}')
print(f'Best LR: {gs.best_params_}')
print(f'F1 Score: {f1_score(y, LR_predictions)}')

Cross Validation accuracy: 0.9059525298899139
Best LR: {'C': 0.1}
F1 Score: 0.8948545861297538


## Random Forest Classifier

In [8]:
from sklearn.ensemble import RandomForestClassifier

gs = GridSearchCV(RandomForestClassifier(n_jobs=2), param_grid={'n_estimators': [300]}, cv=5, n_jobs=8)
gs = gs.fit(x, y)

best_rf = gs.best_estimator_

cv_acc = gs.best_score_

RF_predictions = best_rf.predict(x)

print(f'Cross Validation accuracy: {cv_acc}')
print(f'Best RF params: {gs.best_params_}')
print(f'F1 Score: {f1_score(y, RF_predictions)}')

Cross Validation accuracy: 0.9016170795071281
Best RF params: {'n_estimators': 300}
F1 Score: 0.99663505659223


## SVM

In [9]:
from sklearn.svm import SVC

gs = GridSearchCV(SVC(), param_grid={'C': [5]}, cv=5, n_jobs=8)

gs = gs.fit(x, y)

best_svm = gs.best_estimator_

cv_acc = gs.best_score_

SVM_predictions = best_svm.predict(x)

print(f'Cross Validation accuracy: {cv_acc}')
print(f"Best SVM: {gs.best_params_}")
print(f'F1 Score: {f1_score(y, SVM_predictions)}')

Cross Validation accuracy: 0.9027993025039625
Best SVM: {'C': 5}
F1 Score: 0.8960411972964274


## Neural Network

In [10]:
from torch.optim import Adam
import torch
import skorch

from skorch.callbacks import EarlyStopping, Checkpoint

num_epochs = 100
checkpoint_dir = 'checkpoints/'

callbacks = [EarlyStopping(patience=20, threshold=0.001, threshold_mode='abs', monitor='valid_acc', lower_is_better=False),
            Checkpoint(monitor='valid_acc_best', f_params='DisasterClassifier.pt', dirname=checkpoint_dir)
]

net = skorch.NeuralNetBinaryClassifier(
    NeuralNetwork,
    module__input_size=768,
    module__hidden_size=100,
    module__output_size=1,
    module__num_layers=6,
    optimizer=Adam, 
    optimizer__weight_decay=0.00001,
    lr=0.0001,
    max_epochs=num_epochs, 
    batch_size=32, 
    device=torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu'),
    callbacks=callbacks,
    verbose=0
)

gs = GridSearchCV(net, param_grid={'lr': [0.001]}, cv=5, n_jobs=8)

gs = gs.fit(x.astype(np.float32), y.astype(np.float32))

cv_acc = gs.best_score_

best_net = gs.best_estimator_

best_net.load_params(f_params=checkpoint_dir+'DisasterClassifier.pt', f_optimizer=checkpoint_dir+'optimizer.pt', f_history=checkpoint_dir+'history.json')

NN_predictions = best_net.predict(text_embeddings.astype(np.float32))

print(f'Cross Validation accuracy: {cv_acc}')
print(f'Validation accuracy: {best_net.history[-1]["valid_acc"]}')
print(f'F1 Score: {f1_score(y, NN_predictions)}')

Cross Validation accuracy: 0.8907122759820293
Validation accuracy: 0.9061063690085358
F1 Score: 0.8884989631520178


# Finding Common Incorrect Predictions

In [11]:
common_incorrects = np.where(
    (LR_predictions != targets) &
    (RF_predictions != targets) &
    (SVM_predictions != targets) &
    (NN_predictions != targets)
)

for idx in common_incorrects[0]:
    print(df['text'].iloc[idx])
    print(f'LR: {LR_predictions[idx]}, RF: {RF_predictions[idx]}, SVM: {SVM_predictions[idx]}, NN: {NN_predictions[idx]}, Target: {targets[idx]}')

Mmmmmm I'm burning.... I'm burning buildings I'm building.... Oooooohhhh oooh ooh...
LR: 0.0, RF: 0.0, SVM: 0.0, NN: 0, Target: 1.0
like for the music video I want some real action shit like burning buildings and police chases not some weak ben winston shit
LR: 0.0, RF: 0.0, SVM: 0.0, NN: 0, Target: 1.0
.POTUS #StrategicPatience is a strategy for #Genocide; refugees; IDP Internally displaced people; horror; etc. https://t.co/rqWuoy1fm4
LR: 1.0, RF: 1.0, SVM: 1.0, NN: 1, Target: 0.0
He came to a land which was engulfed in tribal war and turned it into a land of peace i.e. Madinah. #ProphetMuhammad #islam
LR: 0.0, RF: 0.0, SVM: 0.0, NN: 0, Target: 1.0
He came to a land which was engulfed in tribal war and turned it into a land of peace i.e. Madinah. #ProphetMuhammad #islam
LR: 0.0, RF: 0.0, SVM: 0.0, NN: 0, Target: 1.0
Caution: breathing may be hazardous to your health.
LR: 0.0, RF: 0.0, SVM: 0.0, NN: 0, Target: 1.0
The Prophet (peace be upon him) said 'Save yourself from Hellfire even i