In [1]:
import numpy as np

import pandas as pd

from NNs import NeuralNetwork

In [2]:
df = pd.read_csv('data/processed_bert.csv')

print(df.head())

data = df[[str(i) for i in range(768)]+["target"]].values

np.random.shuffle(data)

          0         1         2         3         4         5         6  \
0 -0.822546 -0.472057 -0.579868  0.601334  0.001131 -0.033307  0.621883   
1 -0.932106 -0.444832 -0.946184  0.783482  0.649199 -0.417868  0.810984   
2 -0.786213 -0.454594 -0.989117  0.766382  0.825029 -0.260707  0.601579   
3 -0.929848 -0.641187 -0.984187  0.838084  0.698074 -0.439465  0.848216   
4 -0.782167 -0.475203 -0.785371  0.579609  0.718675 -0.169704  0.333027   

          7         8         9  ...       760       761       762       763  \
0  0.219169 -0.297440 -0.999933  ...  0.746353  0.789280  0.454223  0.652186   
1  0.553593 -0.841144 -0.999991  ...  0.972591  0.666550 -0.809021  0.077952   
2  0.324533 -0.961853 -0.999995  ...  0.992954  0.505823 -0.130431 -0.429334   
3  0.544355 -0.940431 -0.999996  ...  0.995336  0.807030 -0.736760 -0.092651   
4  0.279532 -0.594330 -0.999871  ...  0.968012  0.792530  0.080506  0.567232   

        764       765       766       767  target  \
0  0.384898 -0.

In [3]:
from sklearn.model_selection import train_test_split

text_embeddings = data[:,:-1]
targets = data[:,-1]

X_train, X_test, y_train, y_test = train_test_split(text_embeddings, targets, test_size=0.2, random_state=42)

## Logistic Regression

In [32]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0, max_iter=1000).fit(X_train, y_train)

train_acc = clf.score(X_train, y_train)
test_acc = clf.score(X_test, y_test)

print(f'Train accuracy: {train_acc}')
print(f'Test accuracy: {test_acc}')

Train accuracy: 0.832183908045977
Test accuracy: 0.7977675640183848


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Random Forest Classifier

In [33]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(n_estimators=300, max_depth=200, n_jobs=10).fit(X_train, y_train)

train_acc = rf_clf.score(X_train, y_train)
test_acc = rf_clf.score(X_test, y_test)

print(f'Train accuracy: {train_acc}')
print(f'Test accuracy: {test_acc}')

Train accuracy: 0.997208538587849
Test accuracy: 0.7741300065659882


## SVM

In [6]:
from sklearn.svm import SVC

svm_clf = SVC(C=6).fit(X_train, y_train)

train_acc = svm_clf.score(X_train, y_train)
test_acc = svm_clf.score(X_test, y_test)

print(f'Train accuracy: {train_acc}')
print(f'Test accuracy: {test_acc}')

Train accuracy: 0.8044334975369458
Test accuracy: 0.7951411687458962


## Neural Network

In [23]:
from torch.optim import Adam
from torch.nn import BCELoss
import skorch

from skorch.callbacks import EarlyStopping, Checkpoint

num_epochs = 100
checkpoint_dir = 'checkpoints/'

callbacks = [EarlyStopping(patience=20, threshold=0.001, threshold_mode='abs', monitor='valid_acc', lower_is_better=False),
            Checkpoint(monitor='valid_acc_best', f_params='DisasterClassifier.pt', dirname=checkpoint_dir)
]

net = skorch.NeuralNetBinaryClassifier(
    NeuralNetwork,
    module__input_size=768,
    module__hidden_size=100,
    module__output_size=1,
    module__num_layers=6,
    optimizer=Adam, 
    optimizer__weight_decay=0.00001,
    lr=0.0001,
    max_epochs=num_epochs, 
    batch_size=32, 
    device='cuda:0',
    callbacks=callbacks
)

net.fit(X_train.astype(np.float32), y_train.astype(np.float32))
net.load_params(f_params=checkpoint_dir+'DisasterClassifier.pt', f_optimizer=checkpoint_dir+'optimizer.pt', f_history=checkpoint_dir+'history.json')

train_acc = net.score(X_train.astype(np.float32), y_train.astype(np.float32))
test_acc = net.score(X_test.astype(np.float32), y_test.astype(np.float32))

print(f'Train accuracy: {train_acc}')
print(f'Test accuracy: {test_acc}')


  epoch    train_loss    valid_acc    valid_loss    cp     dur
-------  ------------  -----------  ------------  ----  ------
      1        [36m0.6461[0m       [32m0.6773[0m        [35m0.6068[0m     +  0.5581
      2        [36m0.5940[0m       [32m0.7069[0m        [35m0.5813[0m     +  0.5616
      3        [36m0.5774[0m       [32m0.7266[0m        [35m0.5705[0m     +  0.5523
      4        [36m0.5685[0m       [32m0.7422[0m        [35m0.5599[0m     +  0.4420
      5        [36m0.5588[0m       [32m0.7537[0m        [35m0.5473[0m     +  0.4631
      6        [36m0.5514[0m       0.7504        [35m0.5448[0m        0.4907
      7        [36m0.5454[0m       [32m0.7594[0m        [35m0.5376[0m     +  0.3648
      8        [36m0.5399[0m       [32m0.7783[0m        [35m0.5284[0m     +  0.3639
      9        [36m0.5361[0m       [32m0.7808[0m        [35m0.5224[0m     +  0.3637
     10        [36m0.5339[0m       [32m0.7841[0m        [35m0.5193[