# Imports & Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers, Sequential
from tensorflow.keras.callbacks import EarlyStopping

2022-04-08 14:23:38.080199: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-04-08 14:23:38.080441: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [4]:
from NLP_Natural_Disasters.data import get_data, clean_data

In [5]:
pd.set_option("display.max_rows", None, "display.max_columns", None, 'display.max_colwidth', None)

In [6]:
df = clean_data(get_data()).drop(columns=['id'])
df = df[df['text'] != '']
df.head(50)

Unnamed: 0,text,target
0,deed reason earthquake may allah forgive u,1
1,forest fire near la canada,1
2,resident asked shelter place officer evacuation shelter place order expected,1
3,people receive wildfire evacuation order california,1
4,got sent photo alaska smoke wildfire school,1
5,rockyfire update california hwy closed due lake county fire cafire wildfire,1
6,flood disaster heavy rain cause flash flooding street colorado spring area,1
7,im top hill see fire wood,1
8,there emergency evacuation happening building across street,1
9,im afraid tornado coming area,1


# Simple Embedding

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['target'], test_size=0.2)

In [8]:
# This initializes a Keras utilities that does all the tokenization for you
tokenizer = Tokenizer()

# The tokenization learns a dictionnary that maps a token (integer) to each word
# It can be done only on the train set - we are not supposed to know the test set !
# This tokenization also lower your words, apply some filters, and so on - you can check the doc if you want
tokenizer.fit_on_texts(X_train)
    
# We apply the tokenization to the train and test set
X_train_token = tokenizer.texts_to_sequences(X_train)
X_test_token = tokenizer.texts_to_sequences(X_test)

vocab_size = len(tokenizer.word_index)

In [9]:
X_train_pad = pad_sequences(X_train_token, dtype='float32', padding='post')
X_test_pad = pad_sequences(X_test_token, dtype='float32', padding='post')

# Simple Model

In [10]:
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [11]:
# Params dimension vectors
model = Sequential([
    layers.Embedding(input_dim=vocab_size+1, output_dim=50 , mask_zero=True),
    layers.LSTM(20),
    layers.Dense(10, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

model.compile(
    loss='binary_crossentropy',
    optimizer='rmsprop',
    metrics=['acc',f1_m,precision_m, recall_m]
)

model.summary()

2022-04-08 14:23:50.732324: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:922] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-04-08 14:23:50.732703: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-04-08 14:23:50.732779: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory
2022-04-08 14:23:50.732833: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublasLt.so.11'; dlerror: libcublasLt.so.11: cannot open shared object file: No such file or directory
2022-04-08 14:23:50.732877: W tensorflow/stream_executor/platform/default/dso_loader.cc:6

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 50)          274150    
                                                                 
 lstm (LSTM)                 (None, 20)                5680      
                                                                 
 dense (Dense)               (None, 10)                210       
                                                                 
 dense_1 (Dense)             (None, 1)                 11        
                                                                 
Total params: 280,051
Trainable params: 280,051
Non-trainable params: 0
_________________________________________________________________


In [None]:
es = EarlyStopping(patience=5)
history = model.fit(X_train_pad, y_train, validation_split=0.3, batch_size=16, epochs=20, callbacks=[es])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
 50/267 [====>.........................] - ETA: 6s - loss: 0.2775 - acc: 0.8900 - f1_m: 0.8734 - precision_m: 0.9023 - recall_m: 0.8685

In [149]:
loss, accuracy, f1_score, precision, recall = model.evaluate(X_test_pad, y_test, verbose=0)

In [150]:
f1_score

0.7345847487449646

In [165]:
model.predict(X_test_pad)

array([[0.00679779],
       [0.02298656],
       [0.70521927],
       ...,
       [0.04932457],
       [0.98169494],
       [0.00902265]], dtype=float32)

In [166]:
y_test

1948    0
112     0
4580    0
6323    0
3138    0
1921    0
5831    1
7070    0
5762    0
3545    0
5591    1
5039    0
2556    1
4133    1
5438    0
5632    0
1545    1
2441    1
1079    0
1187    1
4791    0
5718    1
5199    0
7432    0
432     0
3226    0
6129    0
1047    0
4841    1
3121    0
7579    0
1899    0
5000    0
1144    1
3957    1
1773    1
3561    1
117     0
5143    1
1870    0
1960    1
5690    0
4569    1
3816    0
2730    0
137     1
7302    1
6986    0
3335    1
3574    1
407     1
2027    0
4845    0
6242    0
1692    0
4277    0
6629    0
5774    0
7561    0
2891    0
6212    1
1959    1
5866    0
849     0
587     1
857     1
5350    0
5313    1
770     0
6952    0
1813    1
3583    1
2739    1
598     1
1596    1
525     0
4795    0
2085    1
5563    0
6886    0
7215    1
6818    1
3605    0
5990    0
536     0
2444    1
7554    0
7008    0
1774    1
47      0
5323    0
6922    0
7522    1
5613    1
3695    1
1374    0
3526    1
7548    0
6100    1
1156    1
