# Imports & Setup

In [12]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [13]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [14]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers, Sequential
from tensorflow.keras.callbacks import EarlyStopping

In [15]:
from NLP_Natural_Disasters.data import get_data, clean_data

In [16]:
pd.set_option("display.max_rows", None, "display.max_columns", None, 'display.max_colwidth', None)

In [17]:
df = clean_data(get_data()).drop(columns=['id'])
df = df[df['text'] != '']
df.head(5)

Unnamed: 0,text,target
0,deed reason earthquake may allah forgive u,1
1,forest fire near la canada,1
2,resident asked shelter place officer evacuation shelter place order expected,1
3,people receive wildfire evacuation order california,1
4,got sent photo alaska smoke wildfire school,1


# Simple Embedding

In [18]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['target'], test_size=0.2)

In [19]:
# This initializes a Keras utilities that does all the tokenization for you
tokenizer = Tokenizer()

# The tokenization learns a dictionnary that maps a token (integer) to each word
# It can be done only on the train set - we are not supposed to know the test set !
# This tokenization also lower your words, apply some filters, and so on - you can check the doc if you want
tokenizer.fit_on_texts(X_train)
    
# We apply the tokenization to the train and test set
X_train_token = tokenizer.texts_to_sequences(X_train)
X_test_token = tokenizer.texts_to_sequences(X_test)

vocab_size = len(tokenizer.word_index)

In [20]:
X_train_pad = pad_sequences(X_train_token, dtype='float32', padding='post')
X_test_pad = pad_sequences(X_test_token, dtype='float32', padding='post')

In [21]:
X_train_pad

array([[ 110., 4796., 1015., ...,    0.,    0.,    0.],
       [ 732.,  333.,  282., ...,    0.,    0.,    0.],
       [3262.,  733.,  826., ...,    0.,    0.,    0.],
       ...,
       [  46.,   41., 1607., ...,    0.,    0.,    0.],
       [ 115.,  820.,  371., ...,    0.,    0.,    0.],
       [ 789., 2759., 1456., ...,    0.,    0.,    0.]], dtype=float32)

# Simple Model

In [22]:
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [23]:
# Params dimension vectors
model = Sequential([
    layers.Embedding(input_dim=vocab_size+1, output_dim=50 , mask_zero=True),
    layers.LSTM(20),
    layers.Dense(10, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

model.compile(
    loss='binary_crossentropy',
    optimizer='rmsprop',
    metrics=['acc']
)

model.summary()

2022-04-12 14:54:08.692711: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:922] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-04-12 14:54:08.694369: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-04-12 14:54:08.694426: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory
2022-04-12 14:54:08.694468: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublasLt.so.11'; dlerror: libcublasLt.so.11: cannot open shared object file: No such file or directory
2022-04-12 14:54:08.694507: W tensorflow/stream_executor/platform/default/dso_loader.cc:6

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 50)          273100    
                                                                 
 lstm (LSTM)                 (None, 20)                5680      
                                                                 
 dense (Dense)               (None, 10)                210       
                                                                 
 dense_1 (Dense)             (None, 1)                 11        
                                                                 
Total params: 279,001
Trainable params: 279,001
Non-trainable params: 0
_________________________________________________________________


In [24]:
es = EarlyStopping(patience=5)
history = model.fit(X_train_pad, y_train, validation_split=0.3, batch_size=16, epochs=20, callbacks=[es])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20


In [None]:
loss, accuracy, f1_score, precision, recall = model.evaluate(X_test_pad, y_test, verbose=0)

In [None]:
f1_score

In [None]:
model.predict(X_test_pad)

In [None]:
y_test