# Disaster Tweets
-------------------------------------------------------
>In this second phase of the project, we will:  
>> define and train the classification model based on the **preprocessed tweets**.

-------------------------------------------------

# Import useful Librairies 

In [1]:
import pandas as pd
import numpy as np

# Machine learning librairies
import tensorflow as tf

# global params
pre_file_path = "data/pre_train.csv"
models_path = 'models/'

2022-04-24 21:42:13.851155: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-04-24 21:42:13.851223: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


# Load Preprocessed Data

In [2]:
tweets = pd.read_csv(pre_file_path)
tweets.head()

Unnamed: 0,keyword,text,target,word_count,unique_word_count,stop_word_count,url_count,char_count,punctuation_count,hashtag_count,at_count,clean_text,clean_keyword,keyword_text
0,,Our Deeds are the Reason of this #earthquake M...,1,13,13,8,0,69,1,1,0,deed reason earthquake allah forgive,,deed reason earthquake allah forgive
1,,Forest fire near La Ronge Sask. Canada,1,7,7,0,0,38,1,0,0,forest fire near ronge sask canada,,forest fire near ronge sask canada
2,,All residents asked to 'shelter in place' are ...,1,22,20,11,0,133,3,0,0,resident ask shelter place notify officer evac...,,resident ask shelter place notify officer eva...
3,,"13,000 people receive #wildfires evacuation or...",1,8,8,1,0,65,2,1,0,people receive wildfire evacuation order calif...,,people receive wildfire evacuation order cali...
4,,Just got sent this photo from Ruby #Alaska as ...,1,16,15,7,0,88,2,2,0,got send photo ruby alaska smoke wildfire pour...,,got send photo ruby alaska smoke wildfire pou...


# Train the model

## Prepare data for training

1) Build the **vocabulary** using Tokenizer of keras  
2) Transform the plain texts to sequences of integers while only considering the *k* most common words  
3) Tensorify the list of sequences with their classes 
4) Split the data in train and test sets and generate the batchs

> Parameters :   
>> **_max_features_**: max number of words to take into account for the model training   
>> **_train_ratio_**: used to split the data into training and validation sets        
>> **_batch_size_** = size of the batches    
>> **_seq_length_** = the length of the integer sequences. It is induced from max_features   

###  Parameters

In [3]:
max_features = 5_000
train_ratio = 0.8     
batch_size = 32

###  Build Vocabulary

In [4]:
#build the vocab and keep the K most common word based on word frequency (K = max_features)
# max_features+ 1(1 OOV token)
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words = max_features + 1)
tokenizer.fit_on_texts(tweets["keyword_text"])
print(f'vocabulary size : {len(tokenizer.word_counts)}')

vocabulary size : 14378


###  Transform text to Integer Sequences of equal lengths

In [5]:
# Transforms each text to a sequence of integers, only the K most common words will be transformed (K = max_features)
tweets["tweet_encoded"] = tokenizer.texts_to_sequences(tweets.keyword_text)

# check whether we have empty lists
tweets['length'] = tweets['tweet_encoded'].apply(lambda x : len(x))
tweets = tweets[tweets["length"]!=0]

# add padding so that all sequences have the same length --> a numpy array of equal length sequences
tweet_pad = tf.keras.preprocessing.sequence.pad_sequences(tweets.tweet_encoded, padding="post")
tweet_pad

array([[3491,  457,   94, ...,    0,    0,    0],
       [ 195,    1,  245, ...,    0,    0,    0],
       [1356,  497, 1668, ...,    0,    0,    0],
       ...,
       [3254,  239, 1235, ...,    0,    0,    0],
       [  18,  751, 1800, ...,    0,    0,    0],
       [ 177,   46,  179, ...,    0,    0,    0]], dtype=int32)

###  Tensorify the Sequences

In [6]:
# transform the data to a tensor (TensorSliceDataset)
full_ds = tf.data.Dataset.from_tensor_slices((tweet_pad, tweets.target.values))
list(full_ds.as_numpy_iterator())[0]

2022-04-24 21:42:15.945984: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-04-24 21:42:15.946037: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-04-24 21:42:15.946063: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (jupyter-rouahi-2eaouatef-40gmail-2ecom): /proc/driver/nvidia/version does not exist
2022-04-24 21:42:15.946421: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


(array([3491,  457,   94, 1246, 1667,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0], dtype=int32),
 1)

###  Training and Validation sets & Batching

In [7]:
# Train Test Split
train_size = int(train_ratio * tweets.shape[0])

train_data = full_ds.take(train_size).shuffle(train_size).batch(batch_size)
test_data = full_ds.skip(train_size).batch(batch_size)

for tweet, label in train_data.take(1):
    print(tweet, label)

tf.Tensor(
[[ 219  219  292  412    1  453  210   66 2278 2279  832    0    0    0
     0    0    0    0    0    0]
 [  26 1921 3191  776  344   26  303   29  483  540    2 3199   75    0
     0    0    0    0    0    0]
 [ 261 1979  598  374  887  261 2354    0    0    0    0    0    0    0
     0    0    0    0    0    0]
 [ 437    2  115 2907    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0]
 [  51   51  114 2439 3079 4129 2124 4130    0    0    0    0    0    0
     0    0    0    0    0    0]
 [ 383 1116 1659    7 1242    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0]
 [  89 2539   22  928   89    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0]
 [ 184  959  568  856  165  297  184    0    0    0    0    0    0    0
     0    0    0    0    0    0]
 [  34 1106 2239 1393 1339 1030 1354  865  993  760    0    0    0    0
     0    0    0    0    0    0]
 [ 163 2072 1106  312  163    0    0    0   

In [8]:
# features input_shape = (bs, seq_len)
# target input_shape = (bs,) 

In [9]:
print('number of train batches :', len(train_data))

number of train batches : 191


## baseline model

>🗒 A **baseline** model (**dummy** predictor) allows us to set a **lower bound** on performance for model evaluation. We expect that a trained model outperforms this baseline model.

> Given the marginal probabilities **_p(y=1)=r_** and **_p(y=0)=1−r_**, assume that the baseline model **always predicts the majority class** . 
>> Baseline **Accuracy = max (r, 1-r)**

In [10]:
# baseline_model : always predicts the majority class
p_1 = tweets[tweets['target']==1].shape[0] / tweets.shape[0]
p_0 = tweets[tweets['target']==0].shape[0] / tweets.shape[0]
print('p_1 = {}'.format(p_1))
print('p_0 = {}'.format(p_0))

baseline_accuracy = round(max(p_1, p_0),2)
print(f'Baseline Accuracy Score = {baseline_accuracy}')

p_1 = 0.4298291721419185
p_0 = 0.5701708278580815
Baseline Accuracy Score = 0.57


## LSTM model

### Parameters

In [11]:
# input_dim = vocab_size + 1 = len(tokenizer.word_index or index_word or word_counts) +1  (+1 for the padding 0)
vocab_size = len(tokenizer.word_index)

# input_length or input_shape ([seq_len,]) = length of input sequences after padding 
seq_length = tweet_pad[0].shape[0]

# output_dim =  size of the vector space in which words will be embedded

# Dropout ratio to avoid overfitting

# output_layer activation function = 'sigmoid' for binary classification

### Architecture

<img src="img/lstm.png" width="500" height="600">

In [12]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim = vocab_size +1, output_dim = 16, input_length = seq_length),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.LSTM(units = 32, return_sequences=False), # maintains the sequential nature
    tf.keras.layers.Dropout(0.2),
    
    tf.keras.layers.Dense(1, activation="sigmoid")
])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 20, 16)            230064    
                                                                 
 dropout (Dropout)           (None, 20, 16)            0         
                                                                 
 lstm (LSTM)                 (None, 32)                6272      
                                                                 
 dropout_1 (Dropout)         (None, 32)                0         
                                                                 
 dense (Dense)               (None, 1)                 33        
                                                                 
Total params: 236,369
Trainable params: 236,369
Non-trainable params: 0
_________________________________________________________________


### Optimizer

In [13]:
optimizer= tf.keras.optimizers.Adam()

model.compile(optimizer = optimizer,
              loss = tf.keras.losses.BinaryCrossentropy(),
              metrics = [tf.keras.metrics.BinaryAccuracy()])

### Train the model

In [16]:
es_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience = 3)

history = model.fit(
    train_data,
    epochs=10, 
    validation_data = test_data,
    callbacks = [es_callback]        
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


In [17]:
print('\n---------------------------- Train Accuracy ------------------------------\n')
print('Mean: ', np.mean(history.history['binary_accuracy']))
print('Std: ', np.std(history.history['binary_accuracy']))
print('\n---------------------------- Validation Accuracy ------------------------------\n')
print('Mean: ', np.mean(history.history['val_binary_accuracy']))
print('Std: ', np.std(history.history['val_binary_accuracy']))


---------------------------- Train Accuracy ------------------------------

Mean:  0.9277923822402954
Std:  0.011095163291250277

---------------------------- Validation Accuracy ------------------------------

Mean:  0.6909329771995545
Std:  0.01605950484051095


>🗒 We obtain a mean accuracy of **69%** over the validation set. We will try to improve this mean score by taking into account the numerical metadata extracted from the tweets text.

### Save the model

In [15]:
model.save(models_path + "model_lstm.h5")