In [1]:
# RNN for NLP - Disaster Tweets Analysis
# https://www.kaggle.com/c/nlp-getting-started

import tensorflow as tf
from tensorflow import keras
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import time

In [2]:
# Load train data
df = pd.read_csv("./data/train.csv")
print(df.shape)
# Show samples of the data
df.head()

(7613, 5)


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
# Data summary
print((df.target == 1).sum()) # Disaster
print((df.target == 0).sum()) # No Disaster

3271
4342


In [4]:
# Data preprocessing
import re
import string

# Function for removing url
def remove_URL(text):
    url = re.compile(r"https?://\S+|www\.\S+")
    return url.sub(r"", text)

# Function for removing punctuation
def remove_punct(text):
    translator = str.maketrans("", "", string.punctuation)
    return text.translate(translator)

print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [5]:
# Remove stopwords
# pip install nltk
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Stop Words: A stop word is a commonly used word (such as “the”, “a”, “an”, “in”) that a search engine
# has been programmed to ignore, both when indexing entries for searching and when retrieving them 
# as the result of a search query.
stop = set(stopwords.words("english"))
print(stop)

# Function for removing stopwords
def remove_stopwords(text):
    filtered_words = [word.lower() for word in text.split() if word.lower() not in stop]
    return " ".join(filtered_words)

{'my', 'wouldn', 'with', 'not', "needn't", 'by', "couldn't", 'isn', 'couldn', 'how', "mightn't", 'is', "didn't", 'very', 'during', 'doesn', 'hadn', 'weren', 'we', 'or', 'who', 'there', 'to', 'was', 'they', 'didn', 'that', 'o', 'did', 'when', 'hasn', 'his', 'any', "weren't", 'from', 'her', 'below', 'further', 're', 'so', 'haven', "should've", 'had', 'only', 'yourself', 'than', 'll', 'shouldn', "wasn't", 'now', 'won', "you're", 'at', 'shan', 'into', 'most', 'which', 'been', 'other', 'because', "won't", 'he', 'mightn', "you'll", "it's", 'of', 'through', "don't", "you'd", 'once', 'him', 'myself', "haven't", 'all', 'being', 'each', 'few', 'some', 'your', 'hers', 'doing', 'over', 'here', 'just', 'them', 'more', 'its', 'after', 'd', 'm', 'on', 'again', 'whom', 'wasn', 't', "doesn't", "mustn't", 'herself', 'am', 'it', 'don', 'against', 'aren', 'were', 'yourselves', "hasn't", "wouldn't", 'those', 'until', 'both', "you've", 'ourselves', 'mustn', 'what', 'the', 'our', 'their', 'under', 'these', '

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\7000028508\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
# Remove url, punctuation and stopwords from training data
print(df.text)

df["text"] = df.text.map(remove_URL) # map(lambda x: remove_URL(x))
print(df.text)

df["text"] = df.text.map(remove_punct)
print(df.text)

df["text"] = df.text.map(remove_stopwords)
print(df.text)

0       Our Deeds are the Reason of this #earthquake M...
1                  Forest fire near La Ronge Sask. Canada
2       All residents asked to 'shelter in place' are ...
3       13,000 people receive #wildfires evacuation or...
4       Just got sent this photo from Ruby #Alaska as ...
                              ...                        
7608    Two giant cranes holding a bridge collapse int...
7609    @aria_ahrary @TheTawniest The out of control w...
7610    M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...
7611    Police investigating after an e-bike collided ...
7612    The Latest: More Homes Razed by Northern Calif...
Name: text, Length: 7613, dtype: object
0       Our Deeds are the Reason of this #earthquake M...
1                  Forest fire near La Ronge Sask. Canada
2       All residents asked to 'shelter in place' are ...
3       13,000 people receive #wildfires evacuation or...
4       Just got sent this photo from Ruby #Alaska as ...
                              ..

In [7]:
# Count unique words
from collections import Counter

def counter_word(text_col):
    count = Counter()
    for text in text_col.values:
        for word in text.split():
            count[word] += 1
    return count

counter = counter_word(df.text)
num_unique_words = len(counter)

print(num_unique_words)
print(counter.most_common(5))

17971
[('like', 345), ('im', 299), ('amp', 298), ('fire', 250), ('get', 229)]


In [8]:
# Split dataset into training and validation set
train_size = int(df.shape[0] * 0.8)

train_df = df[:train_size]
val_df = df[train_size:]

# Split text and labels
train_sentences = train_df.text.to_numpy()
train_labels = train_df.target.to_numpy()

val_sentences = val_df.text.to_numpy()
val_labels = val_df.target.to_numpy()

print(train_sentences.shape, val_sentences.shape)

(6090,) (1523,)


In [None]:
# Tokenize
from tensorflow.keras.preprocessing.text import Tokenizer

# vectorize a text corpus by turning each text into a sequence of integers
tokenizer = Tokenizer(num_words=num_unique_words)
tokenizer.fit_on_texts(train_sentences) # fit only to training

# each word has unique index
word_index = tokenizer.word_index
# print(word_index)

train_sequences = tokenizer.texts_to_sequences(train_sentences)
val_sequences = tokenizer.texts_to_sequences(val_sentences)

# print(train_sentences[10:15])
# print(train_sequences[10:15])

In [10]:
# Pad the sequences to have the same length
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Max number of words in a sequence
max_length = 20

train_padded = pad_sequences(train_sequences, maxlen=max_length, padding="post", truncating="post")
val_padded = pad_sequences(val_sequences, maxlen=max_length, padding="post", truncating="post")

print(train_padded.shape, val_padded.shape)

(6090, 20) (1523, 20)


In [11]:
# Sample
print(train_sentences[42])
print(train_sequences[42])
print(train_padded[42])

awesome time visiting cfc head office ancop site ablaze thanks tita vida taking care us
[751, 33, 5695, 3752, 313, 1074, 5696, 803, 375, 454, 5697, 3753, 896, 397, 14]
[ 751   33 5695 3752  313 1074 5696  803  375  454 5697 3753  896  397
   14    0    0    0    0    0]


In [None]:
# Reverse the indices - flip (key, value)
reverse_word_index = dict([(idx, word) for (word, idx) in word_index.items()])
# print(reverse_word_index)

In [13]:
# Decode from sequence: sequence -> sentence
def decode(sequence):
    return " ".join([reverse_word_index.get(idx, "?") for idx in sequence])

decoded_text = decode(train_sequences[42])

print(train_sequences[42])
print(decoded_text)

[751, 33, 5695, 3752, 313, 1074, 5696, 803, 375, 454, 5697, 3753, 896, 397, 14]
awesome time visiting cfc head office ancop site ablaze thanks tita vida taking care us


In [22]:
# Create LSTM model
from tensorflow.keras import layers

# Embedding: Turns positive integers (indexes) into dense vectors of fixed size. 
# Word embeddings give us a way to use an efficient, dense representation in which similar words have 
# a similar encoding. Importantly, you do not have to specify this encoding by hand. An embedding is a 
# dense vector of floating point values (the length of the vector is a parameter you specify).

model = keras.models.Sequential()
model.add(layers.Embedding(num_unique_words, 32, input_length=max_length))
# -> (batch_size, input_length, output_dim) = (batch_size, 20, 32)

# The layer will take as input an integer matrix of size (batch, input_length),
# and the largest integer (i.e. word index) in the input should be no larger than num_words (vocabulary size).
# Now model.output_shape is (None, input_length, 32), where `None` is the batch dimension.
# https://www.tensorflow.org/api_docs/python/tf/keras/layers/Embedding

model.add(layers.LSTM(8, dropout=0.4))
# -> (batch_size, output_dim) = (batch_size, 8)

model.add(layers.Dense(1, activation="sigmoid")) # value range in [0, 1]
# -> (batch_size, output_dim) = (batch_size, 1)

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 20, 32)            575072    
                                                                 
 lstm_1 (LSTM)               (None, 8)                 1312      
                                                                 
 dense_1 (Dense)             (None, 1)                 9         
                                                                 
Total params: 576,393
Trainable params: 576,393
Non-trainable params: 0
_________________________________________________________________


In [23]:
# Configure Loss, optim and metrics
loss = keras.losses.BinaryCrossentropy(from_logits=False) # for binary classification
optim = keras.optimizers.Adam(learning_rate=0.0002)
metrics = ["accuracy"]

model.compile(loss=loss, optimizer=optim, metrics=metrics)

In [24]:
# Start the training
model.fit(train_padded, train_labels, epochs=10, validation_data=(val_padded, val_labels), verbose=2)

Epoch 1/10
191/191 - 4s - loss: 0.6864 - accuracy: 0.5511 - val_loss: 0.6910 - val_accuracy: 0.5345 - 4s/epoch - 20ms/step
Epoch 2/10
191/191 - 2s - loss: 0.6723 - accuracy: 0.5793 - val_loss: 0.6852 - val_accuracy: 0.5345 - 2s/epoch - 9ms/step
Epoch 3/10
191/191 - 2s - loss: 0.5758 - accuracy: 0.6869 - val_loss: 0.5396 - val_accuracy: 0.7971 - 2s/epoch - 10ms/step
Epoch 4/10
191/191 - 2s - loss: 0.3926 - accuracy: 0.8639 - val_loss: 0.5134 - val_accuracy: 0.7800 - 2s/epoch - 10ms/step
Epoch 5/10
191/191 - 2s - loss: 0.3151 - accuracy: 0.8949 - val_loss: 0.5240 - val_accuracy: 0.7728 - 2s/epoch - 10ms/step
Epoch 6/10
191/191 - 2s - loss: 0.2618 - accuracy: 0.9156 - val_loss: 0.5392 - val_accuracy: 0.7787 - 2s/epoch - 10ms/step
Epoch 7/10
191/191 - 2s - loss: 0.2188 - accuracy: 0.9365 - val_loss: 0.5577 - val_accuracy: 0.7781 - 2s/epoch - 9ms/step
Epoch 8/10
191/191 - 2s - loss: 0.1936 - accuracy: 0.9458 - val_loss: 0.5851 - val_accuracy: 0.7754 - 2s/epoch - 9ms/step
Epoch 9/10
191/191 

<keras.callbacks.History at 0x1324fdc3dc0>

In [25]:
# Load test data
df = pd.read_csv("./data/test.csv")
print(df.shape)
# Show samples of the data
df.head()

(3263, 4)


Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [26]:
# Remove url, punctuation and stopwords from test data
df["text"] = df.text.map(remove_URL)
df["text"] = df.text.map(remove_punct)
df["text"] = df.text.map(remove_stopwords)

In [27]:
# Get test_sentences
test_sentences = df.text.to_numpy()

# Tokenization
test_sequences = tokenizer.texts_to_sequences(test_sentences)

# Padding
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding="post", truncating="post")

In [28]:
# Predictions on the test data
predictions = model.predict(test_padded)
predictions = [1 if p > 0.5 else 0 for p in predictions]



In [29]:
# Sample results of the predictions on the test data
for idx in range(0, 9):
    print(f"Tweets: " + df.text[idx])
    print(f"Result: " + "Disaster" if(predictions[idx] == 1) else f"Result: " + "No disaster") 
    # 1 -> Disaster, 2-> No disaster
    print("===================================================================")

Tweets: happened terrible car crash
Result: Disaster
Tweets: heard earthquake different cities stay safe everyone
Result: Disaster
Tweets: forest fire spot pond geese fleeing across street cannot save
Result: Disaster
Tweets: apocalypse lighting spokane wildfires
Result: Disaster
Tweets: typhoon soudelor kills 28 china taiwan
Result: Disaster
Tweets: shakingits earthquake
Result: Disaster
Tweets: theyd probably still show life arsenal yesterday eh eh
Result: No disaster
Tweets: hey
Result: No disaster
Tweets: nice hat
Result: No disaster
