In [1]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import numpy as np

import spacy
nlp = spacy.load('en_core_web_sm')
from spacy.lang.en.stop_words import STOP_WORDS

import tensorflow as tf

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout, SimpleRNN, GRU
from tensorflow.keras.optimizers import Adam

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.callbacks import ModelCheckpoint

from termcolor import colored

from sklearn.model_selection import train_test_split

In [2]:
# Dataset named "tweet" will be used to train and evaluate the model
tweet = pd.read_csv('train.csv')

# Dataset named "test_tweet" will be used to make submission to the Kaggle platform
test_tweet = pd.read_csv('test.csv')

display (tweet.head())
print (tweet.shape)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


(7613, 5)


In [3]:
#find nan values
tweet.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [4]:
# Copy the dataframe to avoid modifying the original one
tweet_target = tweet.copy()

# Update the target column to use as label for the bar chart
tweet_target["target"] = tweet_target["target"].apply(lambda x: "Disaster" if x == 1 else "Not a disaster")

# Bar chart
fig = px.bar(round(tweet_target["target"].value_counts(normalize=True)*100, 2),
            template='plotly_dark', 
            title = "Disaster Tweets Percentage",
            text="value",
            width=1000, height=500,
            labels = {"index" : ""},
            color=tweet_target["target"].value_counts(normalize=True).index,
            color_discrete_map={"Disaster": "#880808", "Not a disaster": "royalblue"},

            )

fig.update_traces(texttemplate='%{text:.4s}' + '%', textposition='outside')
fig.update_layout(title_x=0.5, 
                    yaxis={'visible': False}, 
                    showlegend=False
                    )
fig.show()

In [5]:
# Function to clean the text
def clean_text(text):
    # remove punctuation
    text = ''.join([c for c in text if c not in r"[!\"#$%&()'ûª*+,-./:;<=>?@[\\\]^_`{|}~]+"])
    # remove stopwords
    text = ' '.join([c for c in text.split() if c not in STOP_WORDS])
    # remove numbers
    text = ''.join([c for c in text if not c.isdigit()])
    # lowercase
    text = ' '.join(text.split()).lower()
    # lemmatize
    text = ' '.join([c.lemma_ for c in nlp(text)])
    # remove words starting by http
    text = ' '.join([c for c in text.split() if not c.startswith("http")])
    
    return text

# Text test
test_function = "I'm a 17 years old student at the University of Washington :)"

# Test of the function
clean_text(test_function)

'I m year old student university washington'

In [6]:
# Using the function to clean the text on "Tweet" and "test_tweet"
tweet["text_clean"] = tweet["text"].apply(clean_text)
test_tweet["text_clean"] = test_tweet["text"].apply(clean_text)

tweet.head()

Unnamed: 0,id,keyword,location,text,target,text_clean
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,our deed reason earthquake may allah forgive
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada
2,5,,,All residents asked to 'shelter in place' are ...,1,all resident ask shelter place notify officer ...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,people receive wildfire evacuation order calif...
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,just got send photo ruby alaska smoke wildfire...


In [7]:
# Creation of a tokenizer
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=10000, oov_token="out_of_vocab")

# Training the tokenizer on the training set
tokenizer.fit_on_texts(tweet['text_clean'])

# Transform the text into sequences of integers
tweet["text_encoded"] = tokenizer.texts_to_sequences(tweet["text_clean"])

tweet.head()

Unnamed: 0,id,keyword,location,text,target,text_clean,text_encoded
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,our deed reason earthquake may allah forgive,"[562, 3730, 465, 218, 876, 1349, 1797]"
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada,"[155, 7, 200, 563, 5489, 5490, 1104]"
2,5,,,All residents asked to 'shelter in place' are ...,1,all resident ask shelter place notify officer ...,"[119, 1472, 513, 1798, 390, 5491, 356, 143, 22..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,people receive wildfire evacuation order calif...,"[16, 2410, 104, 222, 343, 52]"
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,just got send photo ruby alaska smoke wildfire...,"[201, 223, 224, 160, 5492, 1799, 230, 104, 241..."


In [8]:
# Padding of the text
reviews_pad = tf.keras.preprocessing.sequence.pad_sequences(tweet["text_encoded"], padding="post")

# Train test split
X_train, X_val, y_train, y_val = train_test_split(reviews_pad, tweet["target"], test_size=0.3)

# Turns dataframe into tensors
train = tf.data.Dataset.from_tensor_slices((X_train, y_train))
val = tf.data.Dataset.from_tensor_slices((X_val, y_val))

# Create random batches
train_batch = train.shuffle(len(train)).batch(64)
val_batch = val.shuffle(len(val)).batch(64)

 # Display a batch
for review, star in train_batch.take(1):
  print(review, star)

tf.Tensor(
[[   1    1    1 ...    0    0    0]
 [ 511   24  967 ...    0    0    0]
 [8808 8809 1206 ...    0    0    0]
 ...
 [4685   12  864 ...    0    0    0]
 [  43    2 1127 ...    0    0    0]
 [ 250  311  132 ...    0    0    0]], shape=(64, 28), dtype=int32) tf.Tensor(
[1 0 0 1 0 0 0 0 1 0 0 0 0 1 0 1 0 0 0 1 1 1 1 0 1 1 0 1 0 0 1 1 0 1 0 0 1
 1 1 1 0 1 0 0 1 0 1 1 1 0 1 1 0 1 0 0 0 1 0 0 1 1 1 0], shape=(64,), dtype=int64)


In [9]:
vocab_size = tokenizer.num_words # the number of words in the vocabulary

# Defines a text classifier model
model = tf.keras.Sequential([
    Embedding(input_dim=vocab_size, output_dim=7, input_shape=[review.shape[1],]),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(14, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')  # Binary classification between 0 and 1
])

model.compile(
    loss='binary_crossentropy',
    optimizer=Adam(learning_rate=0.001),
    metrics=['accuracy']
)

checkpoint = ModelCheckpoint("model/disaster_analysis.h5", monitor='val_accuracy', verbose=1, save_best_only=True, save_weights_only=False, mode='auto') 

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 28, 7)             70000     
                                                                 
 global_average_pooling1d (G  (None, 7)                0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 14)                112       
                                                                 
 dense_1 (Dense)             (None, 1)                 15        
                                                                 
Total params: 70,127
Trainable params: 70,127
Non-trainable params: 0
_________________________________________________________________


In [10]:
# Trains the model
history = model.fit(
    np.array(X_train), #must convert to numpy array before sending to model
    np.array(y_train), #must convert to numpy array before sending to model
    epochs=15, 
    batch_size=64,
    validation_data=(X_val, y_val),
    callbacks = [checkpoint])

Epoch 1/15
Epoch 1: val_accuracy improved from -inf to 0.56961, saving model to model\disaster_analysis.h5
Epoch 2/15
Epoch 2: val_accuracy improved from 0.56961 to 0.57618, saving model to model\disaster_analysis.h5
Epoch 3/15
Epoch 3: val_accuracy improved from 0.57618 to 0.69221, saving model to model\disaster_analysis.h5
Epoch 4/15
Epoch 4: val_accuracy improved from 0.69221 to 0.74387, saving model to model\disaster_analysis.h5
Epoch 5/15
Epoch 5: val_accuracy improved from 0.74387 to 0.78371, saving model to model\disaster_analysis.h5
Epoch 6/15
Epoch 6: val_accuracy improved from 0.78371 to 0.79072, saving model to model\disaster_analysis.h5
Epoch 7/15
Epoch 7: val_accuracy did not improve from 0.79072
Epoch 8/15
Epoch 8: val_accuracy improved from 0.79072 to 0.80166, saving model to model\disaster_analysis.h5
Epoch 9/15
Epoch 9: val_accuracy did not improve from 0.80166
Epoch 10/15
Epoch 10: val_accuracy did not improve from 0.80166
Epoch 11/15
Epoch 11: val_accuracy did not im

In [11]:
model_loss = pd.DataFrame(model.history.history)
model_loss.head()

Unnamed: 0,loss,accuracy,val_loss,val_accuracy
0,0.684271,0.570839,0.677083,0.569615
1,0.666057,0.571026,0.656991,0.576182
2,0.627859,0.6307,0.61775,0.692207
3,0.56394,0.760931,0.567065,0.74387
4,0.487757,0.825483,0.520313,0.783713


In [12]:
# Plot the accuracy and validation accuracy
fig = go.Figure(data=[
                      go.Scatter(
                          y=history.history["accuracy"],
                          name="accuracy",
                          mode="lines",),
                      go.Scatter(
                          y=history.history["val_accuracy"],
                          name="val_accuracy",
                          mode="lines")
])

fig.update_layout(
    title='Accuracy and Validation Accuracy by Epoch',
    xaxis_title='epochs',
    yaxis_title='',
    yaxis=dict(range=[0, 1])

)
fig.show()

In [13]:
model = tf.keras.models.load_model("model/disaster_analysis.h5")

# Create list of custom strings to predict
custom_tweet = [
    "my cat is sick",
    "my rabbit is sick",
    "A country fired a nuclear bomb",
    "a volcano eruption killed my family not only because of the earthquake but also because of the volcano itself.",
    "Pick up artists and garbage men should switch names.",
    "I once took a girl to Starbucks because I forgot her name",
    "Three people died from the heat wave",
    "Someone ate my dog",
    "Strong possibility of typhoon in Texas",
    "A plane crashed on the Eiffel Tower in Paris. One hundred casualties",
    "LOOK AT THE SKY LAST NIGHT IT WAS ABLAZE",
    "The titanic sank",
    "Damage to school bus on 80 in multi car crash",
    "I would like to travel to the moon",
    "I saw a falling star",
    "the volcano in sicilia is asleep",
    "it is snowing because it is winter",
    "it snows so much there is a risk of avalanche",
    "Zelda is a princess and she has been kidnapped by the evil Ganon",
    "I am a great fan of the movie The Lion King",
    "Mortal Kombat has great fatalities"
]

for i in custom_tweet:
    i = clean_text(i)
    
    tokenized = tokenizer.texts_to_sequences([i])
    reviews_pad_test = pad_sequences(tokenized, maxlen=28)

    # Predict the sentiment of the custom string
    new_review_prediction = round(model.predict(np.array(reviews_pad_test))[0][0])
    sentiment = colored("NOT A DISASTER:    ", "green") if new_review_prediction == 0 else colored("DISASTER:          ", "red")
    i = colored(i, 'magenta')
    percent = round((model.predict(np.array(reviews_pad_test))[0][0])*100, 2)
    percent = colored(str(percent), 'yellow') + colored(" %", 'yellow')

    # Display the prediction
    print (f"{sentiment} {i} has {percent} chance of being a disaster.")

[32mNOT A DISASTER:    [0m [35mcat sick[0m has [33m25.25[0m[33m %[0m chance of being a disaster.
[32mNOT A DISASTER:    [0m [35mrabbit sick[0m has [33m36.47[0m[33m %[0m chance of being a disaster.
[31mDISASTER:          [0m [35ma country fire nuclear bomb[0m has [33m95.34[0m[33m %[0m chance of being a disaster.
[31mDISASTER:          [0m [35mvolcano eruption kill family earthquake volcano[0m has [33m99.01[0m[33m %[0m chance of being a disaster.
[32mNOT A DISASTER:    [0m [35mpick artist garbage man switch name[0m has [33m14.99[0m[33m %[0m chance of being a disaster.
[32mNOT A DISASTER:    [0m [35mI take girl starbuck I forget[0m has [33m10.91[0m[33m %[0m chance of being a disaster.
[31mDISASTER:          [0m [35mthree people die heat wave[0m has [33m94.7[0m[33m %[0m chance of being a disaster.
[32mNOT A DISASTER:    [0m [35msomeone eat dog[0m has [33m8.97[0m[33m %[0m chance of being a disaster.
[31mDISASTER:          [0m

In [14]:
# Create a function to make prediction
def predict_sentiment(text):
    tokenized = tokenizer.texts_to_sequences([text])
    reviews_pad_test = pad_sequences(tokenized, maxlen=28)
    return round(model.predict(np.array(reviews_pad_test))[0][0])

# Prediction on test_tweet for submission
test_tweet["target"] = test_tweet["text_clean"].apply(predict_sentiment)
test_tweet

Unnamed: 0,id,keyword,location,text,text_clean,target
0,0,,,Just happened a terrible car crash,just happen terrible car crash,1
1,2,,,"Heard about #earthquake is different cities, s...",hear earthquake different city stay safe,1
2,3,,,"there is a forest fire at spot pond, geese are...",forest fire spot pond geese flee street I save,1
3,9,,,Apocalypse lighting. #Spokane #wildfires,apocalypse light spokane wildfire,1
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,typhoon soudelor kill china taiwan,1
...,...,...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...,earthquake safety los angeles ûò safety fasten...,1
3259,10865,,,Storm in RI worse than last hurricane. My city...,storm ri bad hurricane my cityampother hard hi...,1
3260,10868,,,Green Line derailment in Chicago http://t.co/U...,green line derailment chicago,1
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...,meg issue hazardous weather outlook hwo,1


In [15]:
# Check percentage of disaster in test_tweet (0 = not disaster, 1 = disaster)
test_tweet["target"].value_counts(normalize=True)*100

0    62.519154
1    37.480846
Name: target, dtype: float64

In [16]:
check_tweet = test_tweet.head(25)

for i, p in zip(check_tweet["text_clean"], check_tweet["target"]):
    p = colored(p, 'yellow')
    i = colored(i, 'red')
    print(i, p)

[31mjust happen terrible car crash[0m [33m1[0m
[31mhear earthquake different city stay safe[0m [33m1[0m
[31mforest fire spot pond geese flee street I save[0m [33m1[0m
[31mapocalypse light spokane wildfire[0m [33m1[0m
[31mtyphoon soudelor kill china taiwan[0m [33m1[0m
[31mbe shakingit earthquake[0m [33m1[0m
[31mthey d probably life arsenal yesterday eh eh[0m [33m0[0m
[31mhey how[0m [33m0[0m
[31mwhat nice hat[0m [33m0[0m
[31mfuck[0m [33m0[0m
[31mno I do not like cold[0m [33m0[0m
[31mnooooooooo do not[0m [33m0[0m
[31mno do not tell[0m [33m0[0m
[31mwhat[0m [33m0[0m
[31mawesome[0m [33m0[0m
[31mbirmingham wholesale market ablaze bbc news fire break birmingham wholesale market[0m [33m1[0m
[31msunkxssedharry wear short race ablaze[0m [33m0[0m
[31mpreviouslyondoyintv toke makinwaûs marriage crisis set nigerian twitter ablaze[0m [33m1[0m
[31mcheck nsfw[0m [33m0[0m
[31mpsa iûm splitting personality techie follow ablazeco

In [17]:
# Create a submission file for Kaggle
submission = test_tweet[["id", "target"]]

# Save the submission file
submission.to_csv("submission.csv", index=False, header=True)

##### Result : F1-Score on Kaggle (Actual ranking: 328 on 914)

![alt text](result.png "F1-Score")