In [1]:
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Embedding, LSTM, GRU, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models import KeyedVectors
import re

# Load the dataset
df = pd.read_csv('/Users/adefebrian/disaster.csv')  # Replace with the actual path to your CSV file

# Define the preprocess_text function
def preprocess_text(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove punctuation and special characters
    text = re.sub(r'[^\w\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Tokenization (split into words)
    words = text.split()
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    # Remove duplicate words
    words = list(dict.fromkeys(words))
    # Join the words back to text
    text = ' '.join(words)
    return text

# Assuming you've defined your preprocess_text function
df['processed_text'] = df['text'].apply(preprocess_text)

# Split data
X = df['processed_text']
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenization and padding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
vocab_size = len(tokenizer.word_index) + 1
max_len = max([len(seq) for seq in X_train_seq])
X_train_padded = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_len, padding='post')

# # Convert labels to binary format
label_binarizer = LabelBinarizer()
y_train_binary = label_binarizer.fit_transform(y_train)
y_test_binary = label_binarizer.transform(y_test)

# Load pre-trained word embeddings
def load_word_embeddings(embedding_file, embedding_dim):
    print("Loading Word Embeddings")
    embeddings_index = {}
    with open(embedding_file, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue  # Skip empty lines
                
            values = re.split(r'\s+', line)
            word = values[0]
            
            try:
                coefs = np.asarray(values[1:], dtype=np.float32)
            except ValueError:
                continue  # Skip lines with non-numeric values
            
            if len(coefs) != embedding_dim:
                continue  # Skip embeddings with incorrect dimensions
                
            embeddings_index[word] = coefs
    print("Word Embeddings Loaded")
    return embeddings_index

glove_file = '/Users/adefebrian/glove.840B.300d.txt'
fasttext_file = '/Users/adefebrian/wiki.en.vec'

In [2]:

embedding_dim = 300
embedding_matrix_glove = np.zeros((vocab_size, embedding_dim))
embedding_matrix_fasttext = np.zeros((vocab_size, embedding_dim))

glove_embeddings = load_word_embeddings(glove_file, embedding_dim)
fasttext_embeddings = load_word_embeddings(fasttext_file, embedding_dim)

for word, i in tokenizer.word_index.items():
    glove_embedding_vector = glove_embeddings.get(word)
    fasttext_embedding_vector = fasttext_embeddings.get(word)
    
    if glove_embedding_vector is not None:
        embedding_matrix_glove[i] = glove_embedding_vector
        
    if fasttext_embedding_vector is not None:
        embedding_matrix_fasttext[i] = fasttext_embedding_vector


Loading Word Embeddings
Word Embeddings Loaded
Loading Word Embeddings
Word Embeddings Loaded


In [3]:
# Create models
models = [
    (LSTM, 'GloVe', embedding_matrix_glove),
    (GRU, 'GloVe', embedding_matrix_glove),
    (LSTM, 'FastText', embedding_matrix_fasttext),
    (GRU, 'FastText', embedding_matrix_fasttext)
]

for rnn_type, emb_type, emb_matrix in models:
    print(f"Training {rnn_type.__name__} with {emb_type} embeddings")

    model = Sequential([
        Embedding(vocab_size, embedding_dim, weights=[emb_matrix], input_length=max_len, trainable=False),
        rnn_type(128, dropout=0.2, recurrent_dropout=0.2),
        Dense(128, activation='relu'),
        Dense(64, activation='relu'),
        Dense(32, activation='relu'),
        Dense(1, activation='sigmoid')
    ])

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    model.fit(X_train_padded, y_train_binary, epochs=20, batch_size=64, validation_data=(X_test_padded, y_test_binary))

    y_pred = model.predict(X_test_padded)
    y_pred_binary = np.round(y_pred).flatten()

    target_names = ['Not Disaster', 'Disaster']
    print(classification_report(y_test_binary, y_pred_binary, target_names=target_names))
    print("\n")


Training LSTM with GloVe embeddings
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
              precision    recall  f1-score   support

Not Disaster       0.81      0.81      0.81       874
    Disaster       0.75      0.75      0.75       649

    accuracy                           0.79      1523
   macro avg       0.78      0.78      0.78      1523
weighted avg       0.79      0.79      0.79      1523



Training GRU with GloVe embeddings
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
              precision    recall  f1-score   support

Not Disaster       0.83      0.75      0.79       874
    Disaster       0.70      

In [4]:
# Create models
models = [
    (LSTM, 'GloVe', embedding_matrix_glove),
    (GRU, 'GloVe', embedding_matrix_glove),
    (LSTM, 'FastText', embedding_matrix_fasttext),
    (GRU, 'FastText', embedding_matrix_fasttext)
]

for rnn_type, emb_type, emb_matrix in models:
    print(f"Training {rnn_type.__name__} with {emb_type} embeddings")

    model = Sequential([
        Embedding(vocab_size, embedding_dim, weights=[emb_matrix], input_length=max_len, trainable=False),
        rnn_type(128, dropout=0.2, recurrent_dropout=0.2),
        Dense(128, activation='relu'),
        Dense(64, activation='relu'),
        Dense(32, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
 
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    model.fit(X_train_padded, y_train_binary, epochs=20, batch_size=64, validation_data=(X_test_padded, y_test_binary))

    y_pred = model.predict(X_test_padded)
    y_pred_binary = np.round(y_pred).flatten()

    target_names = ['Not Disaster', 'Disaster']
    print(classification_report(y_test_binary, y_pred_binary, target_names=target_names))
    print("\n")


Training LSTM with GloVe embeddings
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
              precision    recall  f1-score   support

Not Disaster       0.80      0.84      0.82       874
    Disaster       0.77      0.71      0.74       649

    accuracy                           0.79      1523
   macro avg       0.78      0.78      0.78      1523
weighted avg       0.79      0.79      0.78      1523



Training GRU with GloVe embeddings
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
              precision    recall  f1-score   support

Not Disaster       0.79      0.84      0.82       874
    Disaster       0.77      

In [5]:
# Create models
models = [
    (LSTM, 'GloVe', embedding_matrix_glove),
    (GRU, 'GloVe', embedding_matrix_glove),
    (LSTM, 'FastText', embedding_matrix_fasttext),
    (GRU, 'FastText', embedding_matrix_fasttext)
]

for rnn_type, emb_type, emb_matrix in models:
    print(f"Training {rnn_type.__name__} with {emb_type} embeddings")

    model = Sequential([
        Embedding(vocab_size, embedding_dim, weights=[emb_matrix], input_length=max_len, trainable=False),
        rnn_type(128, dropout=0.2, recurrent_dropout=0.2),
        Dense(128, activation='relu'),
        Dense(64, activation='relu'),
        Dense(32, activation='relu'),
        Dense(1, activation='sigmoid')
    ])

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    model.fit(X_train_padded, y_train_binary, epochs=20, batch_size=64, validation_data=(X_test_padded, y_test_binary))

    y_pred = model.predict(X_test_padded)
    y_pred_binary = np.round(y_pred).flatten()

    target_names = ['Not Disaster', 'Disaster']
    print(classification_report(y_test_binary, y_pred_binary, target_names=target_names))
    print("\n")


Training LSTM with GloVe embeddings
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
              precision    recall  f1-score   support

Not Disaster       0.80      0.80      0.80       874
    Disaster       0.73      0.73      0.73       649

    accuracy                           0.77      1523
   macro avg       0.77      0.76      0.76      1523
weighted avg       0.77      0.77      0.77      1523



Training GRU with GloVe embeddings
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
              precision    recall  f1-score   support

Not Disaster       0.78      0.84      0.81       874
    Disaster       0.76      

In [6]:
# Create models
models = [
    (LSTM, 'GloVe', embedding_matrix_glove),
    (GRU, 'GloVe', embedding_matrix_glove),
    (LSTM, 'FastText', embedding_matrix_fasttext),
    (GRU, 'FastText', embedding_matrix_fasttext)
]

for rnn_type, emb_type, emb_matrix in models:
    print(f"Training {rnn_type.__name__} with {emb_type} embeddings")

    model = Sequential([
        Embedding(vocab_size, embedding_dim, weights=[emb_matrix], input_length=max_len, trainable=False),
        rnn_type(128, dropout=0.2, recurrent_dropout=0.2),
        Dense(128, activation='relu'),
        Dense(64, activation='relu'),
        Dense(32, activation='relu'),
        Dense(1, activation='sigmoid')
    ])

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    model.fit(X_train_padded, y_train_binary, epochs=20, batch_size=64, validation_data=(X_test_padded, y_test_binary))

    y_pred = model.predict(X_test_padded)
    y_pred_binary = np.round(y_pred).flatten()

    target_names = ['Not Disaster', 'Disaster']
    print(classification_report(y_test_binary, y_pred_binary, target_names=target_names))
    print("\n")


Training LSTM with GloVe embeddings
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
              precision    recall  f1-score   support

Not Disaster       0.83      0.81      0.82       874
    Disaster       0.75      0.77      0.76       649

    accuracy                           0.79      1523
   macro avg       0.79      0.79      0.79      1523
weighted avg       0.79      0.79      0.79      1523



Training GRU with GloVe embeddings
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
              precision    recall  f1-score   support

Not Disaster       0.80      0.83      0.82       874
    Disaster       0.76      