In [3]:
import pandas as pd
import re

# Define file paths for the Kaggle disaster tweets dataset.
train_path = '/Users/daniel/Northwestern Local/MSDS-422/Module 9/nlp-getting-started/train.csv'
test_path  = '/Users/daniel/Northwestern Local/MSDS-422/Module 9/nlp-getting-started/test.csv'
sample_path = '/Users/daniel/Northwestern Local/MSDS-422/Module 9/nlp-getting-started/sample_submission.csv'

# The program reads the CSV files into dataframes.
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
sample_df = pd.read_csv(sample_path)

# A text cleaning function is created to normalize the tweets and remove noise.
def clean_text(text):
    text = text.lower()                              # Converts text to lowercase
    text = re.sub(r"http\S+", "url", text)           # Replaces URLs with a placeholder
    text = re.sub(r"@\w+", "user", text)             # Replaces user mentions
    text = re.sub(r"[^a-z0-9# ]", "", text)          # Removes non-alphanumeric characters
    return text

# The cleaning function is applied to training and test tweets.
train_df["text_clean"] = train_df["text"].apply(clean_text)
test_df["text_clean"] = test_df["text"].apply(clean_text)

# A preview is printed to confirm the data was loaded and cleaned properly.
train_df[["text", "text_clean"]].head()

Unnamed: 0,text,text_clean
0,Our Deeds are the Reason of this #earthquake M...,our deeds are the reason of this #earthquake m...
1,Forest fire near La Ronge Sask. Canada,forest fire near la ronge sask canada
2,All residents asked to 'shelter in place' are ...,all residents asked to shelter in place are be...
3,"13,000 people receive #wildfires evacuation or...",13000 people receive #wildfires evacuation ord...
4,Just got sent this photo from Ruby #Alaska as ...,just got sent this photo from ruby #alaska as ...


In [5]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Hyperparameters for the tokenizer and maximum tweet length.
MAX_WORDS = 20000     # Maximum vocabulary size
MAX_LEN = 40          # Maximum sequence length based on typical tweet size

# The tokenizer converts text into integer sequences.
tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(train_df["text_clean"])

# The text is transformed into sequences of integers.
X_train = tokenizer.texts_to_sequences(train_df["text_clean"])
X_test = tokenizer.texts_to_sequences(test_df["text_clean"])

# All sequences are padded to the same length to meet RNN input requirements.
X_train = pad_sequences(X_train, maxlen=MAX_LEN, padding='post', truncating='post')
X_test = pad_sequences(X_test, maxlen=MAX_LEN, padding='post', truncating='post')

# The target variable is extracted as a NumPy array.
y_train = train_df["target"].values

# Shapes are displayed to confirm correct preprocessing.
X_train.shape, X_test.shape

((7613, 40), (3263, 40))

In [7]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# The model uses an embedding layer followed by a single LSTM layer.
model_lstm_1 = Sequential([
    Embedding(input_dim=MAX_WORDS, output_dim=64, input_length=MAX_LEN),  # Embedding converts words to vectors
    LSTM(64, return_sequences=False),                                      # A basic LSTM layer with 64 units
    Dropout(0.3),                                                          # Dropout reduces overfitting
    Dense(1, activation='sigmoid')                                         # Output layer for binary classification
])

# The model is compiled using binary cross-entropy, which is appropriate for a two-class problem.
model_lstm_1.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

# The model summary provides visibility into the number of parameters and architecture.
model_lstm_1.summary()

# The model is trained with a validation split to monitor overfitting.
history_lstm_1 = model_lstm_1.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=5,
    batch_size=64
)

2025-11-13 18:19:46.168782: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M4 Pro
2025-11-13 18:19:46.168905: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 64.00 GB
2025-11-13 18:19:46.168923: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 24.00 GB
2025-11-13 18:19:46.168998: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-11-13 18:19:46.169022: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Epoch 1/5


2025-11-13 18:19:46.531709: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.
2025-11-13 18:19:46.537643: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:961] PluggableGraphOptimizer failed: INVALID_ARGUMENT: Failed to deserialize the `graph_buf`.


[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 20ms/step - accuracy: 0.5764 - loss: 0.6815 - val_accuracy: 0.5345 - val_loss: 0.6906
Epoch 2/5
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.7194 - loss: 0.5794 - val_accuracy: 0.7689 - val_loss: 0.5046
Epoch 3/5
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.8645 - loss: 0.3590 - val_accuracy: 0.7630 - val_loss: 0.5204
Epoch 4/5
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.9186 - loss: 0.2400 - val_accuracy: 0.7827 - val_loss: 0.6005
Epoch 5/5
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.9450 - loss: 0.1667 - val_accuracy: 0.7827 - val_loss: 0.6047


In [9]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout

# The model uses a bidirectional wrapper to allow learning from both forward and reverse text sequences.
model_bilstm = Sequential([
    Embedding(input_dim=MAX_WORDS, output_dim=64),       # Embedding converts words to vector representations
    Bidirectional(LSTM(64, return_sequences=False)),     # Bidirectional LSTM processes sequences in both directions
    Dropout(0.4),                                        # Dropout is used to reduce overfitting
    Dense(1, activation='sigmoid')                       # Binary output layer for disaster classification
])

# The model is compiled using binary cross-entropy, suitable for two classes.
model_bilstm.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

# The summary provides visibility into model structure and total parameter count.
model_bilstm.summary()

# The model is trained with a validation split for performance comparison with other architectures.
history_bilstm = model_bilstm.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=5,
    batch_size=64
)

Epoch 1/5


2025-11-13 18:23:02.387065: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:961] PluggableGraphOptimizer failed: INVALID_ARGUMENT: Failed to deserialize the `graph_buf`.


[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 46ms/step - accuracy: 0.6747 - loss: 0.5965 - val_accuracy: 0.7840 - val_loss: 0.4978
Epoch 2/5
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 42ms/step - accuracy: 0.8576 - loss: 0.3510 - val_accuracy: 0.8083 - val_loss: 0.4554
Epoch 3/5
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 43ms/step - accuracy: 0.9205 - loss: 0.2221 - val_accuracy: 0.7708 - val_loss: 0.5177
Epoch 4/5
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 42ms/step - accuracy: 0.9506 - loss: 0.1467 - val_accuracy: 0.7741 - val_loss: 0.5763
Epoch 5/5
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 42ms/step - accuracy: 0.9619 - loss: 0.1128 - val_accuracy: 0.7590 - val_loss: 0.6965


In [11]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout

# The model uses a GRU layer, which is a simplified version of LSTM it's designed to capture temporal relationships while reducing computation.
model_gru = Sequential([
    Embedding(input_dim=MAX_WORDS, output_dim=64),   # Embedding for vector representation of tokens
    GRU(64, return_sequences=False),                 # GRU layer with 64 units
    Dropout(0.3),                                    # Dropout to reduce overfitting
    Dense(1, activation='sigmoid')                   # Output layer for binary prediction
])

# The model is compiled with binary cross-entropy loss for classification tasks.
model_gru.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

# The model summary shows the architecture and parameter details.
model_gru.summary()

# Training includes a validation split to compare model performance across architectures.
history_gru = model_gru.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=5,
    batch_size=64
)

Epoch 1/5


2025-11-13 18:24:14.688963: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:961] PluggableGraphOptimizer failed: INVALID_ARGUMENT: Failed to deserialize the `graph_buf`.


[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 18ms/step - accuracy: 0.5764 - loss: 0.6823 - val_accuracy: 0.5345 - val_loss: 0.6937
Epoch 2/5
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.5793 - loss: 0.6814 - val_accuracy: 0.5345 - val_loss: 0.6930
Epoch 3/5
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.5793 - loss: 0.6813 - val_accuracy: 0.5345 - val_loss: 0.6955
Epoch 4/5
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.5793 - loss: 0.6817 - val_accuracy: 0.5345 - val_loss: 0.6937
Epoch 5/5
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.5793 - loss: 0.6810 - val_accuracy: 0.5345 - val_loss: 0.6961


In [13]:
import numpy as np
import pandas as pd

# The model predicts probabilities on the test data.
test_predictions = model_bilstm.predict(X_test)

# Predictions are converted into binary classes based on a 0.5 threshold.
test_predictions_binary = (test_predictions > 0.5).astype(int)

submission_df = pd.DataFrame({
    "id": test_df["id"],
    "target": test_predictions_binary.reshape(-1)
})

submission_df.to_csv("submission_bilstm.csv", index=False)

submission_df.head()

[1m  6/102[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m1s[0m 11ms/step  

2025-11-13 18:26:27.948595: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:961] PluggableGraphOptimizer failed: INVALID_ARGUMENT: Failed to deserialize the `graph_buf`.


[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step


Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1


In [15]:
import pandas as pd

best_model = model_lstm_1  

# The model predicts probabilities for the test set.
test_predictions = best_model.predict(X_test)

test_binary = (test_predictions > 0.5).astype(int).reshape(-1)

submission = pd.DataFrame({
    "id": test_df["id"],
    "target": test_binary
})

submission.to_csv("submission_model1_lstm.csv", index=False)

submission.head()

[1m 55/102[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m0s[0m 3ms/step

2025-11-13 18:27:45.938973: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:961] PluggableGraphOptimizer failed: INVALID_ARGUMENT: Failed to deserialize the `graph_buf`.


[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step


Unnamed: 0,id,target
0,0,0
1,2,0
2,3,1
3,9,1
4,11,1


In [17]:
submission.to_csv("submission_final.csv", index=False)
submission.head()

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,1
3,9,1
4,11,1
