In [33]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import keras_nlp
from sklearn.model_selection import train_test_split
from keras_tuner import Hyperband

train_data = pd.read_csv("/kaggle/input/disaster-nlp/train.csv")
df_test = pd.read_csv("/kaggle/input/disaster-nlp/test.csv")

train_data.head()


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [34]:
import keras_nlp


tokenizer = keras_nlp.tokenizers.BertTokenizer.from_preset("bert_base_en")

# Function to preprocess text data
def preprocess_text(text):
    tokens = tokenizer.tokenize(text)
    detokenized_text = tokenizer.detokenize(tokens)
    return detokenized_text

df_train['text'] = df_train['text'].apply(preprocess_text)


In [69]:
import keras_nlp
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = keras_nlp.tokenizers.BertTokenizer.from_preset("bert_base_en")

def preprocess_text(text):
    token_ids = tokenizer(text).numpy().flatten()
    padding_mask = [1] * len(token_ids)  
    segment_ids = [0] * len(token_ids)
    
    return token_ids, padding_mask, segment_ids

input_ids_list = []
attention_mask_list = []
token_type_ids_list = []

for text in df_train["text"]:
    input_ids, attention_mask, token_type_ids = preprocess_text(text)
    input_ids_list.append(input_ids)
    attention_mask_list.append(attention_mask)
    token_type_ids_list.append(token_type_ids)

token_ids_list = []
padding_mask_list = []
segment_ids_list = []

for text in df_train["text"]:
    token_ids, padding_mask, segment_ids = preprocess_text(text)
    token_ids_list.append(token_ids)
    padding_mask_list.append(padding_mask)
    segment_ids_list.append(segment_ids)

token_ids_padded = pad_sequences(token_ids_list, maxlen=max_len, padding='post', truncating='post')
padding_mask_padded = pad_sequences(padding_mask_list, maxlen=max_len, padding='post', truncating='post')
segment_ids_padded = pad_sequences(segment_ids_list, maxlen=max_len, padding='post', truncating='post')

X_train = {
    "token_ids": np.array(token_ids_padded),
    "padding_mask": np.array(padding_mask_padded),
    "segment_ids": np.array(segment_ids_padded)
}
y_train = df_train["target"].values

#X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [70]:
from tensorflow.keras import layers
import keras_nlp

def build_model(hp):

    bert_encoder = keras_nlp.models.BertBackbone.from_preset("bert_base_en")

    token_ids = layers.Input(shape=(max_len,), dtype=tf.int32, name="token_ids")
    padding_mask = layers.Input(shape=(max_len,), dtype=tf.int32, name="padding_mask")
    segment_ids = layers.Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")
    
    bert_output = bert_encoder({
        "token_ids": token_ids, 
        "padding_mask": padding_mask, 
        "segment_ids": segment_ids
    })

    x = layers.GlobalAveragePooling1D()(bert_output["sequence_output"])
    x = layers.Dense(hp.Int("units", min_value=64, max_value=256, step=64), activation="relu")(x)
    x = layers.Dropout(hp.Float("dropout", min_value=0.2, max_value=0.5, step=0.1))(x)
    output = layers.Dense(1, activation="sigmoid")(x)

    model = keras.Model(inputs=[token_ids, padding_mask, segment_ids], outputs=output)
    model.compile(optimizer=keras.optimizers.Adam(hp.Choice("learning_rate", [1e-5, 3e-5, 5e-5])),
                  loss="binary_crossentropy",
                  metrics=["accuracy"])
    return model


In [None]:
token_ids_input = np.array(X_train["token_ids"])
padding_mask_input = np.array(X_train["padding_mask"])
segment_ids_input = np.array(X_train["segment_ids"])

tuner = Hyperband(
    build_model,
    objective="val_accuracy",
    max_epochs=10,
    factor=3,
    directory="hyperband",
    project_name="disaster_tweets"
)

stop_early = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=3)
tuner.search(
    [token_ids_input, padding_mask_input, segment_ids_input],
    y_train,
    epochs=10,
    validation_split=0.2,
    callbacks=[stop_early]
)

best_model = tuner.get_best_models(num_models=1)[0]


Trial 4 Complete [03h 32m 51s]
val_accuracy: 0.829940915107727

Best val_accuracy So Far: 0.829940915107727
Total elapsed time: 07h 47m 24s

Search: Running Trial #5

Value             |Best Value So Far |Hyperparameter
192               |192               |units
0.3               |0.4               |dropout
5e-05             |3e-05             |learning_rate
2                 |2                 |tuner/epochs
0                 |0                 |tuner/initial_epoch
2                 |2                 |tuner/bracket
0                 |0                 |tuner/round

Epoch 1/2
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6420s[0m 33s/step - accuracy: 0.7327 - loss: 0.5351 - val_accuracy: 0.8194 - val_loss: 0.4103
Epoch 2/2
[1m 88/191[0m [32m━━━━━━━━━[0m[37m━━━━━━━━━━━[0m [1m53:46[0m 31s/step - accuracy: 0.8856 - loss: 0.3101

In [None]:
history = best_model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=5, batch_size=32)

val_loss, val_accuracy = best_model.evaluate(X_val, y_val)
print(f"Validation Accuracy: {val_accuracy:.4f}")


In [None]:
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(history.history["accuracy"], label="Training Accuracy")
plt.plot(history.history["val_accuracy"], label="Validation Accuracy")
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history["loss"], label="Training Loss")
plt.plot(history.history["val_loss"], label="Validation Loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()
plt.show()


In [None]:
test_data["text"] = test_data["text"].apply(preprocess_text)
X_test = np.array([tokenize_text(text)["input_ids"].numpy().flatten() for text in test_data["text"]])

predictions = (best_model.predict(X_test) > 0.5).astype("int32")
submission = pd.DataFrame({"id": test_data["id"], "target": predictions.flatten()})
submission.to_csv("submission.csv", index=False)
