In [1]:
pip install keras_tuner

Note: you may need to restart the kernel to use updated packages.


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import keras_tuner as kt
from sklearn.model_selection import train_test_split

---

Text Cleaning

---

In [None]:
def dataprep(df):
    df['text'] =  df['text'].astype(str)
    df['text'] =  df['text'].str.replace(r"@[\w]+", '', regex=True)
    df['text'] =  df['text'].str.replace(r"http[s]?://\S+", '', regex=True)
    df['text'] = df['text'].str.replace(r"[/'-]", ' ', regex=True)
    df['text'] =  df['text'].str.replace(r"&", ' & ', regex=True)
    df['text'] =  df['text'].str.replace(r'[?!.,"#$%\'()*+-/:;<=>@[\\]^_`{|}~“”’]', '', regex=True)


    return df




---

Positional Encoding Function



---



In [None]:
def positional_encoding(position, d_model):
    angle_rads = tf.range(position, dtype=tf.float32)[:, tf.newaxis] / tf.pow(10000.0, (2 * (tf.range(d_model, dtype=tf.float32)[tf.newaxis, :] // 2)) / tf.cast(d_model, tf.float32))
    angle_rads_sin = tf.sin(angle_rads[:, 0::2])
    angle_rads_cos = tf.cos(angle_rads[:, 1::2])
    angle_rads_updated = tf.concat([angle_rads_sin, angle_rads_cos], axis=-1)
    pos_encoding = angle_rads_updated[tf.newaxis, ...]
    return tf.cast(pos_encoding, dtype=tf.float32)



---


Obtaining the max length of a tweet - Used for Padding, Input Layer Shape and Positional Encoding



---


In [None]:
def determine_maxlen(sequences, percentile=99):
    seq_lengths = [len(seq) for seq in sequences]
    maxlen = int(np.percentile(seq_lengths, percentile))

    return maxlen



---



**Build Function Breakdown**


---


*  Step 1 - Setup Hyperparam options
*  Step 2 - Update optimizer based on selection
*  Step 3 - Create input Layer
*  Step 4 - Converts tokens into dense vectors
*  Step 5 - Apply positional encoding
*  Step 6 - Add sentence context to words (Self-Attention)
*  Step 7 - Dropout to avoid overfitting
*  Step 8 - Pool (Equivalent of flatten - Groups all vectors into 1) and Normalize
*  Step 9 - Add dense and dropout layers
*  Step 10 - Create sigmoid (binary) output layer
*  Step 11 - Compile the model

In [None]:
def build_model(hp):
    max_length = hp.Int('max_length', min_value=20, max_value=50, step=5)
    d_model = 128

    # Step 1: Tuning hyperparameters
    num_heads = hp.Int('num_heads', min_value=2, max_value=8, step=2)
    key_dim = hp.Int('key_dim', min_value=40, max_value=56, step=4)
    dense_units = hp.Int('dense_units', min_value=64, max_value=512, step=32)
    dropout_rate = hp.Float('dropout_rate', min_value=0.2, max_value=0.5, step=0.05)
    optimizer_choice = hp.Choice('optimizer', values=['adam', 'sgd', 'rmsprop'])
    lr = hp.Float('lr', min_value=1e-5, max_value=1e-2, sampling='log')
    activation = hp.Choice('activation', values=['relu', 'tanh', 'swish'])

    # Step 2: Select optimizer
    if optimizer_choice == "sgd":
        optimizer = tf.keras.optimizers.SGD(learning_rate=lr)
    elif optimizer_choice == "rmsprop":
        optimizer = tf.keras.optimizers.RMSprop(learning_rate=lr)
    else:
        optimizer = tf.keras.optimizers.Adam(learning_rate=lr)

    # Step 3: Build model
    inputs = layers.Input(shape=(max_length,))

    # Step 4:Embedding + Positional Encoding
    embedding = layers.Embedding(input_dim=len(tokenizer.word_index) + 1,
                                 output_dim=d_model,
                                 input_length=max_length)(inputs)
    positional_encoding_output = positional_encoding(max_length, d_model)
    embedding = embedding + positional_encoding_output

    # Step 5:Multi-Head Attention
    transformer_block = layers.MultiHeadAttention(num_heads=num_heads, key_dim=key_dim)(embedding, embedding)
    transformer_block = layers.Dropout(rate=dropout_rate)(transformer_block)
    transformer_block = layers.LayerNormalization()(transformer_block)

    # Step 6:Pooling and Dense Layer
    x = layers.GlobalAveragePooling1D()(transformer_block)
    x = layers.Dense(units=dense_units, activation=activation)(x)
    x = layers.Dropout(rate=dropout_rate)(x)

    # Step 7:Output Layer
    outputs = layers.Dense(1, activation='sigmoid')(x)

    # Step 8:Compile Model
    model = tf.keras.models.Model(inputs, outputs)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

    return model




---

Load Data

---



In [None]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
train_df = dataprep(train_df)
train_df = dataprep(train_df)



---

Sample dataset to see if cleaning worked

---




In [None]:
train_df.sample(10)

Unnamed: 0,id,keyword,location,text,target
270,393,annihilation,BIG D HOUSTON/BOSTON/DENVER,U.S National Park Services Tonto National Fore...,0
922,1335,blown%20up,"801 SL,UT",Damn greinke got blown up in that first inning,0
178,254,ambulance,Happily Married with 2 kids,AMBULANCE SPRINTER AUTOMATIC FRONTLINE VEHICLE...,0
3646,5195,fatalities,Wisconsin,Sharing to help our cousin s family,0
3733,5305,fear,Halifax,The number of people denying climate change on...,0
6694,9591,thunder,,My brother is crying cause the thunder lmao,0
2047,2937,danger,,The Danger and Excitement of Underwater Cave D...,0
7505,10735,wreckage,Mumbai,Wreckage Conclusively Confirmed as From MH37...,1
1832,2633,crashed,"Gujranwala, Pakistan",Maj Muzzamil Pilot Offr of MI 17 crashed near ...,1
6477,9264,sunk,"London, England",It still hasn t sunk in that I ve actually met...,0




---



Convert Text to Ints



---



In [None]:
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(train_df['text'])

---

Converts Tokens to Sequences of tokens

---

In [None]:
X_train = tokenizer.texts_to_sequences(train_df['text'])
X_test = tokenizer.texts_to_sequences(test_df['text'])

---

Determine Max Length of a Tweet

---

In [None]:
maxlen = determine_maxlen(X_train, percentile=99)
print(maxlen)

28


---

Add padding to ensure all token sequences are the same shape.

---

In [None]:
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

---

Split Train and Dev Sets

---

In [None]:
y_train = train_df['target'].values
X_train, X_dev, y_train, y_dev = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

---

Run Hyperband Search on data

---

In [None]:

out_folder = "Transformer"
hyp_search_folder = "hyperband_search"
batch_sizes = [64, 128, 256, 512, 1024, 2048]

hyperband = kt.Hyperband(
    build_model,
    objective='val_accuracy',
    max_epochs=15,
    factor=4,
    hyperband_iterations=5,
    directory=out_folder,
    project_name=hyp_search_folder,
    seed=42
)


hyperband.search(X_train, y_train, epochs=5, validation_data=(X_dev, y_dev))


best_model = hyperband.get_best_models(num_models=1)[0]

for batch_size in batch_sizes:
    print(f"Evaluating batch size {batch_size}")
    loss, mape = best_model.evaluate(X_dev, y_dev, batch_size=batch_size)
    print(f"Loss: {loss}, MAPE: {mape}")


Trial 55 Complete [00h 01m 21s]
val_accuracy: 0.7754431962966919

Best val_accuracy So Far: 0.8049901723861694
Total elapsed time: 00h 41m 19s
Evaluating batch size 64
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.7906 - loss: 0.5110
Loss: 0.4946240484714508, MAPE: 0.8049901723861694
Evaluating batch size 128
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.7914 - loss: 0.5105
Loss: 0.4946240186691284, MAPE: 0.8049901723861694
Evaluating batch size 256
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - accuracy: 0.7933 - loss: 0.5067
Loss: 0.4946240484714508, MAPE: 0.8049901723861694
Evaluating batch size 512
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step - accuracy: 0.7978 - loss: 0.5016
Loss: 0.4946240186691284, MAPE: 0.8049901723861694
Evaluating batch size 1024
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 87ms/step - accuracy: 0.8020 - los

---

Print Summary of Search for param optimization

---

In [None]:
hyperband.search_space_summary()

hyperband.results_summary()

Search space summary
Default search space size: 4
key_dim (Int)
{'default': None, 'conditions': [], 'min_value': 40, 'max_value': 56, 'step': 4, 'sampling': 'linear'}
dense_units (Int)
{'default': None, 'conditions': [], 'min_value': 88, 'max_value': 128, 'step': 8, 'sampling': 'linear'}
dropout_rate (Float)
{'default': 0.5, 'conditions': [], 'min_value': 0.5, 'max_value': 0.6, 'step': 0.05, 'sampling': 'linear'}
activation (Choice)
{'default': 'relu', 'conditions': [], 'values': ['relu', 'tanh'], 'ordered': False}
Results summary
Results in Transformer\hyperband_search
Showing 10 best trials
Objective(name="val_accuracy", direction="max")

Trial 0003 summary
Hyperparameters:
key_dim: 56
dense_units: 88
dropout_rate: 0.55
activation: relu
tuner/epochs: 4
tuner/initial_epoch: 0
tuner/bracket: 1
tuner/round: 0
Score: 0.8049901723861694

Trial 0033 summary
Hyperparameters:
key_dim: 52
dense_units: 120
dropout_rate: 0.5
activation: relu
tuner/epochs: 4
tuner/initial_epoch: 0
tuner/bracket:

---



In [None]:
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=15)
best_model.build(input_shape=(None, 28))
best_model.summary()

history = best_model.fit(X_train, y_train,epochs=100,batch_size=2048,validation_data=(X_dev, y_dev),callbacks= [stop_early])


Epoch 1/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 1s/step - accuracy: 0.9133 - loss: 0.2549 - val_accuracy: 0.7873 - val_loss: 0.7445
Epoch 2/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1s/step - accuracy: 0.9233 - loss: 0.2158 - val_accuracy: 0.7781 - val_loss: 0.8546
Epoch 3/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1s/step - accuracy: 0.9291 - loss: 0.2031 - val_accuracy: 0.7859 - val_loss: 0.7664
Epoch 4/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1s/step - accuracy: 0.9349 - loss: 0.1821 - val_accuracy: 0.7846 - val_loss: 0.7160
Epoch 5/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1s/step - accuracy: 0.9405 - loss: 0.1689 - val_accuracy: 0.7781 - val_loss: 0.7569
Epoch 6/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 972ms/step - accuracy: 0.9450 - loss: 0.1607 - val_accuracy: 0.7781 - val_loss: 0.8650
Epoch 7/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━

In [None]:
predictions = best_model.predict(X_test)
predictions = (predictions > 0.5).astype(int)

[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step


In [None]:
submission = pd.DataFrame({'id': test_df['id'], 'target': predictions.flatten()})
submission.to_csv('submission.csv', index=False)