In [15]:
pip install keras_tuner



In [16]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import keras_tuner as kt
from sklearn.model_selection import train_test_split

---

Text Cleaning

---

In [17]:
def dataprep(df):
    df['text'] =  df['text'].astype(str)
    df['text'] = df['text'].str.replace(r"[/'-]", ' ', regex=True)
    df['text'] =  df['text'].str.replace(r"&", ' & ', regex=True)
    df['text'] =  df['text'].str.replace(r'[?!.,"#$%\'()*+-/:;<=>@[\\]^_`{|}~“”’]', '', regex=True)
    df['text'] =  df['text'].str.replace(r"@[\w]+", '', regex=True)
    df['text'] =  df['text'].str.replace(r"http[s]?://\S+", '', regex=True)

    return df




---


Don't ask me to explain, stole the function from someone else



---



In [18]:
def positional_encoding(position, d_model):
    angle_rads = tf.range(position, dtype=tf.float32)[:, tf.newaxis] / tf.pow(10000.0, (2 * (tf.range(d_model, dtype=tf.float32)[tf.newaxis, :] // 2)) / tf.cast(d_model, tf.float32))
    angle_rads_sin = tf.sin(angle_rads[:, 0::2])
    angle_rads_cos = tf.cos(angle_rads[:, 1::2])
    angle_rads_updated = tf.concat([angle_rads_sin, angle_rads_cos], axis=-1)
    pos_encoding = angle_rads_updated[tf.newaxis, ...]
    return tf.cast(pos_encoding, dtype=tf.float32)



---


Obtaining the max length of a tweet - Used for Padding, Input Layer Shape and Positional Encoding



---


In [19]:
def determine_maxlen(sequences, percentile=99):
    seq_lengths = [len(seq) for seq in sequences]
    maxlen = int(np.percentile(seq_lengths, percentile))

    return maxlen



---



**Build Function Breakdown**


---


*  Step 1 - Setup Hyperparam options
*  Step 2 - Update optimizer based on selection
*  Step 3 - Create input Layer
*  Step 4 - Converts tokens into dense vectors
*  Step 5 - Apply positional encoding
*  Step 6 - Add sentence context to words (Self-Attention)
*  Step 7 - Dropout to avoid overfitting
*  Step 8 - Pool (Equivalent of flatten - Groups all vectors into 1) and Normalize
*  Step 9 - Add dense and dropout layers
*  Step 10 - Create sigmoid (binary) output layer
*  Step 11 - Compile the model

In [20]:
def build_model(hp):
    max_length = 28
    d_model = 128

    # Step 1
    num_heads = hp.Int('num_heads', min_value=2, max_value=4, step=1)
    key_dim = hp.Int('key_dim', min_value=32, max_value=64, step=8)
    dense_units = hp.Int('dense_units', min_value=64, max_value=128, step=8)
    dropout_rate = hp.Float('dropout_rate', min_value=0.2, max_value=0.4, step=0.05)
    optimizer = hp.Choice("optimizer", values=["RMSprop", "adam", "sgd"])

    # Step 2
    if optimizer == "sgd":
        optimizer = tf.keras.optimizers.SGD()
    elif optimizer == "RMSprop":
        optimizer = tf.keras.optimizers.RMSprop()
    else:
        optimizer = tf.keras.optimizers.Adam()

    # Step 3
    inputs = layers.Input(shape=(max_length,))

    # Step 4
    embedding = layers.Embedding(input_dim=len(tokenizer.word_index) + 1,
                                 output_dim=d_model,
                                 input_length=max_length)(inputs)

    # Step 5
    positional_encoding_output = positional_encoding(max_length, d_model)
    embedding = embedding + positional_encoding_output

    # Step 6
    transformer_block = layers.MultiHeadAttention(num_heads=num_heads, key_dim=key_dim)(embedding, embedding)

    # Step 7
    transformer_block = layers.Dropout(rate=dropout_rate)(transformer_block)

    # Step 8
    transformer_block = layers.LayerNormalization()(transformer_block)
    x = layers.GlobalAveragePooling1D()(transformer_block)

    # Step 9
    x = layers.Dense(units=dense_units, activation = hp.Choice("activation", values = ["relu", "tanh", "selu"]))(x)
    x = layers.Dropout(rate=dropout_rate)(x)

    # Step 10
    outputs = layers.Dense(1, activation='sigmoid')(x)

    # Step 11
    model = tf.keras.models.Model(inputs, outputs)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

    return model




---

Load Data

---



In [21]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
train_df = dataprep(train_df)
train_df = dataprep(train_df)



---

Sample dataset to see if cleaning worked

---




In [22]:
train_df.sample(10)

Unnamed: 0,id,keyword,location,text,target
606,875,bioterror,,FedEx no longer to transport bioterror germs i...,0
2778,3995,devastation,"Mount Vernon, NY",Devastation: coming to a and find the closed...,0
4882,6951,massacre,"Ashburn, VA",I just bought tickets to DEATH BED DUDE BRO ...,0
3941,5603,flood,New York,Spot Flood Combo 53inch 300W Curved Cree LED W...,0
2765,3973,devastation,Vancouver BC,Is This Country Latin America s Next Argentin...,0
2938,4221,drowned,IG: AyshBanaysh,Sometimes logic gets drowned out in emotion bu...,0
1049,1518,body%20bags,"southwest, Tx",Shoot shit up till we see body bags,0
141,203,airplane%20accident,"Spain but Opa-Locka, FL",family members of osama bin laden have died in...,1
619,892,bioterrorism,"Netherlands,Amsterdam-Virtual",In Lies We Trust #dvd CIA Hollywood and Bioter...,0
1941,2792,curfew,"Adelaide, Australia",INFO U. CLD: SCT012 BKN025. EXP INST APCH. RWY...,0




---



Convert Text to Ints



---



In [23]:
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(train_df['text'])

---

Converts Tokens to Sequences of tokens

---

In [24]:
X_train = tokenizer.texts_to_sequences(train_df['text'])
X_test = tokenizer.texts_to_sequences(test_df['text'])

---

Determine Max Length of a Tweet

---

In [25]:
maxlen = determine_maxlen(X_train, percentile=99)
print(maxlen)

28


---

Add padding to ensure all token sequences are the same shape.

---

In [26]:
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

---

Split Train and Dev Sets

---

In [27]:
y_train = train_df['target'].values
X_train, X_dev, y_train, y_dev = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

---

Run Hyperband Search on data

---

In [None]:

out_folder = "Transformer_DNN"
hyp_search_folder = "hyperband_search"
batch_sizes = [64, 128, 256, 512, 1024, 2048]

hyperband = kt.Hyperband(
    build_model,
    objective='val_accuracy',
    max_epochs=15,
    factor=2,
    hyperband_iterations=3,
    directory=out_folder,
    project_name=hyp_search_folder,
    seed=42
)


hyperband.search(X_train, y_train, epochs=5, validation_data=(X_dev, y_dev))


best_model = hyperband.get_best_models(num_models=1)[0]

for batch_size in batch_sizes:
    print(f"Evaluating batch size {batch_size}")
    loss, mape = best_model.evaluate(X_dev, y_dev, batch_size=batch_size)
    print(f"Loss: {loss}, MAPE: {mape}")


Trial 15 Complete [00h 00m 14s]
val_accuracy: 0.7754431962966919

Best val_accuracy So Far: 0.803676962852478
Total elapsed time: 00h 03m 41s

Search: Running Trial #16

Value             |Best Value So Far |Hyperparameter
2                 |2                 |num_heads
32                |32                |key_dim
88                |88                |dense_units
0.4               |0.4               |dropout_rate
adam              |adam              |optimizer
selu              |selu              |activation
8                 |4                 |tuner/epochs
4                 |2                 |tuner/initial_epoch
3                 |3                 |tuner/bracket
2                 |1                 |tuner/round
0011              |0008              |tuner/trial_id

Epoch 5/8
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 22ms/step - accuracy: 0.9212 - loss: 0.2137 - val_accuracy: 0.7945 - val_loss: 0.6492


---

Print Summary of Search for param optimization

---

In [None]:
random_search.search_space_summary()

random_search.results_summary()

---



In [None]:
best_model.build(input_shape=(None, 28))
best_model.summary()

history = best_model.fit(X_train, y_train,epochs=100,batch_size=256,validation_data=(X_dev, y_dev))


In [None]:
predictions = best_model.predict(X_test)
predictions = (predictions > 0.5).astype(int)

In [None]:
submission = pd.DataFrame({'id': test_df['id'], 'target': predictions.flatten()})
submission.to_csv('submission.csv', index=False)