In [1]:
import numpy as np
import pandas as pd
import os
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dropout, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
# Load data
train_data = pd.read_csv("data/nlp-tweets/train.csv")
test_data = pd.read_csv("data/nlp-tweets/test.csv")

print(test_data.head())
print(train_data.sample())

   id keyword location                                               text
0   0     NaN      NaN                 Just happened a terrible car crash
1   2     NaN      NaN  Heard about #earthquake is different cities, s...
2   3     NaN      NaN  there is a forest fire at spot pond, geese are...
3   9     NaN      NaN           Apocalypse lighting. #Spokane #wildfires
4  11     NaN      NaN      Typhoon Soudelor kills 28 in China and Taiwan
        id       keyword location  \
3007  4321  dust%20storm      NaN   

                                                   text  target  
3007  Good way to end the day!!! Geyser plus dust st...       1  


In [3]:
train_features = train_data.drop(['id', 'keyword', 'location', 'target'], axis=1)
train_labels = train_data['target'].values
test_features = test_data.drop(['id', 'keyword', 'location'], axis=1)

all_text = np.concatenate((train_features['text'].values, test_features['text'].values))

In [4]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_text)
vocab_size = len(tokenizer.word_index) + 1
print('Size of Vocabulary:', vocab_size)

Size of Vocabulary: 29320


In [5]:
max_tweet_length = max(len(tweet.split()) for tweet in all_text)
print('Maximum tweet length:', max_tweet_length)

Maximum tweet length: 31


In [6]:
train_sequences = tokenizer.texts_to_sequences(train_features['text'].values)
test_sequences = tokenizer.texts_to_sequences(test_features['text'].values)

# Pad sequences
train_padded = pad_sequences(train_sequences, maxlen=max_tweet_length, padding='post')
test_padded = pad_sequences(test_sequences, maxlen=max_tweet_length, padding='post')

In [7]:
vocab_size = 10000
embedding_size = 200
hidden_units = 256
max_tweet_length = 120

# Build model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_size, input_length=max_tweet_length),
    Bidirectional(LSTM(hidden_units)),
    Dropout(0.2),
    Dense(256, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

model.build(input_shape=(None, max_tweet_length))
model.summary()



In [9]:
maxlen = max(len(x.split()) for x in all_text)
print('Maximum length of tweet : ', maxlen)

Maximum length of tweet :  31


In [10]:
model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.0001), metrics=['accuracy'])

# Train model
batch_size = 256
epochs = 10
model_history = model.fit(train_padded, train_labels, batch_size=batch_size, epochs=epochs)

Epoch 1/10
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 166ms/step - accuracy: 0.5609 - loss: 0.6865
Epoch 2/10
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 160ms/step - accuracy: 0.5771 - loss: 0.6656
Epoch 3/10
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 170ms/step - accuracy: 0.7043 - loss: 0.6092
Epoch 4/10
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 199ms/step - accuracy: 0.7673 - loss: 0.5064
Epoch 5/10
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 161ms/step - accuracy: 0.8285 - loss: 0.4114
Epoch 6/10
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 182ms/step - accuracy: 0.8538 - loss: 0.3555
Epoch 7/10
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 199ms/step - accuracy: 0.8880 - loss: 0.2952
Epoch 8/10
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 159ms/step - accuracy: 0.9047 - loss: 0.2518
Epoch 9/10
[1m30/30[0m [32m━━━━━━━━━━

In [11]:
predictions = model.predict(test_padded)
predictions = (predictions > 0.5).astype(int).reshape(-1)
submission = pd.read_csv("data/nlp-tweets/sample_submission.csv")
submission["target"] = predictions
submission.to_csv("data/nlp-tweets/submission.csv", index=False)

[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 32ms/step
