
In this we will use a recurrent neural network to predict whether or not a *tweet* is talking about a real disaster or not. To do this, we will use *Kaggle.com*'s competition [Natural Language Processing with Disaster Tweets](https://www.kaggle.com/c/nlp-getting-started). Please follow the competition directions to obtain the data and evaluate your final model, noting the extra requirements below. 

In [None]:
#import libraries
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import numpy as np




In [None]:
#import data
train_data = pd.read_csv('train.csv')
train_data.head()


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [None]:
#split data into train and validation
from sklearn.model_selection import train_test_split
train_text, val_text, train_labels, val_labels = train_test_split(train_data['text'], train_data['target'], test_size=0.2, random_state=42)

In [None]:
!pip install tensorflow

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

vocab_size=10000 
max_seq_len=50

tokenizer=Tokenizer(num_words=vocab_size,oov_token="<OOV>")
tokenizer.fit_on_texts(train_text)  #fits tokenizer to training data

train_sequences = tokenizer.texts_to_sequences(train_text)
#pads and truncate data
train_padded = pad_sequences(train_sequences, maxlen=max_seq_len, padding='post', truncating='post')

#validation 
val_sequences = tokenizer.texts_to_sequences(val_text)
val_padded = pad_sequences(val_sequences, maxlen=max_seq_len, padding='post', truncating='post')




In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
embedding_dim = 100
lstm_units = 64
output_dim = 1
#implementing LSTM layer using sequential model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_seq_len))
model.add(LSTM(units=lstm_units))
model.add(Dense(units=output_dim, activation='sigmoid'))

#compilation of model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
epochs = 15
batch_size = 20

#Evaluation of model
model.fit(train_padded, train_labels, epochs=epochs, batch_size=batch_size, validation_data=(val_padded, val_labels))

accuracy = model.evaluate(val_padded, val_labels)
y_pred = model.predict(val_padded)
y_pred = np.round(y_pred).astype(int)

#calculating F1 score
from sklearn.metrics import f1_score
f1 = f1_score(val_labels, y_pred)
print('F1 score:', f1)