# NLP Disaster Tweet Classifier

### Project Prerequisites

In [63]:
import re
import nltk
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
gpus = tf.config.experimental.list_physical_devices("GPU")
for gpu in gpus:
	tf.config.experimental.set_memory_growth(gpu,True)

for gpu in gpus:
	print(gpu)

PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')


### Preparing the Data

In [7]:
train_df = pd.read_csv("Data/train.csv",index_col="id")
train_df.head()

Unnamed: 0_level_0,keyword,location,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,,,Our Deeds are the Reason of this #earthquake M...,1
4,,,Forest fire near La Ronge Sask. Canada,1
5,,,All residents asked to 'shelter in place' are ...,1
6,,,"13,000 people receive #wildfires evacuation or...",1
7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [8]:
train_df.drop(columns=["keyword","location"],inplace=True)
train_df.head()

Unnamed: 0_level_0,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Our Deeds are the Reason of this #earthquake M...,1
4,Forest fire near La Ronge Sask. Canada,1
5,All residents asked to 'shelter in place' are ...,1
6,"13,000 people receive #wildfires evacuation or...",1
7,Just got sent this photo from Ruby #Alaska as ...,1


### Preprocessing

In [40]:
def preprocessing(x):
    processed = re.sub(r"[^\w\s]"," ",x)
    processed_2 = re.sub(r" +"," ",processed).lower()
    
    wnl = nltk.stem.WordNetLemmatizer()
    stop_words = set(nltk.corpus.stopwords.words("english"))
    tokenizer = word_tokenize(processed_2,language='english',)
    
    processed_text = []

    for i in tokenizer:
        if i not in stop_words:
            processed_text.append(wnl.lemmatize(i))
    return " ".join(processed_text)

In [41]:
train_df["processed_text"] = train_df["text"].apply(preprocessing)
train_df.head()

Unnamed: 0_level_0,text,target,processed_text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Our Deeds are the Reason of this #earthquake M...,1,deed reason earthquake may allah forgive u
4,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada
5,All residents asked to 'shelter in place' are ...,1,resident asked shelter place notified officer ...
6,"13,000 people receive #wildfires evacuation or...",1,13 000 people receive wildfire evacuation orde...
7,Just got sent this photo from Ruby #Alaska as ...,1,got sent photo ruby alaska smoke wildfire pour...


In [53]:
vec = CountVectorizer()
X = vec.fit_transform(train_df["processed_text"].values)
X = X.toarray()
y = train_df["target"].values

In [54]:
X_train,X_val,y_train,y_val = train_test_split(X,y,test_size=0.2,random_state=42)

### Model Validation

In [68]:
model = tf.keras.models.Sequential()

model.add(tf.keras.layers.Dense(64,input_shape = (X_train.shape[1],),activation="relu"))
model.add(tf.keras.layers.Dense(32,activation="relu"))
model.add(tf.keras.layers.Dense(1,activation="sigmoid"))

model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_14 (Dense)             (None, 64)                1296832   
_________________________________________________________________
dense_15 (Dense)             (None, 32)                2080      
_________________________________________________________________
dense_16 (Dense)             (None, 1)                 33        
Total params: 1,298,945
Trainable params: 1,298,945
Non-trainable params: 0
_________________________________________________________________


In [69]:
OPTIMIZERS = "adam"
LOSS = "mse"
METRICS = ["accuracy"]

model.compile(loss=LOSS,optimizer=OPTIMIZERS,metrics=METRICS)

In [72]:
EPOCHS = 5
BATCH_SIZE = 1

hist = model.fit(X_train,y_train,
          validation_data=(X_val,y_val),
          batch_size=BATCH_SIZE,
          validation_batch_size=BATCH_SIZE,
          epochs=EPOCHS)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
