In [1]:
import numpy as np 
import pandas as pd 
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,Flatten,Dense,LSTM


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv


In [2]:
tweets_df = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")

### **Looking at the data**

In [3]:
tweets_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [5]:
np.shape(tweets_df)

(7613, 5)

In [6]:
tweets_df.target.value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [7]:
tweets_df.location.isnull().sum()

2533

In [8]:
tweets_df.keyword.isnull().sum()

61

In [9]:
tweets_df = tweets_df.fillna('Not Available')

In [10]:
tweets_df.isnull().sum()

id          0
keyword     0
location    0
text        0
target      0
dtype: int64

#### **Before building the model, it is essential to clean the data by performing the following steps:**
<b>
1) Removing punctuations<br>
2) Removing HTML tags<br>
3) Removing twitter handles along with the '@'<br>
4) Removing URLs<br>
5) Removing stop words<br>    
6) Converting each review to its lower case<br></b>

In [11]:
from nltk.corpus import stopwords
import string
import re

In [12]:
stop_words = set(stopwords.words('english'))
print(stop_words)
exclude_words = {'not'}
updated_stop_words = stop_words-exclude_words

{'more', 'of', 'there', 'haven', 'shouldn', 'needn', 'on', 's', 'the', "you'll", 'is', 'as', 'it', "shouldn't", "mustn't", "don't", 'these', 'hasn', 'been', 'from', 'because', "wasn't", 'its', 'ma', 'ain', 'herself', 'am', 'whom', 'are', 'if', 'where', 'being', 'about', 'myself', "that'll", 'an', 'them', 'themselves', 'each', 've', "you're", 'just', 'will', "didn't", 'above', 'for', "couldn't", 'weren', 'his', 'doing', 'a', 'why', "mightn't", 'who', 'now', 'mightn', 'their', 'won', 'does', 'should', 'this', 'over', 'what', 'he', 'those', 'we', 'same', 'between', 'isn', 'any', "weren't", 'here', 'ourselves', 're', 'again', 'very', "shan't", 'or', 'hadn', 'm', 'and', 'other', 'few', 'she', 'you', 'd', 'nor', "won't", 'me', 't', 'most', "you've", 'yourselves', 'no', 'not', 'down', 'until', 'only', 'didn', 'y', "isn't", 'below', 'mustn', 'some', 'so', 'too', 'did', 'when', "doesn't", 'has', 'during', 'your', 'out', 'off', 'but', 'wouldn', "she's", 'to', 'such', 'itself', 'was', 'aren', "ha

In [13]:
def clean_data(tweet):
    punctuations = string.punctuation
    remove_punct = tweet.translate(str.maketrans('','',punctuations))
    remove_html = re.sub(r'<[a-zA-Z0-9]+>','',remove_punct)
    remove_handle = re.sub(r'@[^\s]+','',remove_html)  
    remove_urls = re.sub(r'https?://\S+','',remove_handle)
    return " ".join([word for word in remove_urls.split(" ") if word not in updated_stop_words])

#### **Splitting the dataset into training and test set**

In [14]:
from sklearn.model_selection import train_test_split as tts

In [15]:
tweets = tweets_df.text

In [16]:
X = tweets.apply(lambda tweet: clean_data(tweet))
Y = tweets_df.target

In [17]:
X_train,X_test,Y_train,Y_test = tts(X,Y,test_size = 0.25,random_state = 1)

In [18]:
X_train.shape,Y_train.shape

((5709,), (5709,))

#### **Performing text preprocessing**

In [19]:
token = Tokenizer(num_words=5000,lower=True,split=' ',char_level=False)

#### **Performing tokenization and embedding the obtained tokens using Tokenizer**

In [20]:
token.fit_on_texts(X_train.values)
X_train_seq = token.texts_to_sequences(X_train)
X_test_seq = token.texts_to_sequences(X_test)

#### **Padding the training and testing data**

In [21]:
X_train_padded = pad_sequences(X_train_seq,maxlen = 25)
X_test_padded = pad_sequences(X_test_seq,maxlen = 25)

### **Building the model**

In [22]:
model = Sequential()

In [23]:
model.add(Embedding(5000,16,input_length = 25))
model.add(LSTM(20,dropout = 0.5))
model.add(Dense(1, activation='sigmoid'))

In [24]:
model.compile(optimizer='adam',loss = 'binary_crossentropy',metrics=['accuracy'])

In [25]:
model.fit(X_train_padded,Y_train,validation_data=(X_test_padded,Y_test),epochs = 25)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<tensorflow.python.keras.callbacks.History at 0x7f106c289710>

In [26]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 25, 16)            80000     
_________________________________________________________________
lstm (LSTM)                  (None, 20)                2960      
_________________________________________________________________
dense (Dense)                (None, 1)                 21        
Total params: 82,981
Trainable params: 82,981
Non-trainable params: 0
_________________________________________________________________


#### **Loading the test set to evaluate**

In [27]:
test_data = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

In [28]:
test_data.shape

(3263, 4)

In [29]:
X_test_data = test_data.text

In [30]:
X_test_data = X_test_data.apply(lambda tweet:clean_data(tweet))

In [31]:
X_test_data_seq = token.texts_to_sequences(X_test_data)
X_test_data_padded = pad_sequences(X_test_data_seq,maxlen = 25)

In [32]:
Y_predicted_test_data = model.predict(X_test_data_padded)

In [33]:
Y_predicted_test_data = map(lambda prediction: prediction[0],Y_predicted_test_data)

In [34]:
Y_predicted_test_data = pd.Series(Y_predicted_test_data)

In [35]:
def convert_to_label(value):
    if value >= 0.5:
        return 1
    elif 0<=value<0.5:
        return 0

In [36]:
Y_predicted_test_data = Y_predicted_test_data.apply(lambda pred_value:convert_to_label(pred_value)) 

In [37]:
Y_predicted_test_data

0       1
1       1
2       1
3       1
4       1
       ..
3258    1
3259    1
3260    1
3261    1
3262    1
Length: 3263, dtype: int64

In [38]:
Y_predicted_test_data.value_counts()

0    1959
1    1304
dtype: int64

In [39]:
test_data['target'] = Y_predicted_test_data

In [40]:
test_data.head()

Unnamed: 0,id,keyword,location,text,target
0,0,,,Just happened a terrible car crash,1
1,2,,,"Heard about #earthquake is different cities, s...",1
2,3,,,"there is a forest fire at spot pond, geese are...",1
3,9,,,Apocalypse lighting. #Spokane #wildfires,1
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,1


In [41]:
final_submission = test_data[['id','target']]
final_submission.to_csv('disaster_tweets_submission.csv')