In [1]:
#Import Libraries

import tensorflow as tf
import nltk 
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import gensim
from gensim import models
from gensim.models import Word2Vec, KeyedVectors
from tensorflow.keras.layers import Dropout

from sklearn.metrics import confusion_matrix

In [2]:
dataset = pd.read_csv('train.csv')
dataset.head(5)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
y= dataset['target'].values
x= dataset.drop(columns='target',axis=1)

In [4]:
x

Unnamed: 0,id,keyword,location,text
0,1,,,Our Deeds are the Reason of this #earthquake M...
1,4,,,Forest fire near La Ronge Sask. Canada
2,5,,,All residents asked to 'shelter in place' are ...
3,6,,,"13,000 people receive #wildfires evacuation or..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...
...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...
7611,10872,,,Police investigating after an e-bike collided ...


In [5]:
def clean(data):
    data = data.lower()
    data = re.sub('https?://\S+|www\.\S+', ' ', data)
    data = re.sub('\\W', ' ', data)
    data = re.sub('\n', ' ', data)
    data = re.sub(' +', ' ', data)
    data = re.sub('^ ', ' ', data)
    data = re.sub(' $', ' ', data)
    data = re.sub('#', ' ', data)
    data = re.sub('@', ' ', data)
    data = re.sub('[^a-zA-Z]',' ', data)
    data = data.replace('%20',' ')
    return data

In [6]:
for i in range(len(x)):
    x['text'][i] = clean(x['text'][i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x['text'][i] = clean(x['text'][i])


In [7]:
x

Unnamed: 0,id,keyword,location,text
0,1,,,our deeds are the reason of this earthquake ma...
1,4,,,forest fire near la ronge sask canada
2,5,,,all residents asked to shelter in place are be...
3,6,,,people receive wildfires evacuation ord...
4,7,,,just got sent this photo from ruby alaska as s...
...,...,...,...,...
7608,10869,,,two giant cranes holding a bridge collapse int...
7609,10870,,,aria ahrary thetawniest the out of control wi...
7610,10871,,,m utc km s of volcano hawaii
7611,10872,,,police investigating after an e bike collided ...


In [8]:
corpus=[]
lemmatizer = WordNetLemmatizer()
for i in range(len(x)):
    words = nltk.word_tokenize(x['text'][i])
    words = [lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
    words = ' '.join(words)
    corpus.append(words)
    #x['text'][i] =  re.sub(r"[^a-zA-Z]+", ' ', x['text'][i])
    #x['text'][i] =  x['text'][i].lower()

In [9]:
corpus

['deed reason earthquake may allah forgive u',
 'forest fire near la ronge sask canada',
 'resident asked shelter place notified officer evacuation shelter place order expected',
 'people receive wildfire evacuation order california',
 'got sent photo ruby alaska smoke wildfire pours school',
 'rockyfire update california hwy closed direction due lake county fire cafire wildfire',
 'flood disaster heavy rain cause flash flooding street manitou colorado spring area',
 'top hill see fire wood',
 'emergency evacuation happening building across street',
 'afraid tornado coming area',
 'three people died heat wave far',
 'haha south tampa getting flooded hah wait second live south tampa gon na gon na fvck flooding',
 'raining flooding florida tampabay tampa day lost count',
 'flood bago myanmar arrived bago',
 'damage school bus multi car crash breaking',
 'man',
 'love fruit',
 'summer lovely',
 'car fast',
 'goooooooaaaaaal',
 'ridiculous',
 'london cool',
 'love skiing',
 'wonderful day'

In [10]:
tf.__version__

'2.6.0'

In [11]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.preprocessing.text import one_hot

In [12]:
voc_size=5000

In [13]:
one_hotrep = [one_hot(word,voc_size) for word in corpus] 

In [14]:
one_hotrep[0]

[1203, 879, 1972, 2984, 2061, 4458, 3727]

# Embedding Representation

In [34]:
sentence_lenth = 8
embeeddocs = pad_sequences(one_hotrep,padding='pre',maxlen=sentence_lenth)

In [35]:
embeeddocs

array([[   0, 1203,  879, ..., 2061, 4458, 3727],
       [   0, 1921, 2497, ..., 1104, 2079, 1143],
       [ 329, 3679, 1786, ...,  329, 4252, 3089],
       ...,
       [   0,    0,    0, ..., 3267, 4412, 2577],
       [1702,   68,  673, ..., 1975,  860, 3752],
       [1053, 3980, 2905, ..., 2407, 4049, 2274]])

# Create Model

In [36]:
embeddingvectorfeature = 40
model = Sequential()
model.add(Embedding(voc_size,embeddingvectorfeature,input_length=sentence_lenth))
model.add(Dropout(0.4))
model.add(LSTM(1000))
model.add(Dropout(0.4))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model

<keras.engine.sequential.Sequential at 0x2a3e1150550>

In [37]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 8, 40)             200000    
_________________________________________________________________
dropout_2 (Dropout)          (None, 8, 40)             0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 1000)              4164000   
_________________________________________________________________
dropout_3 (Dropout)          (None, 1000)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 1001      
Total params: 4,365,001
Trainable params: 4,365,001
Non-trainable params: 0
_________________________________________________________________


In [38]:
len(embeeddocs),y.shape

(7613, (7613,))

In [39]:
x_final=np.array(embeeddocs)
y_final=np.array(y)

In [40]:
x_final.shape,y_final.shape

((7613, 8), (7613,))

In [41]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x_final, y_final, test_size=0.33, random_state=42)

In [42]:
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=50,batch_size=100)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x2a3e122d4f0>

# Performance Metrics And Accuracy

In [43]:
y_pred=model.predict(X_test) 
y_pred=np.argmax(y_pred,axis=1)

In [44]:
confusion_matrix(y_test,y_pred)

array([[1446,    0],
       [1067,    0]], dtype=int64)

In [45]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.575407879029049

In [46]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.58      1.00      0.73      1446
           1       0.00      0.00      0.00      1067

    accuracy                           0.58      2513
   macro avg       0.29      0.50      0.37      2513
weighted avg       0.33      0.58      0.42      2513



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
