In [None]:
# from google.colab import files
# uploaded = files.upload()

In [1]:
%tensorflow_version 2.x
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing import sequence
import pandas as pd

Using TensorFlow backend.


In [2]:
train = pd.read_csv('train.csv')
print(train.head(10))

   id keyword  ...                                               text target
0   1     NaN  ...  Our Deeds are the Reason of this #earthquake M...      1
1   4     NaN  ...             Forest fire near La Ronge Sask. Canada      1
2   5     NaN  ...  All residents asked to 'shelter in place' are ...      1
3   6     NaN  ...  13,000 people receive #wildfires evacuation or...      1
4   7     NaN  ...  Just got sent this photo from Ruby #Alaska as ...      1
5   8     NaN  ...  #RockyFire Update => California Hwy. 20 closed...      1
6  10     NaN  ...  #flood #disaster Heavy rain causes flash flood...      1
7  13     NaN  ...  I'm on top of the hill and I can see a fire in...      1
8  14     NaN  ...  There's an emergency evacuation happening now ...      1
9  15     NaN  ...  I'm afraid that the tornado is coming to our a...      1

[10 rows x 5 columns]


In [3]:
test = pd.read_csv('test.csv')


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [5]:
train.describe()

Unnamed: 0,id,target
count,7613.0,7613.0
mean,5441.934848,0.42966
std,3137.11609,0.49506
min,1.0,0.0
25%,2734.0,0.0
50%,5408.0,0.0
75%,8146.0,1.0
max,10873.0,1.0


# **Convert to Lower Case**

In [6]:
train["text"] = train["text"].str.lower()
train.head()

test["text"] = test["text"].str.lower()
test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,just happened a terrible car crash
1,2,,,"heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,apocalypse lighting. #spokane #wildfires
4,11,,,typhoon soudelor kills 28 in china and taiwan


# **Remove Punctuations**

In [7]:
import string
PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

train["text"] = train["text"].apply(lambda text: remove_punctuation(text))
train.head()

test["text"] = test["text"].apply(lambda text: remove_punctuation(text))
test.head()


Unnamed: 0,id,keyword,location,text
0,0,,,just happened a terrible car crash
1,2,,,heard about earthquake is different cities sta...
2,3,,,there is a forest fire at spot pond geese are ...
3,9,,,apocalypse lighting spokane wildfires
4,11,,,typhoon soudelor kills 28 in china and taiwan


In [8]:
l = ' '.join(list(train['text'])).split()
print(l) 
print(len(set(l)))

l = ' '.join(list(test['text'])).split()
print(l) 
print(len(set(l)))


22697
12721


# **STOPWORDS**

In [9]:
!pip install nltk
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
", ".join(stopwords.words('english'))
STOPWORDS = set(stopwords.words('english'))


def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

train["text"] = train["text"].apply(lambda text: remove_stopwords(text))
train.head()

test["text"] = test["text"].apply(lambda text: remove_stopwords(text))
train.head()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,id,keyword,location,text,target
0,1,,,deeds reason earthquake may allah forgive us,1
1,4,,,forest fire near la ronge sask canada,1
2,5,,,residents asked shelter place notified officer...,1
3,6,,,13000 people receive wildfires evacuation orde...,1
4,7,,,got sent photo ruby alaska smoke wildfires pou...,1


In [10]:

l = ' '.join(list(train['text'])).split()
print(l) 
print(len(set(l)))

l = ' '.join(list(test['text'])).split()
print(l) 
print(len(set(l)))

22564
12590


# **STEMMING**

In [11]:
from nltk.stem.porter import PorterStemmer

 

stemmer = PorterStemmer()
def stem_words(text):
    return " ".join([stemmer.stem(word) for word in text.split()])

train["text"] = train["text"].apply(lambda text: stem_words(text))
train.head()

test["text"] = test["text"].apply(lambda text: stem_words(text))
test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,happen terribl car crash
1,2,,,heard earthquak differ citi stay safe everyon
2,3,,,forest fire spot pond gees flee across street ...
3,9,,,apocalyps light spokan wildfir
4,11,,,typhoon soudelor kill 28 china taiwan


In [12]:
l = ' '.join(list(train['text'])).split()
print(l) 
print(len(set(l)))

l = ' '.join(list(test['text'])).split()
print(l) 
print(len(set(l)))

19548
10917


# **LEMMATIZATION**

In [13]:
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
def lemmatize_words(text):
    pos_tagged_text = nltk.pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

train["text"] = train["text"].apply(lambda text: lemmatize_words(text))
train.head()


test["text"] = test["text"].apply(lambda text: lemmatize_words(text))
test.head()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


Unnamed: 0,id,keyword,location,text
0,0,,,happen terribl car crash
1,2,,,heard earthquak differ citi stay safe everyon
2,3,,,forest fire spot pond gee flee across street c...
3,9,,,apocalyps light spokan wildfir
4,11,,,typhoon soudelor kill 28 china taiwan


In [14]:
l = ' '.join(list(train['text'])).split()
print(l) 
print(len(set(l)))

l = ' '.join(list(test['text'])).split()
print(l) 
print(len(set(l)))

19456
10851


# **REMOVE URLS**

In [15]:
import re
def remove_urls(text):
    url_pattern = re.compile(r'http\S+|www\.\S+')
    return url_pattern.sub(r'', text)


train["text"] = train["text"].apply(lambda text: remove_urls(text))
train.head() 

test["text"] = test["text"].apply(lambda text: remove_urls(text))
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,deed reason earthquak may allah forgiv u,1
1,4,,,forest fire near la rong sask canada,1
2,5,,,resid ask shelter place notifi offic evacu she...,1
3,6,,,13000 peopl receiv wildfir evacu order california,1
4,7,,,get sent photo rubi alaska smoke wildfir pour ...,1


In [16]:
l = ' '.join(list(test['text'])).split()
print(l) 
print(len(set(l)))

8820


**UNIQUE** **KEYWORDS**

In [17]:
print (train.keyword.nunique(), test.keyword.nunique())
print (set(train.keyword.unique()) - set(test.keyword.unique()))

221 221
set()


In [18]:
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score

In [19]:
y_train = train.pop('target')
train = train.drop(['id', 'keyword','location'], axis = 1)
test = test.drop(['id', 'keyword','location'], axis = 1)
print(train.head())

                                                text
0           deed reason earthquak may allah forgiv u
1               forest fire near la rong sask canada
2  resid ask shelter place notifi offic evacu she...
3  13000 peopl receiv wildfir evacu order california
4  get sent photo rubi alaska smoke wildfir pour ...


In [20]:
from keras.preprocessing.text import Tokenizer
from keras_preprocessing import sequence
max_words = len(set(l)) + 2
max_len = len(max(list(train['text']))) + 1
tok = Tokenizer(num_words = max_words)
print(tok)
tok.fit_on_texts(train['text'])

<keras_preprocessing.text.Tokenizer object at 0x7efcd7c60ac8>


# TEXT TO TOKEN **CONVERSION**

In [21]:
sequences = tok.texts_to_sequences(train['text'])
print(sequences)

sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)
print(sequences_matrix)

[[3771, 466, 221, 90, 1375, 2976, 6], [149, 3, 199, 529, 5470, 5471, 1051], [1376, 530, 1863, 396, 5472, 234, 43, 1863, 396, 353, 452], [2469, 12, 2470, 101, 43, 353, 48], [1, 2471, 153, 3772, 1662, 212, 101, 2472, 133], [2473, 207, 48, 1377, 531, 327, 804, 467, 891, 354, 3, 3773, 101], [20, 27, 730, 190, 63, 700, 20, 453, 5473, 892, 805, 252], [4, 200, 1119, 30, 3, 1864], [272, 21, 43, 208, 26, 731, 453], [4, 2116, 355, 24, 252], [501, 12, 114, 264, 116, 502], [732, 549, 2474, 1, 20, 2977, 338, 414, 52, 549, 2474, 290, 290, 5474, 20], [190, 20, 1663, 5475, 2474, 1204, 1052, 34, 291, 312, 2978], [20, 3774, 765, 766, 3774], [134, 133, 328, 1664, 3775, 58, 22, 117], [503, 59], [39, 1504], [292, 39], [58, 640], [5476], [2117], [942, 415], [39, 2475], [504, 34], [5477], [5478, 65, 1120, 201], [1121, 94, 293], [39, 1865], [5479], [2, 3776], [191], [5480, 2118, 279, 532], [265, 155, 339, 730, 1053, 61], [5481, 117, 5482, 593, 165, 532, 996], [701, 165, 532], [1505, 505, 32, 806, 94, 227, 532

# **K FOLD CROSS VALIDATION + MODEL**

In [22]:
from tensorflow.keras.models import Model,Sequential
from tensorflow.keras.layers import LSTM,Activation,Dense,Dropout,Input,Embedding
from sklearn.model_selection import StratifiedKFold
import numpy

In [23]:

seed = 7
numpy.random.seed(seed)
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
cvscores = []
for tr, validate in kfold.split(sequences_matrix, y_train):
    
    model = Sequential()
    model.add(Embedding(max_words,32)),
    model.add(LSTM(32))
    model.add(Dense(1,activation='sigmoid'))
    model.compile(loss='binary_crossentropy',optimizer='rmsprop',metrics=['accuracy'])
    history = model.fit(sequences_matrix[tr],y_train[tr],epochs = 5,batch_size=200)
    scores = model.evaluate(sequences_matrix[validate], y_train[validate], verbose=2)
	
    cvscores.append(scores[1] * 100)
print((numpy.mean(cvscores), numpy.std(cvscores))) 

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
48/48 - 0s - loss: 0.4927 - accuracy: 0.7892
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
48/48 - 0s - loss: 0.4800 - accuracy: 0.7754
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
48/48 - 0s - loss: 0.4279 - accuracy: 0.8194
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
48/48 - 0s - loss: 0.4605 - accuracy: 0.8081
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
48/48 - 0s - loss: 0.4700 - accuracy: 0.7924
(79.69271779060364, 1.5323028499026372)


In [24]:
sequences_test = tok.texts_to_sequences(test['text'])
print(sequences_test)

sequences_matrix_test = sequence.pad_sequences(sequences_test,maxlen=max_len)
print(sequences_matrix_test)

[[208, 1687, 58, 22], [482, 221, 834, 140, 420, 1262, 192], [149, 3, 607, 2957, 2259, 731, 453, 1212, 112], [401, 303, 5924, 101], [465, 587, 17, 4253, 411, 1039], [221], [2626, 633, 44, 193, 95, 2299, 1305, 3046, 3046], [620], [1075, 697], [84], [15, 2, 1086], [15], [15, 266], [], [893], [3442, 2118, 279, 532, 796, 14, 3, 117, 3442, 2118, 279], [1379, 1063, 1739, 532], [7173, 675, 165, 885, 421, 532], [239, 2979], [4551, 2070, 2987, 323, 240, 240, 532], [3645, 53, 532, 5], [16, 59, 532, 10], [4125, 423, 12, 35, 166, 123, 113, 1571, 315, 374, 690, 25, 165, 242, 532, 6865], [1923, 553, 114, 165, 532, 230, 114, 16, 103, 165], [165, 532], [1291, 807, 2024, 135, 165, 532, 34, 179, 7, 7903, 7632, 504], [551, 532, 178, 740], [16, 77, 167, 140, 4135, 26, 692, 819, 970, 8388, 5, 165, 532], [1090, 315, 532, 4696, 2759], [79, 1458, 2732, 1382, 47, 468], [1, 39, 16, 715, 683, 2606, 282, 67, 35, 358, 4, 79, 3614], [534, 5184, 201, 79], [75, 58, 793, 58, 3199, 614, 830], [4345, 4349, 44, 612, 398, 

In [25]:
pred = model.predict(sequences_matrix_test)
pred = (pred.reshape(3263,))
pred = (pred.tolist())

print(pred)
print(len(pred))
for i in range(0,len(pred)):
    
    if pred[i]>= 0.5:
        pred[i] = 1
    else:
        pred[i] = 0  

print(pred)

[0.6486251950263977, 0.6747185587882996, 0.9005126357078552, 0.5705950260162354, 0.9476877450942993, 0.41277042031288147, 0.04064643383026123, 0.13974881172180176, 0.08891898393630981, 0.1157045066356659, 0.09497672319412231, 0.15868404507637024, 0.1466411054134369, 0.17347344756126404, 0.10504856705665588, 0.8820221424102783, 0.11014291644096375, 0.8022310733795166, 0.08335140347480774, 0.09350112080574036, 0.08239522576332092, 0.2588297724723816, 0.04639121890068054, 0.8494317531585693, 0.2627912759780884, 0.7468857169151306, 0.09657731652259827, 0.6004639267921448, 0.04747477173805237, 0.8435869812965393, 0.03238481283187866, 0.3686036467552185, 0.7530173063278198, 0.243003249168396, 0.5824182629585266, 0.05564379692077637, 0.6743093729019165, 0.08289828896522522, 0.058425456285476685, 0.7691811323165894, 0.1875000298023224, 0.6456369161605835, 0.27347972989082336, 0.6455075740814209, 0.02224266529083252, 0.19134867191314697, 0.03876978158950806, 0.0054943859577178955, 0.14067798852