# Import The Required Packages

In [1]:
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import time

# Load The Dataset

In [3]:
df = pd.read_csv("D:/DataSets/tweets.csv")
df.head()

Unnamed: 0,id,keyword,location,text,target
0,0,ablaze,,"Communal violence in Bhainsa, Telangana. ""Ston...",1
1,1,ablaze,,Telangana: Section 144 has been imposed in Bha...,1
2,2,ablaze,New York City,Arsonist sets cars ablaze at dealership https:...,1
3,3,ablaze,"Morgantown, WV",Arsonist sets cars ablaze at dealership https:...,1
4,4,ablaze,,"""Lord Jesus, your love brings freedom and pard...",0


In [4]:
df.shape

(11370, 5)

In [5]:
print((df.target == 1).sum()) # Disaster
print((df.target == 0).sum()) # No Disaster

2114
9256


In [6]:
# Preprocessing
import re
import string

def remove_URL(text):
    url = re.compile(r"https?://\S+|www\.\S+")
    return url.sub(r"", text)

# https://stackoverflow.com/questions/34293875/how-to-remove-punctuation-marks-from-a-string-in-python-3-x-using-translate/34294022
def remove_punct(text):
    translator = str.maketrans("", "", string.punctuation)
    return text.translate(translator)

string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [7]:

pattern = re.compile(r"https?://(\S+|www)\.\S+")
for t in df.text:
    matches = pattern.findall(t)
    for match in matches:
        print(t)
        print(match)
        print(pattern.sub(r"", t))
    if len(matches) > 0:
        break

Arsonist sets cars ablaze at dealership https://t.co/gOQvyJbpVI
t
Arsonist sets cars ablaze at dealership 


In [8]:
df["text"] = df.text.map(remove_URL) # map(lambda x: remove_URL(x))
df["text"] = df.text.map(remove_punct)

In [9]:
# remove stopwords
# pip install nltk
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Stop Words: A stop word is a commonly used word (such as “the”, “a”, “an”, “in”) that a search engine
# has been programmed to ignore, both when indexing entries for searching and when retrieving them 
# as the result of a search query.
stop = set(stopwords.words("english"))

# https://stackoverflow.com/questions/5486337/how-to-remove-stop-words-using-nltk-or-python
def remove_stopwords(text):
    filtered_words = [word.lower() for word in text.split() if word.lower() not in stop]
    return " ".join(filtered_words)

[nltk_data] Downloading package stopwords to C:\Users\Devmallya
[nltk_data]     Karar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
stop

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [11]:
df["text"] = df.text.map(remove_stopwords)

In [12]:
df.text

0        communal violence bhainsa telangana stones pel...
1        telangana section 144 imposed bhainsa january ...
2                     arsonist sets cars ablaze dealership
3                     arsonist sets cars ablaze dealership
4        lord jesus love brings freedom pardon fill hol...
                               ...                        
11365    media warned us well advance wrecked whole nig...
11366    feel directly attacked 💀 consider moonbin amp ...
11367    feel directly attacked 💀 consider moonbin amp ...
11368    ok remember outcast nd dora au au wrecked nerv...
11369                 jake corway wrecked running 14th irp
Name: text, Length: 11370, dtype: object

In [13]:
from collections import Counter

# Count unique words
def counter_word(text_col):
    count = Counter()
    for text in text_col.values:
        for word in text.split():
            count[word] += 1
    return count


counter = counter_word(df.text)

In [14]:
len(counter)

25889

In [15]:
counter

Counter({'communal': 4,
         'violence': 30,
         'bhainsa': 14,
         'telangana': 13,
         'stones': 8,
         'pelted': 4,
         'muslims': 20,
         'houses': 32,
         'vehicles': 17,
         'set': 76,
         'ablaze…': 2,
         'section': 9,
         '144': 1,
         'imposed': 2,
         'january': 73,
         '13': 39,
         '15': 50,
         'clash': 8,
         'erupted': 9,
         'two': 177,
         'groups': 12,
         '12': 31,
         'po…': 9,
         'arsonist': 13,
         'sets': 8,
         'cars': 17,
         'ablaze': 22,
         'dealership': 2,
         'lord': 18,
         'jesus': 28,
         'love': 135,
         'brings': 11,
         'freedom': 16,
         'pardon': 1,
         'fill': 3,
         'holy': 14,
         'spirit': 4,
         'heart': 91,
         'l…': 30,
         'child': 33,
         'chinese': 28,
         'tweet': 41,
         'would': 252,
         'gone': 30,
         'viral': 8,
   

In [16]:
counter.most_common(5)

[('amp', 556), ('like', 526), ('people', 505), ('one', 413), ('us', 331)]

In [17]:
num_unique_words = len(counter)

In [18]:
# Split dataset into training and validation set
train_size = int(df.shape[0] * 0.8)

train_df = df[:train_size]
val_df = df[train_size:]

# split text and labels
train_sentences = train_df.text.to_numpy()
train_labels = train_df.target.to_numpy()
val_sentences = val_df.text.to_numpy()
val_labels = val_df.target.to_numpy()

In [19]:
train_sentences.shape, val_sentences.shape

((9096,), (2274,))

In [20]:
# Tokenize
from tensorflow.keras.preprocessing.text import Tokenizer

# vectorize a text corpus by turning each text into a sequence of integers
tokenizer = Tokenizer(num_words=num_unique_words)
tokenizer.fit_on_texts(train_sentences) # fit only to training

In [21]:
# each word has unique index
word_index = tokenizer.word_index

In [22]:
word_index

{'amp': 1,
 'like': 2,
 'people': 3,
 'one': 4,
 'fire': 5,
 'us': 6,
 'get': 7,
 'would': 8,
 'new': 9,
 'im': 10,
 'know': 11,
 'need': 12,
 'time': 13,
 'first': 14,
 'emergency': 15,
 'iran': 16,
 't…': 17,
 'two': 18,
 'dont': 19,
 'see': 20,
 'think': 21,
 '2': 22,
 'body': 23,
 'back': 24,
 '1': 25,
 'today': 26,
 'still': 27,
 'years': 28,
 'it’s': 29,
 'many': 30,
 'fires': 31,
 'i’m': 32,
 'news': 33,
 'want': 34,
 'really': 35,
 'trump': 36,
 'could': 37,
 'got': 38,
 'nuclear': 39,
 'world': 40,
 'day': 41,
 'australia': 42,
 'good': 43,
 'please': 44,
 'go': 45,
 'man': 46,
 'attack': 47,
 '…': 48,
 'also': 49,
 'help': 50,
 'say': 51,
 'said': 52,
 'right': 53,
 'police': 54,
 'last': 55,
 'much': 56,
 'death': 57,
 'never': 58,
 'a…': 59,
 'burning': 60,
 'even': 61,
 'going': 62,
 'mass': 63,
 'love': 64,
 'disaster': 65,
 'killed': 66,
 'another': 67,
 'climate': 68,
 'year': 69,
 'come': 70,
 'home': 71,
 'due': 72,
 '3': 73,
 'air': 74,
 'don’t': 75,
 '2020': 76,
 't

In [23]:
train_sequences = tokenizer.texts_to_sequences(train_sentences)
val_sequences = tokenizer.texts_to_sequences(val_sentences)

In [24]:
print(train_sentences[10:15])
print(train_sequences[10:15])

['images showing havoc caused cameroon military torched houses okuthe shameless military reported…'
 'social media went bananas chuba hubbard announced monday evening plans return okstate'
 'hausa youths set area office apapaiganmu local council development area ablaze okada riders stormed lg area office…'
 'mamatabanerjee political violence amp vandalism continues unabated west bengal office asanol was…'
 'amen set whole system ablaze man']
[[1048, 783, 3484, 258, 3482, 100, 5770, 505, 5771, 5772, 100, 4329], [1047, 150, 358, 4330, 9148, 9149, 2226, 694, 1128, 1398, 661, 9150], [9151, 3485, 122, 246, 1207, 9152, 386, 1129, 2227, 246, 756, 9153, 4331, 9154, 9155, 246, 9156], [9157, 758, 689, 1, 9158, 929, 9159, 660, 2909, 1207, 9160, 990], [4332, 122, 338, 455, 756, 46]]


In [25]:
# Pad the sequences to have the same length
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Max number of words in a sequence
max_length = 20

train_padded = pad_sequences(train_sequences, maxlen=max_length, padding="post", truncating="post")
val_padded = pad_sequences(val_sequences, maxlen=max_length, padding="post", truncating="post")
train_padded.shape, val_padded.shape

((9096, 20), (2274, 20))

In [26]:
train_padded[10]

array([1048,  783, 3484,  258, 3482,  100, 5770,  505, 5771, 5772,  100,
       4329,    0,    0,    0,    0,    0,    0,    0,    0])

In [27]:
print(train_sentences[10])
print(train_sequences[10])
print(train_padded[10])

images showing havoc caused cameroon military torched houses okuthe shameless military reported…
[1048, 783, 3484, 258, 3482, 100, 5770, 505, 5771, 5772, 100, 4329]
[1048  783 3484  258 3482  100 5770  505 5771 5772  100 4329    0    0
    0    0    0    0    0    0]


In [28]:
# Check reversing the indices

# flip (key, value)
reverse_word_index = dict([(idx, word) for (word, idx) in word_index.items()])

In [29]:
reverse_word_index

{1: 'amp',
 2: 'like',
 3: 'people',
 4: 'one',
 5: 'fire',
 6: 'us',
 7: 'get',
 8: 'would',
 9: 'new',
 10: 'im',
 11: 'know',
 12: 'need',
 13: 'time',
 14: 'first',
 15: 'emergency',
 16: 'iran',
 17: 't…',
 18: 'two',
 19: 'dont',
 20: 'see',
 21: 'think',
 22: '2',
 23: 'body',
 24: 'back',
 25: '1',
 26: 'today',
 27: 'still',
 28: 'years',
 29: 'it’s',
 30: 'many',
 31: 'fires',
 32: 'i’m',
 33: 'news',
 34: 'want',
 35: 'really',
 36: 'trump',
 37: 'could',
 38: 'got',
 39: 'nuclear',
 40: 'world',
 41: 'day',
 42: 'australia',
 43: 'good',
 44: 'please',
 45: 'go',
 46: 'man',
 47: 'attack',
 48: '…',
 49: 'also',
 50: 'help',
 51: 'say',
 52: 'said',
 53: 'right',
 54: 'police',
 55: 'last',
 56: 'much',
 57: 'death',
 58: 'never',
 59: 'a…',
 60: 'burning',
 61: 'even',
 62: 'going',
 63: 'mass',
 64: 'love',
 65: 'disaster',
 66: 'killed',
 67: 'another',
 68: 'climate',
 69: 'year',
 70: 'come',
 71: 'home',
 72: 'due',
 73: '3',
 74: 'air',
 75: 'don’t',
 76: '2020',
 77

In [30]:
def decode(sequence):
    return " ".join([reverse_word_index.get(idx, "?") for idx in sequence])

In [31]:
decoded_text = decode(train_sequences[10])

print(train_sequences[10])
print(decoded_text)

[1048, 783, 3484, 258, 3482, 100, 5770, 505, 5771, 5772, 100, 4329]
images showing havoc caused cameroon military torched houses okuthe shameless military reported…


In [32]:
# Create LSTM model
from tensorflow.keras import layers

# Embedding: https://www.tensorflow.org/tutorials/text/word_embeddings
# Turns positive integers (indexes) into dense vectors of fixed size. (other approach could be one-hot-encoding)

# Word embeddings give us a way to use an efficient, dense representation in which similar words have 
# a similar encoding. Importantly, you do not have to specify this encoding by hand. An embedding is a 
# dense vector of floating point values (the length of the vector is a parameter you specify).

model = keras.models.Sequential()
model.add(layers.Embedding(num_unique_words, 32, input_length=max_length))

# The layer will take as input an integer matrix of size (batch, input_length),
# and the largest integer (i.e. word index) in the input should be no larger than num_words (vocabulary size).
# Now model.output_shape is (None, input_length, 32), where `None` is the batch dimension.


model.add(layers.LSTM(64, dropout=0.1))
model.add(layers.Dense(1, activation="sigmoid"))

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 20, 32)            828448    
_________________________________________________________________
lstm (LSTM)                  (None, 64)                24832     
_________________________________________________________________
dense (Dense)                (None, 1)                 65        
Total params: 853,345
Trainable params: 853,345
Non-trainable params: 0
_________________________________________________________________


In [34]:
loss = keras.losses.BinaryCrossentropy(from_logits=False)
optim = keras.optimizers.Adam(lr=0.001)
metrics = ["accuracy"]

model.compile(loss=loss, optimizer=optim, metrics=metrics)

In [35]:
model.fit(train_padded, train_labels, epochs=20, validation_data=(val_padded, val_labels), verbose=2)

Epoch 1/20
285/285 - 14s - loss: 0.3552 - accuracy: 0.8670 - val_loss: 0.4492 - val_accuracy: 0.8241
Epoch 2/20
285/285 - 7s - loss: 0.1384 - accuracy: 0.9548 - val_loss: 0.4907 - val_accuracy: 0.8399
Epoch 3/20
285/285 - 7s - loss: 0.0537 - accuracy: 0.9852 - val_loss: 0.6071 - val_accuracy: 0.8294
Epoch 4/20
285/285 - 7s - loss: 0.0307 - accuracy: 0.9919 - val_loss: 0.9175 - val_accuracy: 0.8303
Epoch 5/20
285/285 - 7s - loss: 0.0210 - accuracy: 0.9946 - val_loss: 0.8140 - val_accuracy: 0.8307
Epoch 6/20
285/285 - 6s - loss: 0.0160 - accuracy: 0.9966 - val_loss: 0.6301 - val_accuracy: 0.8272
Epoch 7/20
285/285 - 6s - loss: 0.0129 - accuracy: 0.9977 - val_loss: 0.8521 - val_accuracy: 0.8276
Epoch 8/20
285/285 - 6s - loss: 0.0085 - accuracy: 0.9979 - val_loss: 1.0498 - val_accuracy: 0.8281
Epoch 9/20
285/285 - 6s - loss: 0.0056 - accuracy: 0.9988 - val_loss: 1.2015 - val_accuracy: 0.8307
Epoch 10/20
285/285 - 7s - loss: 0.0060 - accuracy: 0.9981 - val_loss: 1.0590 - val_accuracy: 0.832

<tensorflow.python.keras.callbacks.History at 0x2210aa30a90>

In [36]:
predictions = model.predict(train_padded)
predictions = [1 if p > 0.5 else 0 for p in predictions]

In [37]:
print(train_sentences[10:20])
print(train_labels[10:20])
print(predictions[10:20])

['images showing havoc caused cameroon military torched houses okuthe shameless military reported…'
 'social media went bananas chuba hubbard announced monday evening plans return okstate'
 'hausa youths set area office apapaiganmu local council development area ablaze okada riders stormed lg area office…'
 'mamatabanerjee political violence amp vandalism continues unabated west bengal office asanol was…'
 'amen set whole system ablaze man'
 'images showing havoc caused cameroon military torched houses okuthe shameless military is…'
 'cows today local factory sadly still ablaze redjanuary2020'
 'rengoku sets heart ablaze😔❤️🔥 ps missed style coloring c 鬼滅の刃'
 'paulzizkaphoto “rundle ablaze” wishing good evening'
 'french cameroun set houses ablaze ndu roasted two young boys homes targeted killings genocideinsou…']
[1 0 1 1 0 1 1 0 0 1]
[1, 0, 1, 1, 0, 1, 1, 0, 0, 1]
