In [None]:
##TWEET CLASSIFICATION - Identify tweets that indicate a disaster event. 

In [21]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import spacy
import re
import en_core_web_sm
import keras
from nltk.tokenize import word_tokenize
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.sequence import TimeseriesGenerator
from keras.models import Sequential
from keras.layers import Embedding,Flatten,Dense,LSTM,Dropout,SpatialDropout1D
from sklearn.metrics import classification_report,confusion_matrix
%matplotlib inline

In [2]:
df=pd.read_csv('train.csv')

In [3]:
df.head(15)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
5,8,,,#RockyFire Update => California Hwy. 20 closed...,1
6,10,,,#flood #disaster Heavy rain causes flash flood...,1
7,13,,,I'm on top of the hill and I can see a fire in...,1
8,14,,,There's an emergency evacuation happening now ...,1
9,15,,,I'm afraid that the tornado is coming to our a...,1


In [4]:
def clean_data(tweet):
    tweet = re.sub("RT @[\w]*:", "", tweet)
    tweet = re.sub("@[\w]*", "", tweet)
    tweet = re.sub("https://[A-Za-z0-9./]", "", tweet)
    tweet = re.sub("\n", "", tweet)
    tweet = re.sub("&amp", "", tweet)
    tweet = re.sub("#", "", tweet)
    tweet = re.sub(r"[^\w]", ' ', tweet )
    return tweet

In [5]:
##clean sentences contains tweets with hashtags and .com's removed
clean_sentences=[]
for i in df['text']:
    clean_sentences.append(clean_data(i))

In [262]:
clean_sentences[:5]

['Our Deeds are the Reason of this earthquake May ALLAH Forgive us all',
 'Forest fire near La Ronge Sask  Canada',
 'All residents asked to  shelter in place  are being notified by officers  No other evacuation or shelter in place orders are expected',
 '13 000 people receive wildfires evacuation orders in California ',
 'Just got sent this photo from Ruby Alaska as smoke from wildfires pours into a school ']

In [6]:
##function to tokenize words and append to all_words list
all_words = []
for sent in clean_sentences:
    tokenize_word = word_tokenize(sent)
    for word in tokenize_word:
        all_words.append(word)

In [7]:
len(set(all_words))

23754

In [9]:
##converts text to one hot form
sentence_vector=[one_hot(i,len(set(all_words)))for i in clean_sentences]

In [13]:
sentence_vector[0]

[19790,
 3192,
 18546,
 23120,
 17317,
 9208,
 11910,
 20655,
 12226,
 16209,
 15813,
 14432,
 7143]

In [10]:
#function to find the length of the sentence with maximum words.
li=[]
for i in sentence_vector:
    li.append(len(i))
print(max(li))   

34


In [33]:
##converts sentence vector to pad_sequences so that all sentences can be of uniform length
pad_sentence_vector=pad_sequences(sentence_vector,34,padding='post')   

In [12]:
pad_sentence_vector.shape

(7613, 50)

In [34]:
##Neural network model with embedding layer(converts each word into 64 dimensional vector), 
##followed by a LSTM layer and dense layers.

model = Sequential()
model.add(Embedding(len(set(all_words)),64, input_length=34))
model.add(LSTM(128))
model.add(Dense(32,activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

In [35]:
model.compile(optimizer='rmsprop',loss="binary_crossentropy",metrics=["accuracy"])

In [36]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 34, 64)            1520256   
_________________________________________________________________
lstm_6 (LSTM)                (None, 128)               98816     
_________________________________________________________________
dense_5 (Dense)              (None, 32)                4128      
_________________________________________________________________
dropout_3 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 33        
Total params: 1,623,233
Trainable params: 1,623,233
Non-trainable params: 0
_________________________________________________________________


In [37]:
typ=np.array(df['target'])


In [38]:
model.fit(pad_sentence_vector,typ,epochs=10)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x2584ef8dc18>

In [24]:
##function to get the output of intermediate layers in neural network
outputs = []
for layer in model.layers:
    keras_function = keras.backend.function([model.input], [layer.output])
    outputs.append(keras_function([pad_sentence_vector, 1]))

In [32]:
##'all_word_vector' contains the vector format of sentences
all_words_vector=[]
for i in range(7613):
    all_words_vector.append(outputs[0][0][i])

In [124]:
##64 neuron embedding of a word
all_words_vector[7612][33]

array([-0.04336546, -0.01990451, -0.04326888, -0.03171958,  0.00591424,
       -0.02984506,  0.0374441 ,  0.02929184, -0.03230392,  0.01904315,
       -0.02626904, -0.00419082, -0.01574385,  0.0102398 , -0.02165961,
       -0.00351894, -0.00101176,  0.02399922,  0.02646653, -0.0155422 ,
       -0.01411964,  0.00994042,  0.041251  ,  0.03864434,  0.02422207,
       -0.01930986,  0.03358909, -0.01479995,  0.02264045,  0.0414146 ,
        0.00995773, -0.02630647,  0.01438494, -0.04745055,  0.01046242,
        0.02966463, -0.00706748, -0.013628  , -0.04172553,  0.01919276,
        0.00211193, -0.0234073 , -0.03346163, -0.00834046, -0.02264348,
        0.02043033,  0.03566115, -0.03324674,  0.02708175,  0.02326102,
       -0.01369928,  0.01785562,  0.02121887,  0.02717801,  0.03503483,
        0.04712018,  0.00561329,  0.00258492,  0.01998884, -0.02362551,
       -0.00655126,  0.0238639 , -0.02272739, -0.04257928], dtype=float32)

In [149]:
##creates a dictionary with encoded version of each word as the key and 64 dimensional embeddings as values(34:[0.231,0.115,...])
d={}
for i in range(7613):
    for j in range(34):
        if pad_sentence_vector[i][j]!=0:
            d[pad_sentence_vector[i][j]]=all_words_vector[i][j]

In [243]:
##creates a dictionary with words as the key and encoded version of each word as values('our':2012)
d_sentence={}
for i in range(7613):
    a=clean_sentences[i].split(" ")
    while '' in a:
        a.remove('')
    while '_' in a:
        a.remove('_')
    while '__' in a:
        a.remove('__')
    for j in range(len(a)):
        d_sentence[a[j]]=sentence_vector[i][j]
        

In [249]:
d

{19790: array([-0.0575928 , -0.06316222, -0.02138799,  0.04691613, -0.01098062,
        -0.0648561 ,  0.04043918,  0.07169309, -0.07243172, -0.06090378,
        -0.09398786, -0.00158454,  0.13023089, -0.0246022 ,  0.06783589,
         0.06262786,  0.02204265,  0.02283886,  0.00740943, -0.04528404,
        -0.14583132, -0.05096967,  0.01107216,  0.04832347,  0.03166929,
        -0.06701685,  0.10250603, -0.02593023,  0.00092214,  0.05402223,
         0.04608398, -0.11056229,  0.03433543, -0.04033037,  0.04250871,
         0.02093419,  0.04025043, -0.05462016,  0.02087396,  0.04672729,
        -0.01910744, -0.06379203, -0.01902942, -0.02549666, -0.00867012,
         0.00280629,  0.06881952, -0.0299418 ,  0.09101164,  0.05947851,
         0.08207233,  0.08935583,  0.02223668,  0.07174747, -0.01355325,
         0.0455383 ,  0.00978222,  0.03261782,  0.16899326, -0.01281193,
        -0.02053435,  0.06870633,  0.00618593, -0.02083559], dtype=float32),
 3192: array([ 0.02543842, -0.00808294, 

In [250]:
##creates a dictionary with words as key and embeddings as values('our':[0.912,0.541,...])
embeddings={}
for i in range(len(d_sentence)):
    embeddings[list(d_sentence)[i]]=d[list(d_sentence.values())[i]]
    
    

In [267]:
a=embeddings['earthquake']## '#earthquake'
b=embeddings['flood']## '#flood'

In [268]:
##function to compute cosine similarity
dot = np.dot(a,b)
    # Compute the L2 norm of u (≈1 line)
norm_u = np.sqrt(np.sum(a**2))
    
    # Compute the L2 norm of v (≈1 line)
norm_v = np.sqrt(np.sum(b**2))
    # Compute the cosine similarity defined by formula (1) (≈1 line)
cosine_similarity = dot/(norm_u*norm_v)

In [269]:
##higher the cosine value similar the words are.
cosine_similarity

0.95509

In [39]:
##computes predictions using the neural network model
predictions=model.predict(pad_sentence_vector[5000:7613])
pred=predictions.squeeze().tolist()

In [30]:
##To fix a range to convert predictions in terms of zeros and ones
a=pd.DataFrame(pred)
a[0].mean()

0.4283656686761121

In [40]:
pred_final=[]
for i in pred:
    if i>0.4:
        pred_final.append(1)
    else:
        pred_final.append(0)
        

In [41]:
print(classification_report(df['target'][5000:7613],pred_final))
print(confusion_matrix(df['target'][5000:7613],pred_final))

              precision    recall  f1-score   support

           0       0.94      0.97      0.95      1436
           1       0.96      0.93      0.94      1177

    accuracy                           0.95      2613
   macro avg       0.95      0.95      0.95      2613
weighted avg       0.95      0.95      0.95      2613

[[1387   49]
 [  87 1090]]


In [44]:
test=pd.read_csv('test.csv')

In [45]:
test.head(10)

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
5,12,,,We're shaking...It's an earthquake
6,21,,,They'd probably still show more life than Arse...
7,22,,,Hey! How are you?
8,27,,,What a nice hat?
9,29,,,Fuck off!


In [210]:
len(test)

3263

In [46]:
clean_sentences_test=[]
for i in test['text']:
    clean_sentences_test.append(clean_data(i))

In [47]:
all_words_test = []
for sent in clean_sentences_test:
    tokenize_word = word_tokenize(sent)
    for word in tokenize_word:
        all_words_test.append(word)

In [48]:
sentence_vector2=[one_hot(i,len(set(all_words)))for i in clean_sentences_test]

In [49]:
pad_sentence_vector2=pad_sequences(sentence_vector2,34,padding='post')   

In [93]:
predictions_test=model.predict(pad_sentence_vector2[0:3263])

In [113]:
pred_test=[]
for i in predictions_test:
    if i>=0.1:
        pred_test.append(1)
    else:
        pred_test.append(0)


In [114]:
test_results=pd.DataFrame(test['id'],columns=['id'])
test_results['text']=test['text']
test_results['target']=pred_test

In [115]:
test_results.head()

Unnamed: 0,id,text,target
0,0,Just happened a terrible car crash,1
1,2,"Heard about #earthquake is different cities, s...",1
2,3,"there is a forest fire at spot pond, geese are...",1
3,9,Apocalypse lighting. #Spokane #wildfires,1
4,11,Typhoon Soudelor kills 28 in China and Taiwan,1


In [116]:
test_results.to_csv('submit.csv')