# IMPORTING LIBRARIES

In [1]:
import pandas as pd
import numpy as np
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,LSTM,Dense,SpatialDropout1D
from tensorflow.keras.optimizers import Adam


# uploading datasets


In [2]:
train=pd.read_csv("/content/drive/MyDrive/dpAssingment/assingment 2/train.csv")
test=pd.read_csv("/content/drive/MyDrive/dpAssingment/assingment 2/test.csv")


In [3]:
print(train.size,train.shape)
print(test.size,test.shape)

print(train.head(10))

print(len(train[train['target']==1]))
print(len(train[train['target']==0]))


38065 (7613, 5)
13052 (3263, 4)
   id keyword  ...                                               text target
0   1     NaN  ...  Our Deeds are the Reason of this #earthquake M...      1
1   4     NaN  ...             Forest fire near La Ronge Sask. Canada      1
2   5     NaN  ...  All residents asked to 'shelter in place' are ...      1
3   6     NaN  ...  13,000 people receive #wildfires evacuation or...      1
4   7     NaN  ...  Just got sent this photo from Ruby #Alaska as ...      1
5   8     NaN  ...  #RockyFire Update => California Hwy. 20 closed...      1
6  10     NaN  ...  #flood #disaster Heavy rain causes flash flood...      1
7  13     NaN  ...  I'm on top of the hill and I can see a fire in...      1
8  14     NaN  ...  There's an emergency evacuation happening now ...      1
9  15     NaN  ...  I'm afraid that the tornado is coming to our a...      1

[10 rows x 5 columns]
3271
4342


# PRE PROCESSING OF DATA

In [5]:
def removeURL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)


In [6]:
def removeTags(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)

In [7]:
def onlyWords(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    # Single character removal
    text = re.sub(r"\s+[a-zA-Z]\s+", ' ', text)
    # Removing multiple spaces
    text = re.sub(r'\s+', ' ', text)
    return text

In [8]:
def removeEmoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [9]:
def fullAbb(text):
    abbreviations = {
    "$" : " dollar ",
    "€" : " euro ",
    "4ao" : "for adults only",
    "a.m" : "before midday",
    "a3" : "anytime anywhere anyplace",
    "aamof" : "as a matter of fact",
    "acct" : "account",
    "adih" : "another day in hell",
    "afaic" : "as far as i am concerned",
    "afaict" : "as far as i can tell",
    "afaik" : "as far as i know",
    "afair" : "as far as i remember",
    "afk" : "away from keyboard",
    "app" : "application",
    "approx" : "approximately",
    "apps" : "applications",
    "asap" : "as soon as possible",
    "asl" : "age, sex, location",
    "atk" : "at the keyboard",
    "ave." : "avenue",
    "aymm" : "are you my mother",
    "ayor" : "at your own risk", 
    "b&b" : "bed and breakfast",
    "b+b" : "bed and breakfast",
    "b.c" : "before christ",
    "b2b" : "business to business",
    "b2c" : "business to customer",
    "b4" : "before",
    "b4n" : "bye for now",
    "b@u" : "back at you",
    "bae" : "before anyone else",
    "bak" : "back at keyboard",
    "bbbg" : "bye bye be good",
    "bbc" : "british broadcasting corporation",
    "bbias" : "be back in a second",
    "bbl" : "be back later",
    "bbs" : "be back soon",
    "be4" : "before",
    "bfn" : "bye for now",
    "blvd" : "boulevard",
    "bout" : "about",
    "brb" : "be right back",
    "bros" : "brothers",
    "brt" : "be right there",
    "bsaaw" : "big smile and a wink",
    "btw" : "by the way",
    "bwl" : "bursting with laughter",
    "c/o" : "care of",
    "cet" : "central european time",
    "cf" : "compare",
    "cia" : "central intelligence agency",
    "csl" : "can not stop laughing",
    "cu" : "see you",
    "cul8r" : "see you later",
    "cv" : "curriculum vitae",
    "cwot" : "complete waste of time",
    "cya" : "see you",
    "cyt" : "see you tomorrow",
    "dae" : "does anyone else",
    "dbmib" : "do not bother me i am busy",
    "diy" : "do it yourself",
    "dm" : "direct message",
    "dwh" : "during work hours",
    "e123" : "easy as one two three",
    "eet" : "eastern european time",
    "eg" : "example",
    "embm" : "early morning business meeting",
    "encl" : "enclosed",
    "encl." : "enclosed",
    "etc" : "and so on",
    "faq" : "frequently asked questions",
    "fawc" : "for anyone who cares",
    "fb" : "facebook",
    "fc" : "fingers crossed",
    "fig" : "figure",
    "fimh" : "forever in my heart", 
    "ft." : "feet",
    "ft" : "featuring",
    "ftl" : "for the loss",
    "ftw" : "for the win",
    "fwiw" : "for what it is worth",
    "fyi" : "for your information",
    "g9" : "genius",
    "gahoy" : "get a hold of yourself",
    "gal" : "get a life",
    "gcse" : "general certificate of secondary education",
    "gfn" : "gone for now",
    "gg" : "good game",
    "gl" : "good luck",
    "glhf" : "good luck have fun",
    "gmt" : "greenwich mean time",
    "gmta" : "great minds think alike",
    "gn" : "good night",
    "g.o.a.t" : "greatest of all time",
    "goat" : "greatest of all time",
    "goi" : "get over it",
    "gps" : "global positioning system",
    "gr8" : "great",
    "gratz" : "congratulations",
    "gyal" : "girl",
    "h&c" : "hot and cold",
    "hp" : "horsepower",
    "hr" : "hour",
    "hrh" : "his royal highness",
    "ht" : "height",
    "ibrb" : "i will be right back",
    "ic" : "i see",
    "icq" : "i seek you",
    "icymi" : "in case you missed it",
    "idc" : "i do not care",
    "idgadf" : "i do not give a damn fuck",
    "idgaf" : "i do not give a fuck",
    "idk" : "i do not know",
    "ie" : "that is",
    "i.e" : "that is",
    "ifyp" : "i feel your pain",
    "IG" : "instagram",
    "iirc" : "if i remember correctly",
    "ilu" : "i love you",
    "ily" : "i love you",
    "imho" : "in my humble opinion",
    "imo" : "in my opinion",
    "imu" : "i miss you",
    "iow" : "in other words",
    "irl" : "in real life",
    "j4f" : "just for fun",
    "jic" : "just in case",
    "jk" : "just kidding",
    "jsyk" : "just so you know",
    "l8r" : "later",
    "lb" : "pound",
    "lbs" : "pounds",
    "ldr" : "long distance relationship",
    "lmao" : "laugh my ass off",
    "lmfao" : "laugh my fucking ass off",
    "lol" : "laughing out loud",
    "ltd" : "limited",
    "ltns" : "long time no see",
    "m8" : "mate",
    "mf" : "motherfucker",
    "mfs" : "motherfuckers",
    "mfw" : "my face when",
    "mofo" : "motherfucker",
    "mph" : "miles per hour",
    "mr" : "mister",
    "mrw" : "my reaction when",
    "ms" : "miss",
    "mte" : "my thoughts exactly",
    "nagi" : "not a good idea",
    "nbc" : "national broadcasting company",
    "nbd" : "not big deal",
    "nfs" : "not for sale",
    "ngl" : "not going to lie",
    "nhs" : "national health service",
    "nrn" : "no reply necessary",
    "nsfl" : "not safe for life",
    "nsfw" : "not safe for work",
    "nth" : "nice to have",
    "nvr" : "never",
    "nyc" : "new york city",
    "oc" : "original content",
    "og" : "original",
    "ohp" : "overhead projector",
    "oic" : "oh i see",
    "omdb" : "over my dead body",
    "omg" : "oh my god",
    "omw" : "on my way",
    "p.a" : "per annum",
    "p.m" : "after midday",
    "pm" : "prime minister",
    "poc" : "people of color",
    "pov" : "point of view",
    "pp" : "pages",
    "ppl" : "people",
    "prw" : "parents are watching",
    "ps" : "postscript",
    "pt" : "point",
    "ptb" : "please text back",
    "pto" : "please turn over",
    "qpsa" : "what happens", #"que pasa",
    "ratchet" : "rude",
    "rbtl" : "read between the lines",
    "rlrt" : "real life retweet", 
    "rofl" : "rolling on the floor laughing",
    "roflol" : "rolling on the floor laughing out loud",
    "rotflmao" : "rolling on the floor laughing my ass off",
    "rt" : "retweet",
    "ruok" : "are you ok",
    "sfw" : "safe for work",
    "sk8" : "skate",
    "smh" : "shake my head",
    "sq" : "square",
    "srsly" : "seriously", 
    "ssdd" : "same stuff different day",
    "tbh" : "to be honest",
    "tbs" : "tablespooful",
    "tbsp" : "tablespooful",
    "tfw" : "that feeling when",
    "thks" : "thank you",
    "tho" : "though",
    "thx" : "thank you",
    "tia" : "thanks in advance",
    "til" : "today i learned",
    "tl;dr" : "too long i did not read",
    "tldr" : "too long i did not read",
    "tmb" : "tweet me back",
    "tntl" : "trying not to laugh",
    "ttyl" : "talk to you later",
    "u" : "you",
    "u2" : "you too",
    "u4e" : "yours for ever",
    "utc" : "coordinated universal time",
    "w/" : "with",
    "w/o" : "without",
    "w8" : "wait",
    "wassup" : "what is up",
    "wb" : "welcome back",
    "wtf" : "what the fuck",
    "wtg" : "way to go",
    "wtpa" : "where the party at",
    "wuf" : "where are you from",
    "wuzup" : "what is up",
    "wywh" : "wish you were here",
    "yd" : "yard",
    "wyd":"what are you doing",
    "doin":"doing",
    "ygtr" : "you got that right",
    "ynk" : "you never know",
    "zzz" : "sleeping bored and tired"}
    l=text.split()
    for i in range(len(l)):
        if l[i].lower() in abbreviations.keys():
            l[i]=abbreviations[l[i].lower()]
    text=' '.join(l)
    return text


In [10]:
def preProcessing(x):

    for i in range(len(x)):
        x[i]=removeURL(x[i])
        x[i]=removeTags(x[i])
        x[i]=removeEmoji(x[i])
        x[i]=fullAbb(x[i])
        x[i]=onlyWords(x[i])
        x[i]=x[i].lower()
    return x

In [11]:
x,y=train.iloc[:,3].values,train.iloc[:,4]
x_test=test.iloc[:,3].values
x=preProcessing(x)
x_test=preProcessing(x_test)

In [12]:
x[:5]

array(['our deeds are the reason of this earthquake may allah forgive us all',
       'forest fire near la ronge sask canada',
       'all residents asked to shelter in place are being notified by officers no other evacuation or shelter in place orders are expected',
       ' people receive wildfires evacuation orders in california',
       'just got sent this photo from ruby alaska as smoke from wildfires pours into school'],
      dtype=object)

In [13]:
x_test[:5]

array(['just happened terrible car crash',
       'heard about earthquake is different cities stay safe everyone ',
       'there is forest fire at spot pond geese are fleeing across the street cannot save them all',
       'apocalypse lighting spokane wildfires',
       'typhoon soudelor kills in china and taiwan'], dtype=object)

In [14]:
y

0       1
1       1
2       1
3       1
4       1
       ..
7608    1
7609    1
7610    1
7611    1
7612    1
Name: target, Length: 7613, dtype: int64

In [15]:
words,maxwords,maxlen=0,0,0
u=[]
for i in x:
    maxwords=maxwords if maxwords>len(i) else len(i)
    l=i.split()
    maxlen=maxlen if maxlen>len(l) else len(l)
    for j in l:
        if j not in u:
            u.append(j)
            words+=1
print(words)
print(maxwords)
print(maxlen)

16203
156
31


# Tokenization

In [16]:
tokenizer=Tokenizer()
tokenizer.fit_on_texts(x)
x= tokenizer.texts_to_sequences(x)
x_test= tokenizer.texts_to_sequences(x_test)

In [17]:
x[:5]

[[108, 4396, 19, 1, 835, 4, 16, 245, 129, 1556, 4397, 82, 35],
 [183, 40, 215, 716, 6501, 6502, 1168],
 [35,
  1686,
  1441,
  3,
  1864,
  2,
  664,
  19,
  127,
  6503,
  15,
  1687,
  36,
  373,
  246,
  57,
  1864,
  2,
  664,
  1346,
  19,
  1064],
 [54, 4398, 1442, 246, 1346, 2, 85],
 [29, 95, 1169, 16, 315, 17, 6504, 1688, 25, 263, 17, 1442, 6505, 66, 180]]

In [18]:
x_test[:3]

[[29, 881, 1884, 118, 88],
 [468, 51, 245, 7, 1153, 2546, 590, 1984, 210],
 [64, 7, 183, 40, 14, 793, 3410, 19, 4888, 837, 1, 717, 1355, 341, 93, 35]]

In [19]:
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)
maxlen = 65

16204


In [20]:
x = pad_sequences(x, padding='post', maxlen=maxlen)
x_test = pad_sequences(x_test, padding='post', maxlen=maxlen)

# EMBEDDING

In [21]:
embeddings_dictionary = dict()
glove_file1 = open('/content/drive/MyDrive/dpAssingment/assingment 2/glove.6B.300d.txt', encoding="utf8")


In [22]:
for line in glove_file1:
    records = line.split()
    word = records[0]
    vector_dimensions = np.asarray(records[1:], dtype='float32')
    embeddings_dictionary [word] = vector_dimensions
glove_file1.close()

In [23]:
embedding_matrix = np.zeros((vocab_size, 300))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [24]:
embedding_layer = Embedding(vocab_size, 300, weights=[embedding_matrix], input_length=maxlen , trainable=False)

# MODEL 1

In [25]:
model1 = Sequential()
model1.add(embedding_layer)
model1.add(LSTM(128))
model1.add(Dense(1, activation='sigmoid'))
model1.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model1.fit(x,y,epochs=15,batch_size=128)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7fbda56ae588>

In [27]:
model1.save('/content/drive/MyDrive/dpAssingment/assingment 2/model1.h5')

In [28]:
model1.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 65, 300)           4861200   
_________________________________________________________________
lstm (LSTM)                  (None, 128)               219648    
_________________________________________________________________
dense (Dense)                (None, 1)                 129       
Total params: 5,080,977
Trainable params: 219,777
Non-trainable params: 4,861,200
_________________________________________________________________


In [29]:
sub=pd.read_csv("/content/drive/MyDrive/dpAssingment/assingment 2/sample_submission.csv")

In [30]:
ypred1=model1.predict(x_test)
ypred1.resize(len(ypred1))
ypred1=(ypred1>0.5)
for i in range(len(ypred1)):
    sub['target'][i]=int(ypred1[i])
sub.to_csv('/content/drive/MyDrive/dpAssingment/assingment 2/submission1.csv',index=False)

In [31]:
sub.head(10)

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
5,12,0
6,21,1
7,22,0
8,27,0
9,29,0


# MODEL 2

In [32]:
model2=Sequential()
model2.add(embedding_layer)
model2.add(SpatialDropout1D(0.2))
model2.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model2.add(Dense(units=10,activation="relu"))
model2.add(Dense(1, activation='sigmoid'))
model2.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model2.fit(x,y,epochs=15,batch_size=128)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7fbda20c7fd0>

In [33]:
model2.save("/content/drive/MyDrive/dpAssingment/assingment 2/model2.h5")

In [34]:
model2.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 65, 300)           4861200   
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 65, 300)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               219648    
_________________________________________________________________
dense_1 (Dense)              (None, 10)                1290      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 11        
Total params: 5,082,149
Trainable params: 220,949
Non-trainable params: 4,861,200
_________________________________________________________________


In [35]:
ypred2=model2.predict(x_test)
ypred2.resize(len(ypred2))
ypred2=(ypred2>0.5)
for i in range(len(ypred2)):
    sub['target'][i]=int(ypred2[i])
sub.to_csv('/content/drive/MyDrive/dpAssingment/assingment 2/submission2.csv',index=False)
sub.head(10)

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
5,12,0
6,21,0
7,22,0
8,27,0
9,29,0



# MODEL 3

In [36]:
model3=Sequential()
model3.add(embedding_layer)
model3.add(SpatialDropout1D(0.2))
model3.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2,return_sequences=True))
model3.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model3.add(Dense(units=10,activation="relu"))
model3.add(Dense(1, activation='sigmoid'))
model3.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model3.fit(x,y,epochs=15,batch_size=128)


Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7fbda098eac8>

In [37]:
model3.save("/content/drive/MyDrive/dpAssingment/assingment 2/model3.h5")

In [38]:
model3.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 65, 300)           4861200   
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 65, 300)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 65, 128)           219648    
_________________________________________________________________
lstm_3 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense_3 (Dense)              (None, 10)                1290      
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 11        
Total params: 5,213,733
Trainable params: 352,533
Non-trainable params: 4,861,200
______________________________________

In [39]:
ypred3=model3.predict(x_test)
ypred3.resize(len(ypred3))
ypred3=(ypred3>0.5)
for i in range(len(ypred3)):
    sub['target'][i]=int(ypred3[i])
sub.to_csv('/content/drive/MyDrive/dpAssingment/assingment 2/submission3.csv',index=False)
sub.head(10)

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,0
4,11,1
5,12,0
6,21,0
7,22,0
8,27,0
9,29,0


# ENSEMBLE APPROACH

In [41]:
from statistics import mode 
for i in range(len(ypred1)):
    sub['target'][i]=int(mode([ypred1[i],ypred2[i],ypred3[i]]))
sub.to_csv('/content/drive/MyDrive/dpAssingment/assingment 2/submissionEnsemble.csv',index=False)
sub.head(10)

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
5,12,0
6,21,0
7,22,0
8,27,0
9,29,0
