### Importing Basic Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### Importing Data

In [2]:
raw_data = pd.read_csv('twitter_disaster_prediction_dataset.csv')

### Exploring the Data

In [3]:
raw_data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
raw_data.shape

(7613, 5)

In [5]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [6]:
data=raw_data.drop(columns=["id","keyword","location"])

In [7]:
data.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1


In [8]:
# checking imbalance in data
data["target"].value_counts()

0    4342
1    3271
Name: target, dtype: int64

### Text Preprocessing

In [9]:
# Seeing the text
for i in range(5):
  print(np.random.choice(data["text"].values))
  print("*" * 100)

Officials rescue 367 migrants off Libya; 25 bodies found - Fox News http://t.co/cEdCUgEuWs #News
****************************************************************************************************
Schools in Western Uganda still Burning down Buildings during Strikes....Strikes in Western Uganda always Lit literally..
****************************************************************************************************
York Co. first responders compete to save lives in Û÷Badges for BloodÛª #paramedic #EMS http://t.co/E65V80FCus
****************************************************************************************************
****************************************************************************************************
@CrowtherJohn @Effiedeans  you just keep ur head in the sand john. The best place for it.  Lbr after 97 landslide. Couldnt imagine situ now
****************************************************************************************************


In [10]:
for i in range(5):
  print(np.random.choice(data["text"].values))
  print("*" * 100)

incident with injury:I-495  inner loop Exit 31 - MD 97/Georgia Ave Silver Spring
****************************************************************************************************
Worked in factory pressing designs onto T-shirts was electrocuted 
d/t faulty ground. Boss docked my pay while I was at ER #WorstSummerJob
****************************************************************************************************
@SirTitan45  Mega mood swing on a 24 hr schedule. Isn't that how structural failure occurs?
****************************************************************************************************
@phnotf sometimes your cheekiness bleeds through my computer screen and i recoil in fear
****************************************************************************************************
Your brain is particularly vulnerable to trauma at two distinct ages http://t.co/KnBv2YtNWc @qz @TaraSwart @vivian_giang
**************************************************************************

#### Removing HTML Tags

In [11]:
from bs4 import BeautifulSoup

In [12]:
def remove_html_tags(text_inp):
  soupp=BeautifulSoup(text_inp,"html.parser")
  plain_text=soupp.get_text()
  return plain_text

In [13]:
no_html_text=data["text"].apply(remove_html_tags)

  soupp=BeautifulSoup(text_inp,"html.parser")


In [14]:
for i in range(5):
  print(np.random.choice(no_html_text.values))
  print("*" * 100)

@JasonPope2 @JohnFugelsang again I didn't say it was. I was referring to the main 2 buildings. 7 was hit by rubble
****************************************************************************************************
RT @tonyhsieh: 'The person who dances with you in the rain will most likely walk with you in the storm.' -Anonymous
****************************************************************************************************
@mallelis have you gotten to the post-battle we're-on-a-desolate-planet below-the-Mason-Dixon-Line style electro violin playing yet?
****************************************************************************************************
HURRICANE GUILLERMO LIVE NOAA TRACKING / LOOPING WED.AUG.5TH ~ http://t.co/AuruGJEGIQ ~  http://t.co/L3w8miPvnT http://t.co/O85M1bJFRW
****************************************************************************************************
Evacuation order lifted for town of Roosevelt Wash. though residents warned to be ready to leave 

#### Removing URL

In [15]:
no_html_url=no_html_text.str.replace("https?://t.co/\w","",regex=True)

In [16]:
for i in range(5):
  print(np.random.choice(no_html_url.values))
  print("*" * 100)

Shadowflame and the Wraith: Bombed DBaO0rSuz via @amazon
****************************************************************************************************
@estellasrevenge the first time i went swiming in it i was basically screaming WHY DOES IT SMELL/TASTE SO BAD
****************************************************************************************************
Look at the previous battles. Citizens were committing suicide so to not be under American control. The bomb was the only way. @NBCNews
****************************************************************************************************
I liked a @YouTube video K7nPdpWRo J. Cole - Fire Squad (2014 Forest Hills Drive)
****************************************************************************************************
A GPM satellite 'bullseye' in Typhoon Soudelor iVeUPiRKY
****************************************************************************************************


#### Removing non text characters

In [17]:
no_html_url_nontext=no_html_url.str.replace("[^\w\s]","",regex=True)

In [18]:
for i in range(5):
  print(np.random.choice(no_html_url_nontext.values))
  print("*" * 100)

Detonate feat MOP by Apollo Brown llaBzGCRc
****************************************************************************************************
If you are going to achieve excellence in big things you develop the habit in little matters dont know the author
****************************************************************************************************
GodOf_Mischief_ of Lokis daggers she pulled it out and jammed it into Minas thigh When Mina screamed and grabbed at her leg sif
****************************************************************************************************
This past week has been an absolute whirlwind Athens bound
****************************************************************************************************
The hatchetwielding gunman had PEPPER SPRAY AND A FAKE BOMB
****************************************************************************************************


#### Removing Punctuations

In [19]:
#This may be redundant
import string
no_html_url_nontext_punc=no_html_url_nontext.str.translate(str.maketrans("","",string.punctuation))

In [20]:
for i in range(5):
  print(np.random.choice(no_html_url_nontext_punc.values))
  print("*" * 100)

CloydRivers there were plenty of black people rioting when tOSU won the championship as well
****************************************************************************************************
I have an unexplainable desire to watch The Rescuers childhooddefined
****************************************************************************************************
Sinkhole Selfies You Wont Believe Whats In The Brooklyn Sinkhole 
        Sinkhole Selfies You Wont Belie 3b5n3rcr5
****************************************************************************************************
annihilating quarterstaff of annihilation
****************************************************************************************************
Carterville High School coaches prepare for gameday injuries KiMMBUe04
****************************************************************************************************


#### Removing numbers

In [21]:
no_html_url_nontext_punc_num=no_html_url_nontext_punc.str.replace("[\d]","",regex=True)

In [22]:
for i in range(5):
  print(np.random.choice(no_html_url_nontext_punc_num.values))
  print("*" * 100)

family members of osama bin laden have died in an airplane accident how ironic  mhmmm gov shit i suspect
****************************************************************************************************
sriramk DLin pmarca Tragedy of commons pertains to public ownership Not property rights based markets The opposite of what you say
****************************************************************************************************
hiphop news indie Apollo Brown ÛÒ ÛÏDetonateÛ Ft MOP  a hrefnowfVCbMs xWOjxqndC
****************************************************************************************************
Inbounds Out of Bounds

While many picked the Nats to win the NL East in a landslide they currently sit  dEoCxUo
****************************************************************************************************
Hollywood Movie About Trapped Miners Released in Chile The  Hollywood movie about trapped miners starring moYeVjsJ
****************************************************

#### Lower Casing

In [23]:
clean_text=no_html_url_nontext_punc_num.str.lower()

In [24]:
for i in range(5):
  print(np.random.choice(clean_text.values))
  print("*" * 100)

oh teamhennessy nj obliteration  tprimo round  happy birthday djeddygnj colombian festivalû rvfidfn
****************************************************************************************************
policyholders object to clico rescue plan dvivuxz ycpfiyhg
****************************************************************************************************
sinkhole selfies you wont believe whats in the brooklyn sinkhole glyoyfoc
****************************************************************************************************
but the government will not care police will stop rioting eventually of protestors eventually some skyscrapers become plantcovered 
****************************************************************************************************
sarniamakchris hromadske kasiadear  how silly that one of only two countries that can destroy the world has a say about world security
************************************************************************************************

#### Removing stopwords

In [25]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [26]:
stop_words=stopwords.words("english")

In [27]:
def remove_stopword(inp):
  li=inp.split()
  sent=[word for word in li if word not in stop_words]
  return " ".join(sent)

In [28]:
Req_text=clean_text.apply(remove_stopword)

In [29]:
for i in range(5):
  print(np.random.choice(Req_text.values))
  print("*" * 100)

businesses deluged invoices make stand ogt colomr shape likely rise top pay pile
****************************************************************************************************
flowri marinading accident
****************************************************************************************************
newberg upheaval jacque betz responds looking forward day answer questions orcot orpol azqamooc
****************************************************************************************************
johnsontionne except idk really burning
****************************************************************************************************
blaaaaaaa said sunk face stomach making vibrate resolutevanity
****************************************************************************************************


### Model Building

#### SVM

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [31]:
text=Req_text
label=data[["target"]]

In [32]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(text.values,label,test_size=0.2,random_state=0)

In [33]:
tfidf=TfidfVectorizer()

In [34]:
X_train=tfidf.fit_transform(X_train)

In [35]:
X_train

<6090x18539 sparse matrix of type '<class 'numpy.float64'>'
	with 57650 stored elements in Compressed Sparse Row format>

In [36]:
X_test=tfidf.transform(X_test)

In [37]:
from sklearn.svm import LinearSVC
svc=LinearSVC()

In [38]:
svc.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)


In [39]:
y_pred=svc.predict(X_test)

In [40]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.81      0.87      0.84       886
           1       0.79      0.71      0.75       637

    accuracy                           0.80      1523
   macro avg       0.80      0.79      0.79      1523
weighted avg       0.80      0.80      0.80      1523



#### Neural network

In [41]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Flatten,Embedding,Dropout,SimpleRNN
from tensorflow.keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D

In [42]:
token=Tokenizer()
token.fit_on_texts(text.values)

In [43]:
len(token.word_index)

21652

In [44]:
seq=token.texts_to_sequences(text.values)

In [45]:
X=pad_sequences(seq,maxlen=50,padding="post")

In [46]:
X_train,X_test,y_train,y_test=train_test_split(X,label,test_size=0.2, random_state=0)

In [47]:
from sklearn.svm import LinearSVC
svc=LinearSVC()
svc.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)


In [48]:
ypred=svc.predict(X_test)

In [49]:
print(classification_report(y_test,ypred))

              precision    recall  f1-score   support

           0       0.61      0.88      0.72       886
           1       0.58      0.22      0.32       637

    accuracy                           0.61      1523
   macro avg       0.60      0.55      0.52      1523
weighted avg       0.60      0.61      0.55      1523



##### CNN

In [50]:
model = Sequential()
model.add(Embedding(21653, 100, input_length=50))

model.add(Conv1D(32, 2, activation='relu'))
model.add(MaxPooling1D(2))
model.add(Dropout(0.5))

model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))

model.add(Dense(16, activation='relu'))

model.add(GlobalMaxPooling1D())

model.add(Dense(1, activation='sigmoid'))

In [51]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=5, validation_data=(X_test, y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7baf99adb5b0>

##### RNN

In [52]:
model1=Sequential([
    Embedding(21653, 100, input_length=50),
    SimpleRNN(32),
    Dense(32,activation="relu"),
    Dropout(0.5),
    Dense(16,activation="relu"),
    Dense(1, activation="sigmoid")
])

In [53]:
model1.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model1.fit(X_train, y_train, epochs=5, validation_data=(X_test, y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7baf81697040>

In [54]:
!pip install ktrain

Collecting ktrain
  Downloading ktrain-0.37.6.tar.gz (25.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m25.3/25.3 MB[0m [31m68.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting langdetect (from ktrain)
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m82.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting cchardet (from ktrain)
  Downloading cchardet-2.1.7.tar.gz (653 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m653.6/653.6 kB[0m [31m44.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting syntok>1.3.3 (from ktrain)
  Downloading syntok-1.4.4-py3-none-any.whl (24 kB)
Collecting tika (from ktrain)
  Downloading tika-2.6.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformer

In [55]:
import ktrain
from ktrain import text

In [56]:
(X_train, y_train), (X_test, y_test), preproc = text.texts_from_df(train_df=raw_data, text_column='text', label_columns='target', maxlen=40, preprocess_mode='bert')

['not_target', 'target']
      not_target  target
4008         0.0     1.0
4710         1.0     0.0
3795         0.0     1.0
5753         1.0     0.0
6452         0.0     1.0
['not_target', 'target']
      not_target  target
2896         1.0     0.0
5673         0.0     1.0
5154         1.0     0.0
6522         1.0     0.0
4770         0.0     1.0
downloading pretrained BERT model (uncased_L-12_H-768_A-12.zip)...
[██████████████████████████████████████████████████]
extracting pretrained BERT model...
done.

cleanup downloaded zip...
done.

preprocessing train...
language: en


Is Multi-Label? False
preprocessing test...
language: en


In [57]:
model = text.text_classifier(name='bert', train_data=(X_train, y_train), preproc=preproc)

Is Multi-Label? False
maxlen is 40




done.


In [58]:
learner = ktrain.get_learner(model=model, train_data=(X_train, y_train), val_data=(X_test, y_test), batch_size=64)

In [78]:
learner.fit_onecycle(lr=0.001,epochs=5)



begin training using onecycle policy with max lr of 0.001...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7baef68b7760>

In [60]:
predictor = ktrain.get_predictor(learner.model, preproc)

In [76]:
inp="Forest fire near La Ronge Sask"
yprediction=predictor.predict(inp,pred)
yprediction

'not_target'

In [73]:
classes=predictor.get_classes()
classes

['not_target', 'target']

In [70]:
classes[(np.argmax(yprediction))]

'not_target'

##### DistilBERT

In [82]:
train,test,preproc= text.texts_from_df(train_df=raw_data, text_column='text', label_columns='target', maxlen=40, preprocess_mode='distilbert')
model2=text.text_classifier(name='distilbert', train_data=train, preproc=preproc)

['not_target', 'target']
      not_target  target
6115         1.0     0.0
7009         1.0     0.0
7090         0.0     1.0
3956         0.0     1.0
4637         0.0     1.0
['not_target', 'target']
      not_target  target
6307         1.0     0.0
7153         1.0     0.0
6209         1.0     0.0
1238         1.0     0.0
4373         0.0     1.0
preprocessing train...
language: en
train sequence lengths:
	mean : 15
	95percentile : 24
	99percentile : 27


Is Multi-Label? False
preprocessing test...
language: en
test sequence lengths:
	mean : 15
	95percentile : 24
	99percentile : 27


Is Multi-Label? False
maxlen is 40
done.


In [83]:
learner=ktrain.get_learner(model=model2,train_data=train,val_data=test,batch_size=32)

In [84]:
learner.fit_onecycle(lr = 0.001, epochs=5)



begin training using onecycle policy with max lr of 0.001...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7baf8163c550>

In [None]:
predictor = ktrain.get_predictor(learner.model, preproc)

In [86]:
inp="Forest fire near La Ronge Sask"
yprediction=predictor.predict(inp)
yprediction

'not_target'