### Importing Basic Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### Importing Data

In [4]:
raw_data = pd.read_csv('twitter-disaster-prediction-dataset.csv')

### Exploring the Data

In [5]:
raw_data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [6]:
raw_data.shape

(7613, 5)

In [7]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [8]:
data=raw_data.drop(columns=["id","keyword","location"])

In [9]:
data.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1


In [11]:
# checking imbalance in data
data["target"].value_counts()

0    4342
1    3271
Name: target, dtype: int64

### Text Preprocessing

In [12]:
# Seeing the text
for i in range(5):
  print(np.random.choice(data["text"].values))
  print("*" * 100)

@Dead_Dreamer15 ...because if it were on fire that'd be a safety hazard
****************************************************************************************************
Only been back 10 &amp; a whirlwind has hit jaiden started open his present straight away didn't even get chance get in &amp; sit down lol
****************************************************************************************************
As of 2010 there were 17 Beluga deaths reported at #SeaWorld their average age 15 1/2 years #OpSeaWorld http://t.co/MZk5UjlFCV
****************************************************************************************************
|| So.... I just watched the trailed for The Dust Storm and I think part of me just died.... Colin is so perfect my goodness.
****************************************************************************************************
This is the natural and unavoidable consequence of socialism everywhere it has been tried.
http://t.co/BbDpnj8XSx A
****************

In [16]:
for i in range(5):
  print(np.random.choice(data["text"].values))
  print("*" * 100)

Urgent! Save the Salt River #WildHorses! Mass murder by the very ppl supposed to protect them?  --&gt; http://t.co/14wH0pJJ2C @CNN @CBC
****************************************************************************************************
#Reddit updates #content #policy promises to quarantine Û÷extremely offensiveÛª communities http://t.co/EHGtZhKAn4
****************************************************************************************************
Why does my phone electrocute me when it's charging
****************************************************************************************************
Uhhhhh demon hunters. But not the whole Burning Crusade v 2.0 thing.  https://t.co/oPtpS1lgKC
****************************************************************************************************
(#LosDelSonido) Obama Declares Disaster for Typhoon-Devastated Saipan: Obama signs disaster declaration for Northern Ma...  (#IvanBerroa)
***********************************************************

#### Removing HTML Tags

In [13]:
from bs4 import BeautifulSoup

In [14]:
def remove_html_tags(text_inp):
  soupp=BeautifulSoup(text_inp,"html.parser")
  plain_text=soupp.get_text()
  return plain_text

In [15]:
no_html_text=data["text"].apply(remove_html_tags)

  soupp=BeautifulSoup(text_inp,"html.parser")


In [17]:
for i in range(5):
  print(np.random.choice(no_html_text.values))
  print("*" * 100)

Gas leak forces evacuation in east Saint John http://t.co/E1vkc2efsT #NB http://t.co/BeUa507Iug
****************************************************************************************************
The @rbcinsurance quote website = disaster. Tried 3 browsers & 3 machines. Always get 'Missing Info' error due to a non-existant drop down.
****************************************************************************************************
I get to smoke my shit in peace
****************************************************************************************************
The Catastrophic Effects of Hiroshima and Nagasaki Atomic Bombings Still Being Felt Today http://t.co/TzxeG4gOkD
****************************************************************************************************
Summer is lovely
****************************************************************************************************


#### Removing URL

In [19]:
no_html_url=no_html_text.str.replace("https?://t.co/\w","",regex=True)

In [22]:
for i in range(5):
  print(np.random.choice(no_html_url.values))
  print("*" * 100)

TENSION IN ABIA AS INEC OFFICEÛªS RAZEDåÊÛÒ GOVERNOR IKPEAZU PDP APGA REACT KzZOe5CE6
****************************************************************************************************
Ranking #artectura #pop2015 #Nå¼36 Florence + The Machine - Ship To Wreck LE0B19lVF  #music #playlist #YouTube
****************************************************************************************************
@BrookTekle_ didn't look like a murder scene just 1 cops a fire truck and 2 fire assistance cars along with a helicopter
****************************************************************************************************
@stury Note there were no passengers on board when the train derailed this morning.
****************************************************************************************************
i'm sorry i'm so wild in 1d shows like in my wwa show niall started singing steal my girl I literally screamed shut the fuck up
****************************************************************

#### Removing non text characters

In [37]:
no_html_url_nontext=no_html_url.str.replace("[^\w\s]","",regex=True)

In [43]:
for i in range(5):
  print(np.random.choice(no_html_url_nontext.values))
  print("*" * 100)

Accident in Ashville on US 23 SB before SR 752 traffic ylMo0WgFI
****************************************************************************************************
Poor Jack 
****************************************************************************************************
Bloody ell Let it burn RubyBot
****************************************************************************************************
Militants attack police post in Udhampur 2 SPOs injured

Suspected militants Thursday  attacked a police post in o0j9FCPBi
****************************************************************************************************
Vince McMahon once again a billionaire I remember reading a deluge of posts about Vince McMahon losing 350 m o0oz3RYFg
****************************************************************************************************


#### Removing Punctuations

In [45]:
#This may be redundant
import string
no_html_url_nontext_punc=no_html_url_nontext.str.translate(str.maketrans("","",string.punctuation))

In [47]:
for i in range(5):
  print(np.random.choice(no_html_url_nontext_punc.values))
  print("*" * 100)

WillHillBet what is double result live on the app
****************************************************************************************************
Firefigthers Evacuate from Northampton Township House Fire PplD1jHtZ
****************************************************************************************************
CONFIRMED Sanchez Hazard and Bolasie will be out for the rest of the season Ct01nEptL
****************************************************************************************************
malistkiss Sunnis continue to believe they are more righteous and they continually harm Shias Defeats the ideals of Islam
****************************************************************************************************
Hi yall this poem is called is the one about the snowstorm when we meet in space and that one time it rained Thx Ur watching disney chann
****************************************************************************************************


#### Removing numbers

In [48]:
no_html_url_nontext_punc_num=no_html_url_nontext_punc.str.replace("[\d]","",regex=True)

In [49]:
for i in range(5):
  print(np.random.choice(no_html_url_nontext_punc_num.values))
  print("*" * 100)

PTSDChat Yes I feel the root of that is Shame  which can be found in the rubble of most trauma PTSDchat
****************************************************************************************************
Severe Thunderstorm pictures from across the MidSouth ZWLgJQzNS
****************************************************************************************************
This bowl got me thinking Damn Ive been blazing for so damn long
****************************************************************************************************
ictyosaur I never thought it would be a wtf moment yet its here after months of  degree heat
Next we will have flash floods
****************************************************************************************************
Maj Muzzamil Pilot Offr of MI crashed near Mansehra today LRccWct
****************************************************************************************************


#### Lower Casing

In [50]:
clean_text=no_html_url_nontext_punc_num.str.lower()

In [51]:
for i in range(5):
  print(np.random.choice(clean_text.values))
  print("*" * 100)

banditregina i also loved the episode bang in season  when caroline bigsby took hostages in the supermarket
****************************************************************************************************
related news isis video threatens hostage  europe  cnn   kbzo
****************************************************************************************************
blizzarddraco lonewolffur i need this
****************************************************************************************************
a little filming inside a nuclear reactor at chernobyl sonyprousa lumixusa djiglobal profbriancox rtamerica gljhvead
****************************************************************************************************
yearold boy charged with manslaughter of toddler report an yearold boy has been charged with manslaughter over the fatal sh
****************************************************************************************************


#### Removing stopwords

In [58]:
from nltk.corpus import stopwords

In [63]:
stop_words=stopwords.words("english")

In [64]:
def remove_stopword(inp):
  li=inp.split()
  sent=[word for word in li if word not in stop_words]
  return " ".join(sent)

In [65]:
Req_text=clean_text.apply(remove_stopword)

In [66]:
for i in range(5):
  print(np.random.choice(Req_text.values))
  print("*" * 100)

yearold boy charged manslaughter toddler report yearold boy charged manslaughter fatal sh
****************************************************************************************************
doctorfluxx stefanejones spinnellii themermacorn burning buildings rob riot thats embarrassing ruining nation
****************************************************************************************************
entension bayelsa patience jonathan plans hijack apcåêpdp ufeibala iulqprxke
****************************************************************************************************
fifth predynastic king legendary period deluge rknephgu dumuzid shepherd
****************************************************************************************************
unlicensed teen driver among killed nc crash ocakehyx
****************************************************************************************************
