**# Importing Libraries**

In [1]:
import numpy as np
import pandas as pd

Loading and Reading DataSet

In [2]:
df = pd.read_csv("disaster_tweets_data(DS).csv")

In [3]:
df.head()

Unnamed: 0,tweets,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1


Preprocessing

In [4]:
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

In [5]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
sw = set(stopwords.words('english'))
sw

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [7]:
lm = WordNetLemmatizer()

Iterating through dataset

In [8]:
for i in df['tweets']:
  print(i)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Our doctors and nurses in the new Pediatric Emergency Department are all specialized in child services! http://t.co/k1TMLWvjmJ
#MissionHills CA #Nursing : Registered Nurse - Emergency Department ( Full Time... at Providence Health &amp; Services http://t.co/Z5grLREy6V
Just saw a car on the I-77 Fully engulfed in flames hahah
Men escape car engulfed in flames in Parley's Canyon crews investigating cause - http://t.co/P6cyLz5lpt http://t.co/Jpu9gIps9f
Men escape car engulfed in flames in Parley's Canyon crews investigating cause - http://t.co/CYzlshlQhG http://t.co/nDiS8f1vzt
He came to a land which was engulfed in tribal war and turned it into a land of peace i.e. Madinah. #ProphetMuhammad #islam
#TRAFFICALERT  Eastbound 210 Freeway at Citrus Ave in Azusa. Two motorcycles involved in accident with one fully engulfed in flames in lanes
Men escape car engulfed in flames in Parley's Canyon crews investigating cause - http://t

In [9]:
import re

In [10]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [11]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

Preprocessing on first 5 Lines of Dataset

In [12]:
data = []
for i in df['tweets'].iloc[:5]:
  t = i.lower()                         # Converting to lowercase
  t = re.sub('[^A-Za-z0-9]',' ',t)      # removing punc  --> List of sentences
  t = word_tokenize(t)                  # Word tokenization --> List of list of words
  t =  [i for i in t if i not in sw]    # Removing the stopwords
  t = [lm.lemmatize(i) for i in t]      # Lemmatization
  t = " ".join(t)                       # Words to sentence formation
  data.append(t)
print(data)

['deed reason earthquake may allah forgive u', 'forest fire near la ronge sask canada', 'resident asked shelter place notified officer evacuation shelter place order expected', '13 000 people receive wildfire evacuation order california', 'got sent photo ruby alaska smoke wildfire pours school']


In [13]:
cleaned_tweets = []
for i in df['tweets']:
  t = i.lower()                         # Converting to lowercase
  t = re.sub('[^A-Za-z0-9]',' ',t)      # removing punc  --> List of sentences
  t = word_tokenize(t)                  # Word tokenization --> List of list of words
  t =  [i for i in t if i not in sw]    # Removing the stopwords
  t = [lm.lemmatize(i) for i in t]      # Lemmatization
  t = " ".join(t)                       # Words to sentence formation
  cleaned_tweets.append(t)
print(cleaned_tweets)



In [14]:
print(len(cleaned_tweets))

7613


In [15]:
x = np.array(cleaned_tweets)
y = df['target']
print(type(x))
print(type(y))

<class 'numpy.ndarray'>
<class 'pandas.core.series.Series'>


Splitting training and testing Dataset

In [16]:
from sklearn.model_selection import train_test_split

In [17]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(6090,)
(1523,)
(6090,)
(1523,)


Vectorization

In [18]:
from sklearn.feature_extraction.text import CountVectorizer

In [19]:
cv = CountVectorizer(max_features=4800)
cv.fit(x_train)
x_train_cv = cv.transform(x_train).toarray()
x_test_cv = cv.transform(x_test).toarray()

In [20]:
print(x_train_cv.shape)
print(x_test_cv.shape)

(6090, 4800)
(1523, 4800)


Multinomial Naïve Bayes Classification

In [21]:
from sklearn.naive_bayes import MultinomialNB

In [22]:
mnb = MultinomialNB()
mnb.fit(x_train_cv,y_train)

In [23]:
print("Train Score : ", mnb.score(x_train_cv,y_train))
print("Test Score : ", mnb.score(x_test_cv,y_test))

Train Score :  0.8673234811165845
Test Score :  0.7984241628365069


In [24]:
ypred = mnb.predict(x_test_cv)
print(len(ypred))

1523


In [25]:
from sklearn.metrics import confusion_matrix, classification_report

In [26]:
print(confusion_matrix(y_test,ypred))
print(classification_report(y_test,ypred))

[[748 126]
 [181 468]]
              precision    recall  f1-score   support

           0       0.81      0.86      0.83       874
           1       0.79      0.72      0.75       649

    accuracy                           0.80      1523
   macro avg       0.80      0.79      0.79      1523
weighted avg       0.80      0.80      0.80      1523



Logistic Regression

In [27]:
from sklearn.linear_model import LogisticRegression

In [28]:
logReg = LogisticRegression()
logReg.fit(x_train_cv,y_train)

In [29]:
print("Train Score : ", logReg.score(x_train_cv,y_train))
print("Test Score : ", logReg.score(x_test_cv,y_test))

Train Score :  0.929064039408867
Test Score :  0.8003939592908733


In [30]:
ypred = mnb.predict(x_test_cv)
print(len(ypred))

1523


In [31]:
print(confusion_matrix(y_test,ypred))
print(classification_report(y_test,ypred))

[[748 126]
 [181 468]]
              precision    recall  f1-score   support

           0       0.81      0.86      0.83       874
           1       0.79      0.72      0.75       649

    accuracy                           0.80      1523
   macro avg       0.80      0.79      0.79      1523
weighted avg       0.80      0.80      0.80      1523



KNN Classification

In [32]:
from sklearn.neighbors import KNeighborsClassifier

In [33]:
knnClassifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p=2)
knnClassifier.fit(x_train_cv,y_train)

In [34]:
print("Train Score : ", knnClassifier.score(x_train_cv,y_train))
print("Test Score : ", knnClassifier.score(x_test_cv,y_test))

Train Score :  0.7540229885057471
Test Score :  0.7019041365725541


In [35]:
ypred = knnClassifier.predict(x_test_cv)
print(len(ypred))

1523


In [36]:
print(confusion_matrix(y_test,ypred))
print(classification_report(y_test,ypred))

[[838  36]
 [418 231]]
              precision    recall  f1-score   support

           0       0.67      0.96      0.79       874
           1       0.87      0.36      0.50       649

    accuracy                           0.70      1523
   macro avg       0.77      0.66      0.65      1523
weighted avg       0.75      0.70      0.67      1523



Logistic Regression Performs well Here...