# Disaster Tweet Prediction

## i. Introduction

Name : Darmawan Wijaya </br>
Purpose: Apply Natural Language Processing to predict tweet

## ii. Import Library

In [1]:
import sys
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
import sklearn
import tensorflow as tf
from tensorflow import keras
import os
import re
import string

In [2]:
import itertools
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
from sklearn.dummy import DummyClassifier

## iii. Data Loading

In [3]:
df = pd.read_csv('train.csv')

In [4]:
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [6]:
df.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [7]:
df.location.nunique()

3341

In [8]:
df.keyword.nunique()

221

In [9]:
disastertweet = df[['target', 'text']].copy()

In [10]:
disastertweet.head()

Unnamed: 0,target,text
0,1,Our Deeds are the Reason of this #earthquake M...
1,1,Forest fire near La Ronge Sask. Canada
2,1,All residents asked to 'shelter in place' are ...
3,1,"13,000 people receive #wildfires evacuation or..."
4,1,Just got sent this photo from Ruby #Alaska as ...


## Preprocessing

In [11]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
disastertweet.reset_index(inplace=True)

In [13]:
# stopwords
from nltk.corpus import stopwords

stop_words = set(stopwords.words("english"))
print(stop_words)

{'her', 've', 'themselves', "wasn't", 'above', 'she', 'whom', 'out', 'he', 'each', 'too', 'll', 'aren', 'they', 'there', "that'll", 'haven', "shouldn't", 'all', 'in', 'them', 'if', 't', 'couldn', 'who', "weren't", 'while', 'and', 'under', 'his', "hadn't", 'himself', 'weren', 'what', 'me', 'your', 'through', 'any', 'shouldn', 'until', 'this', "she's", 'because', 'from', 'that', 'not', 'you', 'down', 'further', 'o', 'having', 'most', 'only', "you'd", 'am', 'didn', 'nor', 'here', 'y', 'itself', "aren't", "isn't", 'myself', 'of', 'its', 'were', "should've", 'between', 'ourselves', 'once', 'more', "haven't", 'our', 'isn', 'some', 'we', 'have', 'theirs', 'when', 'i', 'own', 'those', 'had', 'yourselves', 'do', 'hers', 'hasn', 'mustn', 'for', 'during', 'same', 'again', 'my', 'just', 'd', 'into', 'few', "you'll", 'where', 'these', 'before', 'did', 'against', "it's", 'been', 'other', 'below', 'which', "didn't", 'yours', 'over', "wouldn't", 'how', 'with', 'm', 'hadn', 'be', 'a', 'very', 'shan', '

In [14]:
# Stopwords
disastertweet['stopwords'] = disastertweet['text'].apply(lambda x: len([x for x in x.split() if x in stop_words]))
disastertweet[['text','stopwords']].head()

Unnamed: 0,text,stopwords
0,Our Deeds are the Reason of this #earthquake M...,5
1,Forest fire near La Ronge Sask. Canada,0
2,All residents asked to 'shelter in place' are ...,9
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,6


In [15]:
# Punctuation
def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return count

disastertweet['punctuation'] = disastertweet['text'].apply(lambda x: count_punct(x))
disastertweet[['text','punctuation']].head()

Unnamed: 0,text,punctuation
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,3
3,"13,000 people receive #wildfires evacuation or...",2
4,Just got sent this photo from Ruby #Alaska as ...,2


In [16]:
# Hastag
disastertweet['hastags'] = disastertweet['text'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
disastertweet[['text','hastags']].head()

Unnamed: 0,text,hastags
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,0
2,All residents asked to 'shelter in place' are ...,0
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,2


In [17]:
# Numbers / Digits
disastertweet['numerics'] = disastertweet['text'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
disastertweet[['text','numerics']].head()

Unnamed: 0,text,numerics
0,Our Deeds are the Reason of this #earthquake M...,0
1,Forest fire near La Ronge Sask. Canada,0
2,All residents asked to 'shelter in place' are ...,0
3,"13,000 people receive #wildfires evacuation or...",0
4,Just got sent this photo from Ruby #Alaska as ...,0


In [18]:
# Mention
disastertweet['mention'] = disastertweet['text'].apply(lambda x: len([x for x in x.split() if x.startswith('@')]))
disastertweet[['text','mention']].head()

Unnamed: 0,text,mention
0,Our Deeds are the Reason of this #earthquake M...,0
1,Forest fire near La Ronge Sask. Canada,0
2,All residents asked to 'shelter in place' are ...,0
3,"13,000 people receive #wildfires evacuation or...",0
4,Just got sent this photo from Ruby #Alaska as ...,0


In [19]:
# URL
disastertweet['URL'] = disastertweet['text'].apply(lambda x: len([x for x in x.split() if x.startswith('http' or 'https')]))
disastertweet[['text','URL']].head()

Unnamed: 0,text,URL
0,Our Deeds are the Reason of this #earthquake M...,0
1,Forest fire near La Ronge Sask. Canada,0
2,All residents asked to 'shelter in place' are ...,0
3,"13,000 people receive #wildfires evacuation or...",0
4,Just got sent this photo from Ruby #Alaska as ...,0


In [20]:
disastertweet

Unnamed: 0,index,target,text,stopwords,punctuation,hastags,numerics,mention,URL
0,0,1,Our Deeds are the Reason of this #earthquake M...,5,1,1,0,0,0
1,1,1,Forest fire near La Ronge Sask. Canada,0,1,0,0,0,0
2,2,1,All residents asked to 'shelter in place' are ...,9,3,0,0,0,0
3,3,1,"13,000 people receive #wildfires evacuation or...",1,2,1,0,0,0
4,4,1,Just got sent this photo from Ruby #Alaska as ...,6,2,2,0,0,0
...,...,...,...,...,...,...,...,...,...
7608,7608,1,Two giant cranes holding a bridge collapse int...,2,5,0,0,0,1
7609,7609,1,@aria_ahrary @TheTawniest The out of control w...,7,5,0,0,2,0
7610,7610,1,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,11,0,0,0,1
7611,7611,1,Police investigating after an e-bike collided ...,5,5,0,0,0,0


In [21]:
# Uppercase
disastertweet['upper'] = disastertweet['text'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
disastertweet[['text','upper']].head()

Unnamed: 0,text,upper
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,0
2,All residents asked to 'shelter in place' are ...,0
3,"13,000 people receive #wildfires evacuation or...",0
4,Just got sent this photo from Ruby #Alaska as ...,0


In [22]:
# Make all Text LowerCase
disastertweet['text'] = disastertweet['text'].apply(lambda x: " ".join(x.lower() for x in x.split()))
disastertweet['text'].head()

0    our deeds are the reason of this #earthquake m...
1               forest fire near la ronge sask. canada
2    all residents asked to 'shelter in place' are ...
3    13,000 people receive #wildfires evacuation or...
4    just got sent this photo from ruby #alaska as ...
Name: text, dtype: object

In [23]:
disastertweet.tail()

Unnamed: 0,index,target,text,stopwords,punctuation,hastags,numerics,mention,URL,upper
7608,7608,1,two giant cranes holding a bridge collapse int...,2,5,0,0,0,1,0
7609,7609,1,@aria_ahrary @thetawniest the out of control w...,7,5,0,0,2,0,0
7610,7610,1,m1.94 [01:04 utc]?5km s of volcano hawaii. htt...,1,11,0,0,0,1,2
7611,7611,1,police investigating after an e-bike collided ...,5,5,0,0,0,0,0
7612,7612,1,the latest: more homes razed by northern calif...,1,7,0,0,0,1,1


In [24]:
#Remove URL
def remove_url(text):
  text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
  return text
nourl = lambda x: remove_url(x)

In [25]:
disastertweet['text']=disastertweet['text'].apply(nourl)
disastertweet['text']

0       our deeds are the reason of this #earthquake m...
1                  forest fire near la ronge sask. canada
2       all residents asked to 'shelter in place' are ...
3       13,000 people receive #wildfires evacuation or...
4       just got sent this photo from ruby #alaska as ...
                              ...                        
7608    two giant cranes holding a bridge collapse int...
7609    @aria_ahrary @thetawniest the out of control w...
7610    m1.94 [01:04 utc]?5km s of volcano hawaii. htt...
7611    police investigating after an e-bike collided ...
7612    the latest: more homes razed by northern calif...
Name: text, Length: 7613, dtype: object

In [26]:
disastertweet['text'].tail()

7608    two giant cranes holding a bridge collapse int...
7609    @aria_ahrary @thetawniest the out of control w...
7610    m1.94 [01:04 utc]?5km s of volcano hawaii. htt...
7611    police investigating after an e-bike collided ...
7612    the latest: more homes razed by northern calif...
Name: text, dtype: object

In [27]:
#Remove Mentions
def remove_mention(text):
  text = re.sub("@[A-Za-z0-9_]+","", text)
  return text
nomention = lambda x: remove_mention(x)

In [28]:
disastertweet['text']=disastertweet['text'].apply(nomention)
disastertweet['text']

0       our deeds are the reason of this #earthquake m...
1                  forest fire near la ronge sask. canada
2       all residents asked to 'shelter in place' are ...
3       13,000 people receive #wildfires evacuation or...
4       just got sent this photo from ruby #alaska as ...
                              ...                        
7608    two giant cranes holding a bridge collapse int...
7609      the out of control wild fires in california ...
7610    m1.94 [01:04 utc]?5km s of volcano hawaii. htt...
7611    police investigating after an e-bike collided ...
7612    the latest: more homes razed by northern calif...
Name: text, Length: 7613, dtype: object

In [29]:
def remove_hashtag(text):
  text = re.sub("#[A-Za-z0-9_]+","", text)
  return text
nohash = lambda x: remove_hashtag(x)

In [30]:
disastertweet['text']=disastertweet['text'].apply(nohash)
disastertweet['text']

0       our deeds are the reason of this  may allah fo...
1                  forest fire near la ronge sask. canada
2       all residents asked to 'shelter in place' are ...
3       13,000 people receive  evacuation orders in ca...
4       just got sent this photo from ruby  as smoke f...
                              ...                        
7608    two giant cranes holding a bridge collapse int...
7609      the out of control wild fires in california ...
7610    m1.94 [01:04 utc]?5km s of volcano hawaii. htt...
7611    police investigating after an e-bike collided ...
7612    the latest: more homes razed by northern calif...
Name: text, Length: 7613, dtype: object

In [31]:
disastertweet

Unnamed: 0,index,target,text,stopwords,punctuation,hastags,numerics,mention,URL,upper
0,0,1,our deeds are the reason of this may allah fo...,5,1,1,0,0,0,1
1,1,1,forest fire near la ronge sask. canada,0,1,0,0,0,0,0
2,2,1,all residents asked to 'shelter in place' are ...,9,3,0,0,0,0,0
3,3,1,"13,000 people receive evacuation orders in ca...",1,2,1,0,0,0,0
4,4,1,just got sent this photo from ruby as smoke f...,6,2,2,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
7608,7608,1,two giant cranes holding a bridge collapse int...,2,5,0,0,0,1,0
7609,7609,1,the out of control wild fires in california ...,7,5,0,0,2,0,0
7610,7610,1,m1.94 [01:04 utc]?5km s of volcano hawaii. htt...,1,11,0,0,0,1,2
7611,7611,1,police investigating after an e-bike collided ...,5,5,0,0,0,0,0


In [32]:
# Remove Punctuation
disastertweet['text'] = disastertweet['text'].str.replace('[^\w\s]','')
disastertweet['text'].head()

0    our deeds are the reason of this  may allah fo...
1                forest fire near la ronge sask canada
2    all residents asked to shelter in place are be...
3    13000 people receive  evacuation orders in cal...
4    just got sent this photo from ruby  as smoke f...
Name: text, dtype: object

In [33]:
# Remove Stopwords
from nltk.corpus import stopwords
stop = stopwords.words('english')

disastertweet['text'] = disastertweet['text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
disastertweet['text'].sample(10)

3642    govt allocating 13 bn flood action issue takes...
3497    super freestyle explosion live concert amalie ...
380     two jewish terrorists charged historicchurch a...
748                            go look blew w atomic bomb
3046    usgs reports m194 5km volcano hawaii 8615 1040...
6748    73rd goode water ski national championships go...
2562    get 50 eur free bet bwin use sports markets ht...
76            accident knew gon happen httpstcoysxun5vceh
5758    meatloving feminists world riot grill arrived ...
806     put taint magisters open gates let blight get ...
Name: text, dtype: object

In [34]:
def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

round1 = lambda x: clean_text_round1(x)

In [35]:
disastertweet['text']=disastertweet['text'].apply(round1)
disastertweet['text']

0                       deeds reason may allah forgive us
1                   forest fire near la ronge sask canada
2       residents asked shelter place notified officer...
3             people receive evacuation orders california
4                  got sent photo ruby smoke pours school
                              ...                        
7608    two giant cranes holding bridge collapse nearb...
7609    control wild fires california even northern pa...
7610                                      volcano hawaii 
7611    police investigating ebike collided car little...
7612    latest homes razed northern california wildfir...
Name: text, Length: 7613, dtype: object

In [36]:
def clean_text_round2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    return text

round2 = lambda x: clean_text_round2(x)

In [37]:
disastertweet['text']=disastertweet['text'].apply(round2)
disastertweet['text']

0                       deeds reason may allah forgive us
1                   forest fire near la ronge sask canada
2       residents asked shelter place notified officer...
3             people receive evacuation orders california
4                  got sent photo ruby smoke pours school
                              ...                        
7608    two giant cranes holding bridge collapse nearb...
7609    control wild fires california even northern pa...
7610                                      volcano hawaii 
7611    police investigating ebike collided car little...
7612    latest homes razed northern california wildfir...
Name: text, Length: 7613, dtype: object

In [38]:
# Number of Character
disastertweet['char_count'] = disastertweet['text'].str.len()
disastertweet[['text','char_count']].head()

Unnamed: 0,text,char_count
0,deeds reason may allah forgive us,33
1,forest fire near la ronge sask canada,37
2,residents asked shelter place notified officer...,88
3,people receive evacuation orders california,44
4,got sent photo ruby smoke pours school,38


In [39]:
# Average Words
def avg_word(sentence):
  words = sentence.split()
  return (sum(len(word) for word in words)/(len(words)+0.000001))

In [40]:
disastertweet['avg_word'] = disastertweet['text'].apply(lambda x: avg_word(x)).round(1)
disastertweet[['text','avg_word']].head()

Unnamed: 0,text,avg_word
0,deeds reason may allah forgive us,4.7
1,forest fire near la ronge sask canada,4.4
2,residents asked shelter place notified officer...,7.1
3,people receive evacuation orders california,7.8
4,got sent photo ruby smoke pours school,4.6


In [41]:
# Write to CSV
disastertweet.to_csv('Disaster_tweet_processed.csv', index=False)

## Define Model

In [42]:
tweet = pd.read_csv('Disaster_tweet_processed.csv')

In [43]:
tweet.isnull().sum()

index           0
target          0
text           58
stopwords       0
punctuation     0
hastags         0
numerics        0
mention         0
URL             0
upper           0
char_count      0
avg_word        0
dtype: int64

In [44]:
tweet.dropna(subset=['text'], inplace=True)

In [45]:
X = tweet.text
y = tweet.target
print(X.shape, y.shape)

(7555,) (7555,)


In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
print ('Train Set Shape\t\t:{}\nTest Set Shape\t\t:{}'.format(X_train.shape, X_test.shape))

Train Set Shape		:(6044,)
Test Set Shape		:(1511,)


## CountVectorizer Bag of Word

In [47]:
# Create the word vector with CountVectorizer

count_vect = CountVectorizer(ngram_range=(1,1))
count_vect_train = count_vect.fit_transform(X_train)
count_vect_train = count_vect_train.toarray()
count_vect_test = count_vect.transform(X_test)
count_vect_test = count_vect_test.toarray()

In [48]:
# Print vocabulary length
print('Vocabulary length :', len(count_vect.get_feature_names()))

Vocabulary length : 12484


In [49]:
# Assign feature names of vector into a variable
vocab = count_vect.get_feature_names()

In [50]:
# Dataframe for train countvectorizer dataset
pd.DataFrame(count_vect_train, columns = vocab).head()

Unnamed: 0,aa,aaaa,aaaaaaallll,aaaaaand,aaarrrgghhh,aampb,aampw,aan,aannnnd,aar,aashiqui,aba,abandon,abandoned,abandoning,abbandoned,abbott,abbswinston,abc,abcnews,abcs,abe,aberdeen,aberystwythshrewsbury,abes,abia,ability,abject,ablaze,able,aboard,abomb,abombed,abomination,abortion,abortions,abouts,abs,absence,absolute,...,ûnotherû,ûplot,ûpolitics,ûransomwareûª,ûvulnerableûª,ûªs,ûåêdemolition,ûï,ûïa,ûïafter,ûïairplaneû,ûïall,ûïcat,ûïdetonateû,ûïfor,ûïhatchet,ûïi,ûïlittle,ûïlove,ûïmake,ûïnewsû,ûïnobody,ûïnumbers,ûïparties,ûïplans,ûïrichmond,ûïsippinûª,ûïstretcher,ûïthe,ûïwe,ûïwhen,ûïyou,ûïû,ûò,ûòthe,ûòåêcnbc,ûó,ûóher,ûókody,ûûif
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## Naive Bayes

In [51]:
NB = MultinomialNB()

In [52]:
NB.fit(count_vect_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [53]:
y_pred = NB.predict(count_vect_test)


In [54]:
# Compute and print the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.77      0.87      0.82       840
           1       0.81      0.68      0.74       671

    accuracy                           0.79      1511
   macro avg       0.79      0.78      0.78      1511
weighted avg       0.79      0.79      0.78      1511



In [55]:
#Confusion Matrix
cmnb = confusion_matrix(y_test, y_pred)
print('Confusion matrix\n\n', cmnb)

Confusion matrix

 [[732 108]
 [215 456]]


In [56]:
#Perhitungan kategori dalam confusion matrix
tpnb = cmnb[1,1]
tnnb = cmnb[0,0]
fpnb = cmnb[0,1]
fnnb = cmnb[1,0]

print('True Positives(TP) = ', tpnb)
print('True Negatives(TN) = ', tnnb)
print('False Positives(FP) = ', fpnb)
print('False Negatives(FN) = ', fnnb)

True Positives(TP) =  456
True Negatives(TN) =  732
False Positives(FP) =  108
False Negatives(FN) =  215


In [57]:
# Perhitungan evaluation metrics
accunb = (tpnb+tnnb)/(tpnb+tnnb+fpnb+fnnb)
misclassnb = (fnnb+fpnb)/(tpnb+tnnb+fpnb+fnnb)
precisnb = tpnb/(tpnb+fpnb)
sensinb = tpnb/(tpnb+fnnb)
specinb = tnnb/(tnnb+fpnb)
f1nb = 2 * (precisnb*sensinb) / (precisnb+sensinb)


print('Accuracy is ',accunb)
print('Misclassification is ',misclassnb)
print('Precision is ',precisnb)
print('Sensitivity is ',sensinb)
print('Specificity is',specinb)
print('F-1 Score is ',f1nb)


Accuracy is  0.786234281932495
Misclassification is  0.21376571806750497
Precision is  0.8085106382978723
Sensitivity is  0.6795827123695977
Specificity is 0.8714285714285714
F-1 Score is  0.7384615384615386


## Logistic Regression

In [58]:
LogReg = LogisticRegression()

In [59]:
LogReg.fit(count_vect_train,y_train)
y_predlogreg = LogReg.predict(count_vect_test)


In [60]:
# Compute and print the classification report
print(classification_report(y_test, y_predlogreg))

              precision    recall  f1-score   support

           0       0.77      0.88      0.82       840
           1       0.82      0.66      0.73       671

    accuracy                           0.79      1511
   macro avg       0.79      0.77      0.78      1511
weighted avg       0.79      0.79      0.78      1511



In [61]:
#Confusion Matrix
cmlogreg = confusion_matrix(y_test, y_predlogreg)
print('Confusion matrix\n\n', cmlogreg)

#Perhitungan kategori dalam confusion matrix
tplogreg = cmlogreg[1,1]
tnlogreg = cmlogreg[0,0]
fplogreg = cmlogreg[0,1]
fnlogreg = cmlogreg[1,0]

print('True Positives(TP) = ', tplogreg)
print('True Negatives(TN) = ', tnlogreg)
print('False Positives(FP) = ', fplogreg)
print('False Negatives(FN) = ', fnlogreg)

# Perhitungan evaluation metrics
acculogreg = (tplogreg+tnlogreg)/(tplogreg+tnlogreg+fplogreg+fnlogreg)
misclasslogreg = (fnlogreg+fplogreg)/(tplogreg+tnlogreg+fplogreg+fnlogreg)
precislogreg = tplogreg/(tplogreg+fplogreg)
sensilogreg = tplogreg/(tplogreg+fnlogreg)
specilogreg = tnlogreg/(tnlogreg+fplogreg)
f1logreg = 2 * (precislogreg*sensilogreg) / (precislogreg+sensilogreg)


print('Accuracy is ',acculogreg)
print('Misclassification is ',misclasslogreg)
print('Precision is ',precislogreg)
print('Sensitivity is ',sensilogreg)
print('Specificity is',specilogreg)
print('F-1 Score is ',f1logreg)


Confusion matrix

 [[742  98]
 [225 446]]
True Positives(TP) =  446
True Negatives(TN) =  742
False Positives(FP) =  98
False Negatives(FN) =  225
Accuracy is  0.786234281932495
Misclassification is  0.21376571806750497
Precision is  0.8198529411764706
Sensitivity is  0.6646795827123696
Specificity is 0.8833333333333333
F-1 Score is  0.7341563786008231


## SVM Model

In [62]:
SVM = SVC()

In [63]:
SVM.fit(count_vect_train,y_train)
y_predsvm = SVM.predict(count_vect_test)


In [64]:
# Compute and print the classification report
print(classification_report(y_test, y_predsvm))

              precision    recall  f1-score   support

           0       0.74      0.93      0.82       840
           1       0.87      0.60      0.71       671

    accuracy                           0.78      1511
   macro avg       0.80      0.76      0.77      1511
weighted avg       0.80      0.78      0.77      1511



In [65]:
#Confusion Matrix
cmsvm = confusion_matrix(y_test, y_predsvm)
print('Confusion matrix\n\n', cmsvm)

#Perhitungan kategori dalam confusion matrix
tpsvm = cmsvm[1,1]
tnsvm = cmsvm[0,0]
fpsvm = cmsvm[0,1]
fnsvm = cmsvm[1,0]

print('True Positives(TP) = ', tpsvm)
print('True Negatives(TN) = ', tnsvm)
print('False Positives(FP) = ', fpsvm)
print('False Negatives(FN) = ', fnsvm)

# Perhitungan evaluation metrics
accusvm = (tpsvm+tnsvm)/(tpsvm+tnsvm+fpsvm+fnsvm)
misclasssvm = (fnsvm+fpsvm)/(tpsvm+tnsvm+fpsvm+fnsvm)
precissvm = tpsvm/(tpsvm+fpsvm)
sensisvm = tpsvm/(tpsvm+fnsvm)
specisvm = tnsvm/(tnsvm+fpsvm)
f1svm = 2 * (precissvm*sensisvm) / (precissvm+sensisvm)


print('Accuracy is ',accusvm)
print('Misclassification is ',misclasssvm)
print('Precision is ',precissvm)
print('Sensitivity is ',sensisvm)
print('Specificity is',specisvm)
print('F-1 Score is ',f1svm)


Confusion matrix

 [[779  61]
 [271 400]]
True Positives(TP) =  400
True Negatives(TN) =  779
False Positives(FP) =  61
False Negatives(FN) =  271
Accuracy is  0.7802779616148247
Misclassification is  0.21972203838517537
Precision is  0.8676789587852495
Sensitivity is  0.5961251862891207
Specificity is 0.9273809523809524
F-1 Score is  0.7067137809187278


## Evaluation

In [66]:
#Definisikan hasil evaluasi
evallogreg = [acculogreg,misclasslogreg,precislogreg,sensilogreg,specilogreg,f1logreg]
evalnb = [accunb,misclassnb,precisnb,sensinb,specinb,f1nb]
evalsvm = [accusvm,misclasssvm,precissvm,sensisvm,specisvm,f1svm]


In [67]:
evaldf = pd.DataFrame(evallogreg, columns=['Logistic Regression'], index=['Accuracy','Misclassification','Precision','Sensitivity','Specificity','F-1 Score'])

In [68]:
evaldf['Naive Bayes'] = evalnb
evaldf['Support Vector Machine'] = evalsvm

In [69]:
evaldf

Unnamed: 0,Logistic Regression,Naive Bayes,Support Vector Machine
Accuracy,0.786234,0.786234,0.780278
Misclassification,0.213766,0.213766,0.219722
Precision,0.819853,0.808511,0.867679
Sensitivity,0.66468,0.679583,0.596125
Specificity,0.883333,0.871429,0.927381
F-1 Score,0.734156,0.738462,0.706714


## Kesimpulan
1. Model Logistic Regresson memiliki nilai akurasi yang tinggi, namun untuk sensitivity dan f1 score masih lebih rendah dibandingkan dengan Naive Bayes, dan untuk precision dan specificity masih lebih rendah dibandingkan dengan SVM.
2. Model SVM memiliki tingkat specificity dan precision yang paling tinggi, namun untuk sensitivity, akurasi dan f1 score masih lebih rendah dibandingkan dengan model lainnya
3. Model Naive Bayes memiliki tingkat akurasi, sensitivity dan f1 score yang paling tinggi, namun untuk precision dan specificity masih lebih rendah dibandingkan dengan model lainnya
4. Ketiga model tersebut sudah cukup baik dalam memprediksi disaster tweet.