# Natural Language Processing with Disaster Tweets  
### (Predict which Tweets are about real disasters and which ones are not)

[Link to problem statement](https://www.kaggle.com/competitions/nlp-getting-started/overview)

In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

# Data Cleaning

In [25]:
data = pd.read_csv('../nlp-getting-started/train.csv')
data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [26]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [27]:
data.isna().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [28]:
data.fillna("NA", inplace=True)

In [29]:
from string import punctuation
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

def remove_link(sentence):
    cleaned_sentence = re.sub(r'http\S+|www\S+', '', sentence)
    return cleaned_sentence.strip()

def extractHashtags(text):
    li = re.findall(r'\#([a-zA-Z0-9_]+)', text)
    return li

def remove_words_starting_with_at(sentence):
    cleaned_sentence = re.sub(r'\@\w+\s*', '', sentence)
    return cleaned_sentence.strip()

def removePunctuations(text):
    newText = "".join([i for i in text if i not in punctuation])
    return newText

def removeStopwords(text):
    newtext = [i for i in text.split() if i not in stopwords.words("english")]
    return newtext

wnl = WordNetLemmatizer()

def lemmatize(text):
    newText = [wnl.lemmatize(ele) for ele in text]
    return newText

In [30]:
data['text_nolink'] = data['text'].apply(lambda x:remove_link(x))
data['hashtags'] = data['text_nolink'].apply(lambda x:extractHashtags(x))
data['text_nomentions'] = data['text_nolink'].apply(lambda x:remove_words_starting_with_at(x))

In [31]:
import spacy
nlp = spacy.load('en_core_web_sm')

def findLocations(text):
    doc = nlp(text)
    locations = [entity.text for entity in doc.ents if entity.label_ == 'GPE' or entity.label_ == 'LOC']
    return locations

data['extracted_locations'] = data['text_nomentions'].apply(lambda x:findLocations(x))

In [32]:
data['noPunctuations'] = data['text_nomentions'].apply(lambda x:removePunctuations(x))
data['noStopwordsTokenized'] = data['noPunctuations'].apply(lambda x:removeStopwords(x))
data['lemmatized'] = data['noStopwordsTokenized'].apply(lambda x:lemmatize(x))

In [33]:
# Join the each string in the array with a space in between
def finalize(textList):
    text = " ".join(textList)
    return text

In [34]:
data['text_cleaned'] = data['lemmatized'].apply(lambda x:finalize(x).lower())
data.head()

Unnamed: 0,id,keyword,location,text,target,text_nolink,hashtags,text_nomentions,extracted_locations,noPunctuations,noStopwordsTokenized,lemmatized,text_cleaned
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this #earthquake M...,[earthquake],Our Deeds are the Reason of this #earthquake M...,[],Our Deeds are the Reason of this earthquake Ma...,"[Our, Deeds, Reason, earthquake, May, ALLAH, F...","[Our, Deeds, Reason, earthquake, May, ALLAH, F...",our deeds reason earthquake may allah forgive u
1,4,,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask. Canada,[],Forest fire near La Ronge Sask. Canada,[Canada],Forest fire near La Ronge Sask Canada,"[Forest, fire, near, La, Ronge, Sask, Canada]","[Forest, fire, near, La, Ronge, Sask, Canada]",forest fire near la ronge sask canada
2,5,,,All residents asked to 'shelter in place' are ...,1,All residents asked to 'shelter in place' are ...,[],All residents asked to 'shelter in place' are ...,[],All residents asked to shelter in place are be...,"[All, residents, asked, shelter, place, notifi...","[All, resident, asked, shelter, place, notifie...",all resident asked shelter place notified offi...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"13,000 people receive #wildfires evacuation or...",[wildfires],"13,000 people receive #wildfires evacuation or...",[California],13000 people receive wildfires evacuation orde...,"[13000, people, receive, wildfires, evacuation...","[13000, people, receive, wildfire, evacuation,...",13000 people receive wildfire evacuation order...
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby #Alaska as ...,"[Alaska, wildfires]",Just got sent this photo from Ruby #Alaska as ...,"[Ruby, Alaska]",Just got sent this photo from Ruby Alaska as s...,"[Just, got, sent, photo, Ruby, Alaska, smoke, ...","[Just, got, sent, photo, Ruby, Alaska, smoke, ...",just got sent photo ruby alaska smoke wildfire...


In [35]:
data['location'] = data['location'].apply(lambda x:remove_link(x))
data['location'] = data['location'].apply(lambda x:remove_words_starting_with_at(x))
data['location'] = data['location'].apply(lambda x:removePunctuations(x))
data['location'] = data['location'].apply(lambda x:removeStopwords(x))
data['location'] = data['location'].apply(lambda x:lemmatize(x))

In [36]:
data['keyword'] = data['keyword'].apply(lambda x:remove_link(x))
data['keyword'] = data['keyword'].apply(lambda x:remove_words_starting_with_at(x))
data['keyword'] = data['keyword'].apply(lambda x:removePunctuations(x))
data['keyword'] = data['keyword'].apply(lambda x:removeStopwords(x))
data['keyword'] = data['keyword'].apply(lambda x:lemmatize(x))

In [37]:
data['location'] = data['location'].apply(lambda x:finalize(x))
data['keyword'] = data['keyword'].apply(lambda x:finalize(x))

In [38]:
data['all_locations'] = data['location'] + ' ' + data['extracted_locations'].apply(lambda x: ' '.join(x))
data['all_locations'] = data['all_locations'].apply(lambda x:x.split())

In [39]:
def removeNA(li):
    ans = []
    for el in li:
        if el != 'NA':
            ans.append(el)
    return ans

In [40]:
data['all_locations'] = data['all_locations'].apply(lambda x:removeNA(x))
data['all_locations'] = data['all_locations'].apply(lambda x:list(set(x)))

In [41]:
def extract_unique_elements(row):
    return [item for item in row['hashtags'] if item not in row['all_locations']]

data['extra_keywords'] = data.apply(extract_unique_elements, axis=1)

In [42]:
data['all_keywords'] = data['keyword'] + ' ' + data['extra_keywords'].apply(lambda x:' '.join(x))
data['all_keywords'] = data['all_keywords'].apply(lambda x:x.split())
data['all_keywords'] = data['all_keywords'].apply(lambda x:removeNA(x))

In [43]:
data['all_locations'] = data['all_locations'].apply(lambda x:finalize(x))
data['all_keywords'] = data['all_keywords'].apply(lambda x:finalize(x))

In [44]:
data.head()

Unnamed: 0,id,keyword,location,text,target,text_nolink,hashtags,text_nomentions,extracted_locations,noPunctuations,noStopwordsTokenized,lemmatized,text_cleaned,all_locations,extra_keywords,all_keywords
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this #earthquake M...,[earthquake],Our Deeds are the Reason of this #earthquake M...,[],Our Deeds are the Reason of this earthquake Ma...,"[Our, Deeds, Reason, earthquake, May, ALLAH, F...","[Our, Deeds, Reason, earthquake, May, ALLAH, F...",our deeds reason earthquake may allah forgive u,,[earthquake],earthquake
1,4,,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask. Canada,[],Forest fire near La Ronge Sask. Canada,[Canada],Forest fire near La Ronge Sask Canada,"[Forest, fire, near, La, Ronge, Sask, Canada]","[Forest, fire, near, La, Ronge, Sask, Canada]",forest fire near la ronge sask canada,Canada,[],
2,5,,,All residents asked to 'shelter in place' are ...,1,All residents asked to 'shelter in place' are ...,[],All residents asked to 'shelter in place' are ...,[],All residents asked to shelter in place are be...,"[All, residents, asked, shelter, place, notifi...","[All, resident, asked, shelter, place, notifie...",all resident asked shelter place notified offi...,,[],
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"13,000 people receive #wildfires evacuation or...",[wildfires],"13,000 people receive #wildfires evacuation or...",[California],13000 people receive wildfires evacuation orde...,"[13000, people, receive, wildfires, evacuation...","[13000, people, receive, wildfire, evacuation,...",13000 people receive wildfire evacuation order...,California,[wildfires],wildfires
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby #Alaska as ...,"[Alaska, wildfires]",Just got sent this photo from Ruby #Alaska as ...,"[Ruby, Alaska]",Just got sent this photo from Ruby Alaska as s...,"[Just, got, sent, photo, Ruby, Alaska, smoke, ...","[Just, got, sent, photo, Ruby, Alaska, smoke, ...",just got sent photo ruby alaska smoke wildfire...,Ruby Alaska,[wildfires],wildfires


In [45]:
data_cleaned = data

In [46]:
df = data_cleaned[['text_cleaned', 'all_locations', 'all_keywords', 'target']]

In [47]:
df['text_length'] = df['text_cleaned'].apply(lambda x:len(x))
df['word_count'] = df['text_cleaned'].apply(lambda x:len(x.split()))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text_length'] = df['text_cleaned'].apply(lambda x:len(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['word_count'] = df['text_cleaned'].apply(lambda x:len(x.split()))


In [48]:
df.isna().sum()

text_cleaned     0
all_locations    0
all_keywords     0
target           0
text_length      0
word_count       0
dtype: int64

In [49]:
df.fillna("NA", inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.fillna("NA", inplace=True)


In [50]:
df.isna().sum()

text_cleaned     0
all_locations    0
all_keywords     0
target           0
text_length      0
word_count       0
dtype: int64

In [51]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.compose import make_column_transformer
cv = CountVectorizer()

text_vectorized = cv.fit_transform(df['text_cleaned']).toarray()

In [52]:
data = pd.DataFrame(text_vectorized, columns=cv.get_feature_names_out())

In [53]:
for el in ['text_length', 'word_count', 'target']:
    data[el] = df[el]

data.head()

Unnamed: 0,0011,001116,0025,005225,010156,010217,0104,010401,0106,0111,...,ûòåêcnbc,ûó,ûóbbc,ûóher,ûókody,ûónegligence,ûótech,ûówe,text_length,word_count
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,47,8
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,37,7
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,92,13
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,57,7
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,59,10


In [54]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Columns: 14996 entries, 0011 to word_count
dtypes: int64(14996)
memory usage: 871.0 MB


In [55]:
from sklearn.model_selection import train_test_split

X = data.drop(['target'], axis=1)
Y = data['target']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

In [56]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix

classifier1 = GaussianNB()
classifier1.fit(X_train, Y_train)

Y_pred = classifier1.predict(X_test)

print(confusion_matrix(Y_test, Y_pred))
print(accuracy_score(Y_test, Y_pred))

[[630 688]
 [177 789]]
0.6212784588441331


In [57]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

treeClassifier1 = DecisionTreeClassifier()
treeClassifier1.fit(X_train, Y_train)

Y_pred = treeClassifier1.predict(X_test)

print(confusion_matrix(Y_test, Y_pred))
print(accuracy_score(Y_test, Y_pred))

[[1049  269]
 [ 342  624]]
0.7324868651488616


In [58]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()
clf.fit(X_train, Y_train)

Y_pred = clf.predict(X_test)

print(confusion_matrix(Y_test, Y_pred))
print(accuracy_score(Y_test, Y_pred))

[[1198  120]
 [ 368  598]]
0.7863397548161121


In [59]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

lr = LogisticRegression(penalty='l2', C=1, max_iter=7600, random_state=42)
lr.fit(X_train, Y_train)

Y_pred = lr.predict(X_test)

print(confusion_matrix(Y_test, Y_pred))
print(accuracy_score(Y_test, Y_pred))

[[1134  184]
 [ 272  694]]
0.8003502626970228


In [60]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score, confusion_matrix

bnb = BernoulliNB()
bnb.fit(X_train, Y_train)

Y_pred = bnb.predict(X_test)

print(confusion_matrix(Y_test, Y_pred))
print(accuracy_score(Y_test, Y_pred))

[[1204  114]
 [ 337  629]]
0.8025394045534151
