<a href="https://colab.research.google.com/github/Arimoro2020/Natural-Language-Processing-NLP-with-Disaster-Tweets/blob/main/02_Data_Cleaning_Natural_Language_Processing_with_Disaster_Tweets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Mount Drive**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**Import Pandas & Numpy Libraries**

In [2]:
import pandas as pd
import numpy as np

In [3]:
pd.set_option('display.max_colwidth', 70)

**Load Tweet Data**

In [4]:
train_tweets = pd.read_excel('/content/drive/MyDrive/PTDataScience/Project2/NLP/NLP2/feature.xlsx')
train_tweets.head()

Unnamed: 0,id,text,target,text_len,%punc,t_%punc,x_feature
0,1,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1,57,1.8,1.124746,64.110528
1,4,Forest fire near La Ronge Sask. Canada,1,32,3.1,1.253927,40.125672
2,5,All residents asked to 'shelter in place' are being notified by of...,1,112,2.7,1.219755,136.612606
3,6,"13,000 people receive #wildfires evacuation orders in California",1,57,3.5,1.284735,73.229904
4,7,Just got sent this photo from Ruby #Alaska as smoke from #wildfire...,1,72,2.8,1.22866,88.463497


**Remove Punctuation**

In [5]:
import string

In [6]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [7]:
def remove_punc(message):
  txt_nopunc =''.join([txt for txt in message if txt not in string.punctuation])
  return txt_nopunc

In [8]:
train_tweets['text_clean'] = train_tweets['text'].apply(lambda x: remove_punc(x))
train_tweets.head()

Unnamed: 0,id,text,target,text_len,%punc,t_%punc,x_feature,text_clean
0,1,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1,57,1.8,1.124746,64.110528,Our Deeds are the Reason of this earthquake May ALLAH Forgive us all
1,4,Forest fire near La Ronge Sask. Canada,1,32,3.1,1.253927,40.125672,Forest fire near La Ronge Sask Canada
2,5,All residents asked to 'shelter in place' are being notified by of...,1,112,2.7,1.219755,136.612606,All residents asked to shelter in place are being notified by offi...
3,6,"13,000 people receive #wildfires evacuation orders in California",1,57,3.5,1.284735,73.229904,13000 people receive wildfires evacuation orders in California
4,7,Just got sent this photo from Ruby #Alaska as smoke from #wildfire...,1,72,2.8,1.22866,88.463497,Just got sent this photo from Ruby Alaska as smoke from wildfires ...


**Tokenization**

In [9]:
import re

In [10]:
def tokenize(message):
  token = re.split('\W+', message)
  return token

In [11]:
train_tweets['txt_tokenized'] = train_tweets['text_clean'].apply(lambda x: tokenize(x.lower()))
train_tweets.head()

Unnamed: 0,id,text,target,text_len,%punc,t_%punc,x_feature,text_clean,txt_tokenized
0,1,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1,57,1.8,1.124746,64.110528,Our Deeds are the Reason of this earthquake May ALLAH Forgive us all,"[our, deeds, are, the, reason, of, this, earthquake, may, allah, f..."
1,4,Forest fire near La Ronge Sask. Canada,1,32,3.1,1.253927,40.125672,Forest fire near La Ronge Sask Canada,"[forest, fire, near, la, ronge, sask, canada]"
2,5,All residents asked to 'shelter in place' are being notified by of...,1,112,2.7,1.219755,136.612606,All residents asked to shelter in place are being notified by offi...,"[all, residents, asked, to, shelter, in, place, are, being, notifi..."
3,6,"13,000 people receive #wildfires evacuation orders in California",1,57,3.5,1.284735,73.229904,13000 people receive wildfires evacuation orders in California,"[13000, people, receive, wildfires, evacuation, orders, in, califo..."
4,7,Just got sent this photo from Ruby #Alaska as smoke from #wildfire...,1,72,2.8,1.22866,88.463497,Just got sent this photo from Ruby Alaska as smoke from wildfires ...,"[just, got, sent, this, photo, from, ruby, alaska, as, smoke, from..."


**Remove Stopwords**

In [12]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [13]:
stopword = nltk.corpus.stopwords.words('english')

In [14]:
stopword[0:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [15]:
def no_stpwords(token_list):
  mssg = [word for word in token_list if word not in stopword]
  return mssg

In [16]:
train_tweets['txt_nostop'] = train_tweets['txt_tokenized'].apply(lambda x: no_stpwords(x))
train_tweets.head()

Unnamed: 0,id,text,target,text_len,%punc,t_%punc,x_feature,text_clean,txt_tokenized,txt_nostop
0,1,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1,57,1.8,1.124746,64.110528,Our Deeds are the Reason of this earthquake May ALLAH Forgive us all,"[our, deeds, are, the, reason, of, this, earthquake, may, allah, f...","[deeds, reason, earthquake, may, allah, forgive, us]"
1,4,Forest fire near La Ronge Sask. Canada,1,32,3.1,1.253927,40.125672,Forest fire near La Ronge Sask Canada,"[forest, fire, near, la, ronge, sask, canada]","[forest, fire, near, la, ronge, sask, canada]"
2,5,All residents asked to 'shelter in place' are being notified by of...,1,112,2.7,1.219755,136.612606,All residents asked to shelter in place are being notified by offi...,"[all, residents, asked, to, shelter, in, place, are, being, notifi...","[residents, asked, shelter, place, notified, officers, evacuation,..."
3,6,"13,000 people receive #wildfires evacuation orders in California",1,57,3.5,1.284735,73.229904,13000 people receive wildfires evacuation orders in California,"[13000, people, receive, wildfires, evacuation, orders, in, califo...","[13000, people, receive, wildfires, evacuation, orders, california, ]"
4,7,Just got sent this photo from Ruby #Alaska as smoke from #wildfire...,1,72,2.8,1.22866,88.463497,Just got sent this photo from Ruby Alaska as smoke from wildfires ...,"[just, got, sent, this, photo, from, ruby, alaska, as, smoke, from...","[got, sent, photo, ruby, alaska, smoke, wildfires, pours, school, ]"


**Lemmatization**

In [17]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [18]:
word_net = nltk.WordNetLemmatizer()

In [19]:
def lemmatization(tokenized_list):
  msg = [word_net.lemmatize(word) for word in tokenized_list]
  return msg

In [20]:
train_tweets['txt_lemmatized'] = train_tweets['txt_nostop'].apply(lambda x: lemmatization(x))
train_tweets.head(10)

Unnamed: 0,id,text,target,text_len,%punc,t_%punc,x_feature,text_clean,txt_tokenized,txt_nostop,txt_lemmatized
0,1,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1,57,1.8,1.124746,64.110528,Our Deeds are the Reason of this earthquake May ALLAH Forgive us all,"[our, deeds, are, the, reason, of, this, earthquake, may, allah, f...","[deeds, reason, earthquake, may, allah, forgive, us]","[deed, reason, earthquake, may, allah, forgive, u]"
1,4,Forest fire near La Ronge Sask. Canada,1,32,3.1,1.253927,40.125672,Forest fire near La Ronge Sask Canada,"[forest, fire, near, la, ronge, sask, canada]","[forest, fire, near, la, ronge, sask, canada]","[forest, fire, near, la, ronge, sask, canada]"
2,5,All residents asked to 'shelter in place' are being notified by of...,1,112,2.7,1.219755,136.612606,All residents asked to shelter in place are being notified by offi...,"[all, residents, asked, to, shelter, in, place, are, being, notifi...","[residents, asked, shelter, place, notified, officers, evacuation,...","[resident, asked, shelter, place, notified, officer, evacuation, s..."
3,6,"13,000 people receive #wildfires evacuation orders in California",1,57,3.5,1.284735,73.229904,13000 people receive wildfires evacuation orders in California,"[13000, people, receive, wildfires, evacuation, orders, in, califo...","[13000, people, receive, wildfires, evacuation, orders, california, ]","[13000, people, receive, wildfire, evacuation, order, california, ]"
4,7,Just got sent this photo from Ruby #Alaska as smoke from #wildfire...,1,72,2.8,1.22866,88.463497,Just got sent this photo from Ruby Alaska as smoke from wildfires ...,"[just, got, sent, this, photo, from, ruby, alaska, as, smoke, from...","[got, sent, photo, ruby, alaska, smoke, wildfires, pours, school, ]","[got, sent, photo, ruby, alaska, smoke, wildfire, pours, school, ]"
5,8,#RockyFire Update => California Hwy. 20 closed in both directions ...,1,93,7.5,1.496278,139.153842,RockyFire Update California Hwy 20 closed in both directions due ...,"[rockyfire, update, california, hwy, 20, closed, in, both, directi...","[rockyfire, update, california, hwy, 20, closed, directions, due, ...","[rockyfire, update, california, hwy, 20, closed, direction, due, l..."
6,10,#flood #disaster Heavy rain causes flash flooding of streets in Ma...,1,82,3.7,1.299093,106.525651,flood disaster Heavy rain causes flash flooding of streets in Mani...,"[flood, disaster, heavy, rain, causes, flash, flooding, of, street...","[flood, disaster, heavy, rain, causes, flash, flooding, streets, m...","[flood, disaster, heavy, rain, cause, flash, flooding, street, man..."
7,13,I'm on top of the hill and I can see a fire in the woods...,1,45,8.9,1.548382,69.677172,Im on top of the hill and I can see a fire in the woods,"[im, on, top, of, the, hill, and, i, can, see, a, fire, in, the, w...","[im, top, hill, see, fire, woods]","[im, top, hill, see, fire, wood]"
8,14,There's an emergency evacuation happening now in the building acro...,1,68,1.5,1.084472,73.74408,Theres an emergency evacuation happening now in the building acros...,"[theres, an, emergency, evacuation, happening, now, in, the, build...","[theres, emergency, evacuation, happening, building, across, street]","[there, emergency, evacuation, happening, building, across, street]"
9,15,I'm afraid that the tornado is coming to our area...,1,43,9.3,1.562056,67.168407,Im afraid that the tornado is coming to our area,"[im, afraid, that, the, tornado, is, coming, to, our, area]","[im, afraid, tornado, coming, area]","[im, afraid, tornado, coming, area]"


**Save DataFrame with Feature Engineering in Excel for Use in Vectorization and ML in Diffrent Collab**

In [21]:
train_tweets.to_excel('/content/drive/MyDrive/PTDataScience/Project2/NLP/NLP2/lemmatized.xlsx', index=False)