In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [2]:
data = pd.read_csv('../nlp-getting-started/train.csv')
data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [4]:
data.isna().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [5]:
data.fillna("NA", inplace=True)

In [6]:
from string import punctuation
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

def remove_link(sentence):
    cleaned_sentence = re.sub(r'http\S+|www\S+', '', sentence)
    return cleaned_sentence.strip()

def extractHashtags(text):
    li = re.findall(r'\#([a-zA-Z0-9_]+)', text)
    return li

def remove_words_starting_with_at(sentence):
    cleaned_sentence = re.sub(r'\@\w+\s*', '', sentence)
    return cleaned_sentence.strip()

def removePunctuations(text):
    newText = "".join([i for i in text if i not in punctuation])
    return newText

def removeStopwords(text):
    newtext = [i for i in text.split() if i not in stopwords.words("english")]
    return newtext

wnl = WordNetLemmatizer()

def lemmatize(text):
    newText = [wnl.lemmatize(ele) for ele in text]
    return newText

In [7]:
data['text_nolink'] = data['text'].apply(lambda x:remove_link(x))
data['hashtags'] = data['text_nolink'].apply(lambda x:extractHashtags(x))
data['text_nomentions'] = data['text_nolink'].apply(lambda x:remove_words_starting_with_at(x))

In [8]:
import spacy
nlp = spacy.load('en_core_web_sm')

def findLocations(text):
    doc = nlp(text)
    locations = [entity.text for entity in doc.ents if entity.label_ == 'GPE' or entity.label_ == 'LOC']
    return locations

data['extracted_locations'] = data['text_nomentions'].apply(lambda x:findLocations(x))

In [9]:
data['noPunctuations'] = data['text_nomentions'].apply(lambda x:removePunctuations(x))
data['noStopwordsTokenized'] = data['noPunctuations'].apply(lambda x:removeStopwords(x))
data['lemmatized'] = data['noStopwordsTokenized'].apply(lambda x:lemmatize(x))

In [10]:
# Join the each string in the array with a space in between
def finalize(textList):
    text = " ".join(textList)
    return text

In [11]:
data['text_cleaned'] = data['lemmatized'].apply(lambda x:finalize(x).lower())
data.head()

Unnamed: 0,id,keyword,location,text,target,text_nolink,hashtags,text_nomentions,extracted_locations,noPunctuations,noStopwordsTokenized,lemmatized,text_cleaned
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this #earthquake M...,[earthquake],Our Deeds are the Reason of this #earthquake M...,[],Our Deeds are the Reason of this earthquake Ma...,"[Our, Deeds, Reason, earthquake, May, ALLAH, F...","[Our, Deeds, Reason, earthquake, May, ALLAH, F...",our deeds reason earthquake may allah forgive u
1,4,,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask. Canada,[],Forest fire near La Ronge Sask. Canada,[Canada],Forest fire near La Ronge Sask Canada,"[Forest, fire, near, La, Ronge, Sask, Canada]","[Forest, fire, near, La, Ronge, Sask, Canada]",forest fire near la ronge sask canada
2,5,,,All residents asked to 'shelter in place' are ...,1,All residents asked to 'shelter in place' are ...,[],All residents asked to 'shelter in place' are ...,[],All residents asked to shelter in place are be...,"[All, residents, asked, shelter, place, notifi...","[All, resident, asked, shelter, place, notifie...",all resident asked shelter place notified offi...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"13,000 people receive #wildfires evacuation or...",[wildfires],"13,000 people receive #wildfires evacuation or...",[California],13000 people receive wildfires evacuation orde...,"[13000, people, receive, wildfires, evacuation...","[13000, people, receive, wildfire, evacuation,...",13000 people receive wildfire evacuation order...
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby #Alaska as ...,"[Alaska, wildfires]",Just got sent this photo from Ruby #Alaska as ...,"[Ruby, Alaska]",Just got sent this photo from Ruby Alaska as s...,"[Just, got, sent, photo, Ruby, Alaska, smoke, ...","[Just, got, sent, photo, Ruby, Alaska, smoke, ...",just got sent photo ruby alaska smoke wildfire...


In [12]:
data['location'] = data['location'].apply(lambda x:remove_link(x))
data['location'] = data['location'].apply(lambda x:remove_words_starting_with_at(x))
data['location'] = data['location'].apply(lambda x:removePunctuations(x))
data['location'] = data['location'].apply(lambda x:removeStopwords(x))
data['location'] = data['location'].apply(lambda x:lemmatize(x))

In [13]:
data['keyword'] = data['keyword'].apply(lambda x:remove_link(x))
data['keyword'] = data['keyword'].apply(lambda x:remove_words_starting_with_at(x))
data['keyword'] = data['keyword'].apply(lambda x:removePunctuations(x))
data['keyword'] = data['keyword'].apply(lambda x:removeStopwords(x))
data['keyword'] = data['keyword'].apply(lambda x:lemmatize(x))

In [14]:
data.head(100)

Unnamed: 0,id,keyword,location,text,target,text_nolink,hashtags,text_nomentions,extracted_locations,noPunctuations,noStopwordsTokenized,lemmatized,text_cleaned
0,1,[NA],[NA],Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this #earthquake M...,[earthquake],Our Deeds are the Reason of this #earthquake M...,[],Our Deeds are the Reason of this earthquake Ma...,"[Our, Deeds, Reason, earthquake, May, ALLAH, F...","[Our, Deeds, Reason, earthquake, May, ALLAH, F...",our deeds reason earthquake may allah forgive u
1,4,[NA],[NA],Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask. Canada,[],Forest fire near La Ronge Sask. Canada,[Canada],Forest fire near La Ronge Sask Canada,"[Forest, fire, near, La, Ronge, Sask, Canada]","[Forest, fire, near, La, Ronge, Sask, Canada]",forest fire near la ronge sask canada
2,5,[NA],[NA],All residents asked to 'shelter in place' are ...,1,All residents asked to 'shelter in place' are ...,[],All residents asked to 'shelter in place' are ...,[],All residents asked to shelter in place are be...,"[All, residents, asked, shelter, place, notifi...","[All, resident, asked, shelter, place, notifie...",all resident asked shelter place notified offi...
3,6,[NA],[NA],"13,000 people receive #wildfires evacuation or...",1,"13,000 people receive #wildfires evacuation or...",[wildfires],"13,000 people receive #wildfires evacuation or...",[California],13000 people receive wildfires evacuation orde...,"[13000, people, receive, wildfires, evacuation...","[13000, people, receive, wildfire, evacuation,...",13000 people receive wildfire evacuation order...
4,7,[NA],[NA],Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby #Alaska as ...,"[Alaska, wildfires]",Just got sent this photo from Ruby #Alaska as ...,"[Ruby, Alaska]",Just got sent this photo from Ruby Alaska as s...,"[Just, got, sent, photo, Ruby, Alaska, smoke, ...","[Just, got, sent, photo, Ruby, Alaska, smoke, ...",just got sent photo ruby alaska smoke wildfire...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,137,[accident],[Charlotte],9 Mile backup on I-77 South...accident blockin...,1,9 Mile backup on I-77 South...accident blockin...,[],9 Mile backup on I-77 South...accident blockin...,"[NC, NC, NC]",9 Mile backup on I77 Southaccident blocking th...,"[9, Mile, backup, I77, Southaccident, blocking...","[9, Mile, backup, I77, Southaccident, blocking...",9 mile backup i77 southaccident blocking right...
96,138,[accident],"[Baton, Rouge, LA]",Has an accident changed your life? We will hel...,0,Has an accident changed your life? We will hel...,[],Has an accident changed your life? We will hel...,[],Has an accident changed your life We will help...,"[Has, accident, changed, life, We, help, deter...","[Has, accident, changed, life, We, help, deter...",has accident changed life we help determine op...
97,139,[accident],"[Hagerstown, MD]",#BREAKING: there was a deadly motorcycle car a...,1,#BREAKING: there was a deadly motorcycle car a...,"[BREAKING, Hagerstown, WHAG]",#BREAKING: there was a deadly motorcycle car a...,[Hagerstown],BREAKING there was a deadly motorcycle car acc...,"[BREAKING, deadly, motorcycle, car, accident, ...","[BREAKING, deadly, motorcycle, car, accident, ...",breaking deadly motorcycle car accident happen...
98,141,[accident],"[Gloucestershire, UK]",@flowri were you marinading it or was it an ac...,0,@flowri were you marinading it or was it an ac...,[],were you marinading it or was it an accident?,[],were you marinading it or was it an accident,"[marinading, accident]","[marinading, accident]",marinading accident


In [15]:
data['location'] = data['location'].apply(lambda x:finalize(x))
data['keyword'] = data['keyword'].apply(lambda x:finalize(x))

In [16]:
data['all_locations'] = data['location'] + ' ' + data['extracted_locations'].apply(lambda x: ' '.join(x))
data['all_locations'] = data['all_locations'].apply(lambda x:x.split())

In [17]:
def removeNA(li):
    ans = []
    for el in li:
        if el != 'NA':
            ans.append(el)
    return ans

In [18]:
data['all_locations'] = data['all_locations'].apply(lambda x:removeNA(x))
data['all_locations'] = data['all_locations'].apply(lambda x:list(set(x)))

In [19]:
data.head(25)

Unnamed: 0,id,keyword,location,text,target,text_nolink,hashtags,text_nomentions,extracted_locations,noPunctuations,noStopwordsTokenized,lemmatized,text_cleaned,all_locations
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this #earthquake M...,[earthquake],Our Deeds are the Reason of this #earthquake M...,[],Our Deeds are the Reason of this earthquake Ma...,"[Our, Deeds, Reason, earthquake, May, ALLAH, F...","[Our, Deeds, Reason, earthquake, May, ALLAH, F...",our deeds reason earthquake may allah forgive u,[]
1,4,,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask. Canada,[],Forest fire near La Ronge Sask. Canada,[Canada],Forest fire near La Ronge Sask Canada,"[Forest, fire, near, La, Ronge, Sask, Canada]","[Forest, fire, near, La, Ronge, Sask, Canada]",forest fire near la ronge sask canada,[Canada]
2,5,,,All residents asked to 'shelter in place' are ...,1,All residents asked to 'shelter in place' are ...,[],All residents asked to 'shelter in place' are ...,[],All residents asked to shelter in place are be...,"[All, residents, asked, shelter, place, notifi...","[All, resident, asked, shelter, place, notifie...",all resident asked shelter place notified offi...,[]
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"13,000 people receive #wildfires evacuation or...",[wildfires],"13,000 people receive #wildfires evacuation or...",[California],13000 people receive wildfires evacuation orde...,"[13000, people, receive, wildfires, evacuation...","[13000, people, receive, wildfire, evacuation,...",13000 people receive wildfire evacuation order...,[California]
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby #Alaska as ...,"[Alaska, wildfires]",Just got sent this photo from Ruby #Alaska as ...,"[Ruby, Alaska]",Just got sent this photo from Ruby Alaska as s...,"[Just, got, sent, photo, Ruby, Alaska, smoke, ...","[Just, got, sent, photo, Ruby, Alaska, smoke, ...",just got sent photo ruby alaska smoke wildfire...,"[Ruby, Alaska]"
5,8,,,#RockyFire Update => California Hwy. 20 closed...,1,#RockyFire Update => California Hwy. 20 closed...,"[RockyFire, CAfire, wildfires]",#RockyFire Update => California Hwy. 20 closed...,[Lake County],RockyFire Update California Hwy 20 closed in ...,"[RockyFire, Update, California, Hwy, 20, close...","[RockyFire, Update, California, Hwy, 20, close...",rockyfire update california hwy 20 closed dire...,"[County, Lake]"
6,10,,,#flood #disaster Heavy rain causes flash flood...,1,#flood #disaster Heavy rain causes flash flood...,"[flood, disaster]",#flood #disaster Heavy rain causes flash flood...,"[Manitou, Colorado Springs]",flood disaster Heavy rain causes flash floodin...,"[flood, disaster, Heavy, rain, causes, flash, ...","[flood, disaster, Heavy, rain, cause, flash, f...",flood disaster heavy rain cause flash flooding...,"[Springs, Manitou, Colorado]"
7,13,,,I'm on top of the hill and I can see a fire in...,1,I'm on top of the hill and I can see a fire in...,[],I'm on top of the hill and I can see a fire in...,[],Im on top of the hill and I can see a fire in ...,"[Im, top, hill, I, see, fire, woods]","[Im, top, hill, I, see, fire, wood]",im top hill i see fire wood,[]
8,14,,,There's an emergency evacuation happening now ...,1,There's an emergency evacuation happening now ...,[],There's an emergency evacuation happening now ...,[],Theres an emergency evacuation happening now i...,"[Theres, emergency, evacuation, happening, bui...","[Theres, emergency, evacuation, happening, bui...",theres emergency evacuation happening building...,[]
9,15,,,I'm afraid that the tornado is coming to our a...,1,I'm afraid that the tornado is coming to our a...,[],I'm afraid that the tornado is coming to our a...,[],Im afraid that the tornado is coming to our area,"[Im, afraid, tornado, coming, area]","[Im, afraid, tornado, coming, area]",im afraid tornado coming area,[]


In [20]:
def extract_unique_elements(row):
    return [item for item in row['hashtags'] if item not in row['all_locations']]

data['extra_keywords'] = data.apply(extract_unique_elements, axis=1)

In [21]:
data['all_keywords'] = data['keyword'] + ' ' + data['extra_keywords'].apply(lambda x:' '.join(x))
data['all_keywords'] = data['all_keywords'].apply(lambda x:x.split())
data['all_keywords'] = data['all_keywords'].apply(lambda x:removeNA(x))

In [22]:
data['all_locations'] = data['all_locations'].apply(lambda x:finalize(x))
data['all_keywords'] = data['all_keywords'].apply(lambda x:finalize(x))

In [23]:
data.to_excel('data_engineered.xlsx', index=False)