# **DATA PREPROCESSING**

### Importing Necessary Packages for Data Pre-processing

In [None]:
import nltk
from nltk.corpus import stopwords # for removing stopwords
from nltk.tokenize import RegexpTokenizer # for tokenizing
from nltk.stem import WordNetLemmatizer # for lemmatizing
from nltk.stem.porter import PorterStemmer # for stemming

import string # for converting all the data into string for data preprocessing

import re # for pattern matching, used for removing URLs, Usernames and Punctuations

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

### Hinglish Stopwords ( Hinglish is an amalgamation of Hindi and English)


**Hinglish Stopwords**
> This list will be used to remove Hindi stopwords in the dataset as nltk.stopwords does not include the Hindi language.

In [None]:
#this list was derived from this GitHub link: https://github.com/TrigonaMinima/HinglishNLP/blob/master/data/assets/stop_hinglish

hindi_s=('''
a
aadi
aaj
aap
aapne
aata
aati
aaya
aaye
ab
abbe
abbey
abe
abhi
able
about
above
accha
according
accordingly
acha
achcha
across
actually
after
afterwards
again
against
agar
ain
aint
ain't
aisa
aise
aisi
alag
all
allow
allows
almost
alone
along
already
also
although
always
am
among
amongst
an
and
andar
another
any
anybody
anyhow
anyone
anything
anyway
anyways
anywhere
ap
apan
apart
apna
apnaa
apne
apni
appear
are
aren
arent
aren't
around
arre
as
aside
ask
asking
at
aur
avum
aya
aye
baad
baar
bad
bahut
bana
banae
banai
banao
banaya
banaye
banayi
banda
bande
bandi
bane
bani
bas
bata
batao
bc
be
became
because
become
becomes
becoming
been
before
beforehand
behind
being
below
beside
besides
best
better
between
beyond
bhai
bheetar
bhi
bhitar
bht
bilkul
bohot
bol
bola
bole
boli
bolo
bolta
bolte
bolti
both
brief
bro
btw
but
by
came
can
cannot
cant
can't
cause
causes
certain
certainly
chahiye
chaiye
chal
chalega
chhaiye
clearly
c'mon
com
come
comes
could
couldn
couldnt
couldn't
d
de
dede
dega
degi
dekh
dekha
dekhe
dekhi
dekho
denge
dhang
di
did
didn
didnt
didn't
dijiye
diya
diyaa
diye
diyo
do
does
doesn
doesnt
doesn't
doing
done
dono
dont
don't
doosra
doosre
down
downwards
dude
dunga
dungi
during
dusra
dusre
dusri
dvaara
dvara
dwaara
dwara
each
edu
eg
eight
either
ek
else
elsewhere
enough
etc
even
ever
every
everybody
everyone
everything
everywhere
ex
exactly
example
except
far
few
fifth
fir
first
five
followed
following
follows
for
forth
four
from
further
furthermore
gaya
gaye
gayi
get
gets
getting
ghar
given
gives
go
goes
going
gone
good
got
gotten
greetings
haan
had
hadd
hadn
hadnt
hadn't
hai
hain
hamara
hamare
hamari
hamne
han
happens
har
hardly
has
hasn
hasnt
hasn't
have
haven
havent
haven't
having
he
hello
help
hence
her
here
hereafter
hereby
herein
here's
hereupon
hers
herself
he's
hi
him
himself
his
hither
hm
hmm
ho
hoga
hoge
hogi
hona
honaa
hone
honge
hongi
honi
hopefully
hota
hotaa
hote
hoti
how
howbeit
however
hoyenge
hoyengi
hu
hua
hue
huh
hui
hum
humein
humne
hun
huye
huyi
i
i'd
idk
ie
if
i'll
i'm
imo
in
inasmuch
inc
inhe
inhi
inho
inka
inkaa
inke
inki
inn
inner
inse
insofar
into
inward
is
ise
isi
iska
iskaa
iske
iski
isme
isn
isne
isnt
isn't
iss
isse
issi
isski
it
it'd
it'll
itna
itne
itni
itno
its
it's
itself
ityaadi
ityadi
i've
ja
jaa
jab
jabh
jaha
jahaan
jahan
jaisa
jaise
jaisi
jata
jayega
jidhar
jin
jinhe
jinhi
jinho
jinhone
jinka
jinke
jinki
jinn
jis
jise
jiska
jiske
jiski
jisme
jiss
jisse
jitna
jitne
jitni
jo
just
jyaada
jyada
k
ka
kaafi
kab
kabhi
kafi
kaha
kahaa
kahaan
kahan
kahi
kahin
kahte
kaisa
kaise
kaisi
kal
kam
kar
kara
kare
karega
karegi
karen
karenge
kari
karke
karna
karne
karni
karo
karta
karte
karti
karu
karun
karunga
karungi
kaun
kaunsa
kayi
kch
ke
keep
keeps
keh
kehte
kept
khud
ki
kin
kine
kinhe
kinho
kinka
kinke
kinki
kinko
kinn
kino
kis
kise
kisi
kiska
kiske
kiski
kisko
kisliye
kisne
kitna
kitne
kitni
kitno
kiya
kiye
know
known
knows
ko
koi
kon
konsa
koyi
krna
krne
kuch
kuchch
kuchh
kul
kull
kya
kyaa
kyu
kyuki
kyun
kyunki
lagta
lagte
lagti
last
lately
later
le
least
lekar
lekin
less
lest
let
let's
li
like
liked
likely
little
liya
liye
ll
lo
log
logon
lol
look
looking
looks
ltd
lunga
m
maan
maana
maane
maani
maano
magar
mai
main
maine
mainly
mana
mane
mani
mano
many
mat
may
maybe
me
mean
meanwhile
mein
mera
mere
merely
meri
might
mightn
mightnt
mightn't
mil
mjhe
more
moreover
most
mostly
much
mujhe
must
mustn
mustnt
mustn't
my
myself
na
naa
naah
nahi
nahin
nai
name
namely
nd
ne
near
nearly
necessary
neeche
need
needn
neednt
needn't
needs
neither
never
nevertheless
new
next
nhi
nine
no
nobody
non
none
noone
nope
nor
normally
not
nothing
novel
now
nowhere
o
obviously
of
off
often
oh
ok
okay
old
on
once
one
ones
only
onto
or
other
others
otherwise
ought
our
ours
ourselves
out
outside
over
overall
own
par
pata
pe
pehla
pehle
pehli
people
per
perhaps
phla
phle
phli
placed
please
plus
poora
poori
provides
pura
puri
q
que
quite
raha
rahaa
rahe
rahi
rakh
rakha
rakhe
rakhen
rakhi
rakho
rather
re
really
reasonably
regarding
regardless
regards
rehte
rha
rhaa
rhe
rhi
ri
right
s
sa
saara
saare
saath
sab
sabhi
sabse
sahi
said
sakta
saktaa
sakte
sakti
same
sang
sara
sath
saw
say
saying
says
se
second
secondly
see
seeing
seem
seemed
seeming
seems
seen
self
selves
sensible
sent
serious
seriously
seven
several
shall
shan
shant
shan't
she
she's
should
shouldn
shouldnt
shouldn't
should've
si
since
six
so
soch
some
somebody
somehow
someone
something
sometime
sometimes
somewhat
somewhere
soon
still
sub
such
sup
sure
t
tab
tabh
tak
take
taken
tarah
teen
teeno
teesra
teesre
teesri
tell
tends
tera
tere
teri
th
tha
than
thank
thanks
thanx
that
that'll
thats
that's
the
theek
their
theirs
them
themselves
then
thence
there
thereafter
thereby
therefore
therein
theres
there's
thereupon
these
they
they'd
they'll
they're
they've
thi
thik
thing
think
thinking
third
this
tho
thoda
thodi
thorough
thoroughly
those
though
thought
three
through
throughout
thru
thus
tjhe
to
together
toh
too
took
toward
towards
tried
tries
true
truly
try
trying
tu
tujhe
tum
tumhara
tumhare
tumhari
tune
twice
two
um
umm
un
under
unhe
unhi
unho
unhone
unka
unkaa
unke
unki
unko
unless
unlikely
unn
unse
until
unto
up
upar
upon
us
use
used
useful
uses
usi
using
uska
uske
usne
uss
usse
ussi
usually
vaala
vaale
vaali
vahaan
vahan
vahi
vahin
vaisa
vaise
vaisi
vala
vale
vali
various
ve
very
via
viz
vo
waala
waale
waali
wagaira
wagairah
wagerah
waha
wahaan
wahan
wahi
wahin
waisa
waise
waisi
wala
wale
wali
want
wants
was
wasn
wasnt
wasn't
way
we
we'd
well
we'll
went
were
we're
weren
werent
weren't
we've
what
whatever
what's
when
whence
whenever
where
whereafter
whereas
whereby
wherein
where's
whereupon
wherever
whether
which
while
who
whoever
whole
whom
who's
whose
why
will
willing
with
within
without
wo
woh
wohi
won
wont
won't
would
wouldn
wouldnt
wouldn't
y
ya
yadi
yah
yaha
yahaan
yahan
yahi
yahin
ye
yeah
yeh
yehi
yes
yet
you
you'd
you'll
your
you're
yours
yourself
yourselves
you've
yup''')
hindi_list=list(map(str,hindi_s.split()))

In [None]:
print(hindi_list)

['a', 'aadi', 'aaj', 'aap', 'aapne', 'aata', 'aati', 'aaya', 'aaye', 'ab', 'abbe', 'abbey', 'abe', 'abhi', 'able', 'about', 'above', 'accha', 'according', 'accordingly', 'acha', 'achcha', 'across', 'actually', 'after', 'afterwards', 'again', 'against', 'agar', 'ain', 'aint', "ain't", 'aisa', 'aise', 'aisi', 'alag', 'all', 'allow', 'allows', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'an', 'and', 'andar', 'another', 'any', 'anybody', 'anyhow', 'anyone', 'anything', 'anyway', 'anyways', 'anywhere', 'ap', 'apan', 'apart', 'apna', 'apnaa', 'apne', 'apni', 'appear', 'are', 'aren', 'arent', "aren't", 'around', 'arre', 'as', 'aside', 'ask', 'asking', 'at', 'aur', 'avum', 'aya', 'aye', 'baad', 'baar', 'bad', 'bahut', 'bana', 'banae', 'banai', 'banao', 'banaya', 'banaye', 'banayi', 'banda', 'bande', 'bandi', 'bane', 'bani', 'bas', 'bata', 'batao', 'bc', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand',

### The dataset

In [None]:
import pandas as pd

data=pd.read_csv("Depression_Tweet.csv")
data.shape

(670, 3)

In [None]:
pd.set_option('display.max_colwidth',None)
data[['TWEET']]

Unnamed: 0,TWEET
0,@triniteddybear thank you have an awesome week.
1,the of end the new JONAS was amazing... aw :'( yep dats a tear. (kinda not really). @ItsChelseaStaub you were awesome.. nice spit take
2,"@rhyzome the blog is pretty lame, but the story of the unreleased bike seat as a tip was awesome"
3,"""Tweeting for Fun, Cruise, Vibes and Insha Allah... #IghaloFC Stan account #Bridget Bema Stan Account \ud83d\ude47\ud83c\udffe\u200d\u2642\ufe0f\ud83d\ude47\ud83c\udffe\u200d\u2642\ufe0f"","
4,finds it interesting that a Vespa counts as a motorcycle in Maryland and Florida AWESOME!!!!
...,...
665,"http://recipekey.com/r/zykfx a great veggie lasagna!! happy marathon monday, boston folks"
666,Some people may say you can't be sad because someone else may have it worse is just like saying someone can't be happy because someone else might have it better. Depression is one of the most dehumanization and tiring emotional experience anyone can encounter in their life
667,"@MissPureGold *Hug..... I simply meant alot of us are sad here, not sure of depression though."
668,"Happy 19th Birthday, Chryslyn."


In [None]:
# converting all the data into string for data pre-processing
data['TWEET']=[str(tweet) for tweet in data['TWEET']]
data[['TWEET']]

Unnamed: 0,TWEET
0,@triniteddybear thank you have an awesome week.
1,the of end the new JONAS was amazing... aw :'( yep dats a tear. (kinda not really). @ItsChelseaStaub you were awesome.. nice spit take
2,"@rhyzome the blog is pretty lame, but the story of the unreleased bike seat as a tip was awesome"
3,"""Tweeting for Fun, Cruise, Vibes and Insha Allah... #IghaloFC Stan account #Bridget Bema Stan Account \ud83d\ude47\ud83c\udffe\u200d\u2642\ufe0f\ud83d\ude47\ud83c\udffe\u200d\u2642\ufe0f"","
4,finds it interesting that a Vespa counts as a motorcycle in Maryland and Florida AWESOME!!!!
...,...
665,"http://recipekey.com/r/zykfx a great veggie lasagna!! happy marathon monday, boston folks"
666,Some people may say you can't be sad because someone else may have it worse is just like saying someone can't be happy because someone else might have it better. Depression is one of the most dehumanization and tiring emotional experience anyone can encounter in their life
667,"@MissPureGold *Hug..... I simply meant alot of us are sad here, not sure of depression though."
668,"Happy 19th Birthday, Chryslyn."


### **1. Removing URLs**

In [None]:
def remove_url(text):
  url_free= re.sub(r'[0-9]', '', text)
  url_free= re.sub(r'http\S+', '', url_free)
  url_free= re.sub(r'ud\S+', '', url_free)
  url_free= re.sub(r'\\ud\S+', '', url_free)
  url_free= re.sub(r'xa0\S+', '', url_free)
  url_free= re.sub(r'\xa0\S+', '', url_free)
  url_free= re.sub(r'pic.\S+', '', url_free)

  return url_free

data['TWEET']=data['TWEET'].apply(lambda x:remove_url(x)) 
#list(data['TWEET'])
data[['TWEET']]


Unnamed: 0,TWEET
0,@triniteddybear thank you have an awesome week.
1,the of end the new JONAS was amazing... aw :'( yep dats a tear. (kinda not really). @ItsChelseaStaub you were awesome.. nice spit take
2,"@rhyzome the blog is pretty lame, but the story of the unreleased bike seat as a tip was awesome"
3,"""Tweeting for Fun, Cruise, Vibes and Insha Allah... #IghaloFC Stan account #Bridget Bema Stan Account \"
4,finds it interesting that a Vespa counts as a motorcycle in Maryland and Florida AWESOME!!!!
...,...
665,"a great veggie lasagna!! happy marathon monday, boston folks"
666,Some people may say you can't be sad because someone else may have it worse is just like saying someone can't be happy because someone else might have it better. Depression is one of the most dehumanization and tiring emotional experience anyone can encounter in their life
667,"@MissPureGold *Hug..... I simply meant alot of us are sad here, not sure of depression though."
668,"Happy th Birthday, Chryslyn."


### **2. Removing Usernames (@abc)**

In [None]:
def remove_username(text):
  uname_free= re.sub('@[\w]+','',text)
  return uname_free

data['TWEET']=data['TWEET'].apply(lambda x:remove_username(x))  
data[['TWEET']]

Unnamed: 0,TWEET
0,thank you have an awesome week.
1,the of end the new JONAS was amazing... aw :'( yep dats a tear. (kinda not really). you were awesome.. nice spit take
2,"the blog is pretty lame, but the story of the unreleased bike seat as a tip was awesome"
3,"""Tweeting for Fun, Cruise, Vibes and Insha Allah... #IghaloFC Stan account #Bridget Bema Stan Account \"
4,finds it interesting that a Vespa counts as a motorcycle in Maryland and Florida AWESOME!!!!
...,...
665,"a great veggie lasagna!! happy marathon monday, boston folks"
666,Some people may say you can't be sad because someone else may have it worse is just like saying someone can't be happy because someone else might have it better. Depression is one of the most dehumanization and tiring emotional experience anyone can encounter in their life
667,"*Hug..... I simply meant alot of us are sad here, not sure of depression though."
668,"Happy th Birthday, Chryslyn."


### **3. Removing Punctuations**

In [None]:
def remove_punctuation(text):
  punc_free= "".join([char for char in text if char not in string.punctuation])
  return punc_free

data['TWEET']=data['TWEET'].apply(lambda x:remove_punctuation(x))  
data[['TWEET']]


Unnamed: 0,TWEET
0,thank you have an awesome week
1,the of end the new JONAS was amazing aw yep dats a tear kinda not really you were awesome nice spit take
2,the blog is pretty lame but the story of the unreleased bike seat as a tip was awesome
3,Tweeting for Fun Cruise Vibes and Insha Allah IghaloFC Stan account Bridget Bema Stan Account
4,finds it interesting that a Vespa counts as a motorcycle in Maryland and Florida AWESOME
...,...
665,a great veggie lasagna happy marathon monday boston folks
666,Some people may say you cant be sad because someone else may have it worse is just like saying someone cant be happy because someone else might have it better Depression is one of the most dehumanization and tiring emotional experience anyone can encounter in their life
667,Hug I simply meant alot of us are sad here not sure of depression though
668,Happy th Birthday Chryslyn


### **4. Tokenize**

In [None]:
tokenizer=RegexpTokenizer(r'\w+')

data['TWEET']=data['TWEET'].apply(lambda x:tokenizer.tokenize(x.lower()))  
data[['TWEET']]


Unnamed: 0,TWEET
0,"[thank, you, have, an, awesome, week]"
1,"[the, of, end, the, new, jonas, was, amazing, aw, yep, dats, a, tear, kinda, not, really, you, were, awesome, nice, spit, take]"
2,"[the, blog, is, pretty, lame, but, the, story, of, the, unreleased, bike, seat, as, a, tip, was, awesome]"
3,"[tweeting, for, fun, cruise, vibes, and, insha, allah, ighalofc, stan, account, bridget, bema, stan, account]"
4,"[finds, it, interesting, that, a, vespa, counts, as, a, motorcycle, in, maryland, and, florida, awesome]"
...,...
665,"[a, great, veggie, lasagna, happy, marathon, monday, boston, folks]"
666,"[some, people, may, say, you, cant, be, sad, because, someone, else, may, have, it, worse, is, just, like, saying, someone, cant, be, happy, because, someone, else, might, have, it, better, depression, is, one, of, the, most, dehumanization, and, tiring, emotional, experience, anyone, can, encounter, in, their, life]"
667,"[hug, i, simply, meant, alot, of, us, are, sad, here, not, sure, of, depression, though]"
668,"[happy, th, birthday, chryslyn]"


### **5. Removing Stopwords**

In [None]:
def remove_stopwords(text):
  sw_free=[word for word in text if word not in stopwords.words('english')+stopwords.words('french')+hindi_list]
  return sw_free

data['TWEET']=data['TWEET'].apply(lambda x:remove_stopwords(x))  
data[['TWEET']]

Unnamed: 0,TWEET
0,"[awesome, week]"
1,"[end, jonas, amazing, aw, yep, dats, tear, kinda, awesome, nice, spit]"
2,"[blog, pretty, lame, story, unreleased, bike, seat, tip, awesome]"
3,"[tweeting, fun, cruise, vibes, insha, allah, ighalofc, stan, account, bridget, bema, stan, account]"
4,"[finds, interesting, vespa, counts, motorcycle, maryland, florida, awesome]"
...,...
665,"[great, veggie, lasagna, happy, marathon, monday, boston, folks]"
666,"[sad, worse, happy, depression, dehumanization, tiring, emotional, experience, encounter, life]"
667,"[hug, simply, meant, alot, sad, depression]"
668,"[happy, birthday, chryslyn]"


### **6. Stemming and Lemmatizing**

#### **6.1. Lemmatizing**

In [None]:
#instantiate lemmatizer
lemmatizer=WordNetLemmatizer()

def word_lemmatizer(text):
  lem_txt=[lemmatizer.lemmatize(word) for word in text]
  return lem_txt

data['TWEET'].apply(lambda x: word_lemmatizer(x))


0                                                                                         [awesome, week]
1                                   [end, jonas, amazing, aw, yep, dat, tear, kinda, awesome, nice, spit]
2                                       [blog, pretty, lame, story, unreleased, bike, seat, tip, awesome]
3      [tweeting, fun, cruise, vibe, insha, allah, ighalofc, stan, account, bridget, bema, stan, account]
4                               [find, interesting, vespa, count, motorcycle, maryland, florida, awesome]
                                                      ...                                                
665                                       [great, veggie, lasagna, happy, marathon, monday, boston, folk]
666       [sad, worse, happy, depression, dehumanization, tiring, emotional, experience, encounter, life]
667                                                           [hug, simply, meant, alot, sad, depression]
668                                           

#### **6.2. Stemming**

In [None]:
#instantiate stemmer
stemmer=PorterStemmer()

def word_stemmer(text):
  stem_txt=" ".join([stemmer.stem(word) for word in text])
  return stem_txt

data['TWEET']=data['TWEET'].apply(lambda x: word_stemmer(x))  
data[['TWEET']]

Unnamed: 0,TWEET
0,awesom week
1,end jona amaz aw yep dat tear kinda awesom nice spit
2,blog pretti lame stori unreleas bike seat tip awesom
3,tweet fun cruis vibe insha allah ighalofc stan account bridget bema stan account
4,find interest vespa count motorcycl maryland florida awesom
...,...
665,great veggi lasagna happi marathon monday boston folk
666,sad wors happi depress dehuman tire emot experi encount life
667,hug simpli meant alot sad depress
668,happi birthday chryslyn


# **CREATING NEW DATASET WITH PRE-PROCESSED DATA**
> This dataset wil be used for further data analysis

In [None]:
data

Unnamed: 0,TID,TWEET,LABEL
0,250,awesom week,1
1,95,end jona amaz aw yep dat tear kinda awesom nice spit,1
2,554,blog pretti lame stori unreleas bike seat tip awesom,1
3,766,tweet fun cruis vibe insha allah ighalofc stan account bridget bema stan account,1
4,617,find interest vespa count motorcycl maryland florida awesom,1
...,...,...,...
665,137,great veggi lasagna happi marathon monday boston folk,1
666,255,sad wors happi depress dehuman tire emot experi encount life,0
667,636,hug simpli meant alot sad depress,0
668,562,happi birthday chryslyn,1


#### Creating a New Dataset with the Pre-processed data

In [None]:
data.to_csv('Depression_Tweet_Preprocessed.csv', index=False)

In [None]:
pdata=pd.read_csv('Depression_Tweet_Preprocessed.csv')
pdata

Unnamed: 0,TID,TWEET,LABEL
0,250,awesom week,1
1,95,end jona amaz aw yep dat tear kinda awesom nice spit,1
2,554,blog pretti lame stori unreleas bike seat tip awesom,1
3,766,tweet fun cruis vibe insha allah ighalofc stan account bridget bema stan account,1
4,617,find interest vespa count motorcycl maryland florida awesom,1
...,...,...,...
665,137,great veggi lasagna happi marathon monday boston folk,1
666,255,sad wors happi depress dehuman tire emot experi encount life,0
667,636,hug simpli meant alot sad depress,0
668,562,happi birthday chryslyn,1


In [None]:
# check for any null values

pdata.isnull().sum()

TID      0
TWEET    1
LABEL    0
dtype: int64

In [None]:
# 1 null value exists

null_val = pdata[pdata.isnull().any(axis=1)]
null_val

Unnamed: 0,TID,TWEET,LABEL
244,614,,1


In [None]:
# removing the null value
pdata.dropna()

Unnamed: 0,TID,TWEET,LABEL
0,250,awesom week,1
1,95,end jona amaz aw yep dat tear kinda awesom nice spit,1
2,554,blog pretti lame stori unreleas bike seat tip awesom,1
3,766,tweet fun cruis vibe insha allah ighalofc stan account bridget bema stan account,1
4,617,find interest vespa count motorcycl maryland florida awesom,1
...,...,...,...
665,137,great veggi lasagna happi marathon monday boston folk,1
666,255,sad wors happi depress dehuman tire emot experi encount life,0
667,636,hug simpli meant alot sad depress,0
668,562,happi birthday chryslyn,1
