In [1]:
import warnings
import numpy as np
import pandas as pd
import nltk
warnings.filterwarnings('ignore')

In [2]:
raw_docs=["I am writing very basic english sentenses",
         "I'm just writing for demo PURPOSE to make audiance understand the basics.",
         "The point is to _learn how it _works on #simple # data."]

### Convert to lower case

In [3]:
new_raw_docs=[]
import string
raw_docs=[word.lower() for word in raw_docs]
    

In [4]:
raw_docs

['i am writing very basic english sentenses',
 "i'm just writing for demo purpose to make audiance understand the basics.",
 'the point is to _learn how it _works on #simple # data.']

### Tokenizer

In [5]:
from nltk.tokenize import word_tokenize,sent_tokenize

In [6]:
raw_docs_word=[word_tokenize(word) for word in raw_docs]

In [7]:
print(raw_docs_word)

[['i', 'am', 'writing', 'very', 'basic', 'english', 'sentenses'], ['i', "'m", 'just', 'writing', 'for', 'demo', 'purpose', 'to', 'make', 'audiance', 'understand', 'the', 'basics', '.'], ['the', 'point', 'is', 'to', '_learn', 'how', 'it', '_works', 'on', '#', 'simple', '#', 'data', '.']]


In [8]:
raw_docs_sent=[sent_tokenize(sent) for sent in raw_docs]

In [9]:
print(raw_docs_sent)

[['i am writing very basic english sentenses'], ["i'm just writing for demo purpose to make audiance understand the basics."], ['the point is to _learn how it _works on #simple # data.']]


### Removing Punctuation

In [10]:
import re

In [11]:
regex=re.compile('[%s]' % re.escape(string.punctuation))

In [12]:
no_punct=[]
for words in raw_docs_word:   
    for word in words:
        no_punct.append(regex.sub(u'',word))
print(no_punct)


['i', 'am', 'writing', 'very', 'basic', 'english', 'sentenses', 'i', 'm', 'just', 'writing', 'for', 'demo', 'purpose', 'to', 'make', 'audiance', 'understand', 'the', 'basics', '', 'the', 'point', 'is', 'to', 'learn', 'how', 'it', 'works', 'on', '', 'simple', '', 'data', '']


### Removing Stopwords

In [13]:
from nltk.corpus import stopwords

In [14]:
no_stop_words=[]
for word in no_punct:
    if word not in set(stopwords.words('english')):
        no_stop_words.append(word)
print(no_stop_words)

['writing', 'basic', 'english', 'sentenses', 'writing', 'demo', 'purpose', 'make', 'audiance', 'understand', 'basics', '', 'point', 'learn', 'works', '', 'simple', '', 'data', '']


### Stemming

In [15]:
from nltk.stem import PorterStemmer

In [16]:
ps=PorterStemmer()

In [17]:
stemmed_words=[ps.stem(word) for word in no_stop_words ]

In [18]:
print(stemmed_words)

['write', 'basic', 'english', 'sentens', 'write', 'demo', 'purpos', 'make', 'audianc', 'understand', 'basic', '', 'point', 'learn', 'work', '', 'simpl', '', 'data', '']


### Lemmitization

In [19]:
from nltk.stem import WordNetLemmatizer

In [20]:
wordnetlemm=WordNetLemmatizer()

In [23]:
lemmetized_words=[wordnetlemm.lemmatize(word) for word in stemmed_words ]

In [24]:
print(lemmetized_words)

['write', 'basic', 'english', 'sentens', 'write', 'demo', 'purpos', 'make', 'audianc', 'understand', 'basic', '', 'point', 'learn', 'work', '', 'simpl', '', 'data', '']


### Advance Clearning Technique-Normalization

In [33]:
text='Today 10th June 2020,G.O.I has around 2 Lakh cases of Corona. M.H has highest numbers !'

In [34]:
from normalise import normalise

In [35]:
# pip install normalise
# nltk.download('brown')
# nltk.download('names')
#nltk.download('universal_tagset')

In [36]:
custom_abbr={'G.O.I':'Government of India',
             'M.H':'Maharastra'}

In [47]:
normalized_tokens=normalise(word_tokenize(text),user_abbrevs=custom_abbr,verbose=False)

In [48]:
normalized_tokens

['Today',
 'the tenth of',
 'June',
 'twenty twenty',
 ',',
 'Government of India',
 'has',
 'around',
 'two',
 'Lakh',
 'cases',
 'of',
 'Corona',
 '.',
 'Maharastra',
 'has',
 'highest',
 'numbers',
 '!']