### Importing necessary dependencies

In [16]:
'''For debugging puopose ,set_trace for setting breakpoint
n - > New Line
c - > Continue'''
from IPython.core.debugger import set_trace


import pandas as pd
import numpy as np
import re #Regex
import nltk #Natural language toolkit
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import spacy
from nltk.tokenize.toktok import ToktokTokenizer
import contractions
#from contractions import CONTRACTION_MAP
import unicodedata

pd.options.display.max_colwidth=200
%matplotlib inline
#autoreload all modules automatically before entering execution code typed
%load_ext autoreload
#Reload all modules (Except those excluded by %aimport)
%autoreload 2 


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [17]:
#nlp =spacy.load('en_core_web_sm',parse=False,tag=False,entity=False)
'''It returns Language object containing all components and data needed to process text'''
nlp=spacy.load('en',parse=False,tag=False,entity=False)
#print(nlp)
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
print('Text Lemma POS Tag Dep Shape is alpha is stop')
for token in doc:#Tokenization
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)
#Spacy.expain will show a short descripton
#print('\n',spacy.explain("PRON"))

tokenizer=ToktokTokenizer()
stopwords_list=nltk.corpus.stopwords.words('english')
stopwords_list.remove('no')
stopwords_list.remove('not')

Text Lemma POS Tag Dep Shape is alpha is stop
Apple Apple PROPN NNP nsubj Xxxxx True False
is be AUX VBZ aux xx True True
looking look VERB VBG ROOT xxxx True False
at at ADP IN prep xx True True
buying buy VERB VBG pcomp xxxx True False
U.K. U.K. PROPN NNP compound X.X. False False
startup startup NOUN NN dobj xxxx True False
for for ADP IN prep xxx True True
$ $ SYM $ quantmod $ False False
1 1 NUM CD compound d False False
billion billion NUM CD pobj xxxx True False

 pronoun


## SPACY
* Text: The original word text.
2. Lemma: The base form of the word.
3. POS: The simple part-of-speech tag.
4. Tag: The detailed part-of-speech tag.
5. Dep: Syntactic dependency, i.e. the relation between tokens.
6. Shape: The word shape – capitalization, punctuation, digits.
7. is alpha: Is the token an alpha character?
8. is stop: Is the token part of a stop list, i.e. the most common words of the language?

#### Cleaning Texts- Strip html

In [18]:
'''Running the document through Beautiful Soup gives us a BeautifulSoup object,
    which represents the document as a nested data structure'''
def strip_html_tags(text):
    soup=BeautifulSoup(text,"html.parser")
    #print(soup.prettify())
    stripped_text=soup.get_text()#extracting all the text from object
    return stripped_text    

#### Removing accented characters(Text Normalization)

In [19]:
#print(unicodedata.normalize('NFKD','Amélie').encode('ascii','ignore').decode('utf-8','ignore'))
'''This is the process of text normalization'''
def remove_accented_chars(text):
    text=unicodedata.normalize('NFKD',text).encode('ascii',
                                                   'ignore').decode('utf-8','ignore')
    return text

#### Expanding Contraction

In [20]:
def expand_contractions(text):
    return contractions.fix(text)

#print(expand_contractions("Cant've"))

#### Removing Special characters

In [21]:
def remove_special_characters(text):
    text = re.sub('[^a-zA-z0-9\s]',' ',str(text))
    return text

#### Lemmatizing text

In [28]:
def lemmatize_text(text):
    text=nlp(text)
    '''If word.lemma_ is not -PRON- then join it\'s lemma otherwise plain text '''
    text=' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

#### Removing Stopwords

In [23]:
def remove_stopwords(text,is_lower_case=False):
    tokens=tokenizer.tokenize(text)
    tokens=[token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens=[token for token in tokens if token not in stopwords_list]
    else:
        filtered_tokens=[token for token in tokens if token.lower() not in stopwords_list]
        
    filtered_text=' '.join(filtered_tokens)
        
    return filtered_text   
        

#### Normalize text corpus - tying it all together

In [24]:
def normalize_corpus(corpus, html_stripping=True, contraction_expansion=True,
                     accented_char_removal=True, text_lower_case=True, 
                     text_lemmatization=True, special_char_removal=True, 
                     stopword_removal=True):
    
    normalized_corpus = []
    set_trace() #Setting breakpoint
    # normalize each document in the corpus
    for doc in corpus:
        # strip HTML
        if html_stripping:
            doc = strip_html_tags(doc)
        # remove accented characters
        if accented_char_removal:
            doc = remove_accented_chars(doc)
        # expand contractions    
        if contraction_expansion:
            doc = expand_contractions(doc)
        # lowercase the text    
        if text_lower_case:
            doc = doc.lower()
        # remove extra newlines
        doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc)
        # insert spaces between special characters to isolate them    
        special_char_pattern = re.compile(r'([{.(-)!}])')
        doc = special_char_pattern.sub(" \\1 ", doc)
        # lemmatize text
        if text_lemmatization:
            doc = lemmatize_text(doc)
        # remove special characters    
        if special_char_removal:
            doc = remove_special_characters(doc)  
        # remove extra whitespace
        doc = re.sub(' +', ' ', doc)
        # remove stopwords
        if stopword_removal:
            doc = remove_stopwords(doc, is_lower_case=text_lower_case)
            
        normalized_corpus.append(doc)
        #set_trace()
        
    return normalized_corpus

#### Sample demo

In [25]:
document = """<p>Héllo! Héllo! can you hear me! I just heard about <b>Python</b>!<br/>\r\n 
              It's an amazing language which can be used for Scripting, Web development,\r\n\r\n
              Information Retrieval, Natural Language Processing, Machine Learning & Artificial Intelligence!\n
              What are you waiting for? Go and get started.<br/> He's learning, she's learning, they've already\n\n
              got a headstart!</p>
           """
document

"<p>Héllo! Héllo! can you hear me! I just heard about <b>Python</b>!<br/>\r\n \n              It's an amazing language which can be used for Scripting, Web development,\r\n\r\n\n              Information Retrieval, Natural Language Processing, Machine Learning & Artificial Intelligence!\n\n              What are you waiting for? Go and get started.<br/> He's learning, she's learning, they've already\n\n\n              got a headstart!</p>\n           "

In [26]:
normalize_corpus([document],text_lemmatization=False,
                 stopword_removal=False,text_lower_case=False)

> [1;32m<ipython-input-24-067515717a2e>[0m(9)[0;36mnormalize_corpus[1;34m()[0m
[1;32m      7 [1;33m    [0mset_trace[0m[1;33m([0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m      8 [1;33m    [1;31m# normalize each document in the corpus[0m[1;33m[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m----> 9 [1;33m    [1;32mfor[0m [0mdoc[0m [1;32min[0m [0mcorpus[0m[1;33m:[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m     10 [1;33m        [1;31m# strip HTML[0m[1;33m[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m     11 [1;33m        [1;32mif[0m [0mhtml_stripping[0m[1;33m:[0m[1;33m[0m[1;33m[0m[0m
[0m
ipdb> c


['Hello Hello can you hear me I just heard about Python it is an amazing language which can be used for Scripting Web development Information Retrieval Natural Language Processing Machine Learning Artificial Intelligence What are you waiting for Go and get started he is learning she is learning they have already got a headstart ']

In [27]:
normalize_corpus([document])

> [1;32m<ipython-input-24-067515717a2e>[0m(9)[0;36mnormalize_corpus[1;34m()[0m
[1;32m      7 [1;33m    [0mset_trace[0m[1;33m([0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m      8 [1;33m    [1;31m# normalize each document in the corpus[0m[1;33m[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m----> 9 [1;33m    [1;32mfor[0m [0mdoc[0m [1;32min[0m [0mcorpus[0m[1;33m:[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m     10 [1;33m        [1;31m# strip HTML[0m[1;33m[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m     11 [1;33m        [1;32mif[0m [0mhtml_stripping[0m[1;33m:[0m[1;33m[0m[1;33m[0m[0m
[0m
ipdb> c


['hello hello hear hear python amazing language use script web development information retrieval natural language processing machine learning artificial intelligence wait go get start learn learn already get headstart']