# Tutorial 2: Build your own text pre-processor

This shows a sequence of common pre-processing functions you can use to build your own text pre-processor.

We also show you how to build a fast text pre-processor which can use multi-threading if your CPU is fast enough.

__Note:__ There is no perfect set of pre-processing steps and it will depend based on the problem at hand and also by trying and viewing the results.

# Install Dependencies

In [1]:
!pip install textsearch
!pip install contractions
!pip install tqdm
import nltk
nltk.download('punkt')
nltk.download('stopwords')

Collecting textsearch
  Downloading https://files.pythonhosted.org/packages/42/a8/03407021f9555043de5492a2bd7a35c56cc03c2510092b5ec018cae1bbf1/textsearch-0.0.17-py2.py3-none-any.whl
Collecting pyahocorasick
[?25l  Downloading https://files.pythonhosted.org/packages/f4/9f/f0d8e8850e12829eea2e778f1c90e3c53a9a799b7f412082a5d21cd19ae1/pyahocorasick-1.4.0.tar.gz (312kB)
[K     |████████████████████████████████| 317kB 8.0MB/s 
[?25hCollecting Unidecode
[?25l  Downloading https://files.pythonhosted.org/packages/74/65/91eab655041e9e92f948cb7302e54962035762ce7b518272ed9d6b269e93/Unidecode-1.1.2-py2.py3-none-any.whl (239kB)
[K     |████████████████████████████████| 245kB 12.0MB/s 
[?25hBuilding wheels for collected packages: pyahocorasick
  Building wheel for pyahocorasick (setup.py) ... [?25l[?25hdone
  Created wheel for pyahocorasick: filename=pyahocorasick-1.4.0-cp36-cp36m-linux_x86_64.whl size=81697 sha256=5872ba40dd08acb8c2a58c6442c9d4ee39aecc2b893ade734032031f0a1b9070
  Stored in d

True

In [2]:
import re
from bs4 import BeautifulSoup
import unicodedata
import contractions
import spacy
import nltk

nlp = spacy.load('en')
ps = nltk.porter.PorterStemmer()

# HTML removal
def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    [s.extract() for s in soup(['iframe', 'script'])]
    stripped_text = soup.get_text()
    stripped_text = re.sub(r'[\r|\n|\r\n]+', '\n', stripped_text)
    return stripped_text

# accent removal
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

# contraction expansion
def expand_contractions(text):
    return contractions.fix(text)

# lemamtization
def spacy_lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

# stemming
def simple_stemming(text, stemmer=ps):
    text = ' '.join([stemmer.stem(word) for word in text.split()])
    return text

# special character removal
def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-Z0-9\s]' if not remove_digits else r'[^a-zA-Z\s]'
    text = re.sub(pattern, '', text)
    return text

# stopword removal
def remove_stopwords(text, is_lower_case=False, stopwords=None):
    if not stopwords:
        stopwords = nltk.corpus.stopwords.words('english')
    tokens = nltk.word_tokenize(text)
    tokens = [token.strip() for token in tokens]
    
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopwords]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopwords]
    
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

Handling extra newlines and carriage returns

In [3]:
s = 'hello\r\nhow are you doing\r\nI\tam\tdoing\tgreat\r\n:)'
print(s)

hello
how are you doing
I	am	doing	great
:)


In [4]:
s.translate(s.maketrans("\n\t\r", "   "))

'hello  how are you doing  I am doing great  :)'

## Your Turn: Add in all the necessary functions and build your pre-processor!

In [5]:
import tqdm # for nice progressbar

def text_pre_processor(text, html_strip=True, accented_char_removal=True, contraction_expansion=True,
                       text_lower_case=True, text_stemming=False, text_lemmatization=True, 
                       special_char_removal=True, remove_digits=True, stopword_removal=True, 
                       stopword_list=None):
    
    # strip HTML
    if html_strip:
        text = strip_html_tags(text)
    
    # remove extra newlines (often might be present in really noisy text)
    text = text.translate(text.maketrans("\n\t\r", "   "))
    
    # remove accented characters
    if accented_char_removal:
        text = remove_accented_chars(text)
    
    # expand contractions    
    if contraction_expansion:
        text = expand_contractions(text)
        
    
    # lemmatize text
    if text_lemmatization:
        text = spacy_lemmatize_text(text) 
        
    # remove special characters and\or digits    
    if special_char_removal:
        # insert spaces between special characters to isolate them    
        special_char_pattern = re.compile(r'([{.(-)!}])')
        text = special_char_pattern.sub(" \\1 ", text)
        text = remove_special_characters(text, remove_digits=remove_digits)  
        
    # stem text
    if text_stemming and not text_lemmatization:
        text = simple_stemming(text)
        
    # lowercase the text    
    if text_lower_case:
        text = text.lower()
        
        
    # remove stopwords
    if stopword_removal:
        text = remove_stopwords(text, is_lower_case=text_lower_case, 
                                stopwords=stopword_list)
        
    # remove extra whitespace
    text = re.sub(' +', ' ', text)
    text = text.strip()
    
    return text

  
def corpus_pre_processor(corpus):
  norm_corpus = []
  for doc in tqdm.tqdm(corpus):
    norm_corpus.append(text_pre_processor(doc))
  return norm_corpus

# Test on a single document

In [6]:
document = """<p>Héllo! Héllo! can you hear me! I just heard about <b>Python</b>!<br/>\r\n 
              It's an amazing language which can be used for [Scripting\tWeb development\tBackend development],\r\n\r\n
              Information Retrieval, Natural Language Processing, Machine Learning & Artificial Intelligence!\n
              What are you waiting for? Go and get started.<br/> He's learning, she's learning, they've already\n\n
              got a headstart! GET PYTHON 3.6 NOW!</p>
           """
print(document)

<p>Héllo! Héllo! can you hear me! I just heard about <b>Python</b>!<br/>
 
              It's an amazing language which can be used for [Scripting	Web development	Backend development],


              Information Retrieval, Natural Language Processing, Machine Learning & Artificial Intelligence!

              What are you waiting for? Go and get started.<br/> He's learning, she's learning, they've already


              got a headstart! GET PYTHON 3.6 NOW!</p>
           


In [8]:
text_pre_processor(document)

'hello hello hear hear python amazing language use scripting web development backend development information retrieval natural language processing machine learning artificial intelligence wait go get start learn learn already get headstart get python'

# Test on a corpus of documents

In [9]:
corpus = ["""<p>Héllo! Héllo! can you hear me! I just heard about <b>Python</b>!<br/>\r\n 
              It's an amazing language which can be used for [Scripting\tWeb development\tBackend development],\r\n\r\n
              Information Retrieval, Natural Language Processing, Machine Learning & Artificial Intelligence!\n
              What are you waiting for? Go and get started.<br/> He's learning, she's learning, they've already\n\n
              got a headstart! GET PYTHON 3.6 NOW!</p>
           """,
          """US unveils world's most powerful supercomputer, beats China. 
             The US has unveiled the world's most powerful supercomputer 
             called 'Summit', beating the previous record-holder China's Sunway 
             TaihuLight. With a peak performance of 200,000 trillion calculations 
             per second, it is over twice as fast as Sunway TaihuLight, which is capable 
             of 93,000 trillion calculations per second. Summit has 4,608 servers, 
             which reportedly take up the size of two tennis courts.""",
          """The Lord of the Rings is an epic high fantasy novel written by English author and scholar J. R. R. Tolkien. 
            The story began as a sequel to Tolkien's 1937 fantasy novel The Hobbit, but eventually developed into 
            a much larger work. Written in stages between 1937 and 1949, The Lord of the Rings is one of the 
            best-selling novels ever written, with over 150 million copies sold.[1]
          """,
          """The title of the novel refers to the story's main antagonist, the Dark Lord Sauron,[a] 
             who had in an earlier age created the One Ring to rule the other Rings of Power as the ultimate weapon 
             in his campaign to conquer and rule all of Middle-earth. From quiet beginnings in the Shire, a hobbit 
             land not unlike the English countryside, the story ranges across Middle-earth, following the course 
             of the War of the Ring through the eyes of its characters, not only the hobbits Frodo Baggins, 
             Samwise "Sam" Gamgee, Meriadoc "Merry" Brandybuck and Peregrin "Pippin" Took, but also the hobbits' 
             chief allies and travelling companions: the Men, Aragorn, a Ranger of the North, and Boromir, 
             a Captain of Gondor; Gimli, a Dwarf warrior; Legolas Greenleaf, an Elven prince; and Gandalf, a wizard.
          """
]

In [10]:
norm_docs = corpus_pre_processor(corpus)
norm_docs[:2]

100%|██████████| 4/4 [00:00<00:00, 34.50it/s]


['hello hello hear hear python amazing language use scripting web development backend development information retrieval natural language processing machine learning artificial intelligence wait go get start learn learn already get headstart get python',
 'us unveil world powerful supercomputer beat china us unveil world powerful supercomputer call summit beat previous record holder china sunway taihulight peak performance trillion calculation per second twice fast sunway taihulight capable trillion calculation per second summit server reportedly take size two tennis court']

# Optional: Pre-processor with multi-threading

In [11]:
from concurrent import futures
import threading

def parallel_preprocessing(idx, doc, total_docs):
    return text_pre_processor(doc)


def pre_process_documents_parallel(documents):
    total_docs = len(documents)
    docs_input = [[idx, doc, total_docs] for idx, doc in enumerate(documents)]
    
    ex = futures.ThreadPoolExecutor(max_workers=None)
    print('preprocessing: starting')
    norm_descriptions_map = ex.map(parallel_preprocessing, 
                                   [record[0] for record in docs_input],
                                   [record[1] for record in docs_input],
                                   [record[2] for record in docs_input])
    norm_descriptions = list(norm_descriptions_map)
    return norm_descriptions

In [12]:
norm_docs = pre_process_documents_parallel(corpus)
norm_docs

preprocessing: starting


['hello hello hear hear python amazing language use scripting web development backend development information retrieval natural language processing machine learning artificial intelligence wait go get start learn learn already get headstart get python',
 'us unveil world powerful supercomputer beat china us unveil world powerful supercomputer call summit beat previous record holder china sunway taihulight peak performance trillion calculation per second twice fast sunway taihulight capable trillion calculation per second summit server reportedly take size two tennis court',
 'lord rings epic high fantasy novel write english author scholar j r r tolkien story begin sequel tolkien fantasy novel hobbit eventually develop much large work write stage lord rings one best sell novel ever write million copy sold',
 'title novel refer story main antagonist dark lord saurona early age create one ring rule rings power ultimate weapon campaign conquer rule middle earth quiet beginning shire hobbit