[Reference](https://towardsdatascience.com/text-preprocessing-steps-and-universal-pipeline-94233cb6725a)

In [1]:
example_text = '''An explosion targeting a tourist bus has injured at least 16 people near the Grand Egyptian Museum, 
next to the pyramids in Giza, security sources say E.U.

South African tourists are among the injured. Most of those hurt suffered minor injuries, 
while three were treated in hospital, N.A.T.O. say.

http://localhost:8888/notebooks/Text%20preprocessing.ipynb

@nickname of twitter user and his email is email@gmail.com . 

A device went off close to the museum fence as the bus was passing on 16/02/2012.'''

In [3]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize

nltk_words = word_tokenize(example_text)
display(f"Tokenized words: {nltk_words}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


"Tokenized words: ['An', 'explosion', 'targeting', 'a', 'tourist', 'bus', 'has', 'injured', 'at', 'least', '16', 'people', 'near', 'the', 'Grand', 'Egyptian', 'Museum', ',', 'next', 'to', 'the', 'pyramids', 'in', 'Giza', ',', 'security', 'sources', 'say', 'E.U', '.', 'South', 'African', 'tourists', 'are', 'among', 'the', 'injured', '.', 'Most', 'of', 'those', 'hurt', 'suffered', 'minor', 'injuries', ',', 'while', 'three', 'were', 'treated', 'in', 'hospital', ',', 'N.A.T.O', '.', 'say', '.', 'http', ':', '//localhost:8888/notebooks/Text', '%', '20preprocessing.ipynb', '@', 'nickname', 'of', 'twitter', 'user', 'and', 'his', 'email', 'is', 'email', '@', 'gmail.com', '.', 'A', 'device', 'went', 'off', 'close', 'to', 'the', 'museum', 'fence', 'as', 'the', 'bus', 'was', 'passing', 'on', '16/02/2012', '.']"

In [4]:
import spacy
import en_core_web_sm

nlp = en_core_web_sm.load()

doc = nlp(example_text)
spacy_words = [token.text for token in doc]
display(f"Tokenized words: {spacy_words}")

"Tokenized words: ['An', 'explosion', 'targeting', 'a', 'tourist', 'bus', 'has', 'injured', 'at', 'least', '16', 'people', 'near', 'the', 'Grand', 'Egyptian', 'Museum', ',', '\\n', 'next', 'to', 'the', 'pyramids', 'in', 'Giza', ',', 'security', 'sources', 'say', 'E.U.', '\\n\\n', 'South', 'African', 'tourists', 'are', 'among', 'the', 'injured', '.', 'Most', 'of', 'those', 'hurt', 'suffered', 'minor', 'injuries', ',', '\\n', 'while', 'three', 'were', 'treated', 'in', 'hospital', ',', 'N.A.T.O.', 'say', '.', '\\n\\n', 'http://localhost:8888', '/', 'notebooks', '/', 'Text%20preprocessing.ipynb', '\\n\\n', '@nickname', 'of', 'twitter', 'user', 'and', 'his', 'email', 'is', 'email@gmail.com', '.', '\\n\\n', 'A', 'device', 'went', 'off', 'close', 'to', 'the', 'museum', 'fence', 'as', 'the', 'bus', 'was', 'passing', 'on', '16/02/2012', '.']"

In [6]:
text_with_punct = '@nickname of twitter user, and his email is email@gmail.com .'

In [8]:
import string
text_without_punct = text_with_punct.translate(str.maketrans('', '', string.punctuation))
display(f"Text without punctuation: {text_without_punct}")

'Text without punctuation: nickname of twitter user and his email is emailgmailcom '

In [10]:
import spacy
import en_core_web_sm

nlp = en_core_web_sm.load()

doc = nlp(text_with_punct)
tokens = [t.text for t in doc]

# python based removal
tokens_without_punct_python = [t for t in tokens if t not in string.punctuation]
display(f"Python based removal: {tokens_without_punct_python}")

# spacy based removal
tokens_without_punct_spacy = [t.text for t in doc if t.pos_ != 'PUNCT']
display(f"Spacy based removal: {tokens_without_punct_spacy}")

"Python based removal: ['@nickname', 'of', 'twitter', 'user', 'and', 'his', 'email', 'is', 'email@gmail.com']"

"Spacy based removal: ['@nickname', 'of', 'twitter', 'user', 'and', 'his', 'email', 'is', 'email@gmail.com']"

In [11]:
text = 'This movie is just not good enough'

In [12]:
import spacy
import en_core_web_sm

nlp = en_core_web_sm.load()

text_without_stop_words = [t.text for t in nlp(text) if not t.is_stop]
display(f"Spacy text without stop words: {text_without_stop_words}")

"Spacy text without stop words: ['movie', 'good']"

In [14]:
import nltk
nltk.download('stopwords')

nltk_stop_words = nltk.corpus.stopwords.words('english')
text_without_stop_words = [t for t in word_tokenize(text) if t not in nltk_stop_words]
display(f"nltk text without stop words: {text_without_stop_words}")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


"nltk text without stop words: ['This', 'movie', 'good', 'enough']"

In [15]:
import en_core_web_sm

nlp = en_core_web_sm.load()

customize_stop_words = [
    'not'
]

for w in customize_stop_words:
    nlp.vocab[w].is_stop = False

text_without_stop_words = [t.text for t in nlp(text) if not t.is_stop]
display(f"Spacy text without updated stop words: {text_without_stop_words}")

"Spacy text without updated stop words: ['movie', 'not', 'good']"

In [16]:
text = 'On the 13 Feb. 2007, Theresa May announced on MTV news that the rate of childhod obesity had risen from 7.3-9.6% in just 3 years , costing the N.A.T.O £20m'

In [27]:
#!pip install --upgrade pip setuptools
!pip install normalise
for dependency in ("brown", "names", "wordnet", "averaged_perceptron_tagger", "universal_tagset"):
    nltk.download(dependency)
from normalise import normalise

user_abbr = {
    "N.A.T.O": "North Atlantic Treaty Organization"
}

normalized_tokens = normalise(word_tokenize(text), user_abbrevs=user_abbr, verbose=False)
display(f"Normalized text: {' '.join(normalized_tokens)}")

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package names to /root/nltk_data...
[nltk_data]   Package names is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


'Normalized text: On the thirteenth of February two thousand and seven , Theresa May announced on M T V news that the rate of childhood obesity had risen from seven point three to nine point six % in just three years , costing the North Atlantic Treaty Organization twenty million pounds'

In [28]:
text ='On the thirteenth of February two thousand and seven , Theresa May announced on M T V news that the rate of childhood obesity had risen from seven point three to nine point six % in just three years , costing the North Atlantic Treaty Organization twenty million pounds'

In [31]:
import numpy as np
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
tokens = word_tokenize(text)
porter=PorterStemmer()

# vectorizing function to able to call on list of tokens
stem_words = np.vectorize(porter.stem)
stemed_text = ' '.join(stem_words(tokens))
display(f"Stemed text: {stemed_text}")

'Stemed text: On the thirteenth of februari two thousand and seven , theresa may announc on M T V news that the rate of childhood obes had risen from seven point three to nine point six % in just three year , cost the north atlant treati organ twenti million pound'

In [34]:
import numpy as np
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
tokens = word_tokenize(text)
wordnet_lemmatizer = WordNetLemmatizer()

# vectorizing function to able to call on list of tokens
lemmatize_words = np.vectorize(wordnet_lemmatizer.lemmatize)
lemmatized_text = ' '.join(lemmatize_words(tokens))
display(f"nltk lemmatized text: {lemmatized_text}")

'nltk lemmatized text: On the thirteenth of February two thousand and seven , Theresa May announced on M T V news that the rate of childhood obesity had risen from seven point three to nine point six % in just three year , costing the North Atlantic Treaty Organization twenty million pound'

In [None]:
import numpy as np
import multiprocessing as mp

import string
import spacy 
import en_core_web_sm
from nltk.tokenize import word_tokenize
from sklearn.base import TransformerMixin, BaseEstimator
from normalise import normalise

nlp = en_core_web_sm.load()


class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self,
                 variety="BrE",
                 user_abbrevs={},
                 n_jobs=1):
        """
        Text preprocessing transformer includes steps:
            1. Text normalization
            2. Punctuation removal
            3. Stop words removal
            4. Lemmatization
        
        variety - format of date (AmE - american type, BrE - british format) 
        user_abbrevs - dict of user abbreviations mappings (from normalise package)
        n_jobs - parallel jobs to run
        """
        self.variety = variety
        self.user_abbrevs = user_abbrevs
        self.n_jobs = n_jobs

    def fit(self, X, y=None):
        return self

    def transform(self, X, *_):
        X_copy = X.copy()

        partitions = 1
        cores = mp.cpu_count()
        if self.n_jobs <= -1:
            partitions = cores
        elif self.n_jobs <= 0:
            return X_copy.apply(self._preprocess_text)
        else:
            partitions = min(self.n_jobs, cores)

        data_split = np.array_split(X_copy, partitions)
        pool = mp.Pool(cores)
        data = pd.concat(pool.map(self._preprocess_part, data_split))
        pool.close()
        pool.join()

        return data

    def _preprocess_part(self, part):
        return part.apply(self._preprocess_text)

    def _preprocess_text(self, text):
        normalized_text = self._normalize(text)
        doc = nlp(normalized_text)
        removed_punct = self._remove_punct(doc)
        removed_stop_words = self._remove_stop_words(removed_punct)
        return self._lemmatize(removed_stop_words)

    def _normalize(self, text):
        # some issues in normalise package
        try:
            return ' '.join(normalise(text, variety=self.variety, user_abbrevs=self.user_abbrevs, verbose=False))
        except:
            return text

    def _remove_punct(self, doc):
        return [t for t in doc if t.text not in string.punctuation]

    def _remove_stop_words(self, doc):
        return [t for t in doc if not t.is_stop]

    def _lemmatize(self, doc):
        return ' '.join([t.lemma_ for t in doc])