In [14]:
"Creating a custom text normalization transformer"

"Text normalization reduces the number of dimensions, decreasing sparsity. Besides the simple filtering of tokens (removing punctuation and stopwords),"
"there are two primary methods for text normalization: stemming and lemmatization."

'there are two primary methods for text normalization: stemming and lemmatization.'

In [9]:
import unicodedata
from sklearn.base import BaseEstimator, TransformerMixin

In [41]:
class TextNormalizer(BaseEstimator, TransformerMixin):
   
   def __init__(self, language='english'):
       self.stopwords = set(nltk.corpus.stopwords.words(language))
       self.lemmatizer = WordNetLemmatizer()

 
   def is_punct(self, token):
      return all(
           unicodedata.category(char).startswith('P') for char in token)


   def is_stopword(self, token):
      return token.lower() in self.stopwords

#checks if every character in the token has a Unicode category that starts with 'P' (for punctuation);

In [33]:

     def normalize(self, document):
        return [
         self.lemmatize(token, tag).lower()
         for paragraph in document
         for sentence in paragraph
         for (token, tag) in sentence
         if not self.is_punct(token) and not self.is_stopword(token)]

#This method applies the filtering functions to remove unwanted tokens and then lemmatizes them.

In [39]:
def lemmatize(self, token, pos_tag):
     tag = {
     'N': wn.NOUN,
     'V': wn.VERB,
     'R': wn.ADV,
     'J': wn.ADJ
     }.get(pos_tag[0], wn.NOUN)
     return self.lemmatizer.lemmatize(token, tag)

#The lemmatize() method first converts the Penn
#Treebank part-of-speech tags that are the default tag set in the
#nltk.pos_tag function to WordNet tags, selecting nouns by default.

In [40]:
  def fit(self, X, y=None):
    return self

  def transform(self, documents):
    for document in documents:
      yield self.normalize(document)

#Finally, we must add the Transformer interface, allowing us to add this
#class to a Scikit-Learn pipeline