In [1]:
import tensorflow
import pandas as pd
from termcolor import colored

## Keras Tokenizer

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [3]:
def keras_tokenizer(doc):
    token = Tokenizer()
    token.fit_on_texts(doc)
    word_index_value = token.word_index
    print(colored("Word Index:","red"),colored(word_index_value,"green"))
    sequences = token.texts_to_sequences(doc)
    print(colored("The Sequences are:","red"),colored(sequences,"green"))
    test_raw_text=["But a bit of better butter will make my batter better",
               "So ‘twas better Betty Botter bought a bit of better butter",
               "So better butter is good for our health",]
#     test_sequences=token.texts_to_sequences([test_raw_text])
    df = pd.DataFrame(data=test_raw_text,columns=['test_raw_data'])
    df['test_sequence'] = df.test_raw_data.apply(lambda x: token.texts_to_sequences([x])[0])
    return df.head()

In [4]:
doc = ["Betty Botter bought some butter","But she said the butter’s bitter",
       "If I put it in my batter, it will make my batter bitter","But a bit of better butter will make my batter better"]
keras_tokenizer(doc)

[31mWord Index:[0m [32m{'my': 1, 'batter': 2, 'butter': 3, 'but': 4, 'bitter': 5, 'it': 6, 'will': 7, 'make': 8, 'better': 9, 'betty': 10, 'botter': 11, 'bought': 12, 'some': 13, 'she': 14, 'said': 15, 'the': 16, 'butter’s': 17, 'if': 18, 'i': 19, 'put': 20, 'in': 21, 'a': 22, 'bit': 23, 'of': 24}[0m
[31mThe Sequences are:[0m [32m[[10, 11, 12, 13, 3], [4, 14, 15, 16, 17, 5], [18, 19, 20, 6, 21, 1, 2, 6, 7, 8, 1, 2, 5], [4, 22, 23, 24, 9, 3, 7, 8, 1, 2, 9]][0m


Unnamed: 0,test_raw_data,test_sequence
0,But a bit of better butter will make my batter...,"[4, 22, 23, 24, 9, 3, 7, 8, 1, 2, 9]"
1,So ‘twas better Betty Botter bought a bit of b...,"[9, 10, 11, 12, 22, 23, 24, 9, 3]"
2,So better butter is good for our health,"[9, 3]"


In [5]:
## usage of out of vocabulary token (oov_token)
def keras_tokenizer_with_oov(doc):
    token = Tokenizer(oov_token='UNK')
    token.fit_on_texts(doc)
    word_index_value = token.word_index
    print(colored("Word Index:","red"),colored(word_index_value,"green"))
    sequences = token.texts_to_sequences(doc)
    print(colored("The Sequences are:","red"),colored(sequences,"green"))
    test_raw_text=["But a bit of better butter will make my batter better",
               "So ‘twas better Betty Botter bought a bit of better butter",
               "So better butter is good for our health",]
    df = pd.DataFrame(data=test_raw_text,columns=['test_raw_data'])
    df['test_sequence'] = df.test_raw_data.apply(lambda x: token.texts_to_sequences([x])[0])
    return df.head()

In [6]:
doc = ["Betty Botter bought some butter","But she said the butter’s bitter",
       "If I put it in my batter, it will make my batter bitter","But a bit of better butter will make my batter better"]
keras_tokenizer_with_oov(doc)

[31mWord Index:[0m [32m{'UNK': 1, 'my': 2, 'batter': 3, 'butter': 4, 'but': 5, 'bitter': 6, 'it': 7, 'will': 8, 'make': 9, 'better': 10, 'betty': 11, 'botter': 12, 'bought': 13, 'some': 14, 'she': 15, 'said': 16, 'the': 17, 'butter’s': 18, 'if': 19, 'i': 20, 'put': 21, 'in': 22, 'a': 23, 'bit': 24, 'of': 25}[0m
[31mThe Sequences are:[0m [32m[[11, 12, 13, 14, 4], [5, 15, 16, 17, 18, 6], [19, 20, 21, 7, 22, 2, 3, 7, 8, 9, 2, 3, 6], [5, 23, 24, 25, 10, 4, 8, 9, 2, 3, 10]][0m


Unnamed: 0,test_raw_data,test_sequence
0,But a bit of better butter will make my batter...,"[5, 23, 24, 25, 10, 4, 8, 9, 2, 3, 10]"
1,So ‘twas better Betty Botter bought a bit of b...,"[1, 1, 10, 11, 12, 13, 23, 24, 25, 10, 4]"
2,So better butter is good for our health,"[1, 10, 4, 1, 1, 1, 1, 1]"


## Padding

In [7]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [8]:
def tokenizer_with_padding(doc):
    token = Tokenizer(oov_token='UNK')
    token.fit_on_texts(doc)
    word_index_value = token.word_index
    print(colored("Word Index:","red"),colored(word_index_value,"green"))
    sequences = token.texts_to_sequences(doc)
    print(colored("The Sequences are:","red"),colored(sequences,"green"))
    test_raw_text=["As a woodchuck would if a woodchuck could chuck wood",
                   "he could, and alsoit will chuck as much wood",
                   "Good wood good chuck its woodchuck"]
    df = pd.DataFrame(data=test_raw_text,columns=['test_raw_data'])
    df['test_sequence'] = df.test_raw_data.apply(lambda x: token.texts_to_sequences([x])[0])
    max_length = 15
    df['pre_padding_seq'] = pad_sequences(df["test_sequence"],
                                          maxlen=max_length, 
                                          padding='pre',
                                          truncating='pre').tolist()
    df['post_padding_seq'] = pad_sequences(df["test_sequence"],
                                          maxlen=max_length, 
                                          padding='post',
                                          truncating='post').tolist()
    return df.head()

In [9]:
doc = ["How much wood would a woodchuck chuck if a woodchuck could chuck wood? He would chuck, he would, as much as he could, and chuck as much wood"]
tokenizer_with_padding(doc)

[31mWord Index:[0m [32m{'UNK': 1, 'chuck': 2, 'much': 3, 'wood': 4, 'would': 5, 'he': 6, 'as': 7, 'a': 8, 'woodchuck': 9, 'could': 10, 'how': 11, 'if': 12, 'and': 13}[0m
[31mThe Sequences are:[0m [32m[[11, 3, 4, 5, 8, 9, 2, 12, 8, 9, 10, 2, 4, 6, 5, 2, 6, 5, 7, 3, 7, 6, 10, 13, 2, 7, 3, 4]][0m


Unnamed: 0,test_raw_data,test_sequence,pre_padding_seq,post_padding_seq
0,As a woodchuck would if a woodchuck could chuc...,"[7, 8, 9, 5, 12, 8, 9, 10, 2, 4]","[0, 0, 0, 0, 0, 7, 8, 9, 5, 12, 8, 9, 10, 2, 4]","[7, 8, 9, 5, 12, 8, 9, 10, 2, 4, 0, 0, 0, 0, 0]"
1,"he could, and alsoit will chuck as much wood","[6, 10, 13, 1, 1, 2, 7, 3, 4]","[0, 0, 0, 0, 0, 0, 6, 10, 13, 1, 1, 2, 7, 3, 4]","[6, 10, 13, 1, 1, 2, 7, 3, 4, 0, 0, 0, 0, 0, 0]"
2,Good wood good chuck its woodchuck,"[1, 4, 1, 2, 1, 9]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 4, 1, 2, 1, 9]","[1, 4, 1, 2, 1, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


## Stemming
Stemming is basically removing the suffix from a word and reduce it to its root word.

##### Porter Stemmer
Most common and gentle stemmer.Its fast but not very precise and always correct.

In [10]:
import nltk
from nltk.stem import PorterStemmer
porter = PorterStemmer()
sentence = "generously Provision Maximum multiply owed caring on go gone going was this universal university universe alumnus alumni alumnae"
# wordList = nltk.word_tokenize(sentence)
# stemWords = [porter.stem(word) for word in wordList]
stemWords = [porter.stem(word) for word in sentence.split()]
print(' '.join(stemWords))

plurals = ['caresses', 'flies', 'dies', 'mules', 'denied',
           'died', 'agreed', 'owned', 'humbled', 'sized',
           'meeting', 'stating', 'siezing', 'itemization',
           'sensational', 'traditional', 'reference', 'colonizer',
           'likes','liked','likely','liking','democratization',
           'plotted','stripes']
singles = [porter.stem(plural) for plural in plurals]
print(singles)

gener provis maximum multipli owe care on go gone go wa thi univers univers univers alumnu alumni alumna
['caress', 'fli', 'die', 'mule', 'deni', 'die', 'agre', 'own', 'humbl', 'size', 'meet', 'state', 'siez', 'item', 'sensat', 'tradit', 'refer', 'colon', 'like', 'like', 'like', 'like', 'democrat', 'plot', 'stripe']


##### Snowball Stemmer
More precise and meaningfull over large data-sets

In [11]:
from nltk.stem import SnowballStemmer
snow = SnowballStemmer("english")
print(colored("SnowballStemmer supported languages:","red")," ".join(SnowballStemmer.languages))
sentence = "Generously Provision Maximum multiply owed caring on go gone going was this universal university universe alumnus alumni alumnae"
stemWords = [snow.stem(word) for word in sentence.split()]
print(' '.join(stemWords))

plurals = ['caresses', 'flies', 'dies', 'mules', 'denied',
           'died', 'agreed', 'owned', 'humbled', 'sized',
           'meeting', 'stating', 'siezing', 'itemization',
           'sensational', 'traditional', 'reference', 'colonizer',
           'likes','liked','likely','liking','democratization',
           'plotted','stripes']
singles = [snow.stem(plural) for plural in plurals]
print(singles)

# German language
snow_german = SnowballStemmer("german")
wordlists = "Guten Morgen"
root = [snow_german.stem(word) for word in wordlists.split()]
print(colored("German:","red"),' '.join(root))

[31mSnowballStemmer supported languages:[0m arabic danish dutch english finnish french german hungarian italian norwegian porter portuguese romanian russian spanish swedish
generous provis maximum multipli owe care on go gone go was this univers univers univers alumnus alumni alumna
['caress', 'fli', 'die', 'mule', 'deni', 'die', 'agre', 'own', 'humbl', 'size', 'meet', 'state', 'siez', 'item', 'sensat', 'tradit', 'refer', 'colon', 'like', 'like', 'like', 'like', 'democrat', 'plot', 'stripe']
[31mGerman:[0m gut morg


##### Lancaster Stemmmer
very aggressive (really confusing when dealing with small words) and it will hugely trim down so that in most of the cases it gives wrong meaning

In [12]:
from nltk.stem import LancasterStemmer
lancaster = LancasterStemmer()
sentence = "generously Provision Maximum multiply owed caring on go gone going was this universal university universe alumnus alumni alumnae"
stemWords = [lancaster.stem(word) for word in sentence.split()]
print(' '.join(stemWords))

plurals = ['caresses', 'flies', 'dies', 'mules', 'denied',
           'died', 'agreed', 'owned', 'humbled', 'sized',
           'meeting', 'stating', 'siezing', 'itemization',
           'sensational', 'traditional', 'reference', 'colonizer',
           'likes','liked','likely','liking','democratization',
           'plotted','stripes']
singles = [lancaster.stem(plural) for plural in plurals]
print(singles)

gen provid maxim multiply ow car on go gon going was thi univers univers univers alumn alumn alumna
['caress', 'fli', 'die', 'mul', 'deny', 'died', 'agree', 'own', 'humbl', 'siz', 'meet', 'stat', 'siez', 'item', 'sens', 'tradit', 'ref', 'colon', 'lik', 'lik', 'lik', 'lik', 'democr', 'plot', 'stripes']


## Lemmatization

In [13]:
from nltk.stem import WordNetLemmatizer
lemma = WordNetLemmatizer()
docs = ["am","are","is","feet","car","cars","car's","cars'","different","colors","corpora","better","plays","playing","stripes",
       "ponies","cares","denied",'sensational', 'traditional', 'reference', 'colonizer','likes','liked','likely','liking',
        'democratization','plotted','hangs','ones','fishes']

lemma_noun_words = [lemma.lemmatize(doc,pos="n") for doc in docs]
print(colored("Lemma NOUN Words:","red"),lemma_noun_words)

lemma_verb_words = [lemma.lemmatize(doc,pos="v") for doc in docs]
print(colored("Lemma VERB Words:","red"),lemma_verb_words)

lemma_adj_words = [lemma.lemmatize(doc,pos="a") for doc in docs]
print(colored("Lemma ADJECTIVE Words:","red"),lemma_adj_words)

lemma_adv_words = [lemma.lemmatize(doc,pos="r") for doc in docs]
print(colored("Lemma ADVERB Words:","red"),lemma_adv_words)

[31mLemma NOUN Words:[0m ['am', 'are', 'is', 'foot', 'car', 'car', "car's", "cars'", 'different', 'color', 'corpus', 'better', 'play', 'playing', 'stripe', 'pony', 'care', 'denied', 'sensational', 'traditional', 'reference', 'colonizer', 'like', 'liked', 'likely', 'liking', 'democratization', 'plotted', 'hang', 'one', 'fish']
[31mLemma VERB Words:[0m ['be', 'be', 'be', 'feet', 'car', 'cars', "car's", "cars'", 'different', 'color', 'corpora', 'better', 'play', 'play', 'strip', 'ponies', 'care', 'deny', 'sensational', 'traditional', 'reference', 'colonizer', 'like', 'like', 'likely', 'like', 'democratization', 'plot', 'hang', 'ones', 'fish']
[31mLemma ADJECTIVE Words:[0m ['am', 'are', 'is', 'feet', 'car', 'cars', "car's", "cars'", 'different', 'colors', 'corpora', 'good', 'plays', 'playing', 'stripes', 'ponies', 'cares', 'denied', 'sensational', 'traditional', 'reference', 'colonizer', 'likes', 'liked', 'likely', 'liking', 'democratization', 'plotted', 'hangs', 'ones', 'fishes']
[