<a href="https://colab.research.google.com/github/BeLeap/nlp-prac/blob/main/02/Integer-Encoding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Integer Encoding

In [1]:
import nltk

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

In [2]:
raw_text = "A barber is a person. a barber is good person. a barber is huge person. he Knew A Secret! The Secret He Kept is huge secret. Huge secret. His barber kept his word. a barber kept his word. His barber kept his secret. But keeping and keeping such a huge secret to himself was driving the barber crazy. the barber went up a huge mountain."

In [4]:
nltk.download('punkt')

sentences = sent_tokenize(raw_text)
sentences

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


['A barber is a person.',
 'a barber is good person.',
 'a barber is huge person.',
 'he Knew A Secret!',
 'The Secret He Kept is huge secret.',
 'Huge secret.',
 'His barber kept his word.',
 'a barber kept his word.',
 'His barber kept his secret.',
 'But keeping and keeping such a huge secret to himself was driving the barber crazy.',
 'the barber went up a huge mountain.']

In [6]:
vocab = {}
preprocessed_sentences = []

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

for sentence in sentences:
  tokenized_sentence = word_tokenize(sentence)
  result = []

  for word in tokenized_sentence:
    word = word.lower()
    if word not in stop_words:
      if len(word) > 2:
        result.append(word)
        if word not in vocab:
          vocab[word] = 0
        vocab[word] += 1
  
  preprocessed_sentences.append(result)

preprocessed_sentences, vocab

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


([['barber', 'person'],
  ['barber', 'good', 'person'],
  ['barber', 'huge', 'person'],
  ['knew', 'secret'],
  ['secret', 'kept', 'huge', 'secret'],
  ['huge', 'secret'],
  ['barber', 'kept', 'word'],
  ['barber', 'kept', 'word'],
  ['barber', 'kept', 'secret'],
  ['keeping', 'keeping', 'huge', 'secret', 'driving', 'barber', 'crazy'],
  ['barber', 'went', 'huge', 'mountain']],
 {'barber': 8,
  'crazy': 1,
  'driving': 1,
  'good': 1,
  'huge': 5,
  'keeping': 2,
  'kept': 4,
  'knew': 1,
  'mountain': 1,
  'person': 3,
  'secret': 6,
  'went': 1,
  'word': 2})

In [7]:
vocab_sorted = sorted(vocab.items(), key=lambda x:x[1], reverse=True)
vocab_sorted

[('barber', 8),
 ('secret', 6),
 ('huge', 5),
 ('kept', 4),
 ('person', 3),
 ('word', 2),
 ('keeping', 2),
 ('good', 1),
 ('knew', 1),
 ('driving', 1),
 ('crazy', 1),
 ('went', 1),
 ('mountain', 1)]

In [13]:
word_to_index = {}
i = 0
for (word, frequency) in vocab_sorted:
  if frequency > 1:
    i = i + 1
    word_to_index[word] = i

word_to_index

{'barber': 1,
 'huge': 3,
 'keeping': 7,
 'kept': 4,
 'person': 5,
 'secret': 2,
 'word': 6}

In [19]:
vocab_size = 5
words_frequency = [word for word, index in word_to_index.items() if index >= vocab_size + 1]

for w in words_frequency:
  del word_to_index[w]

word_to_index

{'barber': 1, 'huge': 3, 'kept': 4, 'person': 5, 'secret': 2}

In [20]:
word_to_index['OOV'] = len(word_to_index) + 1 # Out Of Vocabulary
word_to_index

{'OOV': 6, 'barber': 1, 'huge': 3, 'kept': 4, 'person': 5, 'secret': 2}

In [21]:
encoded_sentences = []

for sentence in preprocessed_sentences:
  encoded_sentence = []
  for word in sentence:
    try:
      encoded_sentence.append(word_to_index[word])
    except:
      encoded_sentence.append(word_to_index['OOV'])
  encoded_sentences.append(encoded_sentence)

encoded_sentences

[[1, 5],
 [1, 6, 5],
 [1, 3, 5],
 [6, 2],
 [2, 4, 3, 2],
 [3, 2],
 [1, 4, 6],
 [1, 4, 6],
 [1, 4, 2],
 [6, 6, 3, 2, 6, 1, 6],
 [1, 6, 3, 6]]

## Counter

In [22]:
from collections import Counter

In [23]:
preprocessed_sentences

[['barber', 'person'],
 ['barber', 'good', 'person'],
 ['barber', 'huge', 'person'],
 ['knew', 'secret'],
 ['secret', 'kept', 'huge', 'secret'],
 ['huge', 'secret'],
 ['barber', 'kept', 'word'],
 ['barber', 'kept', 'word'],
 ['barber', 'kept', 'secret'],
 ['keeping', 'keeping', 'huge', 'secret', 'driving', 'barber', 'crazy'],
 ['barber', 'went', 'huge', 'mountain']]

In [24]:
all_words_list = sum(preprocessed_sentences, [])
all_words_list

['barber',
 'person',
 'barber',
 'good',
 'person',
 'barber',
 'huge',
 'person',
 'knew',
 'secret',
 'secret',
 'kept',
 'huge',
 'secret',
 'huge',
 'secret',
 'barber',
 'kept',
 'word',
 'barber',
 'kept',
 'word',
 'barber',
 'kept',
 'secret',
 'keeping',
 'keeping',
 'huge',
 'secret',
 'driving',
 'barber',
 'crazy',
 'barber',
 'went',
 'huge',
 'mountain']

In [25]:
vocab = Counter(all_words_list)
vocab

Counter({'barber': 8,
         'crazy': 1,
         'driving': 1,
         'good': 1,
         'huge': 5,
         'keeping': 2,
         'kept': 4,
         'knew': 1,
         'mountain': 1,
         'person': 3,
         'secret': 6,
         'went': 1,
         'word': 2})

In [26]:
vocab_size = 5
vocab = vocab.most_common(vocab_size)
vocab

[('barber', 8), ('secret', 6), ('huge', 5), ('kept', 4), ('person', 3)]

In [27]:
word_to_index = {}
i = 0
for (word, freq) in vocab:
  i = i + 1
  word_to_index[word] = i

word_to_index

{'barber': 1, 'huge': 3, 'kept': 4, 'person': 5, 'secret': 2}

## nltk FreqDist

In [28]:
from nltk import FreqDist
import numpy as np

In [30]:
vocab = FreqDist(np.hstack(preprocessed_sentences))
vocab

FreqDist({'barber': 8,
          'crazy': 1,
          'driving': 1,
          'good': 1,
          'huge': 5,
          'keeping': 2,
          'kept': 4,
          'knew': 1,
          'mountain': 1,
          'person': 3,
          'secret': 6,
          'went': 1,
          'word': 2})

In [31]:
vocab_size = 5
vocab = vocab.most_common(vocab_size)
vocab

[('barber', 8), ('secret', 6), ('huge', 5), ('kept', 4), ('person', 3)]

In [32]:
word_to_index = { word[0]: index + 1 for index, word in enumerate(vocab) }
word_to_index

{'barber': 1, 'huge': 3, 'kept': 4, 'person': 5, 'secret': 2}

# Keras Preprocess

In [33]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [35]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(preprocessed_sentences)
tokenizer.word_index, tokenizer.word_counts

({'barber': 1,
  'crazy': 11,
  'driving': 10,
  'good': 8,
  'huge': 3,
  'keeping': 7,
  'kept': 4,
  'knew': 9,
  'mountain': 13,
  'person': 5,
  'secret': 2,
  'went': 12,
  'word': 6},
 OrderedDict([('barber', 8),
              ('person', 3),
              ('good', 1),
              ('huge', 5),
              ('knew', 1),
              ('secret', 6),
              ('kept', 4),
              ('word', 2),
              ('keeping', 2),
              ('driving', 1),
              ('crazy', 1),
              ('went', 1),
              ('mountain', 1)]))

In [36]:
tokenizer.texts_to_sequences(preprocessed_sentences)

[[1, 5],
 [1, 8, 5],
 [1, 3, 5],
 [9, 2],
 [2, 4, 3, 2],
 [3, 2],
 [1, 4, 6],
 [1, 4, 6],
 [1, 4, 2],
 [7, 7, 3, 2, 10, 1, 11],
 [1, 12, 3, 13]]

In [37]:
vocab_size = 5
tokenizer = Tokenizer(num_words = vocab_size + 1)
tokenizer.fit_on_texts(preprocessed_sentences)
tokenizer.word_index, tokenizer.word_counts

({'barber': 1,
  'crazy': 11,
  'driving': 10,
  'good': 8,
  'huge': 3,
  'keeping': 7,
  'kept': 4,
  'knew': 9,
  'mountain': 13,
  'person': 5,
  'secret': 2,
  'went': 12,
  'word': 6},
 OrderedDict([('barber', 8),
              ('person', 3),
              ('good', 1),
              ('huge', 5),
              ('knew', 1),
              ('secret', 6),
              ('kept', 4),
              ('word', 2),
              ('keeping', 2),
              ('driving', 1),
              ('crazy', 1),
              ('went', 1),
              ('mountain', 1)]))

In [38]:
tokenizer.texts_to_sequences(preprocessed_sentences)

[[1, 5],
 [1, 5],
 [1, 3, 5],
 [2],
 [2, 4, 3, 2],
 [3, 2],
 [1, 4],
 [1, 4],
 [1, 4, 2],
 [3, 2, 1],
 [1, 3]]