# **Intro to NLTK**

In [276]:
!pip install nltk



In [277]:
# NLTK -- Natural Language Toolkit
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [278]:
txt = "Hello Geeks. We're hoping you guys are doing great."
txt

"Hello Geeks. We're hoping you guys are doing great."

In [279]:
txt.split('.')

['Hello Geeks', " We're hoping you guys are doing great", '']

In [280]:
len(txt.split(' '))

9

In [281]:
from nltk.tokenize import word_tokenize, sent_tokenize

In [282]:
word_tokenize(txt)

['Hello',
 'Geeks',
 '.',
 'We',
 "'re",
 'hoping',
 'you',
 'guys',
 'are',
 'doing',
 'great',
 '.']

In [283]:
sent_tokenize(txt)

['Hello Geeks.', "We're hoping you guys are doing great."]

In [284]:
for word in word_tokenize(txt):
  if word!='.':
    print(word)

Hello
Geeks
We
're
hoping
you
guys
are
doing
great


# **Stemming and Lemmatisation**

In [285]:
nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk.stem import WordNetLemmatizer, PorterStemmer

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [286]:
stem = PorterStemmer()
lam = WordNetLemmatizer()

In [287]:
# both techniques are used to find root words but root word in lemmatization, root word makes sense all the time but in stemming, it may or may not
# most of the time, lemmatization is used

**Lemmatization**

In [288]:
print(lam.lemmatize('change'))
print(lam.lemmatize('changes'))
print(lam.lemmatize('changed'))
print(lam.lemmatize('changer'))

change
change
changed
changer


**Stemming**

In [289]:
print(stem.stem('change'))
print(stem.stem('changes'))
print(stem.stem('changed'))
print(stem.stem('changer'))

chang
chang
chang
changer


# **Stop Words**

In [290]:
import nltk
txt = 'This is not a good time to talk. Can we do it now?'
txt

'This is not a good time to talk. Can we do it now?'

In [291]:
# in above text, 'is','a','to' are stop words as even without them, meaning of sentence can be recognized
# we need to remove such words and repetitive words in order to reduce computations

In [292]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [293]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [294]:
stopwords.words()

['إذ',
 'إذا',
 'إذما',
 'إذن',
 'أف',
 'أقل',
 'أكثر',
 'ألا',
 'إلا',
 'التي',
 'الذي',
 'الذين',
 'اللاتي',
 'اللائي',
 'اللتان',
 'اللتيا',
 'اللتين',
 'اللذان',
 'اللذين',
 'اللواتي',
 'إلى',
 'إليك',
 'إليكم',
 'إليكما',
 'إليكن',
 'أم',
 'أما',
 'أما',
 'إما',
 'أن',
 'إن',
 'إنا',
 'أنا',
 'أنت',
 'أنتم',
 'أنتما',
 'أنتن',
 'إنما',
 'إنه',
 'أنى',
 'أنى',
 'آه',
 'آها',
 'أو',
 'أولاء',
 'أولئك',
 'أوه',
 'آي',
 'أي',
 'أيها',
 'إي',
 'أين',
 'أين',
 'أينما',
 'إيه',
 'بخ',
 'بس',
 'بعد',
 'بعض',
 'بك',
 'بكم',
 'بكم',
 'بكما',
 'بكن',
 'بل',
 'بلى',
 'بما',
 'بماذا',
 'بمن',
 'بنا',
 'به',
 'بها',
 'بهم',
 'بهما',
 'بهن',
 'بي',
 'بين',
 'بيد',
 'تلك',
 'تلكم',
 'تلكما',
 'ته',
 'تي',
 'تين',
 'تينك',
 'ثم',
 'ثمة',
 'حاشا',
 'حبذا',
 'حتى',
 'حيث',
 'حيثما',
 'حين',
 'خلا',
 'دون',
 'ذا',
 'ذات',
 'ذاك',
 'ذان',
 'ذانك',
 'ذلك',
 'ذلكم',
 'ذلكما',
 'ذلكن',
 'ذه',
 'ذو',
 'ذوا',
 'ذواتا',
 'ذواتي',
 'ذي',
 'ذين',
 'ذينك',
 'ريث',
 'سوف',
 'سوى',
 'شتان',
 'عدا',
 'عسى',
 'عل'

In [295]:
stopword = stopwords.words('english')

In [296]:
stopword

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [297]:
txt = word_tokenize(txt)
txt

['This',
 'is',
 'not',
 'a',
 'good',
 'time',
 'to',
 'talk',
 '.',
 'Can',
 'we',
 'do',
 'it',
 'now',
 '?']

In [298]:
for word1 in txt:
  # print(word1, word1.lower() in stopword)
  if (word1.lower() not in stopword) and len(word1)!=1 :
    print(word1)

good
time
talk


# **Corpus and Vocabulary**

In [299]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
stopword = stopwords.words('english')
corpus = "India, officially the Republic of India, is a country in South Asia. It is the seventh-largest country by area, the most populous country as of June 2023, and from the time of its independence in 1947, the world's most populous democracy. Bounded by the Indian Ocean on the south, the Arabian Sea on the southwest, and the Bay of Bengal on the southeast, it shares land borders with Pakistan to the west; China, Nepal, and Bhutan to the north; and Bangladesh and Myanmar to the east. In the Indian Ocean, India is in the vicinity of Sri Lanka and the Maldives; its Andaman and Nicobar Islands share a maritime border with Thailand, Myanmar, and Indonesia."
corpus

"India, officially the Republic of India, is a country in South Asia. It is the seventh-largest country by area, the most populous country as of June 2023, and from the time of its independence in 1947, the world's most populous democracy. Bounded by the Indian Ocean on the south, the Arabian Sea on the southwest, and the Bay of Bengal on the southeast, it shares land borders with Pakistan to the west; China, Nepal, and Bhutan to the north; and Bangladesh and Myanmar to the east. In the Indian Ocean, India is in the vicinity of Sri Lanka and the Maldives; its Andaman and Nicobar Islands share a maritime border with Thailand, Myanmar, and Indonesia."

In [300]:
word_tokenize(corpus)

['India',
 ',',
 'officially',
 'the',
 'Republic',
 'of',
 'India',
 ',',
 'is',
 'a',
 'country',
 'in',
 'South',
 'Asia',
 '.',
 'It',
 'is',
 'the',
 'seventh-largest',
 'country',
 'by',
 'area',
 ',',
 'the',
 'most',
 'populous',
 'country',
 'as',
 'of',
 'June',
 '2023',
 ',',
 'and',
 'from',
 'the',
 'time',
 'of',
 'its',
 'independence',
 'in',
 '1947',
 ',',
 'the',
 'world',
 "'s",
 'most',
 'populous',
 'democracy',
 '.',
 'Bounded',
 'by',
 'the',
 'Indian',
 'Ocean',
 'on',
 'the',
 'south',
 ',',
 'the',
 'Arabian',
 'Sea',
 'on',
 'the',
 'southwest',
 ',',
 'and',
 'the',
 'Bay',
 'of',
 'Bengal',
 'on',
 'the',
 'southeast',
 ',',
 'it',
 'shares',
 'land',
 'borders',
 'with',
 'Pakistan',
 'to',
 'the',
 'west',
 ';',
 'China',
 ',',
 'Nepal',
 ',',
 'and',
 'Bhutan',
 'to',
 'the',
 'north',
 ';',
 'and',
 'Bangladesh',
 'and',
 'Myanmar',
 'to',
 'the',
 'east',
 '.',
 'In',
 'the',
 'Indian',
 'Ocean',
 ',',
 'India',
 'is',
 'in',
 'the',
 'vicinity',
 'of'

stop word removal

In [301]:
words = []
for word2 in word_tokenize(corpus):
  if (word2.lower() not in stopword) and (len(word2)>=2):
    words.append(word2.lower())
words

['india',
 'officially',
 'republic',
 'india',
 'country',
 'south',
 'asia',
 'seventh-largest',
 'country',
 'area',
 'populous',
 'country',
 'june',
 '2023',
 'time',
 'independence',
 '1947',
 'world',
 "'s",
 'populous',
 'democracy',
 'bounded',
 'indian',
 'ocean',
 'south',
 'arabian',
 'sea',
 'southwest',
 'bay',
 'bengal',
 'southeast',
 'shares',
 'land',
 'borders',
 'pakistan',
 'west',
 'china',
 'nepal',
 'bhutan',
 'north',
 'bangladesh',
 'myanmar',
 'east',
 'indian',
 'ocean',
 'india',
 'vicinity',
 'sri',
 'lanka',
 'maldives',
 'andaman',
 'nicobar',
 'islands',
 'share',
 'maritime',
 'border',
 'thailand',
 'myanmar',
 'indonesia']

In [302]:
print(len(word_tokenize(corpus)), len(words),sep = '\n')
words = set(words)
print(len(words))

136
59
50


In [303]:
vocab = list(words)
vocab

['land',
 'andaman',
 'bangladesh',
 'asia',
 '1947',
 'area',
 '2023',
 'arabian',
 'ocean',
 'bay',
 'west',
 'southwest',
 'shares',
 'islands',
 'myanmar',
 'south',
 'officially',
 'china',
 'time',
 'populous',
 'borders',
 'independence',
 'democracy',
 'bounded',
 'india',
 'bhutan',
 'bengal',
 'southeast',
 "'s",
 'sri',
 'indonesia',
 'indian',
 'thailand',
 'vicinity',
 'share',
 'sea',
 'border',
 'maritime',
 'east',
 'country',
 'lanka',
 'pakistan',
 'world',
 'seventh-largest',
 'june',
 'nepal',
 'nicobar',
 'republic',
 'north',
 'maldives']

In [304]:
for sent in sent_tokenize(corpus):
  print(sent)
# next task is to asssign numbers/encoding to each word in vocab
# reason -- Computer understands numbers

India, officially the Republic of India, is a country in South Asia.
It is the seventh-largest country by area, the most populous country as of June 2023, and from the time of its independence in 1947, the world's most populous democracy.
Bounded by the Indian Ocean on the south, the Arabian Sea on the southwest, and the Bay of Bengal on the southeast, it shares land borders with Pakistan to the west; China, Nepal, and Bhutan to the north; and Bangladesh and Myanmar to the east.
In the Indian Ocean, India is in the vicinity of Sri Lanka and the Maldives; its Andaman and Nicobar Islands share a maritime border with Thailand, Myanmar, and Indonesia.


# **Vocab with Keras**

text to sequences

In [305]:
from tensorflow.keras.preprocessing.text import Tokenizer
tok = Tokenizer()

In [306]:
corp = ['coffee is hot','water is cold']

tok.fit_on_texts(corp)
tok

<keras.preprocessing.text.Tokenizer at 0x7b13fe6dc220>

In [307]:
tok.word_index

{'is': 1, 'coffee': 2, 'hot': 3, 'water': 4, 'cold': 5}

In [308]:
tok.texts_to_sequences(corp)

[[2, 1, 3], [4, 1, 5]]

In [309]:
tok.texts_to_sequences(['water is hot','black coffee is cold']) # as word black not when tok is initialized, it gets neglected

[[4, 1, 3], [2, 1, 5]]

adding oov token(out of vocabulary)

In [310]:
from tensorflow.keras.preprocessing.text import Tokenizer
tok = Tokenizer(oov_token = 'black')
corp = ['coffee is hot','water is cold']
tok.fit_on_texts(corp)

print(tok.word_index)
tok.texts_to_sequences(['water is hot','black coffee is cold'])

{'black': 1, 'is': 2, 'coffee': 3, 'hot': 4, 'water': 5, 'cold': 6}


[[5, 2, 4], [1, 3, 2, 6]]

Limiting the number of words

In [311]:
from tensorflow.keras.preprocessing.text import Tokenizer
tok = Tokenizer(num_words = 5)
corp = ['coffee is hot','water is cold']
tok.fit_on_texts(corp)

print(tok.word_index)
tok.texts_to_sequences(['water is hot','black coffee is cold']) # only 5 words in output

{'is': 1, 'coffee': 2, 'hot': 3, 'water': 4, 'cold': 5}


[[4, 1, 3], [2, 1]]