# NLTK - Natural Language Processing Toolkit
- Leading platform to build python prgrams to work with Human language data.

In [77]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk

In [78]:
#nltk.download()

## Bag of Words Pipeline
Way to convert text to numerical data which can be then fed to the classifier to get the prediction
- Get the Data/Corpus 
- Tokenisation/Stopword Removal 
- Stemming 
- Building a Vocab 
- Vectorization 
- Classification 

### 1. Getting the Data/Corpus
Corpus - A large collection of text

In [79]:

from nltk.corpus import brown

In [80]:
print(brown.categories())

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']


In [81]:
data = brown.sents(categories='fiction')
len(data)

4249

In [82]:
# First sentence in fiction category
' '.join(data[0])

'Thirty-three'

### 2. Tokenisation and Stopword Removal

- Tokenization  
    - Splitting text into small pieces called tokens (usually words or sentences).  
    - Example: "I went to the market." → ["I", "went", "to", "the", "market", "."]

- Stopword removal  
    - Removing very common words that usually add little meaning (like "the", "is", "and").  
    - Example: ["I", "went", "to", "the", "market"] → ["went", "market"]

#### Tokenization

In [83]:
document = """It was a very pleasant day. The weather was cool and there were light showers. I went to the market to by some fruits. """

sentence = "Send all the 50 documents related to chapters 1,2,3 at divij@cb.com"

In [84]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [85]:
sents = sent_tokenize(document)
print(sents, len(sents))

['It was a very pleasant day.', 'The weather was cool and there were light showers.', 'I went to the market to by some fruits.'] 3


In [86]:
sentence.split() #python function - not useful for NLP as it doesn't handle punctuation

['Send',
 'all',
 'the',
 '50',
 'documents',
 'related',
 'to',
 'chapters',
 '1,2,3',
 'at',
 'divij@cb.com']

In [87]:
# Tokenization using NLTK
words = word_tokenize(sentence)
words

['Send',
 'all',
 'the',
 '50',
 'documents',
 'related',
 'to',
 'chapters',
 '1,2,3',
 'at',
 'divij',
 '@',
 'cb.com']

#### Stopwords

In [88]:
from nltk.corpus import stopwords

sw = set(stopwords.words('english'))
print(sw)

{'wasn', 'm', "aren't", 'all', 'are', "couldn't", 'not', "weren't", 'don', 'hadn', 'd', 'its', 'until', 'in', 'same', 'when', 'being', 'did', 'herself', 'themselves', "needn't", "he's", 't', 'will', 'having', 've', 'who', 'been', "he'll", "they'd", 'be', "you've", 'doesn', 'myself', 'any', 'shouldn', 'than', 'whom', "we've", 'ourselves', 'needn', 'and', "we're", 'has', 'me', "wouldn't", 'up', 'if', 'ma', 'both', "hadn't", 'we', 'hers', 'before', 'himself', 'yourself', "it's", 'out', 'because', 'those', 'other', 'through', "hasn't", 'haven', 'ours', 'shan', 'on', 'under', 'a', 'mightn', 'once', "it'd", 'him', "he'd", "she'd", 'that', 'them', 'mustn', "you'd", 's', 'there', 'can', 'which', 'down', 'most', 'were', 'further', 'wouldn', 'ain', 'about', 'from', "haven't", 'how', 'my', 'these', 'was', 'i', 'into', 'too', "didn't", 'at', "i'll", 'couldn', 'between', "she'll", "isn't", "that'll", 'is', 'isn', 'they', 'to', 'yours', 'of', 'had', 'doing', 'above', 'she', "you're", 'theirs', 'here

In [89]:
# function to remove stopwards from a sentence/list of words

def remove_stopwords(text, stopwords):
    useful_words = [word for word in text if word.lower() not in stopwords]
    return useful_words

In [90]:
# Testing stopwords removal
text = "i am not bothered about her very much".split()

useful_text = remove_stopwords(text, sw)
print(useful_text)

['bothered', 'much']


#### Regex (Regular Expression) Based Tokensation

In [91]:
sentence = "Send all the 50 documents related to chapters 1,2,3 at divij@cb.com"

In [92]:
from nltk.tokenize import RegexpTokenizer

In [93]:
tokenizer = RegexpTokenizer('[a-zA-Z@.]+')
useful_text = tokenizer.tokenize(sentence)
useful_text

['Send',
 'all',
 'the',
 'documents',
 'related',
 'to',
 'chapters',
 'at',
 'divij@cb.com']

### 3. Stemming
- Process that transforms particular words(verbs,plurals)into their radical form
- Preserve the semantics of the sentence without increasing the number of unique tokens
- Example - jumps, jumping, jumped, jump ==> jump

In [94]:
text= """Foxes love to make jumps.The quick brown fox was seen jumping over the 
        lovely dog from a 6ft feet high wall"""

In [95]:
from nltk.stem.snowball import SnowballStemmer, PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
#Snowball Stemmer, Porter, Lancaster Stemmer

In [96]:
ps = PorterStemmer()
ps.stem('jumping')

'jump'

In [97]:
ps.stem('jumps')

'jump'

In [114]:
print(ps.stem('lovely'))
print(ps.stem('loving'))

love
love


In [113]:
# Snowball Stemmer
ss = SnowballStemmer('english')
print(ss.stem('lovely'))
print(ss.stem('jumping'))

love
jump


In [115]:
## Lemmatization
from nltk.stem import WordNetLemmatizer

wn = WordNetLemmatizer()
wn.lemmatize('jumping')

'jumping'

### 4. Building a Vocab and Vectorization

In [101]:
# Sample Corpus - Contains 4 Documents, each document can have 1 or more sentences
corpus = [
        'Indian cricket team will wins World Cup, says Capt. Virat Kohli. World cup will be held at Sri Lanka.',
        'We will win next Lok Sabha Elections, says confident Indian PM',
        'The nobel laurate won the hearts of the people.',
        'The movie Raazi is an exciting Indian Spy thriller based upon a real story.'
]

In [102]:
from sklearn.feature_extraction.text import CountVectorizer

In [118]:
cv = CountVectorizer()
vectorized_corpus = cv.fit_transform(corpus)
vectorized_corpus = vectorized_corpus.toarray()

print(len(vectorized_corpus[0]))
print(vectorized_corpus[0])

42
[0 1 0 1 1 0 1 2 0 0 0 1 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 1 0
 2 0 1 0 2]


In [104]:
print(cv.vocabulary_, len(cv.vocabulary_.keys()))

{'indian': 12, 'cricket': 6, 'team': 31, 'will': 37, 'wins': 39, 'world': 41, 'cup': 7, 'says': 27, 'capt': 4, 'virat': 35, 'kohli': 14, 'be': 3, 'held': 11, 'at': 1, 'sri': 29, 'lanka': 15, 'we': 36, 'win': 38, 'next': 19, 'lok': 17, 'sabha': 26, 'elections': 8, 'confident': 5, 'pm': 23, 'the': 32, 'nobel': 20, 'laurate': 16, 'won': 40, 'hearts': 10, 'of': 21, 'people': 22, 'movie': 18, 'raazi': 24, 'is': 13, 'an': 0, 'exciting': 9, 'spy': 28, 'thriller': 33, 'based': 2, 'upon': 34, 'real': 25, 'story': 30} 42


In [105]:
# Reverse Mapping!
numbers = vectorized_corpus[2]
numbers

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 1, 0])

In [119]:
# Reshape numbers into 2D array with 1 sample
numbers = numbers.reshape(1, -1)  # or numbers[np.newaxis, :]
s = cv.inverse_transform(numbers)
print(s)

[array(['hearts', 'laurate', 'nobel', 'of', 'people', 'the', 'won'],
      dtype='<U9')]


#### Vectorization with Stopword Removal

In [107]:
def myTokenizer(document):
    words = tokenizer.tokenize(document.lower())
    # Remove Stopwords
    words = remove_stopwords(words,sw)
    return words

In [120]:
myTokenizer(sentence)
#print(sentence)

['send', 'documents', 'related', 'chapters', 'divij@cb.com']

In [109]:
cv = CountVectorizer(tokenizer=myTokenizer)


In [110]:
vectorized_corpus = cv.fit_transform(corpus).toarray()
print(vectorized_corpus)
print(len(vectorized_corpus[0]))


[[0 1 0 1 2 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 1 0 1 2]
 [0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 1 0 0 1 0 0 1 1 0 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 1 1 0 0 1 0 1 0 1 1 0 0 0 0]]
33




In [111]:
cv.inverse_transform(vectorized_corpus)


[array(['capt.', 'cricket', 'cup', 'held', 'indian', 'kohli.', 'lanka.',
        'says', 'sri', 'team', 'virat', 'wins', 'world'], dtype='<U9'),
 array(['confident', 'elections', 'indian', 'lok', 'next', 'pm', 'sabha',
        'says', 'win'], dtype='<U9'),
 array(['hearts', 'laurate', 'nobel', 'people.'], dtype='<U9'),
 array(['based', 'exciting', 'indian', 'movie', 'raazi', 'real', 'spy',
        'story.', 'thriller', 'upon'], dtype='<U9')]

In [112]:
# For Test Data
test_corpus = [
        'Indian cricket rock !',        
]

In [None]:
cv.transform(test_corpus).toarray()

array([[0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

### More Ways to Create Features
- Unigram - every word as a feature
- Bigrams
- Trigrams
- n-grams
- TF-IDF Normalisation

In [121]:
sent_1  = ["this is good movie"]
sent_2 = ["this is good movie but actor is not present"]
sent_3 = ["this is not good movie"]

In [124]:
cv = CountVectorizer(ngram_range=(1,3))

In [125]:
docs = [sent_1[0],sent_2[0]]
cv.fit_transform(docs).toarray()

array([[0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
        1],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1]])

In [126]:
cv.vocabulary_

{'this': 20,
 'is': 9,
 'good': 6,
 'movie': 14,
 'this is': 21,
 'is good': 10,
 'good movie': 7,
 'this is good': 22,
 'is good movie': 11,
 'but': 3,
 'actor': 0,
 'not': 17,
 'present': 19,
 'movie but': 15,
 'but actor': 4,
 'actor is': 1,
 'is not': 12,
 'not present': 18,
 'good movie but': 8,
 'movie but actor': 16,
 'but actor is': 5,
 'actor is not': 2,
 'is not present': 13}

#### Tf-idf Normalisation
- Avoid features that occur very often, becauase they contain less information
- Information decreases as the number of occurences increases across different type of documents
- So we define another term - term-document-frequency which associates a weight with every term

In [None]:
sent_1  = "this is good movie"
sent_2 = "this was good movie"
sent_3 = "this is not good movie"

corpus = [sent_1,sent_2,sent_3]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf = TfidfVectorizer()

vc = tfidf.fit_transform(corpus).toarray()
print(vc)

[[0.46333427 0.59662724 0.46333427 0.         0.46333427 0.        ]
 [0.41285857 0.         0.41285857 0.         0.41285857 0.69903033]
 [0.3645444  0.46941728 0.3645444  0.61722732 0.3645444  0.        ]]


In [None]:
tfidf.vocabulary_

{'!', '.', '?', 'Blocks', 'Coding', 'Hey', 'Welcome', 'to'}