# Installed Natural Language Preprocessing Toolkit using pip install nltk , using nltk.download() downloaded data.

In [1]:
from nltk.corpus import brown

In [2]:
print(brown.words())

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]


In [3]:
print(brown.categories())

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']


In [4]:
print(len(brown.categories()))

15


### To get the sentences from the advwnture category we have sents function

In [5]:
data = brown.sents(categories='adventure')

In [7]:
data # its a list of list as each sentence is a list of words.

[['Dan', 'Morgan', 'told', 'himself', 'he', 'would', 'forget', 'Ann', 'Turner', '.'], ['He', 'was', 'well', 'rid', 'of', 'her', '.'], ...]

In [8]:
len(data)

4637

In [10]:
' '.join(data[1]) #1st sentence in the fiction category.

'He was well rid of her .'

# Bag of Words Pipeline

In [17]:
document = """It was a very pleasant day. The weather was cool and there were light showers. 
I went to the market to buy some fruits."""
# document is a multiline string

sentence = "Send all the 50 documents related to chapters 1,2,3 at prateek@cb.com"

In [15]:
from nltk.tokenize import sent_tokenize,word_tokenize

In [19]:
sensts = sent_tokenize(document)
print(sensts)
print(len(sensts))

['It was a very pleasant day.', 'The weather was cool and there were light showers.', 'I went to the market to buy some fruits.']
3


In [20]:
sentence.split()

['Send',
 'all',
 'the',
 '50',
 'documents',
 'related',
 'to',
 'chapters',
 '1,2,3',
 'at',
 'prateek@cb.com']

In [21]:
words = word_tokenize(sentence)

In [22]:
print(words)

['Send', 'all', 'the', '50', 'documents', 'related', 'to', 'chapters', '1,2,3', 'at', 'prateek', '@', 'cb.com']


### In split function , it splits the sentence according to sentences , but word tokenizer splits email among name,  @, domain

## Stopword Removal using nltk

In [23]:
from nltk.corpus import stopwords

In [24]:
sw = set(stopwords.words('english'))

In [26]:
print(sw) # These are the common words that we can skip in english

{'through', 'until', 'myself', 'having', 'hasn', 'aren', 'during', 'above', 'below', 'most', "weren't", 'have', 'to', 'won', 'where', 'be', 'y', 'my', 'ma', 'why', 'not', 'their', "won't", 'that', "doesn't", 'other', "you'd", 'him', 'his', 'am', 'which', 'over', "aren't", 'this', 'just', 'up', 'and', 'against', 'from', 'they', 'o', 'too', 'your', 'whom', 'is', 'isn', 'when', "couldn't", 'all', "she's", 'because', 'by', 'how', 'were', "that'll", 'does', 'haven', 'only', 'few', 'such', 'ours', 've', 'down', 'those', 'so', "wouldn't", 'you', 'shouldn', 'of', 'as', 'i', "didn't", 'needn', 'had', 'did', "mustn't", 'after', 'hadn', 'being', 'while', 'if', 's', 'will', 're', 'd', 'same', 'didn', "you've", 'or', 'some', 'out', 'its', 'again', 'should', 'each', 'mustn', 'doing', "should've", 'weren', 'between', 'don', 'shan', 'them', 'a', 'what', 'no', 'off', "needn't", 'was', "you'll", 'then', 'for', "haven't", "isn't", 'very', 'themselves', 'who', 'both', 'wasn', 'ourselves', "wasn't", "you'r

In [28]:
def remove_stopwords(text,stopwords):
    useful_words = [w for w in text if w not in stopwords] # List comprehension
    return useful_words

In [30]:
text = "i am not bothered about her very much".split() # We have to tokenize as by default it will take it as string
useful_words = remove_stopwords(text,sw)
print(useful_words)

['bothered', 'much']


## Tokenization using Regular Expression , for practise regexpal.com

In [32]:
sentence = "Send all the 50 documents related to chapters 1,2,3 at prateek@cb.com"

In [33]:
from nltk.tokenize import RegexpTokenizer

In [37]:
tokenizer = RegexpTokenizer('[a-zA-Z]+') #Regular Expression for acceptance of only words.
useful_text = tokenizer.tokenize(sentence)

In [39]:
print(useful_text)

['Send', 'all', 'the', 'documents', 'related', 'to', 'chapters', 'at', 'prateek', 'cb', 'com']


In [40]:
# if we ewant to capture the email id also then
tokenizer = RegexpTokenizer('[a-zA-Z@.]+') #Regular Expression for acceptance of only words.
useful_text = tokenizer.tokenize(sentence)

In [41]:
print(useful_text)

['Send', 'all', 'the', 'documents', 'related', 'to', 'chapters', 'at', 'prateek@cb.com']


## Stemming - Process that transforms particular words(verbs , plurals) into their radical form. nltk provides 3 stemmers - Snowball , Porter , Lancaster semmer

In [42]:
text = """Foxes love to make jumps. The quick brown fox was seen jumping over the 
lovely dog from a 6ft feet high wall"""

In [43]:
from nltk.stem import SnowballStemmer,PorterStemmer,LancasterStemmer

In [44]:
ps = PorterStemmer()

In [45]:
ps.stem('jumping')

'jump'

In [46]:
print(ps.stem('lovely'))
print(ps.stem('loves'))
print(ps.stem('loveing'))
print(ps.stem('jumps'))

love
love
love
jump


In [47]:
# Snowball stemer is multilingual , french and german are supported

In [48]:
ss = SnowballStemmer('english') # its multilingual so  we have to pass language also

In [49]:
print(ss.stem('lovely'))
print(ss.stem('loves'))
print(ss.stem('loveing'))
print(ss.stem('jumping'))
print(ss.stem('jumps'))

love
love
love
jump
jump


## Lemmatization

In [50]:
from nltk.stem import WordNetLemmatizer

In [51]:
wn = WordNetLemmatizer()

In [52]:
print(wn.stem('lovely'))
print(wn.stem('loves'))
print(wn.stem('loveing'))
print(wn.stem('jumping'))
print(wn.stem('jumps'))

AttributeError: 'WordNetLemmatizer' object has no attribute 'stem'

##### Lammatizer not installed so showing error

# Building vocab and Vectorization

In [53]:
# 1. sports , 2. Politics , 3.Economy , 4.Movies
corpus = [
        'Indian cricket team will wins World Cup, says Capt. Virat Kohli. World cup will be held at Sri Lanka.',
        'We will win next Lok Sabha Elections, says confident Indian PM',
        'The nobel laurate won the hearts of the people.',
        'The movie Raazi is an exciting Indian Spy thriller based upon a real story.'
]

## We will maintain a dictionary called vocab for storing indices and the word . Vector will conain the frequncy

In [54]:
# We can automate the work by sklearn countvectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [55]:
cv = CountVectorizer()

In [56]:
vectorized_corpus = cv.fit_transform(corpus) # fit will learn from the dictionary and transform will count

In [59]:
#vectorized_corpus? will give  (0, 6)	1 , this means that 0th word is at 6th index occuring once
vectorized_corpus = vectorized_corpus.toarray()
vectorized_corpus[0]

array([0, 1, 0, 1, 1, 0, 1, 2, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 2, 0, 1, 0, 2],
      dtype=int64)

In [64]:
print(cv.vocabulary_) # Mapping
print(len(cv.vocabulary_.keys())) # 42 Unique words

{'indian': 12, 'cricket': 6, 'team': 31, 'will': 37, 'wins': 39, 'world': 41, 'cup': 7, 'says': 27, 'capt': 4, 'virat': 35, 'kohli': 14, 'be': 3, 'held': 11, 'at': 1, 'sri': 29, 'lanka': 15, 'we': 36, 'win': 38, 'next': 19, 'lok': 17, 'sabha': 26, 'elections': 8, 'confident': 5, 'pm': 23, 'the': 32, 'nobel': 20, 'laurate': 16, 'won': 40, 'hearts': 10, 'of': 21, 'people': 22, 'movie': 18, 'raazi': 24, 'is': 13, 'an': 0, 'exciting': 9, 'spy': 28, 'thriller': 33, 'based': 2, 'upon': 34, 'real': 25, 'story': 30}
42


In [66]:
# Rewverse mapping
numbers = vectorized_corpus[2]
print(numbers)
s = cv.inverse_transform(numbers)
print(s)

[0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 3 0 0 0 0
 0 0 0 1 0]
[array(['hearts', 'laurate', 'nobel', 'of', 'people', 'the', 'won'],
      dtype='<U9')]


# Vectorization with Stopward Removal

In [70]:
def myTokenizer(document):
    words = tokenizer.tokenize(document.lower()) # so that Indian and indian are considered same
    words = remove_stopwords(words,sw)
    return words

In [71]:
myTokenizer(sentence)

['send', 'documents', 'related', 'chapters', 'prateek@cb.com']

In [72]:
cv = CountVectorizer(tokenizer=myTokenizer) # We can make our custom tokenizer and pass it to countVectorizer

In [73]:
vectorized_corpus = cv.fit_transform(corpus).toarray()

In [74]:
print(vectorized_corpus)

[[0 1 0 1 2 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 1 0 1 2]
 [0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 1 0 0 1 0 0 1 1 0 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 1 1 0 0 1 0 1 0 1 1 0 0 0 0]]


In [75]:
print(len(vectorized_corpus[0]))

33


### Even for small senetences we are getting large vectors so we need dimentionality reduction 

In [76]:
cv.inverse_transform(vectorized_corpus)

[array(['capt.', 'cricket', 'cup', 'held', 'indian', 'kohli.', 'lanka.',
        'says', 'sri', 'team', 'virat', 'wins', 'world'], dtype='<U9'),
 array(['confident', 'elections', 'indian', 'lok', 'next', 'pm', 'sabha',
        'says', 'win'], dtype='<U9'),
 array(['hearts', 'laurate', 'nobel', 'people.'], dtype='<U9'),
 array(['based', 'exciting', 'indian', 'movie', 'raazi', 'real', 'spy',
        'story.', 'thriller', 'upon'], dtype='<U9')]

In [77]:
# For test data
test_corpus = [
        'Indian cricket rocks!',
]

In [80]:
cv.transform(test_corpus).toarray() # If we use fit_transform then it will alter our vocab and will learn again.

array([[0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)

# More ways to Create Features
- Unigram - every word as a feature
- Bigrams - two consecutive words can be treated as a single feature
- Trigrams
- n-grams
- TF-IDF Normalisation

In [81]:
sent_1  = ["this is good movie"]
sent_2 = ["this is good movie but actor is not present"]

In [82]:
cv = CountVectorizer()

In [83]:
docs = [sent_1[0],sent_2[0]]
cv.fit_transform(docs).toarray()

array([[0, 0, 1, 1, 1, 0, 0, 1],
       [1, 1, 1, 2, 1, 1, 1, 1]], dtype=int64)

In [84]:
sent_1  = ["this is good movie"]
sent_2 = ["this is good movie but actor is not present"]
sent_3 = ["this is not good movie"]
# Here ML model might not calssify sent3 as not good as its not identifying not good as one

### Bigram

In [85]:
cv = CountVectorizer(ngram_range=(2,2)) # by default ngram_range is (1,1) so its unigram (2,2) is bigram

In [87]:
docs = [sent_1[0],sent_2[0]]
cv.fit_transform(docs).toarray()

array([[0, 0, 1, 1, 0, 0, 0, 1],
       [1, 1, 1, 1, 1, 1, 1, 1]], dtype=int64)

In [88]:
cv.vocabulary_

{'this is': 7,
 'is good': 3,
 'good movie': 2,
 'movie but': 5,
 'but actor': 1,
 'actor is': 0,
 'is not': 4,
 'not present': 6}

### Trigram

In [89]:
cv = CountVectorizer(ngram_range=(3,3))

In [90]:
docs = [sent_1[0],sent_2[0]]
cv.fit_transform(docs).toarray()

array([[0, 0, 0, 1, 0, 0, 1],
       [1, 1, 1, 1, 1, 1, 1]], dtype=int64)

In [91]:
cv.vocabulary_

{'this is good': 6,
 'is good movie': 3,
 'good movie but': 2,
 'movie but actor': 5,
 'but actor is': 1,
 'actor is not': 0,
 'is not present': 4}

### n-gram which has a range say 1 to 3

In [92]:
cv = CountVectorizer(ngram_range=(1,3))

In [93]:
docs = [sent_1[0],sent_2[0]]
cv.fit_transform(docs).toarray()

array([[0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
        1],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1]], dtype=int64)

In [94]:
cv.vocabulary_

{'this': 20,
 'is': 9,
 'good': 6,
 'movie': 14,
 'this is': 21,
 'is good': 10,
 'good movie': 7,
 'this is good': 22,
 'is good movie': 11,
 'but': 3,
 'actor': 0,
 'not': 17,
 'present': 19,
 'movie but': 15,
 'but actor': 4,
 'actor is': 1,
 'is not': 12,
 'not present': 18,
 'good movie but': 8,
 'movie but actor': 16,
 'but actor is': 5,
 'actor is not': 2,
 'is not present': 13}

# Tf-idf Normalization
- Avoid features that occur very often, becauase they contain less information
- Information decreases as the number of occurences increases across different type of documents
- So we define another term - term-document-frequency which associates a weight with every term
- It has two terms term frequency and inverse document frequency.

In [95]:
sent_1  = "this is good movie"
sent_2 = "this was good movie"
sent_3 = "this is not good movie"

corpus = [sent_1,sent_2,sent_3]

In [96]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [97]:
tfidf = TfidfVectorizer()

In [98]:
vc = tfidf.fit_transform(corpus).toarray() # Vectorized corpus

In [99]:
print(vc)

[[0.46333427 0.59662724 0.46333427 0.         0.46333427 0.        ]
 [0.41285857 0.         0.41285857 0.         0.41285857 0.69903033]
 [0.3645444  0.46941728 0.3645444  0.61722732 0.3645444  0.        ]]


In [100]:
tfidf.vocabulary_

{'this': 4, 'is': 1, 'good': 0, 'movie': 2, 'was': 5, 'not': 3}

## This is occuring at 4rth index and its weight is 0.46 ,0.41 and 0.36 in all three sentences as is less informative as is in all the sentences.
## is similarly is at 1 and its weight is 0.5 ,0 ,0.46 as in 2nd sentence is is not present.
## not has the highest weight as is only present in the 3rd sentence.