## Basic NLP Pipeline
- Data Collection
- Tokenization, Stopwords, Stemming, Lemmatization
- Building a common vocab
- Vectorizing the documents
- Performing classification/Clusteringrpus

### Data Collection

In [1]:
from nltk.corpus import brown

In [3]:
print(brown.categories())

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']


In [8]:
data = brown.sents(categories='editorial')[:100]

In [9]:
print(data)

[['Assembly', 'session', 'brought', 'much', 'good'], ['The', 'General', 'Assembly', ',', 'which', 'adjourns', 'today', ',', 'has', 'performed', 'in', 'an', 'atmosphere', 'of', 'crisis', 'and', 'struggle', 'from', 'the', 'day', 'it', 'convened', '.'], ...]


### Tokenization

In [6]:
text = "It was a very pleasant day, the weather was cool and there were light showers. I went to the market to buy some fruits."
print(text)

It was a very pleasant day, the weather was cool and there were light showers. I went to the market to buy some fruits.


In [7]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [8]:
sents = sent_tokenize(text)
print(sents)

['It was a very pleasant day, the weather was cool and there were light showers.', 'I went to the market to buy some fruits.']


In [9]:
words = word_tokenize(sents[0].lower())
print(words)

['it', 'was', 'a', 'very', 'pleasant', 'day', ',', 'the', 'weather', 'was', 'cool', 'and', 'there', 'were', 'light', 'showers', '.']


### Stopword Removal

In [10]:
from nltk.corpus import stopwords

sw = set(stopwords.words('english'))
print(sw)

{'should', 'will', 'being', 'for', 'hadn', "shouldn't", 'the', 'mightn', 'i', 'from', "isn't", 'not', 'her', 'of', 'before', "hadn't", 'own', 'same', "wouldn't", 'hers', 'having', 'under', 'once', 'any', 'were', 'than', 'be', 'at', 'an', 'these', 'above', 'down', 'yours', "should've", 'until', 's', "haven't", 'now', 'so', 'o', 'themselves', 'hasn', 'doing', 'won', 'isn', 'in', 'don', 'shouldn', 'weren', 'your', 'aren', 'they', "weren't", 'then', 'what', 'most', 'to', 'more', 'but', 'himself', 'theirs', 'was', 'and', 'between', "won't", "hasn't", 'does', 'here', 'ain', "aren't", 'yourself', 'how', 'very', 'with', 'below', 'wasn', 've', 'shan', 'she', 'itself', 'which', 'been', 'couldn', "don't", 'while', 'a', 'about', 'had', 'only', 'just', 'do', 'doesn', 'can', 'where', 'their', "mustn't", 'against', 'why', 'on', 'as', "wasn't", 'by', 'he', 'nor', 'wouldn', 'if', 'out', "doesn't", 'such', 'into', 'y', 'few', 'too', "it's", 'll', 'it', 'our', 'd', 'yourselves', 'ourselves', 'over', "cou

#### Filter words from the sentence

In [11]:
useful_words = [w for w in words if w not in sw]
print(useful_words)

['pleasant', 'day', ',', 'weather', 'cool', 'light', 'showers', '.']


### Tokenization using Regular Expression
Problem with Word Tokenizer - Can't handle complex tokenizations. So we use a Regexp Tokenizer Class in NLTK

In [12]:
from nltk.tokenize import RegexpTokenizer

In [13]:
tokenizer = RegexpTokenizer("[a-zA-Z@]+")

In [14]:
text = "Send all the 50 documents related to clauses 1,2,3 at abc@xyz.com"

print(tokenizer.tokenize(text))

['Send', 'all', 'the', 'documents', 'related', 'to', 'clauses', 'at', 'abc@xyz', 'com']


### Stemming
- Process that transforms particular words(verbs, plural) into their radical form
- Preserve the semantics of the sentence without increasing the number of unique tokens
- jumps, jumping, jumped, jump ==> jum

In [15]:
text = "Foxes love to make jumps. The quick brown fox was seen jumping over the lovely dog from a 6ft feet high wall"

words = tokenizer.tokenize(text.lower())
print(words)

['foxes', 'love', 'to', 'make', 'jumps', 'the', 'quick', 'brown', 'fox', 'was', 'seen', 'jumping', 'over', 'the', 'lovely', 'dog', 'from', 'a', 'ft', 'feet', 'high', 'wall']


In [16]:
words = [w for w in words if w not in sw]
print(words)

['foxes', 'love', 'make', 'jumps', 'quick', 'brown', 'fox', 'seen', 'jumping', 'lovely', 'dog', 'ft', 'feet', 'high', 'wall']


### Stemmers 
- 1) Snowball Stemmer (Multilingual) 
- 2) Porter Stemmer (English)
- 3) Lancaster Stemmer (English)

In [17]:
from nltk.stem.snowball import PorterStemmer, SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer

ps = PorterStemmer()

In [18]:
ps.stem("jumps")

'jump'

In [19]:
ps.stem("lovely")

'love'

In [20]:
ps.stem("awesome")

'awesom'

In [21]:
ls = LancasterStemmer()
ls.stem("teeth")

'tee'

In [22]:
print(ps.stem("teenager"))
print(ls.stem("teenager"))

teenag
teen


In [23]:
ss = SnowballStemmer('english')
print(ss.stem('lovely'))
print(ss.stem('teenager'))

love
teenag


In [24]:
ss_french = SnowballStemmer('french')
print(ss.stem('courais'))

courai


### Lemmatization

In [25]:
from nltk.stem import WordNetLemmatizer

l = WordNetLemmatizer()
l.lemmatize("crying")

'cry'

### Building Common Vocabulary and Vectoring Documents (based upon Bag of Words Model)

In [26]:
corpus = ['Indian cricket team will win World Cup, says Capt. Virat Kohli', 
          'We will win the next Lok Sabha Elections, says confident Indian PM', 
          'The nobel laurate won the hearts of the people', 
          'The movie Raazi is an exciting Indian spy thriller based upon a real story']

In [27]:
from sklearn.feature_extraction.text import CountVectorizer

In [28]:
cv = CountVectorizer()

In [29]:
vectorized_corpus = cv.fit_transform(corpus).toarray()

In [30]:
vectorized_corpus

array([[0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1],
       [0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0,
        1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 1, 0],
       [1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
        0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0]], dtype=int64)

In [31]:
print(cv.vocabulary_) #Dictionary - Word -> Index

{'indian': 9, 'cricket': 4, 'team': 26, 'will': 32, 'win': 33, 'world': 35, 'cup': 5, 'says': 23, 'capt': 2, 'virat': 30, 'kohli': 11, 'we': 31, 'the': 27, 'next': 15, 'lok': 13, 'sabha': 22, 'elections': 6, 'confident': 3, 'pm': 19, 'nobel': 16, 'laurate': 12, 'won': 34, 'hearts': 8, 'of': 17, 'people': 18, 'movie': 14, 'raazi': 20, 'is': 10, 'an': 0, 'exciting': 7, 'spy': 24, 'thriller': 28, 'based': 1, 'upon': 29, 'real': 21, 'story': 25}


In [32]:
## Given a vector what is the sentence?

import numpy as np

vector = np.ones((36,))
vector[3:7] = 0
print(vector)

[1. 1. 1. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [33]:
print(cv.inverse_transform(vector))

[array(['an', 'based', 'capt', 'exciting', 'hearts', 'indian', 'is',
       'kohli', 'laurate', 'lok', 'movie', 'next', 'nobel', 'of',
       'people', 'pm', 'raazi', 'real', 'sabha', 'says', 'spy', 'story',
       'team', 'the', 'thriller', 'upon', 'virat', 'we', 'will', 'win',
       'won', 'world'], dtype='<U9')]


In [34]:
cv.vocabulary_["capt"]

2

### Effectively reduce the size of the vector

In [35]:
def myTokenizer(sentence):
    words = tokenizer.tokenize(sentence.lower())
    return [w for w in words if w not in sw]

myTokenizer(corpus[0])

['indian',
 'cricket',
 'team',
 'win',
 'world',
 'cup',
 'says',
 'capt',
 'virat',
 'kohli']

In [36]:
cv = CountVectorizer(tokenizer=myTokenizer)
vectorized_corpus = cv.fit_transform(corpus)
vc = vectorized_corpus.toarray()

print(vc[0])
print(len(vc[0]))

[0 1 0 1 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 1 1]
29


In [37]:
v = vc[0]
cv.inverse_transform(v)

[array(['capt', 'cricket', 'cup', 'indian', 'kohli', 'says', 'team',
        'virat', 'win', 'world'], dtype='<U9')]

In [38]:
vc[0][0] = 1
print(vc[0])

v = vc[0]
cv.inverse_transform(v)

[1 1 0 1 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 1 1]


[array(['based', 'capt', 'cricket', 'cup', 'indian', 'kohli', 'says',
        'team', 'virat', 'win', 'world'], dtype='<U9')]

### Features in Bag of Words model
- Unigrams
- Bigrams, Trigrams
- N-Grams

In [39]:
cv = CountVectorizer(tokenizer=myTokenizer, ngram_range=(1,3))
vectorized_corpus = cv.fit_transform(corpus)
vc = vectorized_corpus.toarray()
print(vc[0])
print(len(vc[0]))
print(cv.vocabulary_)

[0 0 0 1 1 1 0 0 0 1 1 1 1 1 1 0 0 0 0 0 0 0 0 1 1 1 0 0 0 1 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 1
 1 1 0 0 1 1 1 1 1]
83
{'indian': 23, 'cricket': 9, 'team': 64, 'win': 75, 'world': 80, 'cup': 12, 'says': 55, 'capt': 3, 'virat': 73, 'kohli': 29, 'indian cricket': 24, 'cricket team': 10, 'team win': 65, 'win world': 78, 'world cup': 81, 'cup says': 13, 'says capt': 56, 'capt virat': 4, 'virat kohli': 74, 'indian cricket team': 25, 'cricket team win': 11, 'team win world': 66, 'win world cup': 79, 'world cup says': 82, 'cup says capt': 14, 'says capt virat': 57, 'capt virat kohli': 5, 'next': 39, 'lok': 33, 'sabha': 52, 'elections': 15, 'confident': 6, 'pm': 46, 'win next': 76, 'next lok': 40, 'lok sabha': 34, 'sabha elections': 53, 'elections says': 16, 'says confident': 58, 'confident indian': 7, 'indian pm': 26, 'win next lok': 77, 'next lok sabha': 41, 'lok sabha elections': 35, 'sabha elections says': 54, 'elections says confident'

### TF-IDF Normalisation
- Avoid features that occur very often, because they contain less information
- Information decreases as the number of occurences increases across different type of documents
- So we define another term - term-document-frequency which associates a weight with every term

In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [42]:
tfidf_vectorizer = TfidfVectorizer(tokenizer=myTokenizer,ngram_range=(1,2))
vectorized_corpus = tfidf_vectorizer.fit_transform(corpus).toarray()

print(vectorized_corpus)

[[0.         0.         0.23802376 0.23802376 0.         0.
  0.23802376 0.23802376 0.23802376 0.23802376 0.         0.
  0.         0.         0.         0.         0.15192748 0.23802376
  0.         0.         0.23802376 0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.18766067 0.23802376 0.
  0.         0.         0.         0.23802376 0.23802376 0.
  0.         0.         0.         0.23802376 0.23802376 0.18766067
  0.         0.23802376 0.23802376 0.23802376]
 [0.         0.         0.         0.         0.25277526 0.25277526
  0.         0.         0.         0.         0.25277526 0.25277526
  0.         0.         0.         0.         0.16134317 0.
  0.25277526 0.         0.         0.         0.         0.25277526
  0.25277526 0.         0.         0.25277526 0.25277526 0.
  0.         0.         0.25277526 0.         0.         0.
  0.         

In [43]:
print(tfidf_vectorizer.vocabulary_)

{'indian': 16, 'cricket': 6, 'team': 45, 'win': 53, 'world': 56, 'cup': 8, 'says': 39, 'capt': 2, 'virat': 51, 'kohli': 20, 'indian cricket': 17, 'cricket team': 7, 'team win': 46, 'win world': 55, 'world cup': 57, 'cup says': 9, 'says capt': 40, 'capt virat': 3, 'virat kohli': 52, 'next': 27, 'lok': 23, 'sabha': 37, 'elections': 10, 'confident': 4, 'pm': 32, 'win next': 54, 'next lok': 28, 'lok sabha': 24, 'sabha elections': 38, 'elections says': 11, 'says confident': 41, 'confident indian': 5, 'indian pm': 18, 'nobel': 29, 'laurate': 21, 'hearts': 14, 'people': 31, 'nobel laurate': 30, 'laurate hearts': 22, 'hearts people': 15, 'movie': 25, 'raazi': 33, 'exciting': 12, 'spy': 42, 'thriller': 47, 'based': 0, 'upon': 49, 'real': 35, 'story': 44, 'movie raazi': 26, 'raazi exciting': 34, 'exciting indian': 13, 'indian spy': 19, 'spy thriller': 43, 'thriller based': 48, 'based upon': 1, 'upon real': 50, 'real story': 36}
