## Basic NLP Pipeline

#### Tokenization

In [1]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [2]:
text = "It was a very pleasant and cool day, their were light showers. I went to the Mall road for some shopping."
print(text)

It was a very pleasant and cool day, their were light showers. I went to the Mall road for some shopping.


In [3]:
sents = sent_tokenize(text)
print(sents)

['It was a very pleasant and cool day, their were light showers.', 'I went to the Mall road for some shopping.']


In [4]:
word_list = word_tokenize(sents[0].lower())
print(word_list)

['it', 'was', 'a', 'very', 'pleasant', 'and', 'cool', 'day', ',', 'their', 'were', 'light', 'showers', '.']


#### Stopwords removal

In [5]:
from nltk.corpus import stopwords

In [6]:
sw = set(stopwords.words('english'))

In [7]:
print(sw)
print(len(sw))

{'and', 'doesn', 'where', 'who', "should've", 'have', 'them', 'down', 'don', 'their', 'own', 'yourselves', 'further', 'were', 'i', 'or', 'o', 'shouldn', 'during', 'below', 'such', 'because', 'whom', 'll', 'needn', 'here', 'he', 'himself', 'there', 'are', 'each', 'you', "mightn't", "don't", 'but', 'themselves', 'against', "it's", 'am', 'couldn', 'nor', "she's", 'had', 'do', 'what', 'they', 'any', 'she', 'won', 'yourself', 'has', 'an', 'how', 'y', 'didn', 'off', 'herself', 'him', 'no', 'only', 'some', 'we', "hadn't", 'mustn', 'now', "wasn't", 'to', 'been', 'if', 'again', 'ma', 'is', "you're", 'this', 'then', 'when', 'why', 'can', 'of', 'about', 'by', "shouldn't", "that'll", 'out', 'just', 'wasn', 't', 'myself', "hasn't", 'as', 'few', "mustn't", 'my', 'hadn', 'having', 'with', 'more', 'same', 'haven', 'his', 'into', 'isn', 'should', "weren't", 'on', 'through', 'at', 'it', 'her', 'other', 'so', 'its', 'which', "aren't", 'before', 'from', 'did', 'itself', 'above', 'wouldn', 'your', "you've"

In [8]:
useful_words = [w for w in word_list if w not in sw]
print(useful_words)


['pleasant', 'cool', 'day', ',', 'light', 'showers', '.']


#### Tokenisation using regex

In [9]:
from nltk.tokenize import RegexpTokenizer

In [10]:
tokenizer = RegexpTokenizer("[A-Za-z@]+")

In [11]:
text1 = "Send all the related documents to clauses 1,2,3 at siddhu15798@gmail.com"

In [12]:
print(tokenizer.tokenize(text1))

['Send', 'all', 'the', 'related', 'documents', 'to', 'clauses', 'at', 'siddhu', '@gmail', 'com']


In [13]:
tokenizer1 = RegexpTokenizer("[A-Za-z]+")

In [14]:
print(tokenizer.tokenize(text))

['It', 'was', 'a', 'very', 'pleasant', 'and', 'cool', 'day', 'their', 'were', 'light', 'showers', 'I', 'went', 'to', 'the', 'Mall', 'road', 'for', 'some', 'shopping']


#### Stemming

In [15]:
text2 = "Foxes love to make jumps. The quick brown fox was seen jumping over the lovely dog from a 6ft high wall"

In [16]:
wordlist1 = tokenizer.tokenize(text2.lower())

In [17]:
wordlist1 = [w for w in wordlist1 if w not in sw]
print(wordlist1)

['foxes', 'love', 'make', 'jumps', 'quick', 'brown', 'fox', 'seen', 'jumping', 'lovely', 'dog', 'ft', 'high', 'wall']


### Types of Stemmer
- 1) Snowball Stemmer - Supports multilingual stemming
- 2) Porter Stemmer - Supports only English
- 3) Lancaster Stemmer - Supports only English

In [18]:
from nltk.stem.snowball import PorterStemmer, SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer

In [19]:
ps = PorterStemmer()

In [74]:
print(ps.stem("lovely"))
print(ps.stem("quickly"))

love
quickli


In [22]:
j = []
for i in wordlist1:
    a = ps.stem(i)
    j.append(a)
print(set(j))

{'make', 'seen', 'jump', 'brown', 'dog', 'high', 'quick', 'fox', 'love', 'ft', 'wall'}


In [23]:
ls = LancasterStemmer()
ls.stem("teeth")

'tee'

In [24]:
k = []
for i in wordlist1:
    a = ls.stem(i)
    k.append(a)
print(set(j))

{'make', 'seen', 'jump', 'brown', 'dog', 'high', 'quick', 'fox', 'love', 'ft', 'wall'}


In [25]:
ss = SnowballStemmer('english')
ss.stem("teeth")

'teeth'

In [26]:
l = []
for i in wordlist1:
    a = ss.stem(i)
    l.append(a)
print(set(j))

{'make', 'seen', 'jump', 'brown', 'dog', 'high', 'quick', 'fox', 'love', 'ft', 'wall'}


In [27]:
def func(text):
    tokenizer = RegexpTokenizer("[A-Za-z]+")
    words = tokenizer.tokenize(text.lower())
    sw = set(stopwords.words('english'))
    useful_words = [w for w in words if w not in sw]
    ss = SnowballStemmer('english')
    x = []
    for i in useful_words:
        a = ss.stem(i)
        x.append(a)
    return x

In [28]:
y = func(text)
print(y)

['pleasant', 'cool', 'day', 'light', 'shower', 'went', 'mall', 'road', 'shop']


#### Building common vocab and vectorizing documents (Bag of word)

In [29]:
corpus = [
    'Indian cricket team will win the World Cup, says Capt. Virat Kohli',
    'We will win next Lok Sabha elections, said confident Indian PM',
    'The nobel laurate won the hearts of the people',
    'The movie Raazi is an exciting Indian Spy Thriller movie based upon a real story'
]

In [30]:
from sklearn.feature_extraction.text import CountVectorizer

In [31]:
cv = CountVectorizer()

In [37]:
#vectorized_corpus = cv.fit_transform(corpus)
vectorized_corpus = cv.fit_transform(corpus).toarray()


In [38]:
vectorized_corpus

array([[0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1],
       [0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0,
        1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 1, 0],
       [1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 1,
        0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0]], dtype=int64)

In [43]:
print(vectorized_corpus)
print(len(vectorized_corpus[0]))

[[0 0 1 0 1 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 1 0 1 1 0
  1]
 [0 0 0 1 0 0 1 0 0 1 0 0 0 1 0 1 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 1 1 1 0
  0]
 [0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 1
  0]
 [1 1 0 0 0 0 0 1 0 1 1 0 0 0 2 0 0 0 0 0 1 1 0 0 0 1 1 0 1 1 1 0 0 0 0 0
  0]]
37


In [40]:
print(cv.vocabulary_)

{'indian': 9, 'cricket': 4, 'team': 27, 'will': 33, 'win': 34, 'the': 28, 'world': 36, 'cup': 5, 'says': 24, 'capt': 2, 'virat': 31, 'kohli': 11, 'we': 32, 'next': 15, 'lok': 13, 'sabha': 22, 'elections': 6, 'said': 23, 'confident': 3, 'pm': 19, 'nobel': 16, 'laurate': 12, 'won': 35, 'hearts': 8, 'of': 17, 'people': 18, 'movie': 14, 'raazi': 20, 'is': 10, 'an': 0, 'exciting': 7, 'spy': 25, 'thriller': 29, 'based': 1, 'upon': 30, 'real': 21, 'story': 26}


In [41]:
import numpy as np

In [47]:
vector = np.ones((37,))
print(vector)

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [48]:
print(cv.inverse_transform(vector))

[array(['an', 'based', 'capt', 'confident', 'cricket', 'cup', 'elections',
       'exciting', 'hearts', 'indian', 'is', 'kohli', 'laurate', 'lok',
       'movie', 'next', 'nobel', 'of', 'people', 'pm', 'raazi', 'real',
       'sabha', 'said', 'says', 'spy', 'story', 'team', 'the', 'thriller',
       'upon', 'virat', 'we', 'will', 'win', 'won', 'world'], dtype='<U9')]


In [49]:
cv.vocabulary_["exciting"]

7

In [54]:
## Effectively reduce the size of the vectors

def myTokenizer(sentence):
    words = tokenizer.tokenize(sentence.lower())
    return words

myTokenizer(corpus[0])

['indian',
 'cricket',
 'team',
 'will',
 'win',
 'the',
 'world',
 'cup',
 'says',
 'capt',
 'virat',
 'kohli']

In [62]:
cv = CountVectorizer(tokenizer=myTokenizer)
vectorized_corpus = cv.fit_transform(corpus)
vc = vectorized_corpus.toarray()
print(vc[0])
print(len(vc[0]))
vc[0][0] = 1
v = vc[0]
print(vc[0])
cv.inverse_transform(v)

[0 0 0 1 0 1 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 1 0 1 1 0
 1]
38
[1 0 0 1 0 1 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 1 0 1 1 0
 1]


[array(['a', 'capt', 'cricket', 'cup', 'indian', 'kohli', 'says', 'team',
        'the', 'virat', 'will', 'win', 'world'], dtype='<U9')]

#### Features of Bag of Words Model
- unigram
- bigrams, trigrams
- ngrams

In [63]:
cv = CountVectorizer(tokenizer=myTokenizer, ngram_range=(1,2))
vectorized_corpus = cv.fit_transform(corpus)
vc = vectorized_corpus.toarray()

print(cv.vocabulary_)

{'indian': 20, 'cricket': 10, 'team': 55, 'will': 71, 'win': 73, 'the': 57, 'world': 78, 'cup': 12, 'says': 50, 'capt': 6, 'virat': 67, 'kohli': 26, 'indian cricket': 21, 'cricket team': 11, 'team will': 56, 'will win': 72, 'win the': 75, 'the world': 62, 'world cup': 79, 'cup says': 13, 'says capt': 51, 'capt virat': 7, 'virat kohli': 68, 'we': 69, 'next': 34, 'lok': 29, 'sabha': 46, 'elections': 14, 'said': 48, 'confident': 8, 'pm': 41, 'we will': 70, 'win next': 74, 'next lok': 35, 'lok sabha': 30, 'sabha elections': 47, 'elections said': 15, 'said confident': 49, 'confident indian': 9, 'indian pm': 22, 'nobel': 36, 'laurate': 27, 'won': 76, 'hearts': 18, 'of': 38, 'people': 40, 'the nobel': 60, 'nobel laurate': 37, 'laurate won': 28, 'won the': 77, 'the hearts': 58, 'hearts of': 19, 'of the': 39, 'the people': 61, 'movie': 31, 'raazi': 42, 'is': 24, 'an': 2, 'exciting': 16, 'spy': 52, 'thriller': 63, 'based': 4, 'upon': 65, 'a': 0, 'real': 44, 'story': 54, 'the movie': 59, 'movie r

#### TF-IDF Normalisation
- Avoid features that occur very often, because they contain less info.
- info dec as the no of occurences inc across different type of docs.
- we define another term - term Doc Frequency which associates a weight with every term.

In [66]:
corpus = [
    'Indian cricket team will win the World Cup, says Capt. Virat Kohli. World cup will held at Sri Lanka',
    'We will win next Lok Sabha elections, said confident Indian PM',
    'The nobel laurate won the hearts of the people',
    'The movie Raazi is an exciting Indian Spy Thriller movie based upon a real story'
]

In [67]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [72]:
tfidf_vectorizer = TfidfVectorizer(tokenizer=myTokenizer,ngram_range=(1,2))
vectorized_corpus = tfidf_vectorizer.fit_transform(corpus).toarray()
print(vectorized_corpus)

[[0.         0.         0.         0.         0.15514718 0.15514718
  0.         0.         0.15514718 0.15514718 0.         0.
  0.15514718 0.15514718 0.31029435 0.15514718 0.15514718 0.
  0.         0.         0.         0.         0.         0.15514718
  0.15514718 0.09902843 0.15514718 0.         0.         0.
  0.         0.15514718 0.15514718 0.15514718 0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.15514718 0.15514718 0.
  0.         0.15514718 0.15514718 0.         0.15514718 0.15514718
  0.09902843 0.         0.         0.         0.         0.15514718
  0.         0.         0.         0.         0.15514718 0.15514718
  0.         0.         0.24463963 0.15514718 0.12231982 0.12231982
  0.         0.15514718 0.         0.         0.31029435 0.31029435]
 [0.         0.         0.         0.      

In [73]:
print(tfidf_vectorizer.vocabulary_)

{'indian': 25, 'cricket': 12, 'team': 64, 'will': 80, 'win': 83, 'the': 66, 'world': 88, 'cup': 14, 'says': 57, 'capt': 8, 'virat': 76, 'kohli': 31, 'held': 23, 'at': 4, 'sri': 61, 'lanka': 33, 'indian cricket': 26, 'cricket team': 13, 'team will': 65, 'will win': 82, 'win the': 85, 'the world': 71, 'world cup': 89, 'cup says': 15, 'says capt': 58, 'capt virat': 9, 'virat kohli': 77, 'kohli world': 32, 'cup will': 16, 'will held': 81, 'held at': 24, 'at sri': 5, 'sri lanka': 62, 'we': 78, 'next': 41, 'lok': 36, 'sabha': 53, 'elections': 17, 'said': 55, 'confident': 10, 'pm': 48, 'we will': 79, 'win next': 84, 'next lok': 42, 'lok sabha': 37, 'sabha elections': 54, 'elections said': 18, 'said confident': 56, 'confident indian': 11, 'indian pm': 27, 'nobel': 43, 'laurate': 34, 'won': 86, 'hearts': 21, 'of': 45, 'people': 47, 'the nobel': 69, 'nobel laurate': 44, 'laurate won': 35, 'won the': 87, 'the hearts': 67, 'hearts of': 22, 'of the': 46, 'the people': 70, 'movie': 38, 'raazi': 49, 