## Basic NLP Pipeline

In [1]:
## Tokenisation

In [15]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [65]:
text = "It was a very pleasant and cool day, their were light showers. I went to the Mall road for some shopping."
print(text)

It was a very pleasant and cool day, their were light showers. I went to the Mall road for some shopping.


In [17]:
sents = sent_tokenize(text)
print(sents)

['It was a very pleasant and cool day, their were light showers.', 'I went to the Mall road for some shopping.']


In [19]:
word_list = word_tokenize(sents[0].lower())
print(word_list)

['it', 'was', 'a', 'very', 'pleasant', 'and', 'cool', 'day', ',', 'their', 'were', 'light', 'showers', '.']


In [20]:
## Stopwords removal

In [21]:
from nltk.corpus import stopwords

In [22]:
sw = set(stopwords.words('english'))

In [23]:
print(sw)

{"should've", 'no', 'mustn', 'off', 'same', 'up', "doesn't", 'they', 'our', 'was', 'with', "shouldn't", 'y', 's', 'as', "wasn't", 'for', 'all', 'some', 'the', "that'll", 'weren', 'in', 'himself', "needn't", 'only', 'why', 'can', 'nor', 'above', 'this', 'should', 'at', 'into', 'are', 'ours', 'under', 'your', "mustn't", 'being', 'having', 'now', 'ourselves', 'but', 'don', "don't", 'yourself', 'or', 'is', 'whom', 'down', "haven't", 'how', 'once', 'hers', 'yourselves', 'then', 'more', 'most', 'if', 'few', 'i', "won't", 'their', 'didn', 'myself', "it's", "you're", 'very', "didn't", 'during', 're', 'have', "you'll", 'his', 'these', 'were', 'you', 'has', 'both', 'ain', 'an', 'until', 'hasn', 'further', "hadn't", 'herself', 'where', 'aren', 'each', "you'd", 'wouldn', 'other', "she's", 'so', 'here', "weren't", 'and', 'by', 'be', 'doesn', 'o', 'than', 'we', 'ma', 've', 'not', 'before', 'haven', 'from', 'that', 'won', 'mightn', 'been', 'on', 'wasn', 'hadn', 'd', 'needn', 'to', 'she', 'while', 'ha

['pleasant', 'cool', 'day', ',', 'light', 'showers', '.']


In [25]:
## Tokenisation using regex

In [26]:
from nltk.tokenize import RegexpTokenizer

In [27]:
tokenizer = RegexpTokenizer("[A-Za-z@]+")

In [28]:
text1 = "Send all the related documents to clauses 1,2,3 at siddhu15798@gmail.com"

In [29]:
print(tokenizer.tokenize(text1))

['Send', 'all', 'the', 'related', 'documents', 'to', 'clauses', 'at', 'siddhu', '@gmail', 'com']


In [30]:
tokenizer1 = RegexpTokenizer("[A-Za-z]+")

In [32]:
print(tokenizer.tokenize(text))

['It', 'was', 'a', 'very', 'pleasant', 'and', 'cool', 'day', 'their', 'were', 'light', 'showers', 'I', 'went', 'to', 'the', 'Mall', 'road', 'for', 'some', 'shopping']


In [33]:
## Stemming

In [40]:
text2 = "Foxes love to make jumps. The quick brown fox was seen jumping over the lovely dog from a 6ft high wall"

In [41]:
wordlist1 = tokenizer.tokenize(text2.lower())

In [42]:
wordlist1 = [w for w in wordlist1 if w not in sw]
print(wordlist1)

['foxes', 'love', 'make', 'jumps', 'quick', 'brown', 'fox', 'seen', 'jumping', 'lovely', 'dog', 'ft', 'high', 'wall']


### Types of Stemmer
- 1) Snowball Stemmer - Supports multilingual stemming
- 2) Porter Stemmer - Supports only English
- 3) Lancaster Stemmer - Supports only English

In [60]:
from nltk.stem.snowball import PorterStemmer, SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer

In [53]:
ps = PorterStemmer()

In [58]:
ps.stem("lovely")
ps.stem("teeth")

'teeth'

In [55]:
j = []
for i in wordlist1:
    a = ps.stem(i)
    j.append(a)
print(set(j))

{'dog', 'jump', 'quick', 'high', 'fox', 'brown', 'wall', 'love', 'make', 'ft', 'seen'}


In [57]:
ls = LancasterStemmer()
ls.stem("teeth")

'tee'

In [59]:
k = []
for i in wordlist1:
    a = ls.stem(i)
    k.append(a)
print(set(j))

{'dog', 'jump', 'quick', 'high', 'fox', 'brown', 'wall', 'love', 'make', 'ft', 'seen'}


In [62]:
ss = SnowballStemmer('english')
ss.stem("teeth")

'teeth'

In [63]:
l = []
for i in wordlist1:
    a = ss.stem(i)
    l.append(a)
print(set(j))

{'dog', 'jump', 'quick', 'high', 'fox', 'brown', 'wall', 'love', 'make', 'ft', 'seen'}


In [68]:
def func(text):
    tokenizer = RegexpTokenizer("[A-Za-z]+")
    words = tokenizer.tokenize(text.lower())
    sw = set(stopwords.words('english'))
    useful_words = [w for w in words if w not in sw]
    ss = SnowballStemmer('english')
    x = []
    for i in useful_words:
        a = ss.stem(i)
        x.append(a)
    return x

In [70]:
y = func(text)
print(y)

['pleasant', 'cool', 'day', 'light', 'shower', 'went', 'mall', 'road', 'shop']
