## Natural Language Processing using NLTK

In [1]:
from nltk.corpus import brown

### Data collection

In [2]:
print(brown.categories())

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']


In [7]:
data = brown.sents(categories='editorial')[:100]
print(type(data), len(data))
print(data)
print(len(data))

<class 'nltk.collections.LazySubsequence'> 100
[['Assembly', 'session', 'brought', 'much', 'good'], ['The', 'General', 'Assembly', ',', 'which', 'adjourns', 'today', ',', 'has', 'performed', 'in', 'an', 'atmosphere', 'of', 'crisis', 'and', 'struggle', 'from', 'the', 'day', 'it', 'convened', '.'], ...]
100


# NLP Pipeline
- Data Collection 
- Tokenization, Stopwards Removal, Stemming
- Building a common vocab 
- Vectorize the documents 
- Performing Classification/Clustering

## 2. Tokenization and Stopword Removal

In [10]:
text = "It was a very pleasant day, the weather was cool and there were showers. I went to market to buy some fruits."

In [11]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [12]:
sents = sent_tokenize(text)

In [13]:
print(sents)

['It was a very pleasant day, the weather was cool and there were showers.', 'I went to market to buy some fruits.']


In [15]:
word_list = word_tokenize(sents[0].lower())
print(word_list)

['it', 'was', 'a', 'very', 'pleasant', 'day', ',', 'the', 'weather', 'was', 'cool', 'and', 'there', 'were', 'showers', '.']


## Stopwords removal

In [17]:
from nltk.corpus import stopwords

In [18]:
sw = set(stopwords.words('english'))

In [20]:
print(sw, len(sw))

{'is', "needn't", 'for', 'an', 'yourselves', 's', 'ours', 'and', 'all', 'hadn', 'on', 'no', 'wasn', 'by', 'each', "you'll", 'he', 'am', 'these', 'have', 'down', 'or', 'now', 'off', "hasn't", 'whom', 'most', 'shouldn', 'yours', 'too', 'yourself', 'just', 'has', 'so', "shouldn't", 'aren', "that'll", 'who', 'they', 'll', 'y', 'below', 'as', 'there', 'both', "hadn't", 'through', 'above', "haven't", 'can', 'weren', 'over', 'from', "didn't", 'are', 've', 'themselves', 'at', 'here', 'won', 'more', 'doing', 'further', 'if', 'when', 'be', 'should', 'which', 'own', 'mightn', 'shan', 'theirs', "shan't", 'this', 'hers', 'was', 'until', "you're", 'where', 'then', 'with', 'don', 'does', 'needn', 'do', "don't", 'will', 'about', 'against', 'd', 'any', 'out', 'a', 'the', 'did', 'in', 'myself', 're', "wouldn't", 'couldn', 'same', 'we', 'very', 'between', 'his', 'wouldn', 'him', 'were', 'why', 't', 'under', 'other', 'ain', 'that', 'some', 'herself', 'o', "mightn't", 'up', 'm', 'nor', "you've", 'she', 'ou

## Filter the words from the sentence 

In [21]:
def filter_words(word_list):
    
    useful_words = [w for w in word_list if w not in sw]
    return useful_words

In [22]:
useful_words = filter_words(word_list)
print(useful_words)

['pleasant', 'day', ',', 'weather', 'cool', 'showers', '.']


In [23]:
from nltk.tokenize import RegexpTokenizer

In [29]:
tokenizer = RegexpTokenizer("[a-zA-Z0-9]+")

In [30]:
sents = "send the 50 documents to abc, def, ghi."
print(tokenizer.tokenize(sents))

['send', 'the', '50', 'documents', 'to', 'abc', 'def', 'ghi']


# Stemming
- Process that transforms particular words into root words
- jumping, jump, jumps, jumped => jump

In [31]:
text = "The quick brown fox was seen jumping over the lazy dog from high wall. Foxes love to make jumps"

In [32]:
word_list = tokenizer.tokenize(text.lower())
print(word_list)

['the', 'quick', 'brown', 'fox', 'was', 'seen', 'jumping', 'over', 'the', 'lazy', 'dog', 'from', 'high', 'wall', 'foxes', 'love', 'to', 'make', 'jumps']


## Types of Stemmers 
- Snowball Stemmer (Multilingual)
- Porter Stemmer 
- Lancaster Stemmer 

In [33]:
from nltk.stem.snowball import PorterStemmer, SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer

In [34]:
ps = PorterStemmer()

In [36]:
ps.stem("jumped")
ps.stem("jumping")

'jump'

In [37]:
ps.stem("lovely")

'love'

In [41]:
ps.stem("awesome")
ls = LancasterStemmer()
ls.stem("awesome")

print(ls.stem("teenager"))
print(ps.stem("teenager"))

teen
teenag


In [43]:
ss = SnowballStemmer('french')
ss.stem('courais')

'cour'