# **Natural Language Processing**

In [1]:
import nltk

## **Tokenize words**  
"How are you" => ["How", "are", "you"]

In [2]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [3]:
data = "All work and no play makes jack a dull boy. All work and no play !"
print(word_tokenize(data))

['All', 'work', 'and', 'no', 'play', 'makes', 'jack', 'a', 'dull', 'boy', '.', 'All', 'work', 'and', 'no', 'play', '!']


## **NLTK Stop words**

In [4]:
data = "All work and no play makes jack dull boy. All work and no play makes jack a dull boy."
words = word_tokenize(data)
print(words)

['All', 'work', 'and', 'no', 'play', 'makes', 'jack', 'dull', 'boy', '.', 'All', 'work', 'and', 'no', 'play', 'makes', 'jack', 'a', 'dull', 'boy', '.']


In [5]:
from nltk.corpus import stopwords

In [6]:
stopWords = set(stopwords.words('english'))

In [7]:
print(len(stopWords))
print(stopWords)

179
{'few', 'ourselves', 'by', 'such', 'shan', 'had', 're', 'our', 'with', "that'll", 'and', "mightn't", 'each', 'why', 'other', 'he', 'do', 'herself', 'it', 'hasn', 'doing', 'wouldn', 't', 'below', "hadn't", 'been', 'they', 'then', "should've", 'down', "aren't", 'couldn', 'from', 'how', 'what', 'this', 'were', "didn't", 'my', "you're", 'on', 'shouldn', 'himself', 'which', 'its', 'having', 'should', 'after', 'mightn', 'so', 'only', 's', 'where', "you've", "you'll", 'ours', 'me', "it's", 'weren', 'has', 'too', 'we', 'those', 'now', 'no', 'have', 'at', 'into', 'o', 'until', 'under', 'itself', 'haven', 'own', 'more', 'ma', 'you', 'their', 'be', 'over', 'during', 'that', 'is', 'these', 'won', 'between', 'hers', "needn't", 'y', 'if', 'theirs', 'does', 'for', 'am', 'a', "shouldn't", 'when', 'than', 'here', 'ain', 'above', 'all', 'doesn', 've', 'but', 'very', 'are', 'his', 'did', 'will', 'just', "mustn't", 'nor', "you'd", 'her', 'don', 'once', 'off', "she's", 'as', 'him', 'aren', 'hadn', 'abo

## **NLTK - Stemming**

<img src='stop_words.png'>

In [8]:
from nltk.stem import PorterStemmer

In [9]:
words = ["game","gaming","gamed","games"]
ps = PorterStemmer()
 
for word in words:
    print(ps.stem(word))


game
game
game
game


In [10]:
words = ['study', 'studying', 'studies', 'studious']

for word in words:
    print(word + ":" + ps.stem(word))


study:studi
studying:studi
studies:studi
studious:studiou


## **Lemmetization**

In [11]:
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

sentence_words = ['study', 'studying', 'studies', 'studious']

for word in sentence_words:
    print ("{0:20}{1:20}".format(word,wordnet_lemmatizer.lemmatize(word, pos="v")))


study               study               
studying            study               
studies             study               
studious            studious            


## **NLTK - Speech Tagging**

<img src='speech_tagging.png'>

In [12]:
import nltk
 
document = 'Whether you\'re new to programming or an experienced developer, it\'s easy \
to learn and use Python.'
sentences = nltk.sent_tokenize(document)   
for word in sentences:
    print(nltk.pos_tag(nltk.word_tokenize(word)))


[('Whether', 'IN'), ('you', 'PRP'), ("'re", 'VBP'), ('new', 'JJ'), ('to', 'TO'), ('programming', 'VBG'), ('or', 'CC'), ('an', 'DT'), ('experienced', 'JJ'), ('developer', 'NN'), (',', ','), ('it', 'PRP'), ("'s", 'VBZ'), ('easy', 'JJ'), ('to', 'TO'), ('learn', 'VB'), ('and', 'CC'), ('use', 'VB'), ('Python', 'NNP'), ('.', '.')]


In [13]:
document = 'Today the Netherlands celebrates King\'s Day. To honor this tradition, the Dutch embassy in San Francisco invited me to'
sentences = nltk.sent_tokenize(document)   
 
data = []
for sent in sentences:
    data = data + nltk.pos_tag(nltk.word_tokenize(sent))
 
for word in data: 
    if 'NNP' in word[1]: 
        print(word)


('Netherlands', 'NNP')
('King', 'NNP')
('Day', 'NNP')
('San', 'NNP')
('Francisco', 'NNP')


## **Count Vectorizer**

In [14]:
corpus = [ "This is my first corpus", "Processing it for ML", "Doing ML is awesome",
    "This is fun to look at", "ML is life, ML is interest"
]

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [16]:
x = cv.fit_transform(corpus)
x

<5x17 sparse matrix of type '<class 'numpy.int64'>'
	with 23 stored elements in Compressed Sparse Row format>

In [17]:
x.toarray()

array([[0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0],
       [0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1],
       [0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 0, 2, 0, 0, 0, 0]], dtype=int64)

In [18]:
cv.vocabulary_

{'this': 15,
 'is': 8,
 'my': 13,
 'first': 4,
 'corpus': 2,
 'processing': 14,
 'it': 9,
 'for': 5,
 'ml': 12,
 'doing': 3,
 'awesome': 1,
 'fun': 6,
 'to': 16,
 'look': 11,
 'at': 0,
 'life': 10,
 'interest': 7}

In [19]:
cv.get_feature_names()

['at',
 'awesome',
 'corpus',
 'doing',
 'first',
 'for',
 'fun',
 'interest',
 'is',
 'it',
 'life',
 'look',
 'ml',
 'my',
 'processing',
 'this',
 'to']

## **Wordcloud**

In [20]:
import pandas as pd
spam_df = pd.read_csv("spam.csv", encoding='ISO-8859-1', engine='c')
spam_df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [21]:
# select the required coolumns
spam_df = spam_df[['v1', 'v2']]
spam_df.rename(columns={'v1':'target', 'v2':'text'}, inplace=True)

In [22]:
spam_df.head(2)

Unnamed: 0,target,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...


In [23]:
spam_df.shape

(5572, 2)

In [24]:
spam_df.target.value_counts()

ham     4825
spam     747
Name: target, dtype: int64

In [1]:
from wordcloud import Wordcloud

ModuleNotFoundError: No module named 'wordcloud'

In [26]:
len(spam_list)

NameError: name 'spam_list' is not defined

In [28]:
spam = " ".join(spam_list)
spam[:100]


NameError: name 'spam_list' is not defined

In [29]:
# create a word cloud for spam
spam_wordcloud = WordCloud().generate(spam)

NameError: name 'WordCloud' is not defined

In [30]:
plt.figure()

NameError: name 'plt' is not defined

In [31]:
plt.imshow(spam_wordcloud)
plt.show()

NameError: name 'plt' is not defined