### NLP

Computers to process and analyze large amount of natural data. Computers can read text, hear speech, interpret it.

#### Examples
Online chatbox, Speech Recognition etc

### Tokenization

In [3]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/bhawindhital/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
#Tokenization
#Import libraries
#import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

text = "I believe this would help the reader understand how tokenization \
        works. as well as realize its importance."
        
sents = (sent_tokenize(text))
print(sents)
print(word_tokenize(text))

words = [word_tokenize(sent) for sent in sents]
print(words)
#perform tokenization


['I believe this would help the reader understand how tokenization         works.', 'as well as realize its importance.']
['I', 'believe', 'this', 'would', 'help', 'the', 'reader', 'understand', 'how', 'tokenization', 'works', '.', 'as', 'well', 'as', 'realize', 'its', 'importance', '.']
[['I', 'believe', 'this', 'would', 'help', 'the', 'reader', 'understand', 'how', 'tokenization', 'works', '.'], ['as', 'well', 'as', 'realize', 'its', 'importance', '.']]


##### Stop word removal

In [6]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/bhawindhital/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [8]:
#Import libraries
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation

text = "I believe this would help the reader understand how tokenization \
        works. as well as realize its importance (text) ."
        
print(punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [9]:
custom_list = set(stopwords.words('english')+list(punctuation))

word_list = [word for word in word_tokenize(text) if word not in custom_list]
print(word_list)

['I', 'believe', 'would', 'help', 'reader', 'understand', 'tokenization', 'works', 'well', 'realize', 'importance', 'text']


#### N-grams

In [11]:
#N-grams
from nltk.collocations import BigramCollocationFinder

word_list = ['I', 'believe', 'would', 'help', 'reader', 'understand', \
             'tokenization', 'works', 'well', 'realize', 'importance', 'text']

finde = BigramCollocationFinder.from_words(word_list)
print(finde.ngram_fd.items())

dict_items([(('I', 'believe'), 1), (('believe', 'would'), 1), (('would', 'help'), 1), (('help', 'reader'), 1), (('reader', 'understand'), 1), (('understand', 'tokenization'), 1), (('tokenization', 'works'), 1), (('works', 'well'), 1), (('well', 'realize'), 1), (('realize', 'importance'), 1), (('importance', 'text'), 1)])


#### Stemming

In [12]:
#Stemming
#Import Libraries
from nltk.tokenize import word_tokenize
from nltk.stem.lancaster import LancasterStemmer

l_s = LancasterStemmer()
new_text = "It is important to by very pythonly while you are pythoning\
             with python. All pythoners have pythoned poorly at least once."
             
stem_lan =  [l_s.stem(word) for word in word_tokenize(new_text)] 
print(stem_lan)       

['it', 'is', 'import', 'to', 'by', 'very', 'python', 'whil', 'you', 'ar', 'python', 'with', 'python', '.', 'al', 'python', 'hav', 'python', 'poor', 'at', 'least', 'ont', '.']


#### Word Sense Disambiguation

In [14]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/bhawindhital/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [15]:
#Word Sense Disambiguation
#Import Libraries
#nltk.download('wordnet')
from nltk.corpus import wordnet
for ss in wordnet.synsets('mouse'):
    print(ss, ss.definition())


from nltk.wsd import lesk
from nltk.tokenize import word_tokenize

context_1 = lesk(word_tokenize("Sing in a lower tone, along with the bass"), "bass")
print(context_1, context_1.definition())

context_2 = lesk(word_tokenize("The sea bass really very hard to catch"), "bass")
print(context_2, context_2.definition())

context_3 = lesk(word_tokenize("My mouse is not working, need to change it"), "mouse")
print(context_3, context_3.definition())


#"Sing in a lower tone, along with the bass"
#"The sea bass really very hard to catch"
#"My mouse is not working, need to change it"




Synset('mouse.n.01') any of numerous small rodents typically resembling diminutive rats having pointed snouts and small ears on elongated bodies with slender usually hairless tails
Synset('shiner.n.01') a swollen bruise caused by a blow to the eye
Synset('mouse.n.03') person who is quiet or timid
Synset('mouse.n.04') a hand-operated electronic device that controls the coordinates of a cursor on your computer screen as you move it around on a pad; on the bottom of the device is a ball that rolls on the surface of the pad
Synset('sneak.v.01') to go stealthily or furtively
Synset('mouse.v.02') manipulate the mouse of a computer
Synset('bass.n.07') the member with the lowest range of a family of musical instruments
Synset('sea_bass.n.01') the lean flesh of a saltwater fish of the family Serranidae
Synset('mouse.n.04') a hand-operated electronic device that controls the coordinates of a cursor on your computer screen as you move it around on a pad; on the bottom of the device is a ball that

#### Count Vectorizer

In [17]:
import pandas as pd
corpus = [
     'This is the first document from heaven',
     'but the second document is from mars',
     'And this is the third one from nowhere',
     'Is this the first document from nowhere?',
]

df = pd.DataFrame({'text':corpus})
df

Unnamed: 0,text
0,This is the first document from heaven
1,but the second document is from mars
2,And this is the third one from nowhere
3,Is this the first document from nowhere?


In [19]:
from sklearn.feature_extraction.text import CountVectorizer
count_v = CountVectorizer()
X = count_v.fit_transform(df.text).toarray()
print(X)
print(count_v.vocabulary_)

[[0 0 1 1 1 1 1 0 0 0 0 1 0 1]
 [0 1 1 0 1 0 1 1 0 0 1 1 0 0]
 [1 0 0 0 1 0 1 0 1 1 0 1 1 1]
 [0 0 1 1 1 0 1 0 1 0 0 1 0 1]]
{'this': 13, 'is': 6, 'the': 11, 'first': 3, 'document': 2, 'from': 4, 'heaven': 5, 'but': 1, 'second': 10, 'mars': 7, 'and': 0, 'third': 12, 'one': 9, 'nowhere': 8}


In [20]:
count_v = CountVectorizer(stop_words=['this','is'])
X = count_v.fit_transform(df.text).toarray()
print(X)
print(count_v.vocabulary_)

[[0 0 1 1 1 1 0 0 0 0 1 0]
 [0 1 1 0 1 0 1 0 0 1 1 0]
 [1 0 0 0 1 0 0 1 1 0 1 1]
 [0 0 1 1 1 0 0 1 0 0 1 0]]
{'the': 10, 'first': 3, 'document': 2, 'from': 4, 'heaven': 5, 'but': 1, 'second': 9, 'mars': 6, 'and': 0, 'third': 11, 'one': 8, 'nowhere': 7}


### tfidf

In [21]:
#TD-IDF
#Import Libraries
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = [
     'This is the first document from heaven',
     'but the second document is from mars',
     'And this is the third one from nowhere',
     'Is this the first document from nowhere?',
]

vector = TfidfVectorizer()
vector.fit(corpus)
print(vector.vocabulary_)
print(vector.idf_)

{'this': 13, 'is': 6, 'the': 11, 'first': 3, 'document': 2, 'from': 4, 'heaven': 5, 'but': 1, 'second': 10, 'mars': 7, 'and': 0, 'third': 12, 'one': 9, 'nowhere': 8}
[1.91629073 1.91629073 1.22314355 1.51082562 1.         1.91629073
 1.         1.91629073 1.51082562 1.91629073 1.91629073 1.
 1.91629073 1.22314355]


#### Hashing

In [23]:
#Hashing
#Import Libraries
from sklearn.feature_extraction.text import HashingVectorizer
import pandas as pd
corpus = [
     'This is the first document from heaven',
     'but the second document is from mars',
     'And this is the third one from nowhere',
     'Is this the first document from nowhere?',
]

df = pd.DataFrame({'text':corpus})

hash_v = HashingVectorizer(n_features=8, norm=None,alternate_sign=False)
hash_v.fit_transform(df.text).toarray()

array([[2., 1., 0., 0., 1., 1., 2., 0.],
       [2., 0., 0., 1., 1., 1., 2., 0.],
       [0., 0., 0., 0., 2., 3., 3., 0.],
       [2., 0., 0., 0., 1., 1., 3., 0.]])