## Natural Language Processing Core

In [None]:
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


### 1.Tokenizing Words and Sentences

In [2]:
paragraph = """Tell General Howard I know his heart. What he told me before, I have it in my heart. I am tired of fighting.
Our Chiefs are killed; Looking Glass is dead, Ta Hool Hool Shute is dead. The old men are all dead. It is the young men who
say yes or no. He who led on the young men is dead. It is cold, and we have no blankets; the little children are freezing 
to death. My people, some of them, have run away to the hills, and have no blankets, no food. No one knows where they are 
– perhaps freezing to death. I want to have time to look for my children, and see how many of them I can find. Maybe I 
shall find them among the dead. Hear me, my Chiefs! I am tired; my heart is sick and sad. From where the sun now stands 
I will fight no more forever."""

In [None]:
sentences = nltk.sent_tokenize(paragraph)   # seperate each sentences of our paragraph

In [None]:
words = nltk.word_tokenize(paragraph)       # seperate each words of sentences of our paragraph

### 2.Stemming
"Stemming is process of reducing infected or derived words to their word stem, base or root form"
###### Words representation may not have any meaning.
##### Takes less time
##### Use stemming when meaning of words are not important for analysis. example spam detection

In [5]:
# import stemming from nltk
from nltk.stem import PorterStemmer

In [6]:
# init the stemming
stemmer = PorterStemmer()

In [None]:
#stemming
for i in range(len(sentences)):
    words = nltk.word_tokenize(sentences[i])
    newwords = [stemmer.stem(word) for word in words]
    sentences[i]= ' '.join(newwords)

### 3.Lemmatization
"Same as Stemming but intermediate representation/root form has a meaning"
###### Words representation  have  meaning.
##### Takes more time than Stemming
##### Use Lemmatization when meaning of words are  important for analysis. example Question answer application

In [None]:
# import Lemmatization from NLTK
from nltk.stem import WordNetLemmatizer

In [None]:
sentences = nltk.sent_tokenize(paragraph)   # seperate each sentences of our paragraph

In [None]:
# init the Lemmatization model
lemmatizer = WordNetLemmatizer()

In [None]:
# Lemmatization
for i in range(len(sentences)):
    words = nltk.word_tokenize(sentences[i])
    newwords = [lemmatizer.lemmatize(word) for word in words]
    sentences[i] = ''.join(newwords)

### 4.Stop Word Removal using nltk

In [None]:
from nltk.corpus import stopwords

In [None]:
sentences = nltk.sent_tokenize(paragraph)   # seperate each sentences of our paragraph

In [None]:
# stop word removal
for i in range(len(sentences)):
    words = nltk.word_tokenize(sentences[i])
    newwords = [word for word in words if word not in stopwords.words('english')]
    sentences[i] = ' '.join(newwords)

### 5.Parts of Speech Tagging

In [None]:
words = nltk.word_tokenize(paragraph)

In [None]:
tagged_words =nltk.pos_tag(words)

In [None]:
word_tags = []
for tw in tagged_words:
    word_tags.append(tw[0]+"_"+tw[1])
    
tagged_paragraph = ' '.join(word_tags)

### 6.Named Entity Recognition

In [None]:
paragrapgh1 = "Fouder of Wavy AI Research Foundation is from Pakistan"

In [None]:
words = nltk.word_tokenize(paragraph1)

In [None]:
tagged_words = nltk.pos_tag(words)

In [None]:
namedEnt = nltk.ne_chunk(tagged_words)
namedEnt.draw()

### 7.Building a Bags of Word Model

In [None]:
# import libraries
import nltk
import re
import heapq
import numpy as np

In [None]:
paragraph = """Tell General Howard I know his heart. What he told me before, I have it in my heart. I am tired of fighting.
Our Chiefs are killed; Looking Glass is dead, Ta Hool Hool Shute is dead. The old men are all dead. It is the young men who
say yes or no. He who led on the young men is dead. It is cold, and we have no blankets; the little children are freezing 
to death. My people, some of them, have run away to the hills, and have no blankets, no food. No one knows where they are 
– perhaps freezing to death. I want to have time to look for my children, and see how many of them I can find. Maybe I 
shall find them among the dead. Hear me, my Chiefs! I am tired; my heart is sick and sad. From where the sun now stands 
I will fight no more forever."""

In [None]:
dataset= nltk.sent_tokenize(paragraph)

In [None]:
#  Clean the text
for i in range(len(dataset)):
    dataset[i] = dataset[i].lower()
    dataset[i] = re.sub(r'\W',' ', dataset[i])
    dataset[i] = re.sub(r'\s+',' ', dataset[i])

In [None]:
# Creating the histogram
word2count = {}
for data in dataset:
    words = nltk.word_tokenize(data)
    for word in words:
        if word not in word2count.keys():
            word2count[word] = 1
        else:
            word2count[word] += 1

In [None]:
# take a 100 most frequent words from above dictionaries
freq_words = heapq.nlargest(100, word2count, key= word2count.get)

In [None]:
# finally building our BOW model
X = []
for data in dataset:
    vector = []
    for word in freq_words:
        if word in nltk.word_tokenize(data):
            vector.append(1)
        else:
            vector.append(0)
    X.append(vector)

In [None]:
X = np.asarray(X)