## Natural Language Processing Core

In [1]:
import nltk

### 1.Tokenizing Words and Sentences

In [12]:
paragraph = """Tell General Howard I know his heart. What he told me before, I have it in my heart. I am tired of fighting.
Our Chiefs are killed; Looking Glass is dead, Ta Hool Hool Shute is dead. The old men are all dead. It is the young men who
say yes or no. He who led on the young men is dead. It is cold, and we have no blankets; the little children are freezing 
to death. My people, some of them, have run away to the hills, and have no blankets, no food. No one knows where they are 
– perhaps freezing to death. I want to have time to look for my children, and see how many of them I can find. Maybe I 
shall find them among the dead. Hear me, my Chiefs! I am tired; my heart is sick and sad. From where the sun now stands 
I will fight no more forever."""

In [4]:
sentences = nltk.sent_tokenize(paragraph)   # seperate each sentences of our paragraph
sentences

['Tell General Howard I know his heart.',
 'What he told me before, I have it in my heart.',
 'I am tired of fighting.',
 'Our Chiefs are killed; Looking Glass is dead, Ta Hool Hool Shute is dead.',
 'The old men are all dead.',
 'It is the young men who\nsay yes or no.',
 'He who led on the young men is dead.',
 'It is cold, and we have no blankets; the little children are freezing \nto death.',
 'My people, some of them, have run away to the hills, and have no blankets, no food.',
 'No one knows where they are \n– perhaps freezing to death.',
 'I want to have time to look for my children, and see how many of them I can find.',
 'Maybe I \nshall find them among the dead.',
 'Hear me, my Chiefs!',
 'I am tired; my heart is sick and sad.',
 'From where the sun now stands \nI will fight no more forever.']

In [5]:
words = nltk.word_tokenize(paragraph)       # seperate each words of sentences of our paragraph

In [6]:
words

['Tell',
 'General',
 'Howard',
 'I',
 'know',
 'his',
 'heart',
 '.',
 'What',
 'he',
 'told',
 'me',
 'before',
 ',',
 'I',
 'have',
 'it',
 'in',
 'my',
 'heart',
 '.',
 'I',
 'am',
 'tired',
 'of',
 'fighting',
 '.',
 'Our',
 'Chiefs',
 'are',
 'killed',
 ';',
 'Looking',
 'Glass',
 'is',
 'dead',
 ',',
 'Ta',
 'Hool',
 'Hool',
 'Shute',
 'is',
 'dead',
 '.',
 'The',
 'old',
 'men',
 'are',
 'all',
 'dead',
 '.',
 'It',
 'is',
 'the',
 'young',
 'men',
 'who',
 'say',
 'yes',
 'or',
 'no',
 '.',
 'He',
 'who',
 'led',
 'on',
 'the',
 'young',
 'men',
 'is',
 'dead',
 '.',
 'It',
 'is',
 'cold',
 ',',
 'and',
 'we',
 'have',
 'no',
 'blankets',
 ';',
 'the',
 'little',
 'children',
 'are',
 'freezing',
 'to',
 'death',
 '.',
 'My',
 'people',
 ',',
 'some',
 'of',
 'them',
 ',',
 'have',
 'run',
 'away',
 'to',
 'the',
 'hills',
 ',',
 'and',
 'have',
 'no',
 'blankets',
 ',',
 'no',
 'food',
 '.',
 'No',
 'one',
 'knows',
 'where',
 'they',
 'are',
 '–',
 'perhaps',
 'freezing',
 '

### 2.Stemming
"Stemming is process of reducing infected or derived words to their word stem, base or root form"
###### Words representation may not have any meaning.
##### Takes less time
##### Use stemming when meaning of words are not important for analysis. example spam detection

In [7]:
# import stemming from nltk
from nltk.stem import PorterStemmer

In [8]:
# init the stemming
stemmer = PorterStemmer()

In [12]:
#stemming
for i in range(len(sentences)):
    words = nltk.word_tokenize(sentences[i])
    newwords = [stemmer.stem(word) for word in words]
    sentences[i]= ' '.join(newwords)
    
sentences

['tell gener howard I know hi heart .',
 'what he told me befor , I have it in my heart .',
 'I am tire of fight .',
 'our chief are kill ; look glass is dead , Ta hool hool shute is dead .',
 'the old men are all dead .',
 'It is the young men who say ye or no .',
 'He who led on the young men is dead .',
 'It is cold , and we have no blanket ; the littl children are freez to death .',
 'My peopl , some of them , have run away to the hill , and have no blanket , no food .',
 'No one know where they are – perhap freez to death .',
 'I want to have time to look for my children , and see how mani of them I can find .',
 'mayb I shall find them among the dead .',
 'hear me , my chief !',
 'I am tire ; my heart is sick and sad .',
 'from where the sun now stand I will fight no more forev .']

### 3.Lemmatization
"Same as Stemming but intermediate representation/root form has a meaning"
###### Words representation  have  meaning.
##### Takes more time than Stemming
##### Use Lemmatization when meaning of words are  important for analysis. example Question answer application

In [13]:
# import Lemmatization from NLTK
from nltk.stem import WordNetLemmatizer

In [14]:
sentences = nltk.sent_tokenize(paragraph)   # seperate each sentences of our paragraph

In [15]:
# init the Lemmatization model
lemmatizer = WordNetLemmatizer()

In [16]:
# Lemmatization
for i in range(len(sentences)):
    words = nltk.word_tokenize(sentences[i])
    newwords = [lemmatizer.lemmatize(word) for word in words]
    sentences[i] = ''.join(newwords)
sentences

['TellGeneralHowardIknowhisheart.',
 'Whathetoldmebefore,Ihaveitinmyheart.',
 'Iamtiredoffighting.',
 'OurChiefsarekilled;LookingGlassisdead,TaHoolHoolShuteisdead.',
 'Theoldmenarealldead.',
 'Itistheyoungmenwhosayyesorno.',
 'Hewholedontheyoungmenisdead.',
 'Itiscold,andwehavenoblanket;thelittlechildarefreezingtodeath.',
 'Mypeople,someofthem,haverunawaytothehill,andhavenoblanket,nofood.',
 'Nooneknowwheretheyare–perhapsfreezingtodeath.',
 'Iwanttohavetimetolookformychild,andseehowmanyofthemIcanfind.',
 'MaybeIshallfindthemamongthedead.',
 'Hearme,myChiefs!',
 'Iamtired;myheartissickandsad.',
 'FromwherethesunnowstandIwillfightnomoreforever.']

### 4.Stop Word Removal using nltk

In [2]:
from nltk.corpus import stopwords

In [13]:
sentences = nltk.sent_tokenize(paragraph)   # seperate each sentences of our paragraph

In [14]:
# stop word removal
for i in range(len(sentences)):
    words = nltk.word_tokenize(sentences[i])
    newwords = [word for word in words if word not in stopwords.words('english')]
    sentences[i] = ' '.join(newwords)
sentences

['Tell General Howard I know heart .',
 'What told , I heart .',
 'I tired fighting .',
 'Our Chiefs killed ; Looking Glass dead , Ta Hool Hool Shute dead .',
 'The old men dead .',
 'It young men say yes .',
 'He led young men dead .',
 'It cold , blankets ; little children freezing death .',
 'My people , , run away hills , blankets , food .',
 'No one knows – perhaps freezing death .',
 'I want time look children , see many I find .',
 'Maybe I shall find among dead .',
 'Hear , Chiefs !',
 'I tired ; heart sick sad .',
 'From sun stands I fight forever .']

### 5.Parts of Speech Tagging

In [19]:
paragraph = 'hi! how are you. where from you. '

In [20]:
words = nltk.word_tokenize(paragraph)

In [21]:
tagged_words =nltk.pos_tag(words)

In [22]:
word_tags = []
for tw in tagged_words:
    word_tags.append(tw[0]+"_"+tw[1])
    
tagged_paragraph = ' '.join(word_tags)
tagged_paragraph

'hi_NN !_. how_WRB are_VBP you_PRP ._. where_WRB from_IN you_PRP ._.'

### 6.Named Entity Recognition

In [23]:
paragraph1 = "Fouder of Wavy AI Research Foundation is from Pakistan"

In [24]:
words = nltk.word_tokenize(paragraph1)
words

['Fouder',
 'of',
 'Wavy',
 'AI',
 'Research',
 'Foundation',
 'is',
 'from',
 'Pakistan']

In [25]:
tagged_words = nltk.pos_tag(words)
tagged_words

[('Fouder', 'NN'),
 ('of', 'IN'),
 ('Wavy', 'NNP'),
 ('AI', 'NNP'),
 ('Research', 'NNP'),
 ('Foundation', 'NNP'),
 ('is', 'VBZ'),
 ('from', 'IN'),
 ('Pakistan', 'NNP')]

In [None]:
namedEnt = nltk.ne_chunk(tagged_words)
namedEnt.draw()

### 7.Building a Bags of Word Model

In [33]:
# import libraries
import nltk
import re
import heapq
import numpy as np

In [34]:
paragraph = """Tell General Howard I know his heart. What he told me before, I have it in my heart. I am tired of fighting.
Our Chiefs are killed; Looking Glass is dead, Ta Hool Hool Shute is dead. The old men are all dead. It is the young men who
say yes or no. He who led on the young men is dead. It is cold, and we have no blankets; the little children are freezing 
to death. My people, some of them, have run away to the hills, and have no blankets, no food. No one knows where they are 
– perhaps freezing to death. I want to have time to look for my children, and see how many of them I can find. Maybe I 
shall find them among the dead. Hear me, my Chiefs! I am tired; my heart is sick and sad. From where the sun now stands 
I will fight no more forever."""

In [35]:
dataset= nltk.sent_tokenize(paragraph)

In [48]:
#  Clean the text
for i in range(len(dataset)):
    dataset[i] = dataset[i].lower()
    dataset[i] = re.sub(r'\W',' ', dataset[i])
    dataset[i] = re.sub(r'\s+',' ', dataset[i])

In [49]:
# Creating the histogram
word2count = {}
for data in dataset:
    words = nltk.word_tokenize(data)
    for word in words:
        if word not in word2count.keys():
            word2count[word] = 1
        else:
            word2count[word] += 1

In [50]:
# take a 100 most frequent words from above dictionaries
freq_words = heapq.nlargest(100, word2count, key= word2count.get)

In [51]:
# finally building our BOW model
X = []
for data in dataset:
    vector = []
    for word in freq_words:
        if word in nltk.word_tokenize(data):
            vector.append(1)
        else:
            vector.append(0)
    X.append(vector)

In [68]:
Bag_of_Word_Model = np.asarray(X)
Bag_of_Word_Model

array([[1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 1, ..., 0, 0, 0],
       [1, 1, 0, ..., 1, 1, 1]])

### 8.Building the TF-IDF Model

In [53]:
# import libraries
import nltk
import re
import heapq
import numpy as np

In [54]:
paragraph = """Tell General Howard I know his heart. What he told me before, I have it in my heart. I am tired of fighting.
Our Chiefs are killed; Looking Glass is dead, Ta Hool Hool Shute is dead. The old men are all dead. It is the young men who
say yes or no. He who led on the young men is dead. It is cold, and we have no blankets; the little children are freezing 
to death. My people, some of them, have run away to the hills, and have no blankets, no food. No one knows where they are 
– perhaps freezing to death. I want to have time to look for my children, and see how many of them I can find. Maybe I 
shall find them among the dead. Hear me, my Chiefs! I am tired; my heart is sick and sad. From where the sun now stands 
I will fight no more forever."""

In [55]:
dataset= nltk.sent_tokenize(paragraph)

In [56]:
#  Clean the text
for i in range(len(dataset)):
    dataset[i] = dataset[i].lower()
    dataset[i] = re.sub(r'\W',' ', dataset[i])
    dataset[i] = re.sub(r'\s+',' ', dataset[i])

In [57]:
# Creating the histogram
word2count = {}
for data in dataset:
    words = nltk.word_tokenize(data)
    for word in words:
        if word not in word2count.keys():
            word2count[word] = 1
        else:
            word2count[word] += 1

In [58]:
# take a 100 most frequent words from above dictionaries
freq_words = heapq.nlargest(100, word2count, key= word2count.get)

In [62]:
# IDF Matrix
word_idfs = {}
for word in  freq_words:
    doc_count = 0
    for data in dataset:
        if word in nltk.word_tokenize(data):
            doc_count += 1
    word_idfs[word] = np.log((len(dataset)/doc_count)+1)

In [63]:
# TF Matrix
tf_matrix = {}
for word in freq_words:
    doc_tf = []
    for data in dataset:
        frequency = 0
        for w in nltk.word_tokenize(data):
            if w== word:
                frequency += 1
        tf_word = frequency/len(nltk.word_tokenize(data))
        doc_tf.append(tf_word)
    tf_matrix[word] = doc_tf

In [69]:
# TF-IDF Calculation
tfidf_matrix = []
for word in tf_matrix.keys():
    tfidf = []
    for value in tf_matrix[word]:
        score = value * word_idfs[word]
        tfidf.append(score)
    tfidf_matrix.append(tfidf)

In [72]:
X = np.asarray(tfidf_matrix)

In [73]:
TF_IDF_Model = np.transpose(X)
TF_IDF_Model

array([[0.16359033, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.10410294, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.22902646, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.12723692, 0.        , 0.15403271, ..., 0.        , 0.        ,
        0.        ],
       [0.09542769, 0.09542769, 0.        , ..., 0.23104906, 0.23104906,
        0.23104906]])

### 9.Building the N-Gram Model

#### 1. N-Gram Modeling - Character Grams

In [13]:
# import library
import random

In [14]:
# data
text= """There are many different types of essay writing and, depending on what you are writing for, the format and approach 
can change. We’ve designed our Essay Writing Center to provide you with key tips and pointers so that you can get started in 
the right direction – no matter if your essay is designed to persuade the college admissions team that you’re the right 
candidate, if you are making your case to win a scholarship, or simply if you are looking for help with your homework There are
many different types of essay writing and, depending on what you are writing for, the format and approach can change. We’ve 
designed our Essay Writing Center to provide you with key tips and pointers so that you can get started in the right direction 
– no matter if your essay is designed to persuade the college admissions team that you’re the right candidate, if you are 
making your case to win a scholarship, or simply if you are looking for help with your homework"""

In [15]:
# creating the n-gram
n = 3
ngrams = {}
for i in range(len(text)-n):
    gram = text[i:i+n]  #text[0:3] = The
    if gram not in ngrams.keys():
        ngrams[gram] = []
    ngrams[gram].append(text[i+n]) #text[0+3] = text[3] = r

In [16]:
# Testing our n-gram model
currentGram = text[0:n]
result = currentGram
for i in range(100):
    if currentGram not in ngrams.keys():
        break
    possibilities = ngrams[currentGram]
    nextItem = possibilities[random.randrange(len(possibilities))]
    result += nextItem
    currentGram = result[len(result)-n: len(result)]
    
print(result)

There looking for, the right 
can change. We’ve designed in a scholarship, or simply is dependidate, if


#### 2. N-Gram Modeling - Word Grams

In [19]:
# import librararies
import random
import nltk

In [20]:
# sample data
text= """There are many different types of essay writing and, depending on what you are writing for, the format and approach 
can change. We’ve designed our Essay Writing Center to provide you with key tips and pointers so that you can get started in 
the right direction – no matter if your essay is designed to persuade the college admissions team that you’re the right 
candidate, if you are making your case to win a scholarship, or simply if you are looking for help with your homework There are
many different types of essay writing and, depending on what you are writing for, the format and approach can change. We’ve 
designed our Essay Writing Center to provide you with key tips and pointers so that you can get started in the right direction 
– no matter if your essay is designed to persuade the college admissions team that you’re the right candidate, if you are 
making your case to win a scholarship, or simply if you are looking for help with your homework"""

In [24]:
# creating the n-gram
n = 3
ngrams = {}
words = nltk.word_tokenize(text)
for i in range(len(words)-n):
    gram = ' '.join(words[i:i+n])
    if gram not in ngrams.keys():
        ngrams[gram] = []
    ngrams[gram].append(words[i+n]) 

In [27]:
# Testing our n-gram model
currentGram = ' '.join(words[0:n])
result = currentGram
for i in range(30):
    if currentGram not in ngrams.keys():
        break
    possibilities = ngrams[currentGram]
    nextItem = possibilities[random.randrange(len(possibilities))]
    result += ' '+nextItem
    rwords = nltk.word_tokenize(result)
    currentGram = ' '.join(rwords[len(rwords)-n:len(rwords)])
    
print(result)

There are many different types of essay writing and , depending on what you are writing for , the format and approach can change . We ’ ve designed our Essay Writing Center


### 10.Latent Semantic Analysis using Scikit Learn and nltk

In [19]:
# import the Libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import nltk

In [4]:
# sample data
dataset = ["The amount of population is increase day by day",
           "The concert was just a great",
          "I love to see Gordon Ramsay cook",
           "Google introducing a new technology",
          "AI Robot are example of AI technology present today",
          "All of us singing in the concert",
          "We have launch compaigns to stop popullation and Global warming"]

In [5]:
# convert into lowercase
dataset = [line.lower() for line in dataset]

In [7]:
# convert into TF-IDF model
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(dataset)

In [8]:
# lSA
lsa = TruncatedSVD(n_components = 4, n_iter = 100)
lsa.fit(X)

TruncatedSVD(algorithm='randomized', n_components=4, n_iter=100,
       random_state=None, tol=0.0)

In [9]:
row1 = lsa.components_[0]

In [18]:
terms = vectorizer.get_feature_names()
for i, comp in enumerate(lsa.components_):
    componentTerms = zip(terms, comp)
    sortedTerms = sorted(componentTerms, key=lambda x:x[1], reverse=True)
    sortedTerms = sortedTerms[:10]
    print("\nConcept",i, ":")
    for term in sortedTerms:
        print(term)



Concept 0 :
('the', 0.4259712396390564)
('concert', 0.39998286562925545)
('of', 0.2938250158823831)
('great', 0.24619048735802163)
('just', 0.24619048735802163)
('was', 0.24619048735802163)
('day', 0.23699958677792862)
('all', 0.23566672236585795)
('in', 0.23566672236585795)
('singing', 0.23566672236585795)

Concept 1 :
('technology', 0.4518449046133094)
('ai', 0.3979413369682742)
('google', 0.3453644610279633)
('introducing', 0.3453644610279633)
('new', 0.3453644610279633)
('are', 0.19897066848413705)
('example', 0.19897066848413705)
('present', 0.19897066848413705)
('robot', 0.19897066848413705)
('today', 0.19897066848413705)

Concept 2 :
('to', 0.4157884439670068)
('cook', 0.2835916579351077)
('gordon', 0.2835916579351077)
('love', 0.2835916579351077)
('ramsay', 0.2835916579351077)
('see', 0.2835916579351077)
('and', 0.2173064471129242)
('compaigns', 0.2173064471129242)
('global', 0.2173064471129242)
('have', 0.2173064471129242)

Concept 3 :
('day', 0.5130530624437486)
('amount', 0

In [20]:
# concept word
concept_words = {}
terms = vectorizer.get_feature_names()
for i, comp in enumerate(lsa.components_):
    componentTerms = zip(terms, comp)
    sortedTerms = sorted(componentTerms, key=lambda x:x[1], reverse=True)
    sortedTerms = sortedTerms[:10]
    concept_words["Concept "+str(i)] = sortedTerms

In [25]:
#now concept are belong from which categories take the probablities
for key in concept_words.keys():
    sentence_score = []
    for sentence in dataset:
        words= nltk.word_tokenize(sentence)
        score = 0
        for word in words:
            for word_with_score in concept_words[key]:
                if word == word_with_score[0]:
                    score += word_with_score[1]
        sentence_score.append(score)
    print("\n"+key+":")
    for sentence_scores in sentence_score:
        print(sentence_scores)


Concept 0:
1.1937954290772967
1.5645255673423768
0
0
0.2938250158823831
1.8267792882482687
0

Concept 1:
0
0
0
1.4879382876971992
2.242580920970543
0
0

Concept 2:
0
0
1.8337467336425453
0
0
0
1.2850142324187037

Concept 3:
2.4873933898695775
0
0
0
0.5295107205666278
0.17865460887270906
0


### 11.Synonyms and Antonyms using nltk

In [1]:
 # find Synonyms and Antonyms of word
from nltk.corpus import wordnet

synonyms = []
antonyms = []

for syn in wordnet.synsets("good"):
    for s in syn.lemmas():
        synonyms.append(s.name())
        for a in s.antonyms():
            antonyms.append(a.name())
            
print(set(synonyms))
print(set(antonyms))

{'respectable', 'in_force', 'in_effect', 'full', 'secure', 'good', 'commodity', 'beneficial', 'proficient', 'practiced', 'safe', 'undecomposed', 'serious', 'trade_good', 'effective', 'ripe', 'honorable', 'estimable', 'skilful', 'skillful', 'expert', 'salutary', 'adept', 'honest', 'goodness', 'sound', 'unspoilt', 'dependable', 'just', 'upright', 'right', 'near', 'unspoiled', 'thoroughly', 'soundly', 'dear', 'well'}
{'evilness', 'badness', 'bad', 'ill', 'evil'}


### 12.Word Negation Tracking

In [9]:
# import librararies
import nltk
from nltk.corpus import wordnet

In [10]:
# sample data
sentence = "I was not happy with the team's performance"

In [11]:
# word tokenize
words = nltk.word_tokenize(sentence)

In [12]:
# convert our sample data into I was not_happy with the team's performance
new_words = []
temp_word = ""
for word in  words:
    if word == "not":
        temp_word = "not_"
    elif temp_word == "not_":
        word = temp_word + word  # not_happy
        temp_word = ""
    if word != "not":
        new_words.append(word)
sentence = ' '.join(new_words) 
sentence

"I was not_happy with the team 's performance"

In [14]:
# Now convert into some meaning of negation like  I was unhappy with the team's performance
new_words = []
temp_word = ""
for word in  words:
    antonyms = []
    if word == "not":
        temp_word = "not_"
    elif temp_word == "not_":
        for syn in wordnet.synsets(word):
            for s in syn.lemmas():
                for a in s.antonyms():
                    antonyms.append(a.name())
        if len(antonyms) >= 1:
            word = antonyms[0]
        else:
            word = temp_word + word
        temp_word = ""
    if word != "not":
        new_words.append(word)
sentence = ' '.join(new_words) 
sentence

"I was unhappy with the team 's performance"