In [0]:
# Importing necessary library
import pandas as pd
import numpy as np
import nltk
import os
import nltk.corpus

In [0]:
# sample text for performing tokenization
text = "I am a good boy. I love dogs very much. Dogs also love me. I really want a pet dog."

In [0]:
# THE FIRST STEP OF NLP IS TOKENISATION

#BREAKING OF SENTENCES TO BASIC UNITS

In [0]:
# importing word_tokenize from nltk
from nltk.tokenize import word_tokenize

In [4]:
# #In case the punkt package is missing 
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [5]:
# Passing the string text into word tokenize for breaking the sentences
token = word_tokenize(text)
token

['I',
 'am',
 'a',
 'good',
 'boy',
 '.',
 'I',
 'love',
 'dogs',
 'very',
 'much',
 '.',
 'Dogs',
 'also',
 'love',
 'me',
 '.',
 'I',
 'really',
 'want',
 'a',
 'pet',
 'dog',
 '.']

In [6]:
# finding the frequency distinct in the tokens
# Importing FreqDist library from nltk and passing token into FreqDist
from nltk.probability import FreqDist
fdist = FreqDist(token)
fdist

FreqDist({'.': 4,
          'Dogs': 1,
          'I': 3,
          'a': 2,
          'also': 1,
          'am': 1,
          'boy': 1,
          'dog': 1,
          'dogs': 1,
          'good': 1,
          'love': 2,
          'me': 1,
          'much': 1,
          'pet': 1,
          'really': 1,
          'very': 1,
          'want': 1})

In [7]:
# To find the frequency of top 10 words
fdist1 = fdist.most_common(10)
fdist1

[('.', 4),
 ('I', 3),
 ('a', 2),
 ('love', 2),
 ('am', 1),
 ('good', 1),
 ('boy', 1),
 ('dogs', 1),
 ('very', 1),
 ('much', 1)]

In [0]:
# SECOND STEP BEING STEMMING

In [0]:
# STEMMING BREAKS WORDS TO THEIR ROOT FORM. LIKE DOGS TO DOG.

# THERE ARE TWO FORMS OF STEMMING. 
#1. PORTER STEMMING
#2. LANCESTER STEMMING

In [9]:
# Importing Porterstemmer from nltk library
from nltk.stem import PorterStemmer
pst = PorterStemmer()
pst.stem("caring")

'care'

In [10]:
#Testint the Porter Stemmer
stm = ["going", "go","gone"]
for word in stm :
   print(word+ ":" +pst.stem(word))

going:go
go:go
gone:gone


In [11]:
# Checking for the list of words
stm = ["dogs", "Dogs"]
for word in stm :
   print(word+ ":" +pst.stem(word))

dogs:dog
Dogs:dog


In [12]:
# Importing LancasterStemmer from nltk
from nltk.stem import LancasterStemmer
lst = LancasterStemmer()
stm1 = ["giving", "given", "given", "gave"]
for word in stm1 :
 print(word+ ":" +lst.stem(word))

giving:giv
given:giv
given:giv
gave:gav


In [14]:
stm1 = ["cares", "care","caring"]
for word in stm1 :
 print(word+ ":" +lst.stem(word))

cares:car
care:car
caring:car


In [0]:
#THE THIRD STEP IS LEMMATIZATION  
# LEMMATIZATION ALSO MAPS A WORD TO ITS ROOT FORM,SOMEHOW SIMILIAR TO STEMMING

In [0]:
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

In [18]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [22]:
print("cares :", lemmatizer.lemmatize("cares")) 
print("corpora :", lemmatizer.lemmatize("corpora"))

cares : care
corpora : corpus


In [0]:
# importing stopwors from nltk library
from nltk.corpus import stopwords
a = set(stopwords.words('english'))

In [24]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [0]:
text=text.lower()

In [27]:
text

'i am a good boy. i love dogs very much. dogs also love me. i really want a pet dog.'

In [0]:
token=word_tokenize(text)

In [30]:
token

['i',
 'am',
 'a',
 'good',
 'boy',
 '.',
 'i',
 'love',
 'dogs',
 'very',
 'much',
 '.',
 'dogs',
 'also',
 'love',
 'me',
 '.',
 'i',
 'really',
 'want',
 'a',
 'pet',
 'dog',
 '.']

In [33]:
stopwords=[w for w in token if w not in a]
print(stopwords)

['good', 'boy', '.', 'love', 'dogs', 'much', '.', 'dogs', 'also', 'love', '.', 'really', 'want', 'pet', 'dog', '.']


In [37]:
#TAGGING SENTENCES WITH THE APPROPRIATE PARTS OF SPEECH (POS TAGGING)
print(nltk.pos_tag(token))

[('i', 'NN'), ('am', 'VBP'), ('a', 'DT'), ('good', 'JJ'), ('boy', 'NN'), ('.', '.'), ('i', 'JJ'), ('love', 'VBP'), ('dogs', 'NNS'), ('very', 'RB'), ('much', 'RB'), ('.', '.'), ('dogs', 'NNS'), ('also', 'RB'), ('love', 'VBP'), ('me', 'PRP'), ('.', '.'), ('i', 'VB'), ('really', 'RB'), ('want', 'VB'), ('a', 'DT'), ('pet', 'JJ'), ('dog', 'NN'), ('.', '.')]


In [36]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [0]:
#NAMED ENTITY RECOGNITION

In [0]:
text1='My name is Amartya Dutta and I am doing a challenge at IEEE , California.'

In [40]:
nltk.download('maxent_ne_chunker')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.


True

In [42]:
nltk.download('words')

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [53]:
#importing chunk library from nltk
from nltk import ne_chunk
# tokenize and POS Tagging before doing chunk
token1 = word_tokenize(text1)
tags = nltk.pos_tag(token1)
chunk = ne_chunk(tags)
print(chunk)

(S
  My/PRP$
  name/NN
  is/VBZ
  (PERSON Amartya/NNP Dutta/NNP)
  and/CC
  I/PRP
  am/VBP
  doing/VBG
  a/DT
  challenge/NN
  at/IN
  (ORGANIZATION IEEE/NNP)
  ,/,
  (GPE California/NNP)
  ./.)
