# Ex -1 
Perform the following task using NLTK: Tokenize and tag some text, identify named entities,
display a parse tree, and find the ambiguity of the sentence using parse tree.

# Tokenize the sentence using PunktSentenceTokenizer

In [1]:
#Importing the required modules
import nltk
from nltk.tokenize import PunktSentenceTokenizer
from nltk.corpus import webtext

In [2]:
#Create a String variable to tokenize
raw_text = """Hi! How are you?
This is my first nlp lab
"""

In [3]:
#Tokenizing the raw text
tokens = PunktSentenceTokenizer(raw_text)
tokens = tokens.tokenize(raw_text)

In [4]:
#Printing the Tokens
tokens

['Hi!', 'How are you?', 'This is my first nlp lab']

# Removing Stopwords form the text

In [5]:
#import required modules
from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer

In [6]:
#Extracting the English stopwords form the nltk corpus
en_stopwords = set(stopwords.words('english'))
en_stopwords

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [7]:
#Tokenizing the raw text to words using WordPunctTokenizer
tokenizer = WordPunctTokenizer()
words = tokenizer.tokenize(raw_text)
words

['Hi',
 '!',
 'How',
 'are',
 'you',
 '?',
 'This',
 'is',
 'my',
 'first',
 'nlp',
 'lab']

In [8]:
#Removing the stopwords from the 
[word for word in words if word not in en_stopwords]

['Hi', '!', 'How', '?', 'This', 'first', 'nlp', 'lab']

# Stemming & Lemmatization

In [9]:
#Import the required modules
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [10]:
#Creating the Stemmer and Lemmatizer
word_stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [11]:
# Printing the results of stemming 
print("writing =>", word_stemmer.stem('writing'))
print("believes =>", word_stemmer.stem('believes'))

writing => write
believes => believ


In [12]:
# Printing the results using lemmatization
print("writing =",lemmatizer.lemmatize(' writing '))
print("believes =>",lemmatizer.lemmatize(' believes '))

writing =  writing 
believes =>  believes 


# Tagging tokens with its parts of speech: using nltk.pos_tag

In [19]:
#importing modules
import nltk
from nltk import sent_tokenize, word_tokenize, pos_tag, ne_chunk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import gutenberg
from nltk.tree import Tree
from nltk.parse import RecursiveDescentParser
import string


sentences = gutenberg.raw('austen-emma.txt')[:999]
sent_tokens = sent_tokenize(sentences)

word_tokens = []
for sentence in sent_tokens :
    translator = str.maketrans('', '', string.punctuation)
    sentence = sentence.translate(translator)
    word_tokens.append(word_tokenize(sentence))

In [20]:
tags = []
for i, tokens in enumerate(word_tokens) :
    tags += [pos_tag(tokens)] 
    print(f"{i+1}. {tags[-1]}\n")#Named entities recogonization : using nltk.ne_chunk

1. [('Emma', 'NN'), ('by', 'IN'), ('Jane', 'NNP'), ('Austen', 'NNP'), ('1816', 'CD'), ('VOLUME', 'NNP'), ('I', 'PRP'), ('CHAPTER', 'VBP'), ('I', 'PRP'), ('Emma', 'NNP'), ('Woodhouse', 'NNP'), ('handsome', 'VBD'), ('clever', 'NN'), ('and', 'CC'), ('rich', 'JJ'), ('with', 'IN'), ('a', 'DT'), ('comfortable', 'JJ'), ('home', 'NN'), ('and', 'CC'), ('happy', 'JJ'), ('disposition', 'NN'), ('seemed', 'VBD'), ('to', 'TO'), ('unite', 'VB'), ('some', 'DT'), ('of', 'IN'), ('the', 'DT'), ('best', 'JJS'), ('blessings', 'NNS'), ('of', 'IN'), ('existence', 'NN'), ('and', 'CC'), ('had', 'VBD'), ('lived', 'VBN'), ('nearly', 'RB'), ('twentyone', 'CD'), ('years', 'NNS'), ('in', 'IN'), ('the', 'DT'), ('world', 'NN'), ('with', 'IN'), ('very', 'RB'), ('little', 'JJ'), ('to', 'TO'), ('distress', 'VB'), ('or', 'CC'), ('vex', 'VB'), ('her', 'PRP$')]

2. [('She', 'PRP'), ('was', 'VBD'), ('the', 'DT'), ('youngest', 'JJS'), ('of', 'IN'), ('the', 'DT'), ('two', 'CD'), ('daughters', 'NNS'), ('of', 'IN'), ('a', 'DT')

In [22]:
nltk.download('maxent_ne_chunker')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping chunkers\maxent_ne_chunker.zip.


True

In [25]:
nltk.download('words')

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\words.zip.


True

In [26]:
ne = []
for i, tag in enumerate(tags) :
    ne += [ne_chunk(tag)] 
    for subtree in ne[-1] :
        if isinstance(subtree, Tree) :
            words = [word for word,tag in subtree.leaves()]
            label = subtree.label()
            print(f"ENTITY : {' '.join(words)}, LABEL : {label}")

ENTITY : Emma, LABEL : GPE
ENTITY : Jane Austen, LABEL : PERSON
ENTITY : Emma Woodhouse, LABEL : PERSON
ENTITY : Miss Taylor, LABEL : PERSON
ENTITY : Emma, LABEL : GPE
ENTITY : Miss Taylor, LABEL : PERSON


# Display a parse tree & Find the ambiguity of the sentence using parse tree.

In [27]:
import nltk
sentence = [
   ("a", "DT"),
   ("clever", "JJ"),
   ("fox","NN"),
   ("was","VBP"),
   ("jumping","VBP"),
   ("over","IN"),
   ("the","DT"),
   ("wall","NN")
]
grammar = "NP:{<DT>?<JJ>*<NN>}" 
Reg_parser = nltk.RegexpParser(grammar)
Reg_parser.parse(sentence)
Output = Reg_parser.parse(sentence)
Output.draw()

# Removal of repeating characters

In [28]:
import re
from nltk.corpus import wordnet

class Rep_word_removal(object):
    def __init__(self):
        self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
        self.repl = r'\1\2\3'
    def replace(self, word):
        if wordnet.synsets(word):
            return word
        repl_word = self.repeat_regexp.sub(self.repl, word)
        if repl_word != word:
            return self.replace(repl_word)
        else:
            return repl_word

In [29]:
rep_word = Rep_word_removal()
rep_word.replace ("Hiiiiiiiiiiiiiiiiiiiii")

'Hi'

In [30]:
rep_word.replace("Hellooooooooooooooo")

'Hello'

# Using WordNet

In [31]:
#Importing the required module
from nltk.corpus import wordnet as wn

In [32]:
#Creating Synset instances
syn = wn.synsets('dog')[0]
syn.name()

'dog.n.01'

In [33]:
#define the word
syn.definition()

'a member of the genus Canis (probably descended from the common wolf) that has been domesticated by man since prehistoric times; occurs in many breeds'

In [34]:
#Example sentance for the word
syn.examples()

['the dog barked all night']

In [35]:
#Getting Synonyms for words
syn.hypernyms()

[Synset('canine.n.02'), Synset('domestic_animal.n.01')]

In [36]:
#Syn of Syn
syn.hypernyms()[0].hyponyms()

[Synset('bitch.n.04'),
 Synset('dog.n.01'),
 Synset('fox.n.01'),
 Synset('hyena.n.01'),
 Synset('jackal.n.01'),
 Synset('wild_dog.n.01'),
 Synset('wolf.n.01')]

In [37]:
syn.hypernyms()[1].hyponyms()

[Synset('dog.n.01'),
 Synset('domestic_cat.n.01'),
 Synset('feeder.n.01'),
 Synset('head.n.02'),
 Synset('stocker.n.01'),
 Synset('stray.n.01')]

In [38]:
# Root word of the syn
syn.root_hypernyms()

[Synset('entity.n.01')]

In [39]:
#finding lemmas
lemmas = syn.lemmas()
len(lemmas)

3

In [40]:
print(lemmas[0].name())
print(lemmas[1].name())
print(lemmas[2].name())

dog
domestic_dog
Canis_familiaris


In [41]:
#Finding Antonym for a word
syn = wn.synset('bad.n.01')
antonym1 = syn.lemmas()[0].antonyms()[0]
antonym1.name()

'good'

In [42]:
#Define the antonym
antonym1.synset().definition()

'that which is pleasing or valuable or useful'

# The End