# The NLP Grimoire

## Chapter 1 - An Introduction to NLP

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](#)

In [1]:
!pip install polyglot PyICU pycld2 morfessor
!pip install stanza

Collecting polyglot
  Downloading polyglot-16.7.4.tar.gz (126 kB)
[K     |████████████████████████████████| 126 kB 18.0 MB/s 
[?25hCollecting PyICU
  Downloading PyICU-2.8.tar.gz (299 kB)
[K     |████████████████████████████████| 299 kB 43.3 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting pycld2
  Downloading pycld2-0.41.tar.gz (41.4 MB)
[K     |████████████████████████████████| 41.4 MB 1.3 MB/s 
[?25hCollecting morfessor
  Downloading Morfessor-2.0.6-py3-none-any.whl (35 kB)
Building wheels for collected packages: polyglot, PyICU, pycld2
  Building wheel for polyglot (setup.py) ... [?25l[?25hdone
  Created wheel for polyglot: filename=polyglot-16.7.4-py2.py3-none-any.whl size=52577 sha256=58a6afe12a1c4609227fb41c04c0eb763af69b0d15d3034e396fb872d786960c
  Stored in directory: /root/.cache/pip/wheels/09/bc/67/75c9de8e9726460bc0b101ad225ad025cb8c

In [7]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

# Lexical Analysis

## Tokenization

In [3]:
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize

text = "This sentence is an example of tokenization. Another example sentence!"
print("Word Tokenization\t", word_tokenize(text))
print("Sentence Tokenization\t", sent_tokenize(text))

Word Tokenization	 ['This', 'sentence', 'is', 'an', 'example', 'of', 'tokenization', '.', 'Another', 'example', 'sentence', '!']
Sentence Tokenization	 ['This sentence is an example of tokenization.', 'Another example sentence!']


## Morphological Analysis

In [4]:
# Example from https://polyglot.readthedocs.io/en/latest/MorphologicalAnalysis.html
!polyglot download morph2.en morph2.ar

from polyglot.downloader import downloader
from polyglot.text import Text, Word

words = ["preprocessing", "processor", "invaluable", "thankful", "crossed"]
for w in words:
  w = Word(w, language="en")
  print("{:<20}{}".format(w, w.morphemes))

[polyglot_data] Downloading package morph2.en to
[polyglot_data]     /root/polyglot_data...
[polyglot_data] Downloading package morph2.ar to
[polyglot_data]     /root/polyglot_data...
preprocessing       ['pre', 'process', 'ing']
processor           ['process', 'or']
invaluable          ['in', 'valuable']
thankful            ['thank', 'ful']
crossed             ['cross', 'ed']


### Stemming

In [5]:
import nltk; nltk.download('punkt')
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
text = "Elephants are truly magnificent creatures"
for word in text.split(" "):
  print(f"{word}\t -> \t{stemmer.stem(word)}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Elephants	 -> 	eleph
are	 -> 	are
truly	 -> 	truli
magnificent	 -> 	magnific
creatures	 -> 	creatur


### Lemmatization

In [8]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
text = "Elephants are truly magnificent creatures"
for word in text.split(" "):
  print(f"{word}\t -> \t{lemmatizer.lemmatize(word)}")

Elephants	 -> 	Elephants
are	 -> 	are
truly	 -> 	truly
magnificent	 -> 	magnificent
creatures	 -> 	creature


# Syntactic Analysis

## Syntactic Parsing

In [9]:
# Example taken from https://www.nltk.org/book/ch08.html

import nltk
groucho_grammar = nltk.CFG.fromstring("""
  S -> NP VP
  PP -> P NP
  NP -> Det N | Det N PP | 'I'
  VP -> V NP | VP PP
  Det -> 'an' | 'my'
  N -> 'elephant' | 'pajamas'
  V -> 'shot'
  P -> 'in'
""")

sentence = ['I', 'shot', 'an', 'elephant', 'in', 'my', 'pajamas']
parser = nltk.ChartParser(groucho_grammar)

for tree in parser.parse(sentence):
  print(tree)

(S
  (NP I)
  (VP
    (VP (V shot) (NP (Det an) (N elephant)))
    (PP (P in) (NP (Det my) (N pajamas)))))
(S
  (NP I)
  (VP
    (V shot)
    (NP (Det an) (N elephant) (PP (P in) (NP (Det my) (N pajamas))))))


## Parts of Speech Tagging

In [10]:
import nltk
from nltk.tokenize import word_tokenize
text = word_tokenize("I completed my assignment while he was watching football")
nltk.pos_tag(text)

[('I', 'PRP'),
 ('completed', 'VBD'),
 ('my', 'PRP$'),
 ('assignment', 'NN'),
 ('while', 'IN'),
 ('he', 'PRP'),
 ('was', 'VBD'),
 ('watching', 'VBG'),
 ('football', 'NN')]

# Semantic Analysis

## Word Sense Disambiguation

In [11]:
from nltk.wsd import lesk
from nltk.corpus import wordnet as wn

sentence = "Can you book the flight for me?"
print(lesk(sentence.split(" "), 'book', 'v'))
print("\nAll senses of word 'bank'\n")
for ss in wn.synsets('bank'):
   print(ss, ss.definition())

Synset('reserve.v.04')

All senses of word 'bank'

Synset('bank.n.01') sloping land (especially the slope beside a body of water)
Synset('depository_financial_institution.n.01') a financial institution that accepts deposits and channels the money into lending activities
Synset('bank.n.03') a long ridge or pile
Synset('bank.n.04') an arrangement of similar objects in a row or in tiers
Synset('bank.n.05') a supply or stock held in reserve for future use (especially in emergencies)
Synset('bank.n.06') the funds held by a gambling house or the dealer in some gambling games
Synset('bank.n.07') a slope in the turn of a road or track; the outside is higher than the inside in order to reduce the effects of centrifugal force
Synset('savings_bank.n.02') a container (usually with a slot in the top) for keeping money at home
Synset('bank.n.09') a building in which the business of banking transacted
Synset('bank.n.10') a flight maneuver; aircraft tips laterally about its longitudinal axis (especial

# Pragmatic Analysis

## Co-Reference Resolution

In [13]:
# Example taken from https://github.com/stanfordnlp/stanza/blob/main/demo/Stanza_CoreNLP_Interface.ipynb

import stanza
from stanza.server import CoreNLPClient

corenlp_dir = './corenlp'
stanza.install_corenlp(dir=corenlp_dir)

import os
os.environ["CORENLP_HOME"] = corenlp_dir

with CoreNLPClient(annotators=['coref'], 
                   memory='4G', endpoint='http://localhost:9001', be_quiet=True) as client:
    text = "Barack Obama was born in Hawaii. He was elected president in 2008."
    document = client.annotate(text)

    print("{:30s}\t{}".format("Mention", "Type"))
    for sent in document.sentence:
        for m in sent.mentions:
            print("{:30s}\t{}".format(m.entityMentionText, m.entityType))

2022-01-12 20:50:05 INFO: Installing CoreNLP package into ./corenlp...


Downloading https://huggingface.co/stanfordnlp/CoreNLP/resolve/main/stanford-corenlp-latest.zip:   0%|        …

2022-01-12 20:50:14 INFO: Writing properties to tmp file: corenlp_server-44578798d544460e.props
2022-01-12 20:50:14 INFO: Starting server with command: java -Xmx4G -cp ./corenlp/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9001 -timeout 60000 -threads 5 -maxCharLength 100000 -quiet True -serverProperties corenlp_server-44578798d544460e.props -annotators coref -preload -outputFormat serialized


Mention                       	Type
Barack Obama                  	PERSON
Hawaii                        	STATE_OR_PROVINCE
president                     	TITLE
2008                          	DATE
He                            	PERSON
