# NLTK experiments # 

In [66]:
import nltk
from nltk import word_tokenize, CFG
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt

In [67]:
nltk.download()
from nltk.book import *

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


In [68]:
from nltk.corpus import wordnet as wn
wn.synsets('image')

[Synset('image.n.01'),
 Synset('persona.n.02'),
 Synset('picture.n.01'),
 Synset('prototype.n.01'),
 Synset('trope.n.01'),
 Synset('double.n.03'),
 Synset('image.n.07'),
 Synset('image.n.08'),
 Synset('effigy.n.01'),
 Synset('image.v.01'),
 Synset('visualize.v.01')]

In [69]:
raw = open("requirements.txt").read()
print(raw)

R1 The instrument shall image a target at 600 650 km according to IF-1.
R2 The instrument shall image a target with spectral radiance of ABC (*plot) according to IF-1.
R3 The instrument shall accept Command A according to IF-2.
R4 The instrument shall transmit image data according to IF-2 in less than 0.2 s after receiving Command A.
R5 The instrument shall have a resolution better than 1 unit.
R6 The instrument shall have a FOV greater than 2â—¦
R7 The instrument shall provide telemetry data every 1 s according to IF-2.
R8 The instrument shall accept power according to IF-3.
R9 The instrument shall consume less than 600 W of electrical power.
R10 The instrument shall withstand a mechanical load of 5 g in any direction on IF-4.
R11 The instrument shall fulfill its performance when subjected to a temperature between âˆ’10 â—¦C and +45 â—¦C at IF-4.
R12 The instrument shall have a lifetime of at least 7 years.
Note 1 R10 only applies during launch. All other requirements only apply once 

In [70]:
def ie_preprocess(filename):
    raw = open(filename).read()
    lines = [line.lower() for line in raw.split('\n')]
    lines = [word_tokenize(line) for line in lines]
    lines = [nltk.pos_tag(line) for line in lines]
    return lines

Minor corrections to tagging

In [71]:
corrections = [
    ("r1", "LS"),
    ("r2", "LS"),
    ("r3", "LS"),
    ("r4", "LS"),
    ("r5", "LS"),
    ("r6", "LS"),
    ("r7", "LS"),
    ("r8", "LS"),
    ("r9", "LS"),
    ("r10", "LS"),
    ("image", "VB"),
    ("according", "A")
]

requirements = ie_preprocess("requirements.txt")
for i in range(len(requirements)):
    for w in range(len(requirements[i])):
        for correction in corrections:
            if requirements[i][w][0] == correction[0]:
                requirements[i][w] = correction
requirements

[[('r1', 'LS'),
  ('the', 'DT'),
  ('instrument', 'NN'),
  ('shall', 'MD'),
  ('image', 'VB'),
  ('a', 'DT'),
  ('target', 'NN'),
  ('at', 'IN'),
  ('600', 'CD'),
  ('650', 'CD'),
  ('km', 'NN'),
  ('according', 'A'),
  ('to', 'TO'),
  ('if-1', 'NN'),
  ('.', '.')],
 [('r2', 'LS'),
  ('the', 'DT'),
  ('instrument', 'NN'),
  ('shall', 'MD'),
  ('image', 'VB'),
  ('a', 'DT'),
  ('target', 'NN'),
  ('with', 'IN'),
  ('spectral', 'JJ'),
  ('radiance', 'NN'),
  ('of', 'IN'),
  ('abc', 'NN'),
  ('(', '('),
  ('*', 'NNP'),
  ('plot', 'NN'),
  (')', ')'),
  ('according', 'A'),
  ('to', 'TO'),
  ('if-1', 'NN'),
  ('.', '.')],
 [('r3', 'LS'),
  ('the', 'DT'),
  ('instrument', 'NN'),
  ('shall', 'MD'),
  ('accept', 'VB'),
  ('command', 'NN'),
  ('a', 'DT'),
  ('according', 'A'),
  ('to', 'TO'),
  ('if-2', 'NN'),
  ('.', '.')],
 [('r4', 'LS'),
  ('the', 'DT'),
  ('instrument', 'NN'),
  ('shall', 'MD'),
  ('transmit', 'VB'),
  ('image', 'VB'),
  ('data', 'NNS'),
  ('according', 'A'),
  ('to', 'TO')

Chunking demo

In [80]:
grammar = r"""
S:  {<NP><MD><VB><NP><PP>?}
MD: {<MD>}
NP: {<LS>?<DT>?<NN>} 
    }<VP><AP><PP>{
VP: {<VB>}
AP: {<A><.*>+}
PP: {<IN><.*>+}
    }<VBG|AP>+{
"""

tokenized_requirements = requirements[3]
cp = nltk.RegexpParser(grammar)
result = cp.parse(tokenized_requirements)
print(result)

(S
  (NP r4/LS the/DT instrument/NN)
  (MD shall/MD)
  (VP transmit/VB)
  (VP image/VB)
  data/NNS
  (AP
    according/A
    to/TO
    (NP if-2/NN)
    in/IN
    less/JJR
    than/IN
    0.2/CD
    (NP s/NN)
    after/IN
    receiving/VBG
    (NP command/NN)
    a/DT
    ./.))


Parsing the chunked sentence

In [73]:
result[0][0]

('r2', 'LS')

In [74]:
prep = None
for r in result[3]:
    if r[1] == 'NN':
        prep = r[0]
prep

'target'

In [75]:
foundSubject = False
foundVerb = False
foundObject = False
foundReg = False

subject = verb = obj = prep = None
for chunk in result:
    for c in chunk:
        print(c)
        if not foundSubject and c[1] == 'NN':
            subject = c[0]
            foundSubject = True
        elif not foundVerb and c[1] == 'VB':
            verb = c[0]
            foundVerb = True
        elif not foundObject and c[1] == 'NN':
            obj = c[0]
            foundObject = True
        
print("=========")
print(subject)
print(verb)
print(obj)

('r2', 'LS')
('the', 'DT')
('instrument', 'NN')
('shall', 'MD')
('image', 'VB')
('a', 'DT')
('target', 'NN')
('with', 'IN')
('spectral', 'JJ')
('radiance', 'NN')
('of', 'IN')
('abc', 'NN')
('(', '(')
('*', 'NNP')
('plot', 'NN')
(')', ')')
('according', 'A')
('to', 'TO')
('if-1', 'NN')
('.', '.')
instrument
image
target
