# 청킹, 문장 구문 분석, 의존성  
 - 청킹은 텍스트에서 짧은 구를 추출하는 과정
 - 덩이짓기(청킹) 은 정보를 의미있는 묶음으로 만드는 것

In [7]:
import nltk
nltk.download('maxent_ne_chunker')
nltk.download('words')
text = "Namsan Botanical Garden is a well known botanical gardenin Seoul, Korea."
sentences = nltk.sent_tokenize(text)
for sentence in sentences:
    words = nltk.word_tokenize(sentence)
    print(words)
    print('-'*40)
    tags = nltk.pos_tag(words)
    print(tags)
    print('-'*40)
    chunks = nltk.ne_chunk(tags)
    print(chunks)

['Namsan', 'Botanical', 'Garden', 'is', 'a', 'well', 'known', 'botanical', 'gardenin', 'Seoul', ',', 'Korea', '.']
----------------------------------------
[('Namsan', 'NNP'), ('Botanical', 'NNP'), ('Garden', 'NNP'), ('is', 'VBZ'), ('a', 'DT'), ('well', 'RB'), ('known', 'VBN'), ('botanical', 'JJ'), ('gardenin', 'NN'), ('Seoul', 'NNP'), (',', ','), ('Korea', 'NNP'), ('.', '.')]
----------------------------------------
(S
  (PERSON Namsan/NNP)
  (PERSON Botanical/NNP Garden/NNP)
  is/VBZ
  a/DT
  well/RB
  known/VBN
  botanical/JJ
  gardenin/NN
  (GPE Seoul/NNP)
  ,/,
  (GPE Korea/NNP)
  ./.)


[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\student\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\student\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


# grammar를 이용하여 청크 구현하기

In [8]:
import nltk
nltk.download('punkt')
text = "Ravi is the CEO of a Company. He is very powerful public speaker also."
grammar = '\n'.join([
 'NP: {<DT>*<NNP>}', #DT : 한정사 , NNP : 고유명사 => DT가 0번 이상 출현
 'NP: {<JJ>*<NN>}', #JJ : 형용사 , NN : 명사 => JJ가 0번 이상 출현
 'NP: {<NNP>+}',
])
sentences = nltk.sent_tokenize(text)
for sentence in sentences:
    words = nltk.word_tokenize(sentence)
    tags = nltk.pos_tag(words)
    chunkparser = nltk.RegexpParser(grammar)
    result = chunkparser.parse(tags)
    print(result)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\student\AppData\Roaming\nltk_data...


(S
  (NP Ravi/NNP)
  is/VBZ
  (NP the/DT CEO/NNP)
  of/IN
  (NP a/DT Company/NNP)
  ./.)
(S
  He/PRP
  is/VBZ
  very/RB
  (NP powerful/JJ public/JJ speaker/NN)
  also/RB
  ./.)


[nltk_data]   Package punkt is already up-to-date!


# Training 청커

In [12]:
import nltk
nltk.download('treebank')
nltk.download('conll2000')
from nltk.corpus import conll2000
from nltk.corpus import treebank_chunk
def mySimpleChunker() :
    grammar = 'NP: {<NNP>+}'
    return nltk.RegexpParser(grammar)
def test_nothing(data) :
    cp = nltk.RegexpParser("")
    print(cp.evaluate(data))

def test_mysimplechunker(data) :
    schunker = mySimpleChunker()
    print(schunker.evaluate(data))

datasets = [
    conll2000.chunked_sents('test.txt', chunk_types=['NP']),
    treebank_chunk.chunked_sents()
]
for dataset in datasets :
    test_nothing(dataset[:50])
    test_mysimplechunker(dataset[:50])

[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\student\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package conll2000 to
[nltk_data]     C:\Users\student\AppData\Roaming\nltk_data...
[nltk_data]   Package conll2000 is already up-to-date!


ChunkParse score:
    IOB Accuracy:  38.6%%
    Precision:      0.0%%
    Recall:         0.0%%
    F-Measure:      0.0%%
ChunkParse score:
    IOB Accuracy:  48.2%%
    Precision:     71.1%%
    Recall:        17.2%%
    F-Measure:     27.7%%
ChunkParse score:
    IOB Accuracy:  45.0%%
    Precision:      0.0%%
    Recall:         0.0%%
    F-Measure:      0.0%%
ChunkParse score:
    IOB Accuracy:  50.7%%
    Precision:     51.9%%
    Recall:         8.8%%
    F-Measure:     15.1%%


# 회귀  
 - 재귀 하향 파서는 왼쪽에서 오른쪽으로 입력을 읽고 파생트리를 하향식으로 작성하고 전위순호(pre-order)방
노드를 통과시켜는 파서
 - 컴파일러를 작성하는데 사용



In [33]:
import nltk
def RDParserExample(grammar, textlist):
    parser = nltk.parse.RecursiveDescentParser(grammar)
    for text in textlist:
        sentence = nltk.word_tokenize(text)
    for tree in parser.parse(sentence):
        print(tree)
        tree.draw()
grammar = nltk.CFG.fromstring("""
S -> NP VP
NP -> NNP VBZ
VP -> IN NNP | DT NN IN NNP
NNP -> 'Tajmahal' | 'Agra' | 'Bangalore' | 'Karnataka'
VBZ -> 'is'
IN -> 'in' | 'of'
DT -> 'the'
NN -> 'capital'
""")
text = [
 "Tajmahal is in Agra",
 "Bangalore is the capital of Karnataka",
]
RDParserExample(grammar, text)

(S
  (NP (NNP Bangalore) (VBZ is))
  (VP (DT the) (NN capital) (IN of) (NNP Karnataka)))


# 시프트 변환구문  
 - 입력 텍스트에서 첫번째 토큰을 읽고 스택에 넣음
 - 스택의 전체 구문 분석 트리를 읽고 생성규칙을 오른쪽에서 왼쪽으로 읽음으로써 적용 가능한 생성 규칙을
이 과정은 생성규칙이 바닥 날때까지 반복되고, 구문분석이 실패했다는 점을 인정
 - 이 과정은 입력이 소모될 때까지 반복되며, 구문 분석이 성공했다고 말함


In [34]:
import nltk
def SRParserExample(grammar, textlist):
    parser = nltk.parse.ShiftReduceParser(grammar)
    for text in textlist:
        sentence = nltk.word_tokenize(text)
        print(sentence)
        for tree in parser.parse(sentence):
            print(tree)
            tree.draw()
text = [
    "Bangalore is the capital of Karnataka",
    "Tajmahal is in Agra"
]
grammar = nltk.CFG.fromstring("""
S -> NP VP
NP -> NNP VBZ
VP -> IN NNP | DT NN IN NNP
NNP -> 'Tajmahal' | 'Agra' | 'Bangalore' | 'Karnataka'
VBZ -> 'is'
IN -> 'in' | 'of'
DT -> 'the'
NN -> 'capital'
""")

SRParserExample(grammar, text)

['Bangalore', 'is', 'the', 'capital', 'of', 'Karnataka']
['Tajmahal', 'is', 'in', 'Agra']
(S (NP (NNP Tajmahal) (VBZ is)) (VP (IN in) (NNP Agra)))


# 의존 문법과 투사 의존성 구문 분석

In [35]:
import nltk
grammar = nltk.grammar.DependencyGrammar.fromstring("""
'savings' -> 'small'
'yield' -> 'savings'
'gains' -> 'large'
'yield' -> 'gains'
""")
sentence = 'small savings yield large gains'
dp = nltk.parse.ProjectiveDependencyParser(grammar)
for t in sorted(dp.parse(sentence.split())):
    print(t)
    t.draw()

(yield (savings small) (gains large))


In [41]:
for t in dp.parse(sentence.split()):
    print(t)
    t.draw()

(yield (savings small) (gains large))


In [39]:
sentence.split()

['small', 'savings', 'yield', 'large', 'gains']

In [38]:
sorted(sentence.split())

['gains', 'large', 'savings', 'small', 'yield']

# 차트 구문 분석

In [44]:
from nltk.grammar import CFG
from nltk.parse.chart import ChartParser, BU_LC_STRATEGY
grammar = CFG.fromstring("""
S -> T1 T4
T1 -> NNP VBZ
T2 -> DT NN
T3 -> IN NNP
T4 -> T3 | T2 T3
NNP -> 'Tajmahal' | 'Agra' | 'Bangalore' | 'Karnataka'
VBZ -> 'is'
IN -> 'in' | 'of'
DT -> 'the'
NN -> 'capital'
""")
cp = ChartParser(grammar, BU_LC_STRATEGY, trace=True)
sentence = "Bangalore is the capital of Karnataka"
tokens = sentence.split()
chart = cp.chart_parse(tokens)
parses = list(chart.parses(grammar.start()))
print("Total Edges :", len(chart.edges()))
for tree in parses: print(tree)
tree.draw()


|.Bangal.  is  . the  .capita.  of  .Karnat.|
|[------]      .      .      .      .      .| [0:1] 'Bangalore'
|.      [------]      .      .      .      .| [1:2] 'is'
|.      .      [------]      .      .      .| [2:3] 'the'
|.      .      .      [------]      .      .| [3:4] 'capital'
|.      .      .      .      [------]      .| [4:5] 'of'
|.      .      .      .      .      [------]| [5:6] 'Karnataka'
|[------]      .      .      .      .      .| [0:1] NNP -> 'Bangalore' *
|[------>      .      .      .      .      .| [0:1] T1 -> NNP * VBZ
|.      [------]      .      .      .      .| [1:2] VBZ -> 'is' *
|[-------------]      .      .      .      .| [0:2] T1 -> NNP VBZ *
|[------------->      .      .      .      .| [0:2] S  -> T1 * T4
|.      .      [------]      .      .      .| [2:3] DT -> 'the' *
|.      .      [------>      .      .      .| [2:3] T2 -> DT * NN
|.      .      .      [------]      .      .| [3:4] NN -> 'capital' *
|.      .      [-------------]      .      .| [2: