# NER 내장 개체명 인식 기능 사용

In [1]:
import nltk

def sampleNE():
    sent = nltk.corpus.treebank.tagged_sents()[0] #객체 타입 별로 
    print(nltk.ne_chunk(sent))

def sampleNE2():
    sent = nltk.corpus.treebank.tagged_sents()[0] 
    print(nltk.ne_chunk(sent, binary = True)) # binary = True는 sampleNE()와 다르게 타입을 자세히 알려주진 않고 
                                                #객체이다 아니다만 판별
    
if __name__ == '__main__':
    sampleNE()
    sampleNE2()

(S
  (PERSON Pierre/NNP)
  (ORGANIZATION Vinken/NNP)
  ,/,
  61/CD
  years/NNS
  old/JJ
  ,/,
  will/MD
  join/VB
  the/DT
  board/NN
  as/IN
  a/DT
  nonexecutive/JJ
  director/NN
  Nov./NNP
  29/CD
  ./.)
(S
  (NE Pierre/NNP Vinken/NNP)
  ,/,
  61/CD
  years/NNS
  old/JJ
  ,/,
  will/MD
  join/VB
  the/DT
  board/NN
  as/IN
  a/DT
  nonexecutive/JJ
  director/NN
  Nov./NNP
  29/CD
  ./.)


# 클래스 사용 딕셔너리 생성, 반전 사용

In [7]:
import nltk
nltk.download('averaged_perceptron_tagger')
class LearningDictionary():
    def __init__(self, sentence):
        self.words = nltk.word_tokenize(sentence)
        self.tagged = nltk.pos_tag(self.words)
        self.buildDictionary()
        self.buildReverseDictionary()
        
    def buildDictionary(self): #dictionary[단어] = 품사
        self.dictionary = {}
        for (word, pos) in self.tagged:
            self.dictionary[word] = pos
            
    def buildReverseDictionary(self): #dictionary[품사] = 단어목록
        self.rdictionary = {}
        for key in self.dictionary.keys():
            value = self.dictionary[key]
            if value not in self.rdictionary:
                print('-')
                print([key])
                self.rdictionary[value] = [key]
            else:
                self.rdictionary[value].append(key)
    def isWordPresent(self, word): #dictionary에 단어 포함 여부
        return 'Yes' if word in self.dictionary else 'No'
    
    def getPOSForWord(self, word): #단어의 품사 찾기
        return self.dictionary[word] if word in self.dictionary else None
    
    def getWordsForPOS(self, pos): #품사에 해당하는 단어 목록 찾기
        return self.rdictionary[pos] if pos in self.rdictionary else None
    
sentence = "All the flights got delayed due to bad weather"
learning = LearningDictionary(sentence)
words = ["chair", "flights", "delayed", "pencil", "weather"]
pos = ["NN", "VBS", "NNS"]
for word in words: #단어로 품사찾기
    status = learning.isWordPresent(word)
    print("Is '{}' present in dictionary ? : '{}'".format(word, status))
    if status is 'Yes':
        print("\tPOS For '{}' is '{}'".format(word, learning.getPOSForWord(word)))
for pword in pos: #품사로 단어찾기
    print("POS '{}' has '{}' words".format(pword, learning.getWordsForPOS(pword)))

-
['All']
-
['the']
-
['flights']
-
['got']
-
['delayed']
-
['due']
-
['to']
-
['weather']
Is 'chair' present in dictionary ? : 'No'
Is 'flights' present in dictionary ? : 'Yes'
	POS For 'flights' is 'NNS'
Is 'delayed' present in dictionary ? : 'Yes'
	POS For 'delayed' is 'VBN'
Is 'pencil' present in dictionary ? : 'No'
Is 'weather' present in dictionary ? : 'Yes'
	POS For 'weather' is 'NN'
POS 'NN' has '['weather']' words
POS 'VBS' has 'None' words
POS 'NNS' has '['flights']' words


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\15Z970-GA5BK\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


# 분류기 사용 문장 분할

In [12]:
import nltk
nltk.download('punkt')
def featureExtractor(words, i): #단어, 다음단어첫글자대문자 여부
    return ({'current-word': words[i], 'next-is-upper': words[i+1][0].isupper()}, words[i+1][0].isupper())
def getFeaturesets(sentence):
    words = nltk.word_tokenize(sentence)
    featuresets = [featureExtractor(words, i) for i in range(1, len(words) - 1) if words[i] == '.'] #구
    print(featuresets)
    return featuresets
def segmentTextAndPrintSentences(data):
    words = nltk.word_tokenize(data)
    for i in range(0, len(words) - 1):
        if words[i] == '.':
            if classifier.classify(featureExtractor(words, i)[0]) == True: #단어, 다음단어첫글자대문자
                print(".")
        else:
            print(words[i], end = ' ')

    else:
        print("{} ".format(words[i]), end= ' ')

print(words[-1])
# https://en.wikipedia.org/wiki/India에서 텍스트 복사
traindata = "India, officially the Republic of India (Bhārat Gaṇarājya),[e] is a country in South Asia. it is the seventh-largest country by area, the second-most populous country (with over 1.2 billion people), and the most populous democracy in the world. It is bounded by the Indian Ocean on the south, the Arabian Sea on the southwest, and the Bay of Bengal on the southeast. It shares land borders with Pakistan to the west;[f] China, Nepal, and Bhutan to the northeast; and Myanmar (Burma) and Bangladesh to the east. In the Indian Ocean, India is in the vicinity of Sri Lanka and the Maldives. India's Andaman and Nicobar Islands share a maritime border with Thailand and Indonesia."
testdata = "The Indian subcontinent was home to the urban Indus Valley Civilisation of the 3rd millennium BCE. In the following millennium, the oldest scriptures associated with Hinduism began to be composed. Social stratification, based on caste, emerged in the first millennium BCE, and Buddhism and Jainism arose. Early political consolidations took place under the Maurya and Gupta empires; the later peninsular Middle Kingdoms influenced cultures as far as southeast Asia. In the medieval era, Judaism, Zoroastrianism, Christianity, and Islam arrived, and Sikhism emerged, all adding to the region's diverse culture. Much of the north fell to the Delhi sultanate; the south was united under the Vijayanagara Empire. The economy expanded in the 17th century in the Mughal Empire. In the mid-18th century, the subcontinent came under British East India Company rule, and in the mid-19th under British crown rule. A nationalist movement emerged in the late 19th century, which later, under Mahatma Gandhi, was noted for nonviolent resistance and led to India's independence in 1947."
traindataset = getFeaturesets(traindata)
classifier = nltk.NaiveBayesClassifier.train(traindataset)
segmentTextAndPrintSentences(testdata)

weather
[({'current-word': '.', 'next-is-upper': False}, False), ({'current-word': '.', 'next-is-upper': True}, True), ({'current-word': '.', 'next-is-upper': True}, True), ({'current-word': '.', 'next-is-upper': True}, True), ({'current-word': '.', 'next-is-upper': True}, True)]
The Indian subcontinent was home to the urban Indus Valley Civilisation of the 3rd millennium BCE .
In the following millennium , the oldest scriptures associated with Hinduism began to be composed .
Social stratification , based on caste , emerged in the first millennium BCE , and Buddhism and Jainism arose .
Early political consolidations took place under the Maurya and Gupta empires ; the later peninsular Middle Kingdoms influenced cultures as far as southeast Asia .
In the medieval era , Judaism , Zoroastrianism , Christianity , and Islam arrived , and Sikhism emerged , all adding to the region 's diverse culture .
Much of the north fell to the Delhi sultanate ; the south was united under the Vijayanagara 

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\15Z970-GA5BK\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
