In [1]:
import nltk

In [2]:
simpleSentence = 'Seoul is the capital of Korea.'
wordsInSentence = nltk.word_tokenize(simpleSentence)
print(wordsInSentence)

['Seoul', 'is', 'the', 'capital', 'of', 'Korea', '.']


In [3]:
partsOfSpeechTags = nltk.pos_tag(wordsInSentence)
print(partsOfSpeechTags)

[('Seoul', 'NNP'), ('is', 'VBZ'), ('the', 'DT'), ('capital', 'NN'), ('of', 'IN'), ('Korea', 'NNP'), ('.', '.')]


In [4]:
import nltk
def learnDefaultTagger(simpleSentence):
    wordsInSentence = nltk.word_tokenize(simpleSentence)
    tagger = nltk.DefaultTagger("NN")
    posEnabledTags = tagger.tag(wordsInSentence)
    print(posEnabledTags)
    
def learnRETagger(simpleSentence):
    customPatterns = [
    (r'.*ing$', 'ADJECTIVE'), # running
    (r'.*ly$', 'ADVERB'), # willingly
    (r'.*ion$', 'NOUN'), # intimation
    (r'(.*ate|.*en|is)$', 'VERB'), # terminate, darken, lighten
    (r'^an$', 'INDEFINITE-ARTICLE'), # terminate
    (r'^(with|on|at)$', 'PREPOSITION'), # on
    (r'^\-?[0-9]+(\.[0-9]+)$', 'NUMBER'), # -1.0, 12345.123
    (r'.*$', None),
    ]
    tagger = nltk.RegexpTagger(customPatterns)
    wordsInSentence = nltk.word_tokenize(simpleSentence)
    posEnabledTags = tagger.tag(wordsInSentence)
    print(posEnabledTags)
    
def learnLookupTagger(simpleSentence):
    mapping = {
    '.': '.', 'place': 'NN', 'on': 'IN',
    'earth': 'NN', 'Reykjavik' : 'NNP', 'is': 'VBZ',
    'an': 'DT', 'amazing': 'JJ'
    }
    tagger = nltk.UnigramTagger(model=mapping)
    wordsInSentence = nltk.word_tokenize(simpleSentence)
    posEnabledTags = tagger.tag(wordsInSentence)
    print(posEnabledTags)

In [5]:
testSentence = 'Reykjavik is an amazing place on earth. I have visited Reykjavik 10 times'

In [6]:
learnDefaultTagger(testSentence)

[('Reykjavik', 'NN'), ('is', 'NN'), ('an', 'NN'), ('amazing', 'NN'), ('place', 'NN'), ('on', 'NN'), ('earth', 'NN'), ('.', 'NN'), ('I', 'NN'), ('have', 'NN'), ('visited', 'NN'), ('Reykjavik', 'NN'), ('10', 'NN'), ('times', 'NN')]


In [7]:
learnRETagger(testSentence)

[('Reykjavik', None), ('is', 'VERB'), ('an', 'INDEFINITE-ARTICLE'), ('amazing', 'ADJECTIVE'), ('place', None), ('on', 'PREPOSITION'), ('earth', None), ('.', None), ('I', None), ('have', None), ('visited', None), ('Reykjavik', None), ('10', None), ('times', None)]


In [8]:
learnLookupTagger(testSentence)

[('Reykjavik', 'NNP'), ('is', 'VBZ'), ('an', 'DT'), ('amazing', 'JJ'), ('place', 'NN'), ('on', 'IN'), ('earth', 'NN'), ('.', '.'), ('I', None), ('have', None), ('visited', None), ('Reykjavik', 'NNP'), ('10', None), ('times', None)]


# 피클로 저장하기  
 - 데이터의 클래스 형태를 그대로 보존한다

In [9]:
import nltk
import pickle
def sampleData():
    return [
    "Bangalore is the capital of Karnataka.",
    "Steve Jobs was the CEO of Apple.",
    "iPhone was Invented by Apple.",
    "Books can be purchased in Market.",
    ]

def buildDictionary():
    dictionary = {}
    for sent in sampleData():
        partsOfSpeechTags = nltk.pos_tag(nltk.word_tokenize(sent))
    for tag in partsOfSpeechTags:
        value = tag[0]
        pos = tag[1]
        dictionary[value] = pos
    return dictionary

def saveMyTagger(tagger, fileName):
    fileHandle = open(fileName, "wb")
    pickle.dump(tagger, fileHandle)
    fileHandle.close()

def saveMyTraining(fileName):
    tagger = nltk.UnigramTagger(model=buildDictionary())
    saveMyTagger(tagger, fileName)

def loadMyTagger(fileName):
    return pickle.load(open(fileName, "rb"))

sentence = 'Iphone is purchased by Steve Jobs in Bangalore Market'
fileName = "myTagger.pickle"
saveMyTraining(fileName)
myTagger = loadMyTagger(fileName)
print(myTagger.tag(nltk.word_tokenize(sentence)))

[('Iphone', None), ('is', None), ('purchased', 'VBN'), ('by', None), ('Steve', None), ('Jobs', None), ('in', 'IN'), ('Bangalore', None), ('Market', 'NNP')]


# 자체 문법 작성(CFG)

In [12]:
import nltk
import string
from nltk.parse.generate import generate

In [21]:
import nltk
import string
from nltk.parse.generate import generate
productions = [
"ROOT -> WORD",
"WORD -> ' '",
"WORD -> NUMBER LETTER",
"WORD -> LETTER NUMBER",
]
digits = list(string.digits)
for digit in digits[:4]:
    productions.append("NUMBER -> '{w}'".format(w=digit))
    
letters = "' | '".join(list(string.ascii_lowercase)[:4])
productions.append("LETTER -> '{w}'".format(w=letters))
grammarString = "\n".join(productions)
grammar = nltk.CFG.fromstring(grammarString)
print(grammar)

# generate : 생성, depth는 탐색하는 층 개수
for sentence in generate(grammar, n=20, depth=5):
    palindrome = "".join(sentence).replace(" ", "")
    print("생성된 단어: {}, 크기: {}".format(palindrome, len(palindrome)))

Grammar with 12 productions (start state = ROOT)
    ROOT -> WORD
    WORD -> ' '
    WORD -> NUMBER LETTER
    WORD -> LETTER NUMBER
    NUMBER -> '0'
    NUMBER -> '1'
    NUMBER -> '2'
    NUMBER -> '3'
    LETTER -> 'a'
    LETTER -> 'b'
    LETTER -> 'c'
    LETTER -> 'd'
생성된 단어: , 크기: 0
생성된 단어: 0a, 크기: 2
생성된 단어: 0b, 크기: 2
생성된 단어: 0c, 크기: 2
생성된 단어: 0d, 크기: 2
생성된 단어: 1a, 크기: 2
생성된 단어: 1b, 크기: 2
생성된 단어: 1c, 크기: 2
생성된 단어: 1d, 크기: 2
생성된 단어: 2a, 크기: 2
생성된 단어: 2b, 크기: 2
생성된 단어: 2c, 크기: 2
생성된 단어: 2d, 크기: 2
생성된 단어: 3a, 크기: 2
생성된 단어: 3b, 크기: 2
생성된 단어: 3c, 크기: 2
생성된 단어: 3d, 크기: 2
생성된 단어: a0, 크기: 2
생성된 단어: a1, 크기: 2
생성된 단어: a2, 크기: 2


In [22]:
string.digits

'0123456789'

In [13]:
string.ascii_lowercase

'abcdefghijklmnopqrstuvwxyz'

In [14]:
letters ="' | '".join(list(string.ascii_lowercase)[:4])
letters

"a' | 'b' | 'c' | 'd"

In [16]:
productions = [
"ROOT -> WORD",
"WORD -> ' '",
"WORD -> NUMBER LETTER",
"WORD -> LETTER NUMBER",
]

In [17]:
productions.append("LETTER -> '{w}'".format(w=letters))
productions

['ROOT -> WORD',
 "WORD -> ' '",
 'WORD -> NUMBER LETTER',
 'WORD -> LETTER NUMBER',
 "LETTER -> 'a' | 'b' | 'c' | 'd'"]

In [18]:
grammarString = "\n".join(productions)
grammarString

"ROOT -> WORD\nWORD -> ' '\nWORD -> NUMBER LETTER\nWORD -> LETTER NUMBER\nLETTER -> 'a' | 'b' | 'c' | 'd'"

In [19]:
grammar = nltk.CFG.fromstring(grammarString)
print(grammar)

Grammar with 8 productions (start state = ROOT)
    ROOT -> WORD
    WORD -> ' '
    WORD -> NUMBER LETTER
    WORD -> LETTER NUMBER
    LETTER -> 'a'
    LETTER -> 'b'
    LETTER -> 'c'
    LETTER -> 'd'


# 확률적 문맥 문법(CFG)  
 - 확률을 0으로 해도 나타난다.(generate함수의 작동원리가 다름)

In [25]:
import nltk
from nltk.parse.generate import generate
productions = [
"ROOT -> WORD [1.0]",
"WORD -> P1 [0.25]",
"WORD -> P1 P2 [0.25]",
"WORD -> P1 P2 P3 [0.25]",
"WORD -> P1 P2 P3 P4 [0.25]",
"P1 -> 'A' [1.0]",
"P2 -> 'B' [0.5]",
"P2 -> 'C' [0.5]",
"P3 -> 'D' [0.3]",
"P3 -> 'E' [0.3]",
"P3 -> 'F' [0.4]",
"P4 -> 'G' [0.9]",
"P4 -> 'H' [0.1]",
]

grammarString = "\n".join(productions)
grammar = nltk.PCFG.fromstring(grammarString)
print(grammar)

for sentence in generate(grammar, n=10, depth=5):
    palindrome = "".join(sentence).replace(" ", "")
    print("문자열 : {}, 크기 : {}".format(palindrome, len(palindrome)))

Grammar with 13 productions (start state = ROOT)
    ROOT -> WORD [1.0]
    WORD -> P1 [0.25]
    WORD -> P1 P2 [0.25]
    WORD -> P1 P2 P3 [0.25]
    WORD -> P1 P2 P3 P4 [0.25]
    P1 -> 'A' [1.0]
    P2 -> 'B' [0.5]
    P2 -> 'C' [0.5]
    P3 -> 'D' [0.3]
    P3 -> 'E' [0.3]
    P3 -> 'F' [0.4]
    P4 -> 'G' [0.9]
    P4 -> 'H' [0.1]
문자열 : A, 크기 : 1
문자열 : AB, 크기 : 2
문자열 : AC, 크기 : 2
문자열 : ABD, 크기 : 3
문자열 : ABE, 크기 : 3
문자열 : ABF, 크기 : 3
문자열 : ACD, 크기 : 3
문자열 : ACE, 크기 : 3
문자열 : ACF, 크기 : 3
문자열 : ABDG, 크기 : 4


# 재귀 CFG 작성

In [33]:
import nltk
import string
from nltk.parse.generate import generate
productions = [
"ROOT -> WORD",
"WORD -> ' '"
]
alphabets = list(string.digits)

for alphabet in alphabets[:1]:
    productions.append("WORD -> '{w}' WORD '{w}'".format(w=alphabet))

grammarString = "\n".join(productions)
grammar = nltk.CFG.fromstring(grammarString)
print(grammar)

for sentence in generate(grammar, n=10, depth=20):
    palindrome = "".join(sentence).replace(" ", "")
    print("Palindrome : {}, Size : {}".format(palindrome, len(palindrome)))

Grammar with 3 productions (start state = ROOT)
    ROOT -> WORD
    WORD -> ' '
    WORD -> '0' WORD '0'
Palindrome : , Size : 0
Palindrome : 00, Size : 2
Palindrome : 0000, Size : 4
Palindrome : 000000, Size : 6
Palindrome : 00000000, Size : 8
Palindrome : 0000000000, Size : 10
Palindrome : 000000000000, Size : 12
Palindrome : 00000000000000, Size : 14
Palindrome : 0000000000000000, Size : 16
Palindrome : 000000000000000000, Size : 18
