# LineTokenizer : 개행문자(/n)를 기준으로 토큰화. 마침표나 문장 단위가 아님
# SpaceTokenizer : 공백을 기준으로 토큰화
# TweetTokenizer : 트위터에 특화한 토큰화. 이모티콘도 하나의 토큰으로 인지
# word_tokenize : 단어 기준 토큰화

In [7]:
import nltk
nltk.download('punkt')
from nltk.tokenize import LineTokenizer, SpaceTokenizer, TweetTokenizer
from nltk import word_tokenize

lTokenizer = LineTokenizer();


print("Line tokenizer 출력 : ", lTokenizer.tokenize("My name is Maximus Decimus Meridiu\n 브라브라ㅡ비라ㅡ"))
rawText = "By 11 o'clock on Sunday, the doctor shall open the dispensary :)"
sTokenizer = SpaceTokenizer()
print("Space Tokenizer 출력 :", sTokenizer.tokenize(rawText))
print("Word Tokenizer 출력 : ", word_tokenize(rawText))
tTokenizer = TweetTokenizer()
print("Tweet Tokenizer 출력 : ", tTokenizer.tokenize("This is a cooool #dummysmiley: :="))

Line tokenizer 출력 :  ['My name is Maximus Decimus Meridiu', ' 브라브라ㅡ비라ㅡ']
Space Tokenizer 출력 : ['By', '11', "o'clock", 'on', 'Sunday,', 'the', 'doctor', 'shall', 'open', 'the', 'dispensary', ':)']
Word Tokenizer 출력 :  ['By', '11', "o'clock", 'on', 'Sunday', ',', 'the', 'doctor', 'shall', 'open', 'the', 'dispensary', ':', ')']
Tweet Tokenizer 출력 :  ['This', 'is', 'a', 'cooool', '#dummysmiley', ':', ':=']


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\15Z970-GA5BK\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# 어간추출(Stemming)
# 단어의 어근을 중심으로 규칙에 기반하여 어미를 제거 혹은 변화하여 표준화
# PorterStemmer 에 비교하여 LancasterStemmer의 예문에서 더 많은 제거가 일어났음을 확인할 수 있음 (name -> #na
# 규칙에 기반하여 표준화 하여 새로운 단어도 처리가 가능함 => 규칙에만 일치하면 처리가 가능

In [12]:
from nltk import PorterStemmer, LancasterStemmer, word_tokenize
raw = "My name is Maximus Decimus Meridius, commander of the Armies of the North, Gene"
tokens = word_tokenize(raw)
porter = PorterStemmer()
pStems = [porter.stem(t) for t in tokens]
print(pStems)
lancaster = LancasterStemmer()
lStems = [lancaster.stem(t) for t in tokens]
print(lStems)

['My', 'name', 'is', 'maximu', 'decimu', 'meridiu', ',', 'command', 'of', 'the', 'armi', 'of', 'the', 'north', ',', 'gene']
['my', 'nam', 'is', 'maxim', 'decim', 'meridi', ',', 'command', 'of', 'the', 'army', 'of', 'the', 'nor', ',', 'gen']


In [13]:
import nltk
nltk.download('wordnet')
from nltk import word_tokenize, PorterStemmer, WordNetLemmatizer
raw = "My name is Maximus Decimus Meridius, commander of the armies of the north, Gene"
tokens = word_tokenize(raw)
porter = PorterStemmer()
stems = [porter.stem(t) for t in tokens]
print(stems)
lemmatizer = WordNetLemmatizer()
lemmas = [lemmatizer.lemmatize(t) for t in tokens]
print(lemmas)


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\15Z970-GA5BK\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


['My', 'name', 'is', 'maximu', 'decimu', 'meridiu', ',', 'command', 'of', 'the', 'armi', 'of', 'the', 'north', ',', 'gene']
['My', 'name', 'is', 'Maximus', 'Decimus', 'Meridius', ',', 'commander', 'of', 'the', 'army', 'of', 'the', 'north', ',', 'Gene']


In [14]:
import nltk
nltk.download('gutenberg')
nltk.download('stopwords')
from nltk.corpus import gutenberg
print(gutenberg.fileids())
gb_words = gutenberg.words('bible-kjv.txt')
words_filtered = [e.lower() for e in gb_words if len(e) >= 3]

stopwords = nltk.corpus.stopwords.words('english')
words = [w for w in words_filtered if w.lower() not in stopwords]
fdist = nltk.FreqDist(words)
fdist2 = nltk.FreqDist(gb_words)
print('Following are the most common 10 words in the bag')
print(fdist2.most_common(10))
print('Following are the most common 10 words in the bag minus the stopwords')
print(fdist.most_common(10))
fdist.plot()

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\15Z970-GA5BK\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\gutenberg.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\15Z970-GA5BK\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']
Following are the most common 10 words in the bag
[(',', 70509), ('the', 62103), (':', 43766), ('and', 38847), ('of', 34480), ('.', 26160), ('to', 13396), ('And', 12846), ('that', 12576), ('in', 12331)]
Following are the most common 10 words in the bag minus the stopwords
[('shall', 9838), ('unto', 8997), ('lord', 7964), ('thou', 5474), ('thy', 4600), ('god', 4472), ('said', 3999), ('thee', 3827), ('upon', 2748), ('man', 2735)]


<Figure size 640x480 with 1 Axes>

In [15]:
from nltk.metrics.distance import edit_distance
def my_edit_distance(str1, str2):
    m= len(str1) + 1
    n= len(str2) + 1

    table = {}
    for i in range(m): table[i,0]=i
    for j in range(n): table[0,j]=j
    for i in range(1, m):
        for j in range(1, n):
            cost = 0 if str1[i - 1] == str2[j - 1] else 1
            table[i,j] = min(table[i, j-1]+1, table[i-1, j]+1, table[i-1, j-1]+cost)
    return table[i,j]
print("Our Algorithm :",my_edit_distance("hand", "and"))
print("NLTK Algorithm :",edit_distance("hand", "and"))

Our Algorithm : 1
NLTK Algorithm : 1


In [16]:
story1 = """In a far away kingdom, there was a river. This river was home to many gold
One day, a homeless bird saw the river. "The water in this river seems so cool and soo
As soon as the bird settled down near the river, the golden swans noticed her. They ca
"I am homeless, brothers. I too will pay the rent. Please give me shelter," the bird p
"I will teach them a lesson!" decided the humiliated bird.
She went to the King and said, "O King! The swans in your river are impolite and unkin
The King was angry with the arrogant swans for having insulted the homeless bird. He o
"Do you think the royal treasury depends upon your golden feathers? You can not decide
The swans shivered with fear on hearing the King. They flew away never to return. The"""
story2 = """Long time ago, there lived a King. He was lazy and liked all the comforts
One day, the King went into the forest to hunt. After having wandered for quite somet
But as soon as he held his bow up, the swan disappeared. And the King heard a voice,
Surprised, the King said, “Please show me the way to heaven." “Do good deeds, serve yo
The selfish King, eager to capture the Swan, tried doing some good deeds in his Kingdo
The King then disguised himself and went out into the street. There he tried helping a
Suddenly, the King heard the golden swan’s voice, “Do good deeds and you will come to
He realized that his people needed him and carrying out his duties was the only way to
"""
story1 = story1.replace(",","").replace("\n","").replace('.','').replace('"','').replace("!","").replace("?", "").casefold()

story2 = story2.replace(",","").replace("\n","").replace('.','').replace('"','').replace("!","").replace("?", "").casefold()

story1_words = story1.split(" ")
print("첫 번째 이야기 단어 :", story1_words)
story2_words = story2.split(" ")
print("두 번째 이야기 단어 :", story2_words)
story1_vocab = set(story1_words)
print("첫 번째 이야기 어휘 :", story1_vocab)
story2_vocab = set(story2_words)
print("두 번째 이야기 어휘 :", story2_vocab)
common_vocab = story1_vocab & story2_vocab
print("공통 어휘 :", common_vocab)

첫 번째 이야기 단어 : ['in', 'a', 'far', 'away', 'kingdom', 'there', 'was', 'a', 'river', 'this', 'river', 'was', 'home', 'to', 'many', 'goldone', 'day', 'a', 'homeless', 'bird', 'saw', 'the', 'river', 'the', 'water', 'in', 'this', 'river', 'seems', 'so', 'cool', 'and', 'sooas', 'soon', 'as', 'the', 'bird', 'settled', 'down', 'near', 'the', 'river', 'the', 'golden', 'swans', 'noticed', 'her', 'they', 'cai', 'am', 'homeless', 'brothers', 'i', 'too', 'will', 'pay', 'the', 'rent', 'please', 'give', 'me', 'shelter', 'the', 'bird', 'pi', 'will', 'teach', 'them', 'a', 'lesson', 'decided', 'the', 'humiliated', 'birdshe', 'went', 'to', 'the', 'king', 'and', 'said', 'o', 'king', 'the', 'swans', 'in', 'your', 'river', 'are', 'impolite', 'and', 'unkinthe', 'king', 'was', 'angry', 'with', 'the', 'arrogant', 'swans', 'for', 'having', 'insulted', 'the', 'homeless', 'bird', 'he', 'odo', 'you', 'think', 'the', 'royal', 'treasury', 'depends', 'upon', 'your', 'golden', 'feathers', 'you', 'can', 'not', 'decideth