In [2]:
import pandas as pandas
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [3]:
text = "A barber is a person. a barber is good person. a barber is huge person. he Knew A Secret! The Secret He Kept is huge secret. Huge secret. His barber kept his word. a barber kept his word. His barber kept his secret. But keeping and keeping such a huge secret to himself was driving the barber crazy. the barber went up a huge mountain."

In [4]:
# sentence tokenization
text = sent_tokenize(text)
print(text)

['A barber is a person.', 'a barber is good person.', 'a barber is huge person.', 'he Knew A Secret!', 'The Secret He Kept is huge secret.', 'Huge secret.', 'His barber kept his word.', 'a barber kept his word.', 'His barber kept his secret.', 'But keeping and keeping such a huge secret to himself was driving the barber crazy.', 'the barber went up a huge mountain.']


# dictionary 

In [5]:
# word tokenization
vocab = {}
sentences = []
stop_words = set(stopwords.words('english'))
for i in text : 
    sentence = word_tokenize(i)
    result = []

    for word in sentence : 
        word = word.lower()
        if word not in stop_words:
            if len(word) > 2 :
                result.append(word)
                if word not in vocab: #counter init
                    vocab[word] = 0 
                vocab[word] += 1
    sentences.append(result)
sentences

[['barber', 'person'],
 ['barber', 'good', 'person'],
 ['barber', 'huge', 'person'],
 ['knew', 'secret'],
 ['secret', 'kept', 'huge', 'secret'],
 ['huge', 'secret'],
 ['barber', 'kept', 'word'],
 ['barber', 'kept', 'word'],
 ['barber', 'kept', 'secret'],
 ['keeping', 'keeping', 'huge', 'secret', 'driving', 'barber', 'crazy'],
 ['barber', 'went', 'huge', 'mountain']]

In [6]:
vocab

{'barber': 8,
 'person': 3,
 'good': 1,
 'huge': 5,
 'knew': 1,
 'secret': 6,
 'kept': 4,
 'word': 2,
 'keeping': 2,
 'driving': 1,
 'crazy': 1,
 'went': 1,
 'mountain': 1}

In [7]:
vocab_sorted = sorted(vocab.items(), key= lambda x:x[1], reverse = True)
vocab_sorted

[('barber', 8),
 ('secret', 6),
 ('huge', 5),
 ('kept', 4),
 ('person', 3),
 ('word', 2),
 ('keeping', 2),
 ('good', 1),
 ('knew', 1),
 ('driving', 1),
 ('crazy', 1),
 ('went', 1),
 ('mountain', 1)]

In [8]:
#word indexing according to frequency
word_to_index = {}
i = 0 
for (word, frequency) in vocab_sorted:
    if frequency > 1 : 
        i += 1 
        word_to_index[word] = i
word_to_index

{'barber': 1,
 'secret': 2,
 'huge': 3,
 'kept': 4,
 'person': 5,
 'word': 6,
 'keeping': 7}

In [11]:
vocab_size = 5 
words_frequency = [word for word, count in word_to_index.items() if count >= vocab_size + 1]
print(words_frequency)
for w in words_frequency :
    del word_to_index[w]
word_to_index

['word', 'keeping']


{'barber': 1, 'secret': 2, 'huge': 3, 'kept': 4, 'person': 5}

In [12]:
word_to_index['OOV'] = len(word_to_index)+1

In [13]:
word_to_index

{'barber': 1, 'secret': 2, 'huge': 3, 'kept': 4, 'person': 5, 'OOV': 6}

In [14]:
encoded = []
for sentence in sentences:
    tmp = []
    for word in sentence :
        try : 
            tmp.append(word_to_index[word])
        except KeyError:
            tmp.append(word_to_index['OOV'])
    encoded.append(tmp)
encoded

[[1, 5],
 [1, 6, 5],
 [1, 3, 5],
 [6, 2],
 [2, 4, 3, 2],
 [3, 2],
 [1, 4, 6],
 [1, 4, 6],
 [1, 4, 2],
 [6, 6, 3, 2, 6, 1, 6],
 [1, 6, 3, 6]]

# Counter


In [16]:
from collections import Counter
sentences

[['barber', 'person'],
 ['barber', 'good', 'person'],
 ['barber', 'huge', 'person'],
 ['knew', 'secret'],
 ['secret', 'kept', 'huge', 'secret'],
 ['huge', 'secret'],
 ['barber', 'kept', 'word'],
 ['barber', 'kept', 'word'],
 ['barber', 'kept', 'secret'],
 ['keeping', 'keeping', 'huge', 'secret', 'driving', 'barber', 'crazy'],
 ['barber', 'went', 'huge', 'mountain']]

In [18]:
words = sum(sentences,[])
words

['barber',
 'person',
 'barber',
 'good',
 'person',
 'barber',
 'huge',
 'person',
 'knew',
 'secret',
 'secret',
 'kept',
 'huge',
 'secret',
 'huge',
 'secret',
 'barber',
 'kept',
 'word',
 'barber',
 'kept',
 'word',
 'barber',
 'kept',
 'secret',
 'keeping',
 'keeping',
 'huge',
 'secret',
 'driving',
 'barber',
 'crazy',
 'barber',
 'went',
 'huge',
 'mountain']

In [20]:
import numpy as np
words = np.hstack(sentences)
words

array(['barber', 'person', 'barber', 'good', 'person', 'barber', 'huge',
       'person', 'knew', 'secret', 'secret', 'kept', 'huge', 'secret',
       'huge', 'secret', 'barber', 'kept', 'word', 'barber', 'kept',
       'word', 'barber', 'kept', 'secret', 'keeping', 'keeping', 'huge',
       'secret', 'driving', 'barber', 'crazy', 'barber', 'went', 'huge',
       'mountain'], dtype='<U8')

In [22]:
vocab = Counter(words)
vocab

Counter({'barber': 8,
         'person': 3,
         'good': 1,
         'huge': 5,
         'knew': 1,
         'secret': 6,
         'kept': 4,
         'word': 2,
         'keeping': 2,
         'driving': 1,
         'crazy': 1,
         'went': 1,
         'mountain': 1})

In [23]:
vocab['barber']

8

In [24]:
vocab_size = 5
vocab = vocab.most_common(vocab_size) # most frequent top 5
vocab

[('barber', 8), ('secret', 6), ('huge', 5), ('kept', 4), ('person', 3)]

In [25]:
type(vocab)

list

# NLTK FreqDist

In [26]:
from nltk import FreqDist
import numpy as np

vocab = FreqDist(np.hstack(sentences))
vocab

FreqDist({'barber': 8, 'secret': 6, 'huge': 5, 'kept': 4, 'person': 3, 'word': 2, 'keeping': 2, 'good': 1, 'knew': 1, 'driving': 1, ...})

In [27]:
vocab_size = 5
vocab = vocab.most_common(vocab_size) # 등장 빈도수가 높은 상위 5개의 단어만 저장
vocab

[('barber', 8), ('secret', 6), ('huge', 5), ('kept', 4), ('person', 3)]

In [28]:
word_to_index = {word[0] : index + 1 for index, word in enumerate(vocab)}
print(word_to_index)

{'barber': 1, 'secret': 2, 'huge': 3, 'kept': 4, 'person': 5}


In [33]:
testdata = ['a','b','c']
for index, value in enumerate(testdata):
    print("value : {}, index : {}".format(value,index))

value : a, index : 0
value : b, index : 1
value : c, index : 2


In [34]:
from tensorflow.keras.preprocessing.text import Tokenizer


In [35]:
sentences=[['barber', 'person'], ['barber', 'good', 'person'], ['barber', 'huge', 'person'], ['knew', 'secret'], ['secret', 'kept', 'huge', 'secret'], ['huge', 'secret'], ['barber', 'kept', 'word'], ['barber', 'kept', 'word'], ['barber', 'kept', 'secret'], ['keeping', 'keeping', 'huge', 'secret', 'driving', 'barber', 'crazy'], ['barber', 'went', 'huge', 'mountain']]

In [36]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
tokenizer

<keras_preprocessing.text.Tokenizer at 0x1558c1f90>

In [37]:
tokenizer.word_index

{'barber': 1,
 'secret': 2,
 'huge': 3,
 'kept': 4,
 'person': 5,
 'word': 6,
 'keeping': 7,
 'good': 8,
 'knew': 9,
 'driving': 10,
 'crazy': 11,
 'went': 12,
 'mountain': 13}

In [38]:
tokenizer.word_counts

OrderedDict([('barber', 8),
             ('person', 3),
             ('good', 1),
             ('huge', 5),
             ('knew', 1),
             ('secret', 6),
             ('kept', 4),
             ('word', 2),
             ('keeping', 2),
             ('driving', 1),
             ('crazy', 1),
             ('went', 1),
             ('mountain', 1)])

In [39]:
tokenizer.word_docs

defaultdict(int,
            {'person': 3,
             'barber': 8,
             'good': 1,
             'huge': 5,
             'secret': 5,
             'knew': 1,
             'kept': 4,
             'word': 2,
             'keeping': 1,
             'crazy': 1,
             'driving': 1,
             'mountain': 1,
             'went': 1})

In [40]:
tokenizer.texts_to_sequences(sentences)

[[1, 5],
 [1, 8, 5],
 [1, 3, 5],
 [9, 2],
 [2, 4, 3, 2],
 [3, 2],
 [1, 4, 6],
 [1, 4, 6],
 [1, 4, 2],
 [7, 7, 3, 2, 10, 1, 11],
 [1, 12, 3, 13]]

In [41]:
vocab_size = 5
tokenizer = Tokenizer(num_words = vocab_size + 1) # top 5 (#1~#5)  (#0~#5 if num_words = 5 )
tokenizer.fit_on_texts(sentences)

In [42]:
tokenizer.word_index

{'barber': 1,
 'secret': 2,
 'huge': 3,
 'kept': 4,
 'person': 5,
 'word': 6,
 'keeping': 7,
 'good': 8,
 'knew': 9,
 'driving': 10,
 'crazy': 11,
 'went': 12,
 'mountain': 13}

In [43]:
tokenizer.word_counts

OrderedDict([('barber', 8),
             ('person', 3),
             ('good', 1),
             ('huge', 5),
             ('knew', 1),
             ('secret', 6),
             ('kept', 4),
             ('word', 2),
             ('keeping', 2),
             ('driving', 1),
             ('crazy', 1),
             ('went', 1),
             ('mountain', 1)])

In [44]:
tokenizer.texts_to_sequences(sentences)

[[1, 5],
 [1, 5],
 [1, 3, 5],
 [2],
 [2, 4, 3, 2],
 [3, 2],
 [1, 4],
 [1, 4],
 [1, 4, 2],
 [3, 2, 1],
 [1, 3]]

In [45]:
vocab_size = 5
words_frequency = [word for word,count in tokenizer.word_index.items() if count >= vocab_size + 1] # 인덱스가 5 초과인 단어 제거
for w in words_frequency:
    del tokenizer.word_index[w] # 해당 단어에 대한 인덱스 정보를 삭제
    del tokenizer.word_counts[w] # 해당 단어에 대한 카운트 정보를 삭제
print(tokenizer.word_index)
print(tokenizer.word_counts)
print(tokenizer.texts_to_sequences(sentences))

{'barber': 1, 'secret': 2, 'huge': 3, 'kept': 4, 'person': 5}
OrderedDict([('barber', 8), ('person', 3), ('huge', 5), ('secret', 6), ('kept', 4)])
[[1, 5], [1, 5], [1, 3, 5], [2], [2, 4, 3, 2], [3, 2], [1, 4], [1, 4], [1, 4, 2], [3, 2, 1], [1, 3]]


In [47]:
vocab_size = 5
tokenizer = Tokenizer(num_words = vocab_size + 2, oov_token = 'OOV')
# 빈도수 상위 5개 단어만 사용 (#2~#7), default OOV = #1
tokenizer.fit_on_texts(sentences)

In [48]:
print('index of word OOV : {}'.format(tokenizer.word_index['OOV']))


index of word OOV : 1


In [49]:
print(tokenizer.texts_to_sequences(sentences))

[[2, 6], [2, 1, 6], [2, 4, 6], [1, 3], [3, 5, 4, 3], [4, 3], [2, 5, 1], [2, 5, 1], [2, 5, 3], [1, 1, 4, 3, 1, 2, 1], [2, 1, 4, 1]]


# Padding

In [52]:
max_len = max(len(item) for item in encoded)

In [64]:
for item in encoded:
    while len(item) < max_len :
        item.append(0) #zero padding
padded_np  = np.array(encoded)
padded_np

array([[2, 6, 0, 0, 0, 0, 0],
       [2, 1, 6, 0, 0, 0, 0],
       [2, 4, 6, 0, 0, 0, 0],
       [1, 3, 0, 0, 0, 0, 0],
       [3, 5, 4, 3, 0, 0, 0],
       [4, 3, 0, 0, 0, 0, 0],
       [2, 5, 1, 0, 0, 0, 0],
       [2, 5, 1, 0, 0, 0, 0],
       [2, 5, 3, 0, 0, 0, 0],
       [1, 1, 4, 3, 1, 2, 1],
       [2, 1, 4, 1, 0, 0, 0]])

In [71]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
tokenizer.word_index

{'barber': 1,
 'secret': 2,
 'huge': 3,
 'kept': 4,
 'person': 5,
 'word': 6,
 'keeping': 7,
 'good': 8,
 'knew': 9,
 'driving': 10,
 'crazy': 11,
 'went': 12,
 'mountain': 13}

In [54]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [55]:
encoded = tokenizer.texts_to_sequences(sentences)
encoded

[[2, 6],
 [2, 1, 6],
 [2, 4, 6],
 [1, 3],
 [3, 5, 4, 3],
 [4, 3],
 [2, 5, 1],
 [2, 5, 1],
 [2, 5, 3],
 [1, 1, 4, 3, 1, 2, 1],
 [2, 1, 4, 1]]

In [62]:
padded = pad_sequences(encoded)

In [63]:
(padded == padded_np).all()

False

In [65]:
padded = pad_sequences(encoded, padding='post')
(padded == padded_np).all()

True

In [66]:
padded = pad_sequences(encoded, padding = 'post', maxlen = 5) # front letters cut
padded 

array([[0, 0, 0, 0, 0],
       [6, 0, 0, 0, 0],
       [6, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [4, 3, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [3, 0, 0, 0, 0],
       [4, 3, 1, 2, 1],
       [4, 1, 0, 0, 0]], dtype=int32)

In [73]:
last_value = len(tokenizer.word_index)+1
last_value

14

In [74]:
encoded = tokenizer.texts_to_sequences(sentences)

padded = pad_sequences(encoded, padding = 'post', value = last_value)
padded

array([[ 1,  5, 14, 14, 14, 14, 14],
       [ 1,  8,  5, 14, 14, 14, 14],
       [ 1,  3,  5, 14, 14, 14, 14],
       [ 9,  2, 14, 14, 14, 14, 14],
       [ 2,  4,  3,  2, 14, 14, 14],
       [ 3,  2, 14, 14, 14, 14, 14],
       [ 1,  4,  6, 14, 14, 14, 14],
       [ 1,  4,  6, 14, 14, 14, 14],
       [ 1,  4,  2, 14, 14, 14, 14],
       [ 7,  7,  3,  2, 10,  1, 11],
       [ 1, 12,  3, 13, 14, 14, 14]], dtype=int32)

In [72]:
len(tokenizer.word_index)

13

# one-hot encoding

In [81]:
from konlpy.tag import Okt
okt = Okt()
token = okt.morphs("나는 자연어 처리를 배운다")
token

['나', '는', '자연어', '처리', '를', '배운다']

In [82]:
word2index={}
for vocab in token:
    if vocab not in word2index.keys():
        word2index[vocab] = len(word2index)
word2index

{'나': 0, '는': 1, '자연어': 2, '처리': 3, '를': 4, '배운다': 5}

In [83]:
def one_hot_encoding(word, word2index):
    one_hot_vector = [0]*(len(word2index))
    index = word2index[word]
    one_hot_vector[index] = 1 
    return one_hot_vector

In [84]:
one_hot_encoding("자연어",word2index)

[0, 0, 1, 0, 0, 0]

In [87]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
text = "나랑 짜장면 먹을래 짬뽕 먹을래"

t = Tokenizer()
t.fit_on_texts([text])
t.word_index

{'먹을래': 1, '나랑': 2, '짜장면': 3, '짬뽕': 4}

In [88]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
text = "나랑 짜장면 먹을래 짬뽕 먹을래"

t = Tokenizer()
t.fit_on_texts(text)
t.word_index

{'먹': 1,
 '을': 2,
 '래': 3,
 '나': 4,
 '랑': 5,
 '짜': 6,
 '장': 7,
 '면': 8,
 '짬': 9,
 '뽕': 10}

In [98]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
text = "나랑 짜장면 먹을래 짬뽕 먹을래"

t = Tokenizer()
t.fit_on_sequences([text])
t.word_index

{}

In [99]:
t = Tokenizer()
t.fit_on_texts([text])
t.word_index

{'먹을래': 1, '나랑': 2, '짜장면': 3, '짬뽕': 4}

In [103]:
sub_text = "짬뽕 먹을래"
encoded = t.texts_to_sequences([sub_text])
encoded

[[4, 1]]

In [104]:
encoded[0]

[4, 1]

In [106]:
one_hot = to_categorical(encoded[0])
one_hot


#array([[0., 0., 0., 0., 1.],   # index 4 onehot 
#       [0., 1., 0., 0., 0.]], dtype=float32) # index 2 onehot 

array([[0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0.]], dtype=float32)

# Splitting data

In [108]:
X,y = zip(['a',1],['b',2],['c',3])
X, y

(('a', 'b', 'c'), (1, 2, 3))

In [109]:
sequences = [['a',1],['b',2],['c',3]]

In [110]:
X,y = zip(*sequences)
X,y

(('a', 'b', 'c'), (1, 2, 3))

In [111]:
X[0]

'a'

In [112]:
import pandas as pd
values = [['안녕 이 편지는~',1],['꼭 읽어주세요',0],['(광고) 웹 발신',1],['사랑이가',0]]
columns = ['메일본문','스팸유무']

df = pd.DataFrame(values, columns = columns)
df

Unnamed: 0,메일본문,스팸유무
0,안녕 이 편지는~,1
1,꼭 읽어주세요,0
2,(광고) 웹 발신,1
3,사랑이가,0


In [113]:
X=df['메일본문']
y=df['스팸유무']


In [114]:
y

0    1
1    0
2    1
3    0
Name: 스팸유무, dtype: int64

In [115]:
type(y)

pandas.core.series.Series

In [117]:
ar = np.arange(0,16).reshape((4,4))
ar

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15]])

In [119]:
X=ar[:,:3]
X

array([[ 0,  1,  2],
       [ 4,  5,  6],
       [ 8,  9, 10],
       [12, 13, 14]])

In [120]:
y=ar[:,3]
y

array([ 3,  7, 11, 15])

In [121]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1234)

In [122]:
X,y = np.arange(10).reshape((2,5)),range(5)

In [123]:
X

array([[0, 1, 2, 3, 4],
       [5, 6, 7, 8, 9]])

In [124]:
y

range(0, 5)

In [125]:
type(y), list(y)

(range, [0, 1, 2, 3, 4])

In [133]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1234)


In [134]:
X_train

array([[2, 3],
       [4, 5],
       [6, 7]])

In [135]:
X_test

array([[8, 9],
       [0, 1]])

In [136]:
y_train

[1, 2, 3]

In [137]:
y_test

[4, 0]

In [138]:
X, y = np.arange(0,24).reshape((12,2)), range(12)

In [139]:
X

array([[ 0,  1],
       [ 2,  3],
       [ 4,  5],
       [ 6,  7],
       [ 8,  9],
       [10, 11],
       [12, 13],
       [14, 15],
       [16, 17],
       [18, 19],
       [20, 21],
       [22, 23]])

In [140]:
y

range(0, 12)

In [141]:
list(y)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]

In [143]:
n_of_train = int(len(X) * 0.8)
n_of_test = int(len(X)- n_of_train)
n_of_train,n_of_test

(9, 3)

In [144]:
X_test = X[n_of_train:]
y_test = y[n_of_train:]
X_train = X[:n_of_train]
y_train = y[:n_of_train]

In [145]:
X_test

array([[18, 19],
       [20, 21],
       [22, 23]])

In [146]:
list(y_test)

[9, 10, 11]

In [147]:
!pip install git+https://github.com/haven-jeon/PyKoSpacing.git


Collecting git+https://github.com/haven-jeon/PyKoSpacing.git
  Cloning https://github.com/haven-jeon/PyKoSpacing.git to /private/var/folders/vz/251qtffn4433kmccnjk90_7h0000gn/T/pip-req-build-njkracry
  Running command git clone -q https://github.com/haven-jeon/PyKoSpacing.git /private/var/folders/vz/251qtffn4433kmccnjk90_7h0000gn/T/pip-req-build-njkracry
Collecting tensorflow==2.4.0
  Downloading tensorflow-2.4.0-cp37-cp37m-macosx_10_11_x86_64.whl (175.4 MB)
[K     |████████████████████████████████| 175.4 MB 46 kB/s 
Collecting argparse>=1.4.0
  Downloading argparse-1.4.0-py2.py3-none-any.whl (23 kB)
Collecting wheel~=0.35
  Using cached wheel-0.36.2-py2.py3-none-any.whl (35 kB)
Collecting opt-einsum~=3.3.0
  Using cached opt_einsum-3.3.0-py3-none-any.whl (65 kB)
Collecting absl-py~=0.10
  Downloading absl_py-0.12.0-py3-none-any.whl (129 kB)
[K     |████████████████████████████████| 129 kB 25.8 MB/s 
[?25hCollecting six~=1.15.0
  Using cached six-1.15.0-py2.py3-none-any.whl (10 kB)


In [148]:
sent = '김철수는 극중 두 인격의 사나이 이광수 역을 맡았다. 철수는 한국 유일의 태권도 전승자를 가리는 결전의 날을 앞두고 10년간 함께 훈련한 사형인 유연재(김광수 분)를 찾으러 속세로 내려온 인물이다.'
new_sent = sent.replace(" ",'')
new_sent

'김철수는극중두인격의사나이이광수역을맡았다.철수는한국유일의태권도전승자를가리는결전의날을앞두고10년간함께훈련한사형인유연재(김광수분)를찾으러속세로내려온인물이다.'

In [156]:
from pykospacing import spacing
kospacing_sent = spacing(new_sent)

sent == kospacing_sent

ModuleNotFoundError: No module named 'pykospacing'

In [None]:
!pip install git+https://github.com/ssut/py-hanspell.git


In [152]:
from hanspell import spell_checker

sent = "맞춤법 틀리면 외 않되? 쓰고싶은대로쓰면돼지 "
spelled_sent = spell_checker.check(sent)

hanspell_sent = spelled_sent.checked
hanspell_sent

'맞춤법 틀리면 왜 안돼? 쓰고 싶은 대로 쓰면 되지'

In [153]:
spelled_sent

Checked(result=True, original='맞춤법 틀리면 외 않되? 쓰고싶은대로쓰면돼지 ', checked='맞춤법 틀리면 왜 안돼? 쓰고 싶은 대로 쓰면 되지', errors=2, words=OrderedDict([('맞춤법', 0), ('틀리면', 0), ('왜', 1), ('안돼?', 1), ('쓰고', 1), ('싶은', 1), ('대로', 1), ('쓰면', 1), ('되지', 1)]), time=0.22724509239196777)

In [None]:
!pip install soynlp

In [157]:
from konlpy.tag import Okt
tokenizer = Okt()
tokenizer.morphs('빅스 최애돌 기부요정 등극')

['빅스', '최애', '돌', '기부', '요정', '등', '극']

In [158]:
import urllib.request
from soynlp import DoublespaceLineCorpus
from soynlp.word import WordExtractor

In [159]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/lovit/soynlp/master/tutorials/2016-10-20.txt", filename="2016-10-20.txt")

('2016-10-20.txt', <http.client.HTTPMessage at 0x17bb4ae10>)

In [160]:
corpus = DoublespaceLineCorpus("2016-10-20.txt")
len(corpus)


30091

In [162]:
i = 0 
for document in corpus : 
    if len(document) > 0:
        print(document)
        i += 1
    if i == 3 :
        break

19  1990  52 1 22
오패산터널 총격전 용의자 검거 서울 연합뉴스 경찰 관계자들이 19일 오후 서울 강북구 오패산 터널 인근에서 사제 총기를 발사해 경찰을 살해한 용의자 성모씨를 검거하고 있다 성씨는 검거 당시 서바이벌 게임에서 쓰는 방탄조끼에 헬멧까지 착용한 상태였다 독자제공 영상 캡처 연합뉴스  서울 연합뉴스 김은경 기자 사제 총기로 경찰을 살해한 범인 성모 46 씨는 주도면밀했다  경찰에 따르면 성씨는 19일 오후 강북경찰서 인근 부동산 업소 밖에서 부동산업자 이모 67 씨가 나오기를 기다렸다 이씨와는 평소에도 말다툼을 자주 한 것으로 알려졌다  이씨가 나와 걷기 시작하자 성씨는 따라가면서 미리 준비해온 사제 총기를 이씨에게 발사했다 총알이 빗나가면서 이씨는 도망갔다 그 빗나간 총알은 지나가던 행인 71 씨의 배를 스쳤다  성씨는 강북서 인근 치킨집까지 이씨 뒤를 쫓으며 실랑이하다 쓰러뜨린 후 총기와 함께 가져온 망치로 이씨 머리를 때렸다  이 과정에서 오후 6시 20분께 강북구 번동 길 위에서 사람들이 싸우고 있다 총소리가 났다 는 등의 신고가 여러건 들어왔다  5분 후에 성씨의 전자발찌가 훼손됐다는 신고가 보호관찰소 시스템을 통해 들어왔다 성범죄자로 전자발찌를 차고 있던 성씨는 부엌칼로 직접 자신의 발찌를 끊었다  용의자 소지 사제총기 2정 서울 연합뉴스 임헌정 기자 서울 시내에서 폭행 용의자가 현장 조사를 벌이던 경찰관에게 사제총기를 발사해 경찰관이 숨졌다 19일 오후 6시28분 강북구 번동에서 둔기로 맞았다 는 폭행 피해 신고가 접수돼 현장에서 조사하던 강북경찰서 번동파출소 소속 김모 54 경위가 폭행 용의자 성모 45 씨가 쏜 사제총기에 맞고 쓰러진 뒤 병원에 옮겨졌으나 숨졌다 사진은 용의자가 소지한 사제총기  신고를 받고 번동파출소에서 김창호 54 경위 등 경찰들이 오후 6시 29분께 현장으로 출동했다 성씨는 그사이 부동산 앞에 놓아뒀던 가방을 챙겨 오패산 쪽으로 도망간 후였다  김 경위는 오패산 터널 입구 오른쪽의 급경사에서 성씨에

In [163]:
word_extractor = WordExtractor()
word_extractor.train(corpus)
word_score_table = word_extractor.extract()

training was done. used memory 1.372 Gb
all cohesion probabilities was computed. # words = 223348
all branching entropies was computed # words = 361598
all accessor variety was computed # words = 361598


In [164]:
#cohesion probability
word_score_table["반포한"].cohesion_forward

0.08838002913645132

In [165]:
word_score_table["반포한강"].cohesion_forward

0.19841268168224552

In [166]:
word_score_table["반포한강공"].cohesion_forward

0.2972877884078849

In [169]:
word_score_table["반포한강공원"].cohesion_forward

0.37891487632839754

In [182]:
word_score_table["반포한강공원으로"].cohesion_forward

0.343533249143631

In [183]:
#branching entropy
word_score_table["디스"].right_branching_entropy

1.6371694761537934

In [184]:
word_score_table["디스플"].right_branching_entropy

-0.0

In [186]:
word_score_table["디"].right_branching_entropy

2.68517802819071

In [187]:
word_score_table["디스플레이"].right_branching_entropy

3.1400392861792916

In [189]:
from soynlp.tokenizer import LTokenizer

scores = {word:score.cohesion_forward for word, score in word_score_table.items()}

In [193]:
scores['덕']

0

In [194]:
l_tokenizer = LTokenizer(scores = scores)
l_tokenizer.tokenize("국제 사회의 등불이 되어 사람을 살리자",flatten = False)

[('국제', ''), ('사회의', ''), ('등불이', ''), ('되어', ''), ('사람', '을'), ('살리자', '')]

In [195]:
l_tokenizer.tokenize("국제 사회의 등불이 되어 사람을 살려내자",flatten = False)

[('국제', ''), ('사회의', ''), ('등불이', ''), ('되어', ''), ('사람', '을'), ('살려내자', '')]

In [197]:
l_tokenizer.tokenize("국제 사회의 등불이 되어 사람을 살려내자",flatten = True)

['국제', '사회의', '등불이', '되어', '사람', '을', '살려내자']

In [198]:
from soynlp.tokenizer import MaxScoreTokenizer

maxscore_tokenizer = MaxScoreTokenizer(scores = scores)
maxscore_tokenizer.tokenize("국제사회의등불이되어사람을살려내자")

['국제사회의', '등불이', '되어', '사람', '을살려내자']

In [200]:
from soynlp.normalizer import *
emoticon_normalize('앜ㅋㅋㅋㅋㅋ존재뮤ㅠㅠㅠㅠ',num_repeats=2)

'아ㅋㅋ재뮤ㅠㅠ'

In [207]:
repeat_normalize('와하하ㅏ아아아아대박이네대박이야대박이야000대박대박대대대와하하하하하핫',num_repeats=2)

'와하하ㅏ아아대박이네대박이야대박이야000대박대박대대대와하하핫'

In [211]:
!pip install customized_konlpy


Collecting customized_konlpy
  Using cached customized_konlpy-0.0.64-py3-none-any.whl (881 kB)
Installing collected packages: customized-konlpy
Successfully installed customized-konlpy-0.0.64


In [221]:
from ckonlpy.tag import Twitter

twitter = Twitter()

twitter.morphs('수진이는 사무실로 갔다. 은경이는 안 갔다')

['수진', '이', '는', '사무실', '로', '갔다', '.', '은', '경이', '는', '안', '갔다']

In [222]:
twitter.add_dictionary('은경이','Noun')

In [223]:
twitter.morphs('수진이는 사무실로 갔다. 은경이는 안 갔다')

['수진', '이', '는', '사무실', '로', '갔다', '.', '은경이', '는', '안', '갔다']