# Word Embedding - Count-based
1. Bag of Words (BOW) 구현하기
2. TF-IDF 구현하기
3. Scikit-learn을 이용한 vectorizer
4. Scikit-learn naive bayes classification

In [12]:
# 한국어 토큰화를 위해 konlpy를 설치합니다
!apt-get update
!apt-get install g++ openjdk-8-jdk
!pip3 install konlpy

0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Hit:3 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease
Hit:4 http://archive.ubuntu.com/ubuntu bionic InRelease
Get:5 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Ign:6 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release
Hit:8 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:9 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Hit:10 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Hit:11 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic InRelease
Get:12 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [74.6 kB]


In [13]:
# Library import
from konlpy.tag import Komoran
import re
import pandas as pd
import math

## 01 Bag of Words (BOW)

In [28]:
corpus = "코로나 백신 어서 맞아야 할텐데 하지만 백신 구하기 어려워 코로나 끝났으면."

In [14]:
# Komoran 객체 정의
komoran = Komoran()

In [29]:
# 토큰화 이전에 마침표를 제거합니다
token = re.sub("(\.)", "", corpus)

In [30]:
token

'코로나 백신 어서 맞아야 할텐데 하지만 백신 구하기 어려워 코로나 끝났으면'

In [31]:
token = komoran.morphs(token)

In [32]:
token

['코로나',
 '백신',
 '어서',
 '맞',
 '아야',
 '하',
 'ㄹ텐데',
 '하지만',
 '백신',
 '구하',
 '기',
 '어렵',
 '어',
 '코로나',
 '끝나',
 '았',
 '으면']

In [19]:
word2index = {}
bow = []

In [20]:
# Tokenized된 리스트를 순회하면서 word2index를 생성합니다.
# bow 리스트엔 단어 등장 횟수를 카운팅하여 저장합니다.
for voca in token:
    if voca not in word2index:
        word2index[voca] = len(word2index)
        bow.insert(len(word2index)-1, 1)

    else:
        index = word2index.get(voca)
        bow[index] = bow[index] + 1

In [21]:
print(word2index)

{'코로나': 0, '백신': 1, '어서': 2, '맞': 3, '아야': 4, '하': 5, 'ㄹ텐데': 6, '하지만': 7, '구하': 8, '기': 9, '어렵': 10, '어': 11, '끝나': 12, '았': 13, '으면': 14}


In [22]:
bow

[2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

## 02 TF-IDF

In [None]:
documents = ["차가운 아메리카노 먹고 싶다", "따뜻한 아메리카노", "아메리카노 아메리카노 주세요", "카페라떼 주세요", "차가운 카페라떼 먹고 싶다"]

In [None]:
# 이번엔 마침표가 없이 공백으로만 토큰화를 하겠습니다.

vocab = []

for docu in documents:
    words = docu.split(" ")
    for word in words:
        if word not in vocab:
            vocab.append(word)

vocab.sort()

In [None]:
vocab

['따뜻한', '먹고', '싶다', '아메리카노', '주세요', '차가운', '카페라떼']

In [None]:
N = len(documents)

In [None]:
# term-frequency
def tf(t, d):
    return d.count(t)

# inverse-term-frequency
def idf(t):
    df = 0
    for docu in documents:
        if t in docu:
            df += 1
    return math.log(N/(df + 1))

# tf-idf
def tf_idf(t, d):
    return tf(t, d) * idf(t)

In [None]:
result = []
for i in range(N):
    result.append([])
    d = documents[i]
    for j in range(len(vocab)):
        t = vocab[j]
        result[-1].append(tf(t, d))

tf_ = pd.DataFrame(result, columns=vocab)

In [None]:
tf_

Unnamed: 0,따뜻한,먹고,싶다,아메리카노,주세요,차가운,카페라떼
0,0,1,1,1,0,1,0
1,1,0,0,1,0,0,0
2,0,0,0,2,1,0,0
3,0,0,0,0,1,0,1
4,0,1,1,0,0,1,1


In [None]:
result = []
for j in range(len(vocab)):
    t = vocab[j]
    result.append(idf(t))

idf_ = pd.DataFrame(result, index=vocab, columns=["idf"])

In [None]:
idf_

Unnamed: 0,idf
따뜻한,0.916291
먹고,0.510826
싶다,0.510826
아메리카노,0.223144
주세요,0.510826
차가운,0.510826
카페라떼,0.510826


In [None]:
result = []
for i in range(N):
    result.append([])
    d = documents[i]
    for j in range(len(vocab)):
        t = vocab[j]
        result[-1].append(tf_idf(t, d))

tfidf_ = pd.DataFrame(result, columns=vocab)

In [None]:
tfidf_

Unnamed: 0,따뜻한,먹고,싶다,아메리카노,주세요,차가운,카페라떼
0,0.0,0.510826,0.510826,0.223144,0.0,0.510826,0.0
1,0.916291,0.0,0.0,0.223144,0.0,0.0,0.0
2,0.0,0.0,0.0,0.446287,0.510826,0.0,0.0
3,0.0,0.0,0.0,0.0,0.510826,0.0,0.510826
4,0.0,0.510826,0.510826,0.0,0.0,0.510826,0.510826


## 03 Scikit-learn 사용

### 03-1 Scikit-learn을 사용한 BOW vectorizer

In [23]:
from sklearn.feature_extraction.text import CountVectorizer

In [36]:
corpus = ["코로나 백신 어서 맞아야 할텐데 하지만 백신 구하기 어려워 코로나 끝났으면."]

In [37]:
vector = CountVectorizer()

In [38]:
vector.fit(corpus)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [39]:
vector.transform(corpus).toarray()

array([[1, 1, 1, 2, 1, 1, 2, 1, 1]])

In [40]:
vector.vocabulary_

{'구하기': 0,
 '끝났으면': 1,
 '맞아야': 2,
 '백신': 3,
 '어려워': 4,
 '어서': 5,
 '코로나': 6,
 '하지만': 7,
 '할텐데': 8}

### 03-2 Scikit-learn을 이용한 TF-IDF vectorizer

In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [42]:
documents = ["차가운 아메리카노 먹고 싶다", "따뜻한 아메리카노", "아메리카노 아메리카노 주세요", "카페라떼 주세요", "차가운 카페라떼 먹고 싶다"]

In [43]:
tfidfv = TfidfVectorizer().fit(documents)

In [44]:
tfidfv_array = tfidfv.transform(documents).toarray()

In [47]:
# 이번엔 마침표가 없이 공백으로만 토큰화를 하겠습니다.

vocab = []

for docu in documents:
    words = docu.split(" ")
    for word in words:
        if word not in vocab:
            vocab.append(word)

vocab.sort()

In [48]:
pd.DataFrame(tfidfv_array, columns=vocab)

Unnamed: 0,따뜻한,먹고,싶다,아메리카노,주세요,차가운,카페라떼
0,0.0,0.520646,0.520646,0.432183,0.0,0.520646,0.0
1,0.830881,0.0,0.0,0.556451,0.0,0.0,0.0
2,0.0,0.0,0.0,0.856606,0.515971,0.0,0.0
3,0.0,0.0,0.0,0.0,0.707107,0.0,0.707107
4,0.0,0.5,0.5,0.0,0.0,0.5,0.5


In [49]:
tfidfv.vocabulary_

{'따뜻한': 0, '먹고': 1, '싶다': 2, '아메리카노': 3, '주세요': 4, '차가운': 5, '카페라떼': 6}