## 자연어 벡터화

### 1. CountVectorizer
단어들의 카운트(출현 빈도(frequency))로 여러 문서들을 벡터화

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
text_data = ['나는 배가 고프다', '내일 점심 뭐먹지', '내일 공부 해야겠다', '점심 먹고 공부 해야지']
cvect = CountVectorizer()

In [7]:
# 단어 사전 추출
output = cvect.fit_transform(text_data)
output.toarray()

array([[1, 0, 1, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 1, 0, 1, 0, 1, 0, 0],
       [0, 1, 0, 1, 0, 0, 0, 0, 1, 0],
       [0, 1, 0, 0, 1, 0, 0, 1, 0, 1]], dtype=int64)

In [15]:
cvect.vocabulary_

{'나는': 2,
 '배가': 6,
 '고프다': 0,
 '내일': 3,
 '점심': 7,
 '뭐먹지': 5,
 '공부': 1,
 '해야겠다': 8,
 '먹고': 4,
 '해야지': 9}

In [14]:
sorted(cvect.vocabulary_)

['고프다', '공부', '나는', '내일', '먹고', '뭐먹지', '배가', '점심', '해야겠다', '해야지']

In [18]:
import pandas as pd
df = pd.DataFrame(output.toarray(), columns=sorted(cvect.vocabulary_))
display(df)

Unnamed: 0,고프다,공부,나는,내일,먹고,뭐먹지,배가,점심,해야겠다,해야지
0,1,0,1,0,0,0,1,0,0,0
1,0,0,0,1,0,1,0,1,0,0
2,0,1,0,1,0,0,0,0,1,0
3,0,1,0,0,1,0,0,1,0,1


In [19]:
# 문자열 추출
output2 = cvect.transform(text_data)
output2.toarray()

array([[1, 0, 1, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 1, 0, 1, 0, 1, 0, 0],
       [0, 1, 0, 1, 0, 0, 0, 0, 1, 0],
       [0, 1, 0, 0, 1, 0, 0, 1, 0, 1]], dtype=int64)

### 2. TfidfVecorizer
TF-IDF라는 값을 사용하여 CountVectorizer의 단점을 보완</br>
먼저 해당 단어의 TF를 구하고, 이후 전체 문장에서 IDF를 구한 후, 해당 값에 역수를 취해준 IDF를 만들어 곱해준다</br>
아예 등장하지 않는다면 0, 그 이외에는 실수값이 크다면 그 단어가 보다 가치있는 특징이라는 것이고, 작다면 그다지 가치가 없다고 판단이 가능합니다.

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer
text_data = ['나는 배가 고프다', '내일 점심 뭐먹지', '내일 공부 해야겠다', '점심 먹고 공부 해야지']
tfidf = TfidfVectorizer()

In [31]:
output = tfidf.fit_transform(text_data)

In [32]:
output.toarray()

array([[0.57735027, 0.        , 0.57735027, 0.        , 0.        ,
        0.        , 0.57735027, 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.52640543, 0.        ,
        0.66767854, 0.        , 0.52640543, 0.        , 0.        ],
       [0.        , 0.52640543, 0.        , 0.52640543, 0.        ,
        0.        , 0.        , 0.        , 0.66767854, 0.        ],
       [0.        , 0.43779123, 0.        , 0.        , 0.55528266,
        0.        , 0.        , 0.43779123, 0.        , 0.55528266]])

In [35]:
tfidf.vocabulary_

{'나는': 2,
 '배가': 6,
 '고프다': 0,
 '내일': 3,
 '점심': 7,
 '뭐먹지': 5,
 '공부': 1,
 '해야겠다': 8,
 '먹고': 4,
 '해야지': 9}

In [36]:
df2 = pd.DataFrame(output.toarray(),columns=sorted(tfidf.vocabulary_))
display(df2)

Unnamed: 0,고프다,공부,나는,내일,먹고,뭐먹지,배가,점심,해야겠다,해야지
0,0.57735,0.0,0.57735,0.0,0.0,0.0,0.57735,0.0,0.0,0.0
1,0.0,0.0,0.0,0.526405,0.0,0.667679,0.0,0.526405,0.0,0.0
2,0.0,0.526405,0.0,0.526405,0.0,0.0,0.0,0.0,0.667679,0.0
3,0.0,0.437791,0.0,0.0,0.555283,0.0,0.0,0.437791,0.0,0.555283


## 모델 학습

linear/logistic/pipeline