#### Vectorizer 클래스 사용법
1. 클래스 객체 생성
2. 말뭉치를 넣고 fit 메서드 실행
3. vocabulary_ 속성에 단어장이 자동 생성됨
4. transform 메서드로 다른 문서를 BOW 인코딩
5. BOW 인코딩 결과는 Sparse 행렬로 만들어지므로 toarray 메서드로 보통 행렬로 변환

#### DictVectorizer

In [22]:
# 현실적으로 쓸 일이 없다
from sklearn.feature_extraction import DictVectorizer
v = DictVectorizer(sparse=False)
D = [{'A': 1, 'B': 2}, {'B': 3, 'C': 1}]
X = v.fit_transform(D)
X

array([[1., 2., 0.],
       [0., 3., 1.]])

In [23]:
v.feature_names_

['A', 'B', 'C']

In [24]:
# fit_transform 이전에 설정된 corpus에 없는 단어는 무효처리
v.transform({'C': 4, 'D': 3})

array([[0., 0., 4.]])

#### CountVectorizer

##### step 1. 말뭉치 만들기

In [25]:
corpus = [
    'This is the first document.',
    'This is the second second document.',
    'And the third one.',
    'Is this the first document?',
    'The last document?',
]

##### step 2. 인코더 객체 생성

In [26]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer()

##### step 3. 말뭉치 학습 및 단어장 생성

In [27]:
vect.fit(corpus)
vect.vocabulary_

{'this': 9,
 'is': 3,
 'the': 7,
 'first': 2,
 'document': 1,
 'second': 6,
 'and': 0,
 'third': 8,
 'one': 5,
 'last': 4}

##### step 4. 문장을 BOW 인코딩

In [28]:
vect.transform(['This is the second document.']).toarray()

array([[0, 1, 0, 1, 0, 0, 1, 1, 0, 1]], dtype=int64)

In [29]:
# 말뭉치에서 학습하지 못한 단어는 무시된다.
vect.transform(['Something completely new.']).toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)

In [30]:
vect.transform(corpus).toarray()

array([[0, 1, 1, 1, 0, 0, 0, 1, 0, 1],
       [0, 1, 0, 1, 0, 0, 2, 1, 0, 1],
       [1, 0, 0, 0, 0, 1, 0, 1, 1, 0],
       [0, 1, 1, 1, 0, 0, 0, 1, 0, 1],
       [0, 1, 0, 0, 1, 0, 0, 1, 0, 0]], dtype=int64)

##### step 5. 불용어 사용

In [31]:
vect = CountVectorizer(stop_words=['and', 'is', 'the', 'this']).fit(corpus)
vect.vocabulary_

{'first': 1, 'document': 0, 'second': 4, 'third': 5, 'one': 3, 'last': 2}

In [32]:
vect = CountVectorizer(stop_words='english').fit(corpus)
vect.vocabulary_

{'document': 0, 'second': 1}

#### step 6. 빈도수 적용

In [33]:
vect = CountVectorizer(max_df=4, min_df=2).fit(corpus)
vect.vocabulary_, vect.stop_words_

({'this': 3, 'is': 2, 'first': 1, 'document': 0},
 {'and', 'last', 'one', 'second', 'the', 'third'})

In [34]:
vect.transform(corpus).toarray().sum(axis=0)

array([4, 2, 3, 3], dtype=int64)

##### step 7. N그램 적용

In [35]:
# 단어장 생성에 사용할 토큰의 크기를 결정 (unigram, bigram, trigram, etc.)
vect = CountVectorizer(ngram_range=(2, 2)).fit(corpus)
vect.vocabulary_

{'this is': 12,
 'is the': 2,
 'the first': 7,
 'first document': 1,
 'the second': 9,
 'second second': 6,
 'second document': 5,
 'and the': 0,
 'the third': 10,
 'third one': 11,
 'is this': 3,
 'this the': 13,
 'the last': 8,
 'last document': 4}

In [36]:
vect = CountVectorizer(ngram_range=(1, 2), token_pattern='t\w+').fit(corpus)
vect.vocabulary_

{'this': 3, 'the': 0, 'this the': 4, 'third': 2, 'the third': 1}

##### 토큰

In [37]:
vect = CountVectorizer(analyzer='char').fit(corpus)
vect.vocabulary_

{'t': 16,
 'h': 8,
 'i': 9,
 's': 15,
 ' ': 0,
 'e': 6,
 'f': 7,
 'r': 14,
 'd': 5,
 'o': 13,
 'c': 4,
 'u': 17,
 'm': 11,
 'n': 12,
 '.': 1,
 'a': 3,
 '?': 2,
 'l': 10}

In [38]:
vect = CountVectorizer(token_pattern='t\w+').fit(corpus)
vect.vocabulary_

{'this': 2, 'the': 0, 'third': 1}

In [39]:
import nltk

vect = CountVectorizer(tokenizer=nltk.word_tokenize).fit(corpus)
vect.vocabulary_

{'this': 11,
 'is': 5,
 'the': 9,
 'first': 4,
 'document': 3,
 '.': 0,
 'second': 8,
 'and': 2,
 'third': 10,
 'one': 7,
 '?': 1,
 'last': 6}

#### TfidfVectorizer

In [45]:
# 비율에 중요도를 곱해놓은 것
from sklearn.feature_extraction.text import TfidfVectorizer

tfidv = TfidfVectorizer().fit(corpus)
tfidv.transform(corpus).toarray()

array([[0.        , 0.38947624, 0.55775063, 0.4629834 , 0.        ,
        0.        , 0.        , 0.32941651, 0.        , 0.4629834 ],
       [0.        , 0.24151532, 0.        , 0.28709733, 0.        ,
        0.        , 0.85737594, 0.20427211, 0.        , 0.28709733],
       [0.55666851, 0.        , 0.        , 0.        , 0.        ,
        0.55666851, 0.        , 0.26525553, 0.55666851, 0.        ],
       [0.        , 0.38947624, 0.55775063, 0.4629834 , 0.        ,
        0.        , 0.        , 0.32941651, 0.        , 0.4629834 ],
       [0.        , 0.45333103, 0.        , 0.        , 0.80465933,
        0.        , 0.        , 0.38342448, 0.        , 0.        ]])

#### 해시 트릭 (Hashing Trick)
- HashingVectorizer를 사용하면 해시 함수(Hash function)를 사용
- 단어에 대한 인덳 번호를 수식으로 생성 (문자를 입력하면 숫자가 출력되기 때문에 암호화에 자주 사용됨)
- 장점: 사전 메모리가 없고 실행 시간을 줄일 수 있다 (단어장의 길이가 길어져도 연산 속도가 빠르다)
- 단점: 가끔 단어의 충돌이 발생할 수 있다 (e.g. 'boy' = 3, 'girl' = 3) -> 이는 파라미터 조정으로 해결한다

In [47]:
from sklearn.datasets import fetch_20newsgroups
twenty = fetch_20newsgroups()
len(twenty.data)

11314

In [48]:
%time CountVectorizer().fit(twenty.data).transform(twenty.data)

Wall time: 11.1 s


<11314x130107 sparse matrix of type '<class 'numpy.int64'>'
	with 1787565 stored elements in Compressed Sparse Row format>

In [49]:
from sklearn.feature_extraction.text import HashingVectorizer
hv = HashingVectorizer(n_features=300000)

In [50]:
%time hv.transform(twenty.data)

Wall time: 4.07 s


<11314x300000 sparse matrix of type '<class 'numpy.float64'>'
	with 1786336 stored elements in Compressed Sparse Row format>

### Gensim 패키지
- Bag of Words 인코딩
- TF-IDF 인코딩
- 토픽 모델링

#### Gensim의 BOW 인코딩 기능
- Dictionary 클래스 이용
    - token2id 속성으로 사전 저장
    - doc2bow 메서드로 BOW 인코딩
- TfidModel 클래스를 이용하면 TF-IDF 인코딩도 가능

##### step 1. 말뭉치 만들기

In [53]:
corpus = [
    'This is the first document.',
    'This is the second second document.',
    'And the third one.',
    'Is this the first document?',
    'The last document?',
]

##### step 2. 토큰 리스트 생성

In [56]:
# split 메서드도 사용 가능하다
token_list = [[text for text in doc.split()] for doc in corpus]
token_list

[['This', 'is', 'the', 'first', 'document.'],
 ['This', 'is', 'the', 'second', 'second', 'document.'],
 ['And', 'the', 'third', 'one.'],
 ['Is', 'this', 'the', 'first', 'document?'],
 ['The', 'last', 'document?']]

##### step 3. Dictionary 객체 생성

In [58]:
from gensim.corpora import Dictionary

dictionary = Dictionary(token_list)
dictionary.token2id

{'This': 0,
 'document.': 1,
 'first': 2,
 'is': 3,
 'the': 4,
 'second': 5,
 'And': 6,
 'one.': 7,
 'third': 8,
 'Is': 9,
 'document?': 10,
 'this': 11,
 'The': 12,
 'last': 13}

##### step 4. BOW 인코딩

In [60]:
term_matrix = [dictionary.doc2bow(token) for token in token_list]
term_matrix

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)],
 [(0, 1), (1, 1), (3, 1), (4, 1), (5, 2)],
 [(4, 1), (6, 1), (7, 1), (8, 1)],
 [(2, 1), (4, 1), (9, 1), (10, 1), (11, 1)],
 [(10, 1), (12, 1), (13, 1)]]

##### step 5. TF-IDF 인코딩

In [62]:
from gensim.models import TfidfModel

tfidf = TfidfModel(term_matrix)

for doc in tfidf[term_matrix]:
    print('doc:')
    for k, v in doc:
        print(k, v)

doc:
0 0.49633406058198626
1 0.49633406058198626
2 0.49633406058198626
3 0.49633406058198626
4 0.12087183801361165
doc:
0 0.25482305694621393
1 0.25482305694621393
3 0.25482305694621393
4 0.0620568558708622
5 0.8951785160431313
doc:
4 0.07979258234193365
6 0.5755093812740171
7 0.5755093812740171
8 0.5755093812740171
doc:
2 0.3485847413542797
4 0.08489056411237639
9 0.6122789185961829
10 0.3485847413542797
11 0.6122789185961829
doc:
10 0.37344696513776354
12 0.6559486886294514
13 0.6559486886294514


#### 토픽 모델링
- 토픽
    - 문서를 구성하는 단어의 비율 (frequency distribution)
    - 문서 집합의 추상적인 주제

##### step 1. 텍스트 데이터 다운로드

In [63]:
newsgroups = fetch_20newsgroups(categories=['comp.graphics', 'rec.sport.baseball', 'sci.med'])

##### step 2. 명사 추출

In [66]:
%%time
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize

tagged_list = [pos_tag(word_tokenize(doc)) for doc in newsgroups.data]
nouns_list = [[t[0] for t in doc if t[1].startswith('N')] for doc in tagged_list]

Wall time: 57.2 s


##### step 3. 표제어 추출

In [67]:
from nltk.stem import WordNetLemmatizer

lm = WordNetLemmatizer()

nouns_list = [[lm.lemmatize(w, pos='n') for w in doc] for doc in nouns_list]

##### step 4. 불용어 제거

In [68]:
import re
token_list = [[text.lower() for text in doc] for doc in nouns_list]
token_list = [[re.sub(r'[^A-Za-z]+', '', word) for word in doc] for doc in token_list]

In [69]:
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
stop_words += ['', 'subject', 'article', 'line', 'year', 'month', 'address', 'keyword', 'msg']

token_list = [[word for word in doc if (word not in stop_words) and (2 < len(word) < 10)] for doc in token_list]

##### step 5. 토픽 모델링

In [72]:
from gensim import corpora

dictionary = corpora.Dictionary(token_list)
doc_term_matrix = [dictionary.doc2bow(tokens) for tokens in token_list]

In [73]:
%%time
from gensim.models.ldamodel import LdaModel

model = LdaModel(corpus=doc_term_matrix, id2word=dictionary, num_topics=3)

Wall time: 5.88 s


In [74]:
model.print_topics()

[(0,
  '0.010*"image" + 0.009*"lines" + 0.005*"program" + 0.004*"game" + 0.004*"system" + 0.004*"point" + 0.003*"file" + 0.003*"software" + 0.003*"time" + 0.003*"people"'),
 (1,
  '0.014*"lines" + 0.006*"file" + 0.006*"image" + 0.006*"time" + 0.005*"game" + 0.004*"banks" + 0.004*"geb" + 0.004*"format" + 0.004*"computer" + 0.003*"team"'),
 (2,
  '0.013*"lines" + 0.005*"time" + 0.005*"people" + 0.004*"anyone" + 0.004*"food" + 0.004*"problem" + 0.004*"game" + 0.003*"team" + 0.003*"thing" + 0.003*"day"')]

##### step 6. 토픽 시각화

In [76]:
import pyLDAvis
import pyLDAvis.gensim

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(model, doc_term_matrix, dictionary)
vis