## 텍스트 분석 - BOW(Bag of Words)

## 1. CountVectorizer (https://wikidocs.net/22650)

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
corpus = ['you know I want your love. because I love you.']

In [5]:
count_vect = CountVectorizer()
count_vect.fit(corpus)
output = count_vect.transform(corpus)
output.toarray() # 코퍼스로부터 각 단어의 빈도 수를 기록한다
#결과값: array([[1, 1, 2, 1, 2, 1]], dtype=int64)

array([[1, 1, 2, 1, 2, 1]], dtype=int64)

In [6]:
count_vect.vocabulary_ #아웃풋_ 빈도수
#결과값: {'you': 4, 'know': 1, 'want': 3, 'your': 5, 'love': 2, 'because': 0}

{'you': 4, 'know': 1, 'want': 3, 'your': 5, 'love': 2, 'because': 0}

### 불용어를 제거한 BoW

In [7]:
#자체 제거
text=["Family is not an important thing. It's everything."]
cvect = CountVectorizer(stop_words=["the", "a", "an", "is", "not"])
print(cvect.fit_transform(text).toarray())
print(cvect.vocabulary_)

[[1 1 1 1 1]]
{'family': 1, 'important': 2, 'thing': 4, 'it': 3, 'everything': 0}


In [9]:
#Scikit-Learn에서 제공하는 불용어 사용
text=["Family is not an important thing. It's everything."]
cvect = CountVectorizer(stop_words='english') #'it', 'everything' 불용어처리됨
#stop_words외에도 다른파라미터 max_df, min_df, max_features, stop_words. n_gram_range, analyzer, token_pattern, tokenizer
print(cvect.fit_transform(text).toarray())
print(cvect.vocabulary_)

[[1 1 1]]
{'family': 0, 'important': 1, 'thing': 2}


In [11]:
# NLTK에서 제공하는 불용어 사용
from nltk.corpus import stopwords
sw = stopwords.words('english')
len(sw)

179

In [13]:
text=["Family is not an important thing. It's everything."]
cvect = CountVectorizer(stop_words=sw)
print(cvect.fit_transform(text).toarray())
print(cvect.vocabulary_)

[[1 1 1 1]]
{'family': 1, 'important': 2, 'thing': 3, 'everything': 0}


In [16]:
#인덱스를 다시 단어로 변환하는 함수
def get_word(index, voca):
    for key, value in voca.items():
        if value == index:
            return key

In [17]:
get_word(3, cvect.vocabulary_)

'thing'

### N-gram (Bow의 단점을 극복한) 다음단어예측가능,정확도는 높아지지만.. 학습속도떨어짐;

In [18]:
text=["Machine learning is fun and is not boring"]
cvect = CountVectorizer()
print(cvect.fit_transform(text).toarray())
print(cvect.vocabulary_)

[[1 1 1 2 1 1 1]]
{'machine': 5, 'learning': 4, 'is': 3, 'fun': 2, 'and': 0, 'not': 6, 'boring': 1}


In [19]:
cvect = CountVectorizer(ngram_range=(1,2))
print(cvect.fit_transform(text).toarray())
print(cvect.vocabulary_)

[[1 1 1 1 1 2 1 1 1 1 1 1 1 1]]
{'machine': 10, 'learning': 8, 'is': 5, 'fun': 3, 'and': 0, 'not': 12, 'boring': 2, 'machine learning': 11, 'learning is': 9, 'is fun': 6, 'fun and': 4, 'and is': 1, 'is not': 7, 'not boring': 13}


In [20]:
cvect = CountVectorizer(ngram_range=(1,3))
print(cvect.fit_transform(text).toarray()) #train, test 나눌경우엔 fit_transform 쓰면 안됨!!!!각자다른인덱스를가지기때문에ㅠ
print(cvect.vocabulary_)

[[1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1]]
{'machine': 15, 'learning': 12, 'is': 7, 'fun': 4, 'and': 0, 'not': 18, 'boring': 3, 'machine learning': 16, 'learning is': 13, 'is fun': 8, 'fun and': 5, 'and is': 1, 'is not': 10, 'not boring': 19, 'machine learning is': 17, 'learning is fun': 14, 'is fun and': 9, 'fun and is': 6, 'and is not': 2, 'is not boring': 11}


In [21]:
#ngram에 불용어제거까지
cvect = CountVectorizer(ngram_range=(1,2), stop_words='english') 
print(cvect.fit_transform(text).toarray())
print(cvect.vocabulary_)

[[1 1 1 1 1 1 1]]
{'machine': 5, 'learning': 3, 'fun': 1, 'boring': 0, 'machine learning': 6, 'learning fun': 4, 'fun boring': 2}


### CountVectorizer의 파라미터

In [22]:
cvect.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.int64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': None,
 'min_df': 1,
 'ngram_range': (1, 2),
 'preprocessor': None,
 'stop_words': 'english',
 'strip_accents': None,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'vocabulary': None}

## 2. TF-IDF Vectorizer (https://wikidocs.net/31698)
단어의 가중치반영. 단어의 빈도수가 높으면 중요한단어 이나,  
전체문서에 너무 많이 나오면 관습적이구나.. 라고; 

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
    'you know I want your love',
    'I like you',
    'what should I do '   
]
count_vect = CountVectorizer()
print(count_vect.fit_transform(corpus).toarray())
print(count_vect.vocabulary_)

[[0 1 0 1 0 1 0 1 1]
 [0 0 1 0 0 0 0 1 0]
 [1 0 0 0 1 0 1 0 0]]
{'you': 7, 'know': 1, 'want': 5, 'your': 8, 'love': 3, 'like': 2, 'what': 6, 'should': 4, 'do': 0}


In [24]:
tfidf_vect = TfidfVectorizer()
print(tfidf_vect.fit_transform(corpus).toarray())
print(tfidf_vect.vocabulary_)
#사용법은 똑같은데, 결과값이!? 반복되는 단어 you의 확률을 낮춰줌

[[0.         0.46735098 0.         0.46735098 0.         0.46735098
  0.         0.35543247 0.46735098]
 [0.         0.         0.79596054 0.         0.         0.
  0.         0.60534851 0.        ]
 [0.57735027 0.         0.         0.         0.57735027 0.
  0.57735027 0.         0.        ]]
{'you': 7, 'know': 1, 'want': 5, 'your': 8, 'love': 3, 'like': 2, 'what': 6, 'should': 4, 'do': 0}


In [25]:
tfidf_vect.get_params() #파라미터확인해보면 ngram적용가능하다

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.float64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': None,
 'min_df': 1,
 'ngram_range': (1, 1),
 'norm': 'l2',
 'preprocessor': None,
 'smooth_idf': True,
 'stop_words': None,
 'strip_accents': None,
 'sublinear_tf': False,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'use_idf': True,
 'vocabulary': None}