# Bag of Words

### 1. Count Vectorizer

In [1]:
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
corpus = ['you know I want your love. because I love you']

In [6]:
count_vect = CountVectorizer()
output = count_vect.fit_transform(corpus)
output.toarray()

array([[1, 1, 2, 1, 2, 1]])

In [8]:
count_vect.vocabulary_

{'you': 4, 'know': 1, 'want': 3, 'your': 5, 'love': 2, 'because': 0}

- 불용어를 제거한 BoW

In [16]:
#1) 자체 제거
text = ["Family i snot an important thing. It's everything."]
cvect = CountVectorizer(stop_words=['the','a','an','is','not'])
output = cvect.fit_transform(text)
print(output.toarray)

  (0, 1)	1
  (0, 4)	1
  (0, 2)	1
  (0, 5)	1
  (0, 3)	1
  (0, 0)	1


In [18]:
print(output.toarray())

[[1 1 1 1 1 1]]


In [19]:
print(cvect.vocabulary_)

{'family': 1, 'snot': 4, 'important': 2, 'thing': 5, 'it': 3, 'everything': 0}


- Scikit-Learn 에서 제공하는 불용어 사용

In [23]:
text = ["Family i snot an important thing. It's everything."]
cvect = CountVectorizer(stop_words=['english'])
output = cvect.fit_transform(text)
print(output.toarray())
print(cvect.vocabulary_)

[[1 1 1 1 1 1 1]]
{'family': 2, 'snot': 5, 'an': 0, 'important': 3, 'thing': 6, 'it': 4, 'everything': 1}


In [25]:
# NLTK에서 제공하는 불용어 사용
from nltk.corpus import stopwords
sw = stopwords.words('english')
len(sw)

179

- N gram

In [26]:
text = ["Machine learning is fun and is not boring."]
cvect = CountVectorizer()
print(cvect.fit_transform(text).toarray())
print(cvect.vocabulary_)

[[1 1 1 2 1 1 1]]
{'machine': 5, 'learning': 4, 'is': 3, 'fun': 2, 'and': 0, 'not': 6, 'boring': 1}


In [31]:
# unigram ~ bigram
cvect2 = CountVectorizer(ngram_range=(1,2))
print(cvect2.fit_transform(text).toarray())
print(cvect2.vocabulary_)

[[1 1 1 1 1 2 1 1 1 1 1 1 1 1]]
{'machine': 10, 'learning': 8, 'is': 5, 'fun': 3, 'and': 0, 'not': 12, 'boring': 2, 'machine learning': 11, 'learning is': 9, 'is fun': 6, 'fun and': 4, 'and is': 1, 'is not': 7, 'not boring': 13}


In [33]:
# unigram ~ bigram
cvect3 = CountVectorizer(ngram_range=(1,3), stop_words='english')
print(cvect3.fit_transform(text).toarray())
print(cvect3.vocabulary_)

[[1 1 1 1 1 1 1 1 1]]
{'machine': 6, 'learning': 3, 'fun': 1, 'boring': 0, 'machine learning': 7, 'learning fun': 4, 'fun boring': 2, 'machine learning fun': 8, 'learning fun boring': 5}


In [35]:
cvect.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.int64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': None,
 'min_df': 1,
 'ngram_range': (1, 1),
 'preprocessor': None,
 'stop_words': None,
 'strip_accents': None,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'vocabulary': None}

### 2. TF-IDF Vectorizer

In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [37]:
corpus = [
    "you know I want your love",
    "I like you",
    "what should I do"
]

In [38]:
cvect = CountVectorizer()
print(cvect.fit_transform(corpus).toarray())
print(cvect.vocabulary_)

[[0 1 0 1 0 1 0 1 1]
 [0 0 1 0 0 0 0 1 0]
 [1 0 0 0 1 0 1 0 0]]
{'you': 7, 'know': 1, 'want': 5, 'your': 8, 'love': 3, 'like': 2, 'what': 6, 'should': 4, 'do': 0}


In [42]:
tvect = TfidfVectorizer()
tvect_result = tvect.fit_transform(corpus)
tvect_result.toarray()

array([[0.        , 0.46735098, 0.        , 0.46735098, 0.        ,
        0.46735098, 0.        , 0.35543247, 0.46735098],
       [0.        , 0.        , 0.79596054, 0.        , 0.        ,
        0.        , 0.        , 0.60534851, 0.        ],
       [0.57735027, 0.        , 0.        , 0.        , 0.57735027,
        0.        , 0.57735027, 0.        , 0.        ]])

In [43]:
tvect.vocabulary_

{'you': 7,
 'know': 1,
 'want': 5,
 'your': 8,
 'love': 3,
 'like': 2,
 'what': 6,
 'should': 4,
 'do': 0}

In [44]:
tvect.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.float64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': None,
 'min_df': 1,
 'ngram_range': (1, 1),
 'norm': 'l2',
 'preprocessor': None,
 'smooth_idf': True,
 'stop_words': None,
 'strip_accents': None,
 'sublinear_tf': False,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'use_idf': True,
 'vocabulary': None}