<a href="https://colab.research.google.com/github/ASIF-Mahmud1/Exploration/blob/text-classifier/CountVectorizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CountVectorizer Usage Examples

In [None]:
# Blog https://kavita-ganesan.com/how-to-use-countvectorizer/#Resources

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Warm Up Example

In [None]:

doc=["One Cent, Two Cents, Old Cent, New Cent: All About Money", "All goes good, when everything is good"]


In [None]:
cv = CountVectorizer(doc)
count_vector=cv.fit_transform(doc)   # converts raw text to smaller units of text)

In [None]:
# show resulting vocabulary; the numbers are not counts, they are the position in the sparse vector.
cv.vocabulary_
# cv.get_feature_names()

{'about': 0,
 'all': 1,
 'cent': 2,
 'cents': 3,
 'everything': 4,
 'goes': 5,
 'good': 6,
 'is': 7,
 'money': 8,
 'new': 9,
 'old': 10,
 'one': 11,
 'two': 12,
 'when': 13}

In [None]:
# matrix shape. 1 documents, 9 unique words
count_vector.shape

(2, 14)

In [None]:
# any words eliminated internally? -- nope
cv.stop_words_

set()

# CountVectorizer With More Data

## Plain and Simple

In [None]:
cat_in_the_hat_docs=[
      "One Cent, Two Cents, Old Cent, New Cent: All About Money (Cat in the Hat's Learning Library",
      "Inside Your Outside: All About the Human Body (Cat in the Hat's Learning Library)",
      "Oh, The Things You Can Do That Are Good for You: All About Staying Healthy (Cat in the Hat's Learning Library)",
      "On Beyond Bugs: All About Insects (Cat in the Hat's Learning Library)",
      "There's No Place Like Space: All About Our Solar System (Cat in the Hat's Learning Library)" 
     ]


In [None]:
cv = CountVectorizer(cat_in_the_hat_docs)
count_vector=cv.fit_transform(cat_in_the_hat_docs)

In [None]:
# show resulting vocabulary; the numbers are not counts, they are the position in the sparse vector.
cv.vocabulary_

{'about': 0,
 'all': 1,
 'are': 2,
 'beyond': 3,
 'body': 4,
 'bugs': 5,
 'can': 6,
 'cat': 7,
 'cent': 8,
 'cents': 9,
 'do': 10,
 'for': 11,
 'good': 12,
 'hat': 13,
 'healthy': 14,
 'human': 15,
 'in': 16,
 'insects': 17,
 'inside': 18,
 'learning': 19,
 'library': 20,
 'like': 21,
 'money': 22,
 'new': 23,
 'no': 24,
 'oh': 25,
 'old': 26,
 'on': 27,
 'one': 28,
 'our': 29,
 'outside': 30,
 'place': 31,
 'solar': 32,
 'space': 33,
 'staying': 34,
 'system': 35,
 'that': 36,
 'the': 37,
 'there': 38,
 'things': 39,
 'two': 40,
 'you': 41,
 'your': 42}

In [None]:
#shape of count vector: 5 docs (book titles) and 43 unique words
count_vector.shape

(5, 43)

In [None]:
#any stop words?
cv.stop_words_

set()

## CountVectorizer With Custom StopWords

In [None]:
cv = CountVectorizer(cat_in_the_hat_docs,stop_words=["all","in","the","is","and"])
count_vector=cv.fit_transform(cat_in_the_hat_docs)
count_vector.shape

(5, 40)

In [None]:
#any stop words that we explicitly specified?
cv.stop_words # cv.stop_words gives you the stop words that you explicitly specified 

['all', 'in', 'the', 'is', 'and']

In [None]:
#any stop words internally stopped by countvectorizer?
cv.stop_words_  # cv.stop_words_  gives you the stop words that CountVectorizer inferred from your min_df and max_df settings as well as those that were cut off during feature selection

set()

## CountVectorizer With Predefined StopWords

In [None]:
cv = CountVectorizer(cat_in_the_hat_docs,stop_words="english")
count_vector=cv.fit_transform(cat_in_the_hat_docs)

In [None]:
# the shape should be smaller
count_vector.shape

(5, 24)

In [None]:
#any stop words that we explicitly specified?
cv.stop_words

'english'

In [None]:
#any stop words internally stopped by countvectorizer?
cv.stop_words_

set()

In [None]:
# much smaller vocabulary with stopwords applied
cv.vocabulary_

{'body': 0,
 'bugs': 1,
 'cat': 2,
 'cent': 3,
 'cents': 4,
 'good': 5,
 'hat': 6,
 'healthy': 7,
 'human': 8,
 'insects': 9,
 'inside': 10,
 'learning': 11,
 'library': 12,
 'like': 13,
 'money': 14,
 'new': 15,
 'oh': 16,
 'old': 17,
 'outside': 18,
 'place': 19,
 'solar': 20,
 'space': 21,
 'staying': 22,
 'things': 23}

## CountVectorizer with MIN_DF as StopWords

In [None]:
# ignore terms that appeared in less than n documents (can be proportion or absolute counts). How many documents contained a term, better known as document frequency.
cv = CountVectorizer(cat_in_the_hat_docs,min_df=2) # # ignore terms that appeared in less than 2 documents  
count_vector=cv.fit_transform(cat_in_the_hat_docs)

In [None]:
#any stop words internally stopped by countvectorizer?
cv.stop_words_

{'are',
 'beyond',
 'body',
 'bugs',
 'can',
 'cent',
 'cents',
 'do',
 'for',
 'good',
 'healthy',
 'human',
 'insects',
 'inside',
 'like',
 'money',
 'new',
 'no',
 'oh',
 'old',
 'on',
 'one',
 'our',
 'outside',
 'place',
 'solar',
 'space',
 'staying',
 'system',
 'that',
 'there',
 'things',
 'two',
 'you',
 'your'}

In [None]:
count_vector.shape

(5, 8)

In [None]:
# use proportion here. Ignore terms that occurred in less than 25% of the documents
cv = CountVectorizer(cat_in_the_hat_docs,min_df=0.25)
count_vector=cv.fit_transform(cat_in_the_hat_docs)

In [None]:
count_vector.shape

(5, 8)

In [None]:
cv.vocabulary_

{'about': 0,
 'all': 1,
 'cat': 2,
 'hat': 3,
 'in': 4,
 'learning': 5,
 'library': 6,
 'the': 7}

In [None]:
#any stop words internally stopped by countvectorizer?
cv.stop_words_

{'are',
 'beyond',
 'body',
 'bugs',
 'can',
 'cent',
 'cents',
 'do',
 'for',
 'good',
 'healthy',
 'human',
 'insects',
 'inside',
 'like',
 'money',
 'new',
 'no',
 'oh',
 'old',
 'on',
 'one',
 'our',
 'outside',
 'place',
 'solar',
 'space',
 'staying',
 'system',
 'that',
 'there',
 'things',
 'two',
 'you',
 'your'}

## CountVectorizer with MAX_DF as StopWords

In [None]:
# ignore terms that appeared in more than n documents (can be proportion or absolute counts)
# use proportion here
cv = CountVectorizer(cat_in_the_hat_docs,max_df=0.50)
count_vector=cv.fit_transform(cat_in_the_hat_docs)

In [None]:
cv.vocabulary_

{'are': 0,
 'beyond': 1,
 'body': 2,
 'bugs': 3,
 'can': 4,
 'cent': 5,
 'cents': 6,
 'do': 7,
 'for': 8,
 'good': 9,
 'healthy': 10,
 'human': 11,
 'insects': 12,
 'inside': 13,
 'like': 14,
 'money': 15,
 'new': 16,
 'no': 17,
 'oh': 18,
 'old': 19,
 'on': 20,
 'one': 21,
 'our': 22,
 'outside': 23,
 'place': 24,
 'solar': 25,
 'space': 26,
 'staying': 27,
 'system': 28,
 'that': 29,
 'there': 30,
 'things': 31,
 'two': 32,
 'you': 33,
 'your': 34}

In [None]:
cv.stop_words_

{'about', 'all', 'cat', 'hat', 'in', 'learning', 'library', 'the'}

In [None]:
# ignore terms that appeared in more than n documents (can be proportion or absolute counts)
# use absolute values here - suitable when you know number of documents ahead of time and are dealing with only a handful
cv = CountVectorizer(cat_in_the_hat_docs,max_df=4)
count_vector=cv.fit_transform(cat_in_the_hat_docs)

In [None]:
cv.stop_words_

{'about', 'all', 'cat', 'hat', 'in', 'learning', 'library', 'the'}

## Custom Preprocessing 

In [None]:
import re
import nltk
import pandas as pd
from nltk.stem import PorterStemmer

# init stemmer
porter_stemmer=PorterStemmer()

def my_cool_preprocessor(text):
    
    text=text.lower() 
    text=re.sub("\\W"," ",text) # remove special chars
    text=re.sub("\\s+(in|the|all|for|and|on)\\s+"," _connector_ ",text) # normalize certain words
    
    # stem words
    words=re.split("\\s+",text)
    stemmed_words=[porter_stemmer.stem(word=word) for word in words]
    return ' '.join(stemmed_words)

cv = CountVectorizer(cat_in_the_hat_docs,preprocessor=my_cool_preprocessor)
count_vector=cv.fit_transform(cat_in_the_hat_docs)

In [None]:
cv.vocabulary_

{'_connector_': 0,
 'about': 1,
 'are': 2,
 'beyond': 3,
 'bodi': 4,
 'bug': 5,
 'can': 6,
 'cat': 7,
 'cent': 8,
 'do': 9,
 'good': 10,
 'hat': 11,
 'healthi': 12,
 'human': 13,
 'insect': 14,
 'insid': 15,
 'learn': 16,
 'librari': 17,
 'like': 18,
 'money': 19,
 'new': 20,
 'no': 21,
 'oh': 22,
 'old': 23,
 'on': 24,
 'one': 25,
 'our': 26,
 'outsid': 27,
 'place': 28,
 'solar': 29,
 'space': 30,
 'stay': 31,
 'system': 32,
 'that': 33,
 'the': 34,
 'there': 35,
 'thing': 36,
 'two': 37,
 'you': 38,
 'your': 39}

In [None]:
cv = CountVectorizer(cat_in_the_hat_docs,preprocessor=my_cool_preprocessor)
count_vector=cv.fit_transform(cat_in_the_hat_docs)

## Working With N-Grams

In [None]:
#only bigrams, word level
cv = CountVectorizer(cat_in_the_hat_docs,ngram_range=(2,2),preprocessor=my_cool_preprocessor)
count_vector=cv.fit_transform(cat_in_the_hat_docs)

In [None]:
cv.vocabulary_

{'one cent': 35,
 'cent two': 19,
 'two cent': 47,
 'cent old': 18,
 'old cent': 33,
 'cent new': 17,
 'new cent': 30,
 'cent _connector_': 16,
 '_connector_ about': 0,
 'about money': 7,
 'money cat': 29,
 'cat _connector_': 15,
 '_connector_ the': 2,
 'the hat': 44,
 'hat learn': 22,
 'learn librari': 27,
 'insid your': 26,
 'your outsid': 50,
 'outsid _connector_': 37,
 'about _connector_': 5,
 '_connector_ human': 1,
 'human bodi': 24,
 'bodi cat': 12,
 'oh _connector_': 32,
 '_connector_ thing': 3,
 'thing you': 46,
 'you can': 49,
 'can do': 14,
 'do that': 20,
 'that are': 43,
 'are good': 10,
 'good _connector_': 21,
 '_connector_ you': 4,
 'you _connector_': 48,
 'about stay': 9,
 'stay healthi': 41,
 'healthi cat': 23,
 'on beyond': 34,
 'beyond bug': 11,
 'bug _connector_': 13,
 'about insect': 6,
 'insect cat': 25,
 'there no': 45,
 'no place': 31,
 'place like': 38,
 'like space': 28,
 'space _connector_': 40,
 'about our': 8,
 'our solar': 36,
 'solar system': 39,
 'syste

In [None]:
count_vector.shape

(5, 51)

In [None]:
#only bigrams and unigrams
cv = CountVectorizer(cat_in_the_hat_docs,ngram_range=(1,2),preprocessor=my_cool_preprocessor)
count_vector=cv.fit_transform(cat_in_the_hat_docs)

In [None]:
cv.vocabulary_

{'_connector_': 0,
 '_connector_ about': 1,
 '_connector_ human': 2,
 '_connector_ the': 3,
 '_connector_ thing': 4,
 '_connector_ you': 5,
 'about': 6,
 'about _connector_': 7,
 'about insect': 8,
 'about money': 9,
 'about our': 10,
 'about stay': 11,
 'are': 12,
 'are good': 13,
 'beyond': 14,
 'beyond bug': 15,
 'bodi': 16,
 'bodi cat': 17,
 'bug': 18,
 'bug _connector_': 19,
 'can': 20,
 'can do': 21,
 'cat': 22,
 'cat _connector_': 23,
 'cent': 24,
 'cent _connector_': 25,
 'cent new': 26,
 'cent old': 27,
 'cent two': 28,
 'do': 29,
 'do that': 30,
 'good': 31,
 'good _connector_': 32,
 'hat': 33,
 'hat learn': 34,
 'healthi': 35,
 'healthi cat': 36,
 'human': 37,
 'human bodi': 38,
 'insect': 39,
 'insect cat': 40,
 'insid': 41,
 'insid your': 42,
 'learn': 43,
 'learn librari': 44,
 'librari': 45,
 'like': 46,
 'like space': 47,
 'money': 48,
 'money cat': 49,
 'new': 50,
 'new cent': 51,
 'no': 52,
 'no place': 53,
 'oh': 54,
 'oh _connector_': 55,
 'old': 56,
 'old cent': 57

In [None]:
count_vector.shape

(5, 91)

## Working With Character N-Grams

In [None]:
#only character level bigrams 
cv = CountVectorizer(cat_in_the_hat_docs,ngram_range=(2,2),preprocessor=my_cool_preprocessor,analyzer='char_wb')
count_vector=cv.fit_transform(cat_in_the_hat_docs)

In [None]:
cv.vocabulary_

{' _': 0,
 ' a': 1,
 ' b': 2,
 ' c': 3,
 ' d': 4,
 ' g': 5,
 ' h': 6,
 ' i': 7,
 ' l': 8,
 ' m': 9,
 ' n': 10,
 ' o': 11,
 ' p': 12,
 ' s': 13,
 ' t': 14,
 ' y': 15,
 '_ ': 16,
 '_c': 17,
 'ab': 18,
 'ac': 19,
 'al': 20,
 'an': 21,
 'ar': 22,
 'at': 23,
 'ay': 24,
 'be': 25,
 'bo': 26,
 'br': 27,
 'bu': 28,
 'ca': 29,
 'ce': 30,
 'co': 31,
 'ct': 32,
 'd ': 33,
 'di': 34,
 'do': 35,
 'e ': 36,
 'ea': 37,
 'ec': 38,
 'em': 39,
 'en': 40,
 'er': 41,
 'ew': 42,
 'ey': 43,
 'g ': 44,
 'go': 45,
 'h ': 46,
 'ha': 47,
 'he': 48,
 'hi': 49,
 'hu': 50,
 'i ': 51,
 'ib': 52,
 'id': 53,
 'ik': 54,
 'in': 55,
 'ke': 56,
 'la': 57,
 'ld': 58,
 'le': 59,
 'li': 60,
 'lt': 61,
 'm ': 62,
 'ma': 63,
 'mo': 64,
 'n ': 65,
 'nd': 66,
 'ne': 67,
 'ng': 68,
 'nn': 69,
 'no': 70,
 'ns': 71,
 'nt': 72,
 'o ': 73,
 'od': 74,
 'oh': 75,
 'ol': 76,
 'on': 77,
 'oo': 78,
 'or': 79,
 'ou': 80,
 'pa': 81,
 'pl': 82,
 'r ': 83,
 'r_': 84,
 'ra': 85,
 're': 86,
 'ri': 87,
 'rn': 88,
 's ': 89,
 'se': 90,
 'si': 91

In [None]:
count_vector.shape

(5, 113)

## Limiting Vocabulary Size

In [None]:
#only bigrams and unigrams, limit to vocab size of 10
cv = CountVectorizer(cat_in_the_hat_docs,ngram_range=(1,2),preprocessor=my_cool_preprocessor,analyzer='word',max_features=10)
count_vector=cv.fit_transform(cat_in_the_hat_docs)

In [None]:
count_vector.shape

(5, 10)

In [None]:
cv.vocabulary_

{'_connector_': 0,
 'cat': 1,
 'cat _connector_': 2,
 'hat': 3,
 'hat learn': 4,
 'learn': 5,
 'learn librari': 6,
 'librari': 7,
 'the': 8,
 'the hat': 9}

## Extracting Counts of Words / N-Grams

In [None]:
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
 
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """return n-gram counts in descending order of counts"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]
 
    score_vals = []
    feature_vals = []
    results=[]
    
    # word index, count i
    for idx, count in sorted_items:
        
        # get the ngram name
        n_gram=feature_names[idx]
        
        # collect as a list of tuples
        results.append((n_gram,count))
 
    return results

In [None]:
cv = CountVectorizer(cat_in_the_hat_docs,ngram_range=(1,2),preprocessor=my_cool_preprocessor,max_features=100)
count_vector=cv.fit_transform(cat_in_the_hat_docs)

#sort the counts of first book title by descending order of counts
sorted_items=sort_coo(count_vector[0].tocoo())

#Get feature names (words/n-grams). It is sorted by position in sparse matrix
feature_names=cv.get_feature_names()
n_grams=extract_topn_from_vector(feature_names,sorted_items,10)
n_grams

[('cent', 4),
 ('_connector_', 2),
 ('two cent', 1),
 ('two', 1),
 ('the hat', 1),
 ('the', 1),
 ('one cent', 1),
 ('one', 1),
 ('old cent', 1),
 ('old', 1)]

## Binary Values Instead of Counts

In [None]:
cv = CountVectorizer(cat_in_the_hat_docs,ngram_range=(1,2),binary=True)
count_vector=cv.fit_transform(cat_in_the_hat_docs)
print(count_vector[2])

  (0, 35)	1
  (0, 74)	1
  (0, 4)	1
  (0, 90)	1
  (0, 29)	1
  (0, 31)	1
  (0, 9)	1
  (0, 78)	1
  (0, 27)	1
  (0, 17)	1
  (0, 91)	1
  (0, 86)	1
  (0, 82)	1
  (0, 56)	1
  (0, 34)	1
  (0, 73)	1
  (0, 28)	1
  (0, 30)	1
  (0, 8)	1
  (0, 77)	1
  (0, 26)	1
  (0, 16)	1
  (0, 89)	1
  (0, 85)	1
  (0, 55)	1
  (0, 45)	1
  (0, 33)	1
  (0, 80)	1
  (0, 39)	1
  (0, 19)	1
  (0, 7)	1
  (0, 46)	1
  (0, 44)	1
  (0, 32)	1
  (0, 79)	1
  (0, 38)	1
  (0, 18)	1
  (0, 0)	1
  (0, 6)	1


## Custom Tokenizer

In [None]:
import re

def my_tokenizer(text):
    text=re.sub("(\\W)"," \\1 ",text)
    return re.split("\\s+",text)
    

cv = CountVectorizer(cat_in_the_hat_docs,tokenizer=my_tokenizer)
count_vector=cv.fit_transform(cat_in_the_hat_docs)
print(cv.vocabulary_)

{'one': 34, 'cent': 14, ',': 4, 'two': 47, 'cents': 15, 'old': 32, 'new': 29, ':': 5, 'all': 7, 'about': 6, 'money': 28, '(': 2, 'cat': 13, 'in': 22, 'the': 44, 'hat': 19, "'": 1, 's': 38, 'learning': 25, 'library': 26, 'inside': 24, 'your': 49, 'outside': 36, 'human': 21, 'body': 10, ')': 3, '': 0, 'oh': 31, 'things': 46, 'you': 48, 'can': 12, 'do': 16, 'that': 43, 'are': 8, 'good': 18, 'for': 17, 'staying': 41, 'healthy': 20, 'on': 33, 'beyond': 9, 'bugs': 11, 'insects': 23, 'there': 45, 'no': 30, 'place': 37, 'like': 27, 'space': 40, 'our': 35, 'solar': 39, 'system': 42}
