In [1]:
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
rev_corpus =["I loved the “Squid Game” good series.",
             "good show, loved it!",
             "worst series ever! Pathetic!",
             "don't waste your time, worst storyline, and direction.",
             "love it! love it! good direction", 
             "waste and pathetic"]

#### CountVectorizer with default "Bag of Words (Unigrams n-1)"

In [3]:
c_vectorizer = CountVectorizer(stop_words="english")

In [4]:
# Learning the vocabulary dictionary and return document-term matrix
dt_matrix = c_vectorizer.fit_transform(rev_corpus)
dt_matrix

<6x13 sparse matrix of type '<class 'numpy.int64'>'
	with 21 stored elements in Compressed Sparse Row format>

In [5]:
dd = c_vectorizer.vocabulary_
dd

{'loved': 5,
 'squid': 8,
 'game': 2,
 'good': 3,
 'series': 7,
 'worst': 12,
 'pathetic': 6,
 'don': 1,
 'waste': 11,
 'time': 10,
 'storyline': 9,
 'direction': 0,
 'love': 4}

In [6]:
c_vectorizer.get_feature_names()
# c_vectorizer.get_feature_names_out()



['direction',
 'don',
 'game',
 'good',
 'love',
 'loved',
 'pathetic',
 'series',
 'squid',
 'storyline',
 'time',
 'waste',
 'worst']

In [7]:
dt_matrix.toarray()

array([[0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1],
       [1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1],
       [1, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0]], dtype=int64)

In [8]:
c_vectorizer2 = CountVectorizer(stop_words="english", ngram_range=(2, 2))
dt_matrix2 = c_vectorizer2.fit_transform(rev_corpus)

In [9]:
dd2 = c_vectorizer2.vocabulary_
dd2

{'loved squid': 7,
 'squid game': 9,
 'game good': 1,
 'good series': 4,
 'good loved': 3,
 'worst series': 14,
 'series pathetic': 8,
 'don waste': 0,
 'waste time': 13,
 'time worst': 11,
 'worst storyline': 15,
 'storyline direction': 10,
 'love love': 6,
 'love good': 5,
 'good direction': 2,
 'waste pathetic': 12}

In [10]:
c_vectorizer2.get_feature_names_out()

array(['don waste', 'game good', 'good direction', 'good loved',
       'good series', 'love good', 'love love', 'loved squid',
       'series pathetic', 'squid game', 'storyline direction',
       'time worst', 'waste pathetic', 'waste time', 'worst series',
       'worst storyline'], dtype=object)

In [11]:
c_vectorizer2.vocabulary_.get("squid")

In [12]:
ngram_vectorizer = CountVectorizer(analyzer="char_wb", stop_words="english", ngram_range=(2, 2))
dt_matrix2 = ngram_vectorizer.fit_transform(["Beauuutiful", "Beautiful"])

In [13]:
dd3 = ngram_vectorizer.vocabulary_
dd3

{' b': 0,
 'be': 2,
 'ea': 3,
 'au': 1,
 'uu': 10,
 'ut': 9,
 'ti': 7,
 'if': 5,
 'fu': 4,
 'ul': 8,
 'l ': 6}

In [14]:
ngram_vectorizer.get_feature_names_out()

array([' b', 'au', 'be', 'ea', 'fu', 'if', 'l ', 'ti', 'ul', 'ut', 'uu'],
      dtype=object)