In [None]:
# default_exp feature_extraction.text

# feature_extraction.text

> Basic feature extraction techniques for text.

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#export
from collections import Counter, defaultdict
from scipy.sparse import coo_matrix
import numpy as np

In [None]:
x_train = [["a", "b", "a"], ["a", "b"], ["c", "b"], ["d", "b"]]
x_test  = [["a", "b"], ["a"], ["c", "b", "b"], ["c"]]
y_train = [1,1,2,3]

In [None]:
#export
class CountVectorizer:
    """Implementation of Bag of Word Model"""
    
    def __init__(self, store_class_vocab = False):
        if store_class_vocab:
            self.store_class_vocab = {} 
        
    def _calculate_stats(self, y_train):
        """Calculates basic stats: labels , labels frequency, and distrubution of labels/class"""
        self.labels, self.labels_freq = np.unique(y_train, return_counts= True)
        total_freq = np.sum(self.labels_freq)
        self.distribution = self.labels_freq / total_freq
        
    def _get_vocab(self,  x_train, y_train):
        """Build vocabulary  and store corresponding frequency of word types"""
        
        vocab = Counter()
        for label in self.labels:
            vocab += self._word_to_count_map(x_train, y_train , label)
        
        self.vocab, self.vocab_freq = zip(*vocab.items())

    def _word_to_count_map(self, x_train, y_train , label):
        """A dictionary that maps from word types in a class to its frequency"""
        
        word_to_count = defaultdict(int)
        for index , sample in enumerate(x_train):
            if y_train[index] == label:
                for term in sample:
                    word_to_count[term] += 1
        
        try: # can be used to store vocab of individual classes
            self.store_class_vocab[label] = word_to_count 
        finally:
            return Counter(word_to_count)
        
    def fit(self, x_train, y_train):
        """Calcultes neccesary stats to build Bag of Words model"
        
        Args:
            x_train (nested list): list of list containing samples.
            y_train (list): labels for training samples.
        """
        
        self._calculate_stats(y_train)
        self._get_vocab(x_train, y_train)
        
    def transform(self, X):
        """Make Bag of Words vector.
        
        Args:
            X (nested list): list of list containing samples.
            
        Returns:
            sparse coordinate matrix of shape(len(X), len(vocab))
        """
        rows = []
        columns = []
        data = []
        
        for sample_index, sample in enumerate(X):
            sample = Counter(sample)
            for term, term_freq in sample.items():
                vocab_index = self.vocab.index(term)
                columns.append(vocab_index)
                rows.append(sample_index)
                data.append(term_freq)
       
        return coo_matrix((data,(rows, columns)))


In [None]:
cv =  CountVectorizer(store_class_vocab = True )

In [None]:
# cv._calculate_stats(y_train)

# cv.labels, cv.labels_freq, cv.distribution

# cv._word_to_count_map(x_train,y_train,  3)

# cv._get_vocab( x_train, y_train)

# cv.vocab, cv.vocab_freq

# cv.store_compressed_bow

# cv.vocab ,cv.vocab_freq

In [None]:
cv.fit(x_train, y_train)

In [None]:
a = cv.transform(x_train).tocsr() 
b = cv.transform(x_test).tocsr() 

In [None]:
a.toarray(), b.toarray()

(array([[2, 1, 0, 0],
        [1, 1, 0, 0],
        [0, 1, 1, 0],
        [0, 1, 0, 1]], dtype=int64),
 array([[1, 1, 0],
        [1, 0, 0],
        [0, 2, 1],
        [0, 0, 1]], dtype=int64))

In [None]:
a.dot(b).toarray()

array([[3, 2, 0],
       [2, 1, 0],
       [1, 2, 1],
       [1, 0, 1]], dtype=int64)

In [None]:
cv.store_class_vocab

{1: defaultdict(int, {'a': 3, 'b': 2}),
 2: defaultdict(int, {'c': 1, 'b': 1}),
 3: defaultdict(int, {'d': 1, 'b': 1})}

In [None]:
from nbdev.export import *
notebook2script()

Converted 00_core.ipynb.
Converted 01_count_vectorizer.ipynb.
Converted index.ipynb.
Converted main.ipynb.
