# 3. Feature Extraction Module

This module implements various feature extraction techniques like Bag of Words, TF-IDF, and word embeddings.

In [10]:
import numpy as np
from gensim.models import Word2Vec, FastText
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.base import BaseEstimator
from scipy.sparse import csr_matrix
from typing import List, Tuple, Dict

This function create_bag_of_words takes a list of preprocessed text documents as input and uses the scikit-learn library 
to create a bag of words representation of the corpus. 
The function first creates an instance of the sklearn.feature_extraction.text.CountVectorizer, 
which is an implementation of the bag of words model.

The function then fits the vectorizer to the input corpus and transforms the corpus into a bag of words matrix using 
the vectorizer.fit_transform() method. The matrix is stored in the Compressed Sparse Row (CSR) format (csr_matrix) 
for efficient storage and computation.

Finally, the function returns a tuple containing the CountVectorizer instance and the bag of words matrix.



In [11]:
def create_bag_of_words(corpus: List[str], ngram_size=1, min_df=1) -> Tuple[List, np.ndarray]:
    """
    Create a bag of words representation of a text corpus.
    :param corpus: List[str], a list of preprocessed text documents
    :ngram_size: max ngram size, default = 1 meaning single words
    :min_df: ignore words that have a document frequency strictly lower than the given threshold, default = 1
             if the value is between 0 and 0.999, it ignores the words with relative document frequency strictly lower than the threshold
     :return: Tuple[List, Matrix], the List of words and the bow matrix
    """
    # Create an instance of the CountVectorizer from scikit-learn
    vectorizer = CountVectorizer(ngram_range=(1, ngram_size), min_df=min_df)
    # vectorizer = CountVectorizer(ngram_range=(1, ngram_size), min_df=min_df, stop_words='english')

    # Fit the vectorizer to the corpus and transform the corpus into a bag of words matrix
    bag_of_words_matrix = vectorizer.fit_transform(corpus)

    # Return the vectorizer and the bag of words matrix as a tuple
    return vectorizer.get_feature_names_out().tolist(), bag_of_words_matrix.toarray()



Helper functions for summarizing bag_of_words and tfidf matices.

In [None]:
def word_is_nterm(word: str):
    return ' ' in word

def sum_count_documents_containing_each_word(word_list: List, bow_matrix: np.ndarray) -> Dict[str, int]:
    # number of documents containing each word
    word_counts = (bow_matrix > 0).sum(axis=0)
    words_dict = {}
    for word, count in zip(word_list, word_counts):
        words_dict[word] = count
    return words_dict

def sum_count_each_word_in_all_documents(word_list: List, any_matrix: np.ndarray) -> Dict[str, int]:
    # number of occurences of each word in all documents; sum of matrix columns
    word_counts = (any_matrix).sum(axis=0)
    if type(word_counts) == np.matrix:
        word_counts = word_counts.tolist()[0]
    words_dict = {}
    for word, count in zip(word_list, word_counts):
        words_dict[word] = count
    return words_dict

def max_tfidf_each_word_in_all_documents(word_list: List, tfidf_matrix: np.ndarray) -> Dict[str, int]:
    # max tfidf of each word in all documents; max element found in each matrix column
    word_counts = (tfidf_matrix).max(axis=0)
    if type(word_counts) == np.matrix:
        word_counts = word_counts.tolist()[0]
    words_dict = {}
    for word, count in zip(word_list, word_counts):
        words_dict[word] = count
    return words_dict

def sum_count_all_words_in_each_document(ids_list: List, any_matrix: np.ndarray) -> Dict[str, int]:
    # number of words in each document; sum of matrix rows
    word_counts = (any_matrix).sum(axis=1)
    if type(word_counts) == np.matrix:
        word_counts = word_counts.transpose().tolist()[0]
    words_dict = {}
    for id, count in zip(ids_list, word_counts):
        words_dict[id] = count
    return words_dict

def max_tfidf_all_words_in_each_document(ids_list: List, tfidf_matrix: np.ndarray) -> Dict[str, int]:
    # max tfidf of words in each document; max element found in each matrix row
    word_counts = (tfidf_matrix).max(axis=1)
    if type(word_counts) == np.matrix:
        word_counts = word_counts.transpose().tolist()[0]
    words_dict = {}
    for id, count in zip(ids_list, word_counts):
        words_dict[id] = count
    return words_dict



Helper function for extracting sub-matrix from a given matrix (bag_of_words and tfidf).


In [None]:
def filter_matrix(ids_list: List, word_list: List, any_matrix: np.ndarray, filter_rows: List, filter_columns: List) -> Tuple[List, List, np.ndarray]:
    # filter the input matrix (according to filter_rows and filter_columns) into output matrix and preserve the order of filters in new output matrix 
    """
    Construct a sub-matrix from the given matrix based on filter_rows and filter_columns.
    
    Parameters:
    - matrix (numpy.ndarray): The original matrix.
    - filter_rows (list): List of row indices to include in the sub-matrix.
    - filter_columns (list): List of column indices to include in the sub-matrix.
    
    Returns:
    - numpy.ndarray: The sub-matrix.
    """
    new_ids_list = []
    for ind in filter_rows:
        new_ids_list.append(ids_list[ind])
    new_word_list = []
    for ind in filter_columns:
        new_word_list.append(word_list[ind])
    return new_ids_list, new_word_list, any_matrix[np.ix_(filter_rows, filter_columns)]

def filter_matrix_columns(word_list: List, any_matrix: np.ndarray, filter_rows: List, filter_columns: List) -> Tuple[List, np.ndarray]:
    # filter columns of the input matrix (according to filter_rows and filter_columns) into output matrix and preserve the order of filters in new output matrix 
    """
    Construct a sub-matrix from the given matrix based on filter_columns.
    
    Parameters:
    - matrix (numpy.ndarray): The original matrix.
    - filter_rows (list): List of row indices to include in the sub-matrix.
    - filter_columns (list): List of column indices to include in the sub-matrix.
    
    Returns:
    - numpy.ndarray: The sub-matrix.
    """
    new_word_list = []
    for ind in filter_columns:
        new_word_list.append(word_list[ind])
    return new_word_list, any_matrix[np.ix_(filter_rows, filter_columns)]


This function create_tfidf takes a list of preprocessed text documents as input and uses the scikit-learn library 
to create a Term Frequency-Inverse Document Frequency (TF-IDF) representation of the corpus. 
The function first creates an instance of the sklearn.feature_extraction.text.TfidfVectorizer, 
which is an implementation of the TF-IDF model.

The function then fits the vectorizer to the input corpus and transforms the corpus into a TF-IDF matrix using 
the vectorizer.fit_transform() method. The matrix is stored in the Compressed Sparse Row (CSR) format (csr_matrix) 
for efficient storage and computation.

Finally, the function returns a tuple containing the list of features *TfidfVectorizer.get_feature_names_out().tolist()* that correspond to the columns of the TF-IDF matrix and the TF-IDF matrix *tfidf_matrix.todense()*.



In [12]:
def create_tfidf(corpus: List[str], ngram_size=1, min_df=1) -> Tuple[List, np.ndarray]:
    """
    Create a TF-IDF representation of a text corpus.
    :param corpus: List[str], a list of preprocessed text documents
    :ngram_size: max ngram size, default = 1 meaning single words
    :min_df: ignore words that have a document frequency strictly lower than the given threshold, default = 1
             if the value is between 0 and 0.999, it ignores the words with relative document frequency strictly lower than the threshold
    :return: TTuple[List, Matrix], the List of words and the tf-idf matrix
    """
    # Create an instance of the TfidfVectorizer from scikit-learn
    vectorizer = TfidfVectorizer(ngram_range=(1, ngram_size), min_df=min_df)
    # vectorizer = TfidfVectorizer(ngram_range=(1, ngram_size), min_df=min_df, stop_words='english')

    # Fit the vectorizer to the corpus and transform the corpus into a TF-IDF matrix
    tfidf_matrix = vectorizer.fit_transform(corpus)

    # Return the vectorizer and the TF-IDF matrix as a tuple
    return vectorizer.get_feature_names_out().tolist(), tfidf_matrix.todense()



In [None]:
def cluster_matrix(ids_list: List, word_list: List, any_matrix: np.ndarray, cluster_rows: List) -> np.ndarray:
    # cluster the input matrix (according to cluster_rows) into output matrix, where the number of rows is the number of clusters 
    """
    Construct an agregate sub-matrix from the given matrix based on cluster_rows.
    cluster_rows contains integer numbers representing index of a cluster of the corresponding document.
    The clusters are represented by numbers 0, 1, 2, ... 
    
    Parameters:
    - matrix (numpy.ndarray): The original matrix.
    - filter_rows (list): List of row indices to include in the sub-matrix.
    - filter_columns (list): List of column indices to include in the sub-matrix.
    
    Returns:
    - numpy.ndarray: The sub-matrix with the row dimensionality of number of different clusters.
    """
    new_ids_list = []
    for ind in cluster_rows:
        new_ids_list.append(ids_list[ind])
    return any_matrix[np.ix_(cluster_rows, cluster_rows)]

This function create_word_embeddings takes a list of tokenized text documents and an optional embedding method 
(either 'word2vec' or 'fasttext', defaulting to 'word2vec') as input, and uses the gensim library to create 
word embeddings for the documents. The function first trains a word embedding model using the specified method, 
creating either a Word2Vec or FastText model with a vector size of 100, a window size of 5, a minimum word count of 1, 
and 4 worker threads for parallelization.

The function then initializes an empty matrix with the shape (len(tokens_list), model.vector_size) to store the document embeddings. 
For each document in the input tokens_list, the function calculates the document embedding by averaging the word embeddings 
of each token in the document. This is done by first retrieving the word embeddings for each token using the trained 
model's model.wv[token] attribute, then calculating the mean of these embeddings along axis 0.

Finally, the function returns a tuple containing the word embedding model (either a Word2Vec or FastText instance) 
and the document embeddings matrix as a NumPy array.



In [13]:
def create_word_embeddings(tokens_list: List[List[str]], embedding_method: str = 'word2vec') -> Tuple[BaseEstimator, np.ndarray]:
    """
    Create word embeddings for a list of tokenized documents using the specified embedding method.
    :param tokens_list: List[List[str]], a list of tokenized text documents
    :param embedding_method: str, the embedding method to use, either 'word2vec' or 'fasttext' (default: 'word2vec')
    :return: Tuple[BaseEstimator, np.ndarray], the word embedding model and the document embeddings matrix
    """
    # Train the word embedding model based on the specified method
    if embedding_method == 'word2vec':
        model = Word2Vec(tokens_list, vector_size=100, window=5, min_count=1, workers=4)
    elif embedding_method == 'fasttext':
        model = FastText(tokens_list, vector_size=100, window=5, min_count=1, workers=4)
    else:
        raise ValueError("Invalid embedding method. Please use either 'word2vec' or 'fasttext'.")

    # Initialize an empty matrix to store the document embeddings
    document_embeddings = np.zeros((len(tokens_list), model.vector_size))

    # Calculate the document embeddings by averaging the word embeddings of each token
    for i, tokens in enumerate(tokens_list):
        token_embeddings = np.array([model.wv[token] for token in tokens])
        document_embeddings[i] = token_embeddings.mean(axis=0)

    return model, document_embeddings


In [16]:
from collections import defaultdict

This function create_bag_of_words takes a list of preprocessed text documents as input and creates a bag of words representation of the corpus without using the scikit-learn or scipy packages. The function first defines a helper function tokenize_and_count that takes a text document as input, tokenizes it by splitting on whitespace, and counts the occurrences of each token using a defaultdict.

The function then initializes an empty dictionary word_to_index to map words to their index, and an empty list bag_of_words_list to store the bag of words representation of each document. For each document in the input corpus, the function tokenizes and counts the occurrences of words using the tokenize_and_count helper function, and appends the resulting dictionary to the bag_of_words_list.

For each unique word in the word count dictionary, the function checks if it is already in the word_to_index dictionary. If not, the word is added to the dictionary with a new index, which is the current length of the dictionary.

Finally, the function returns a tuple containing the word_to_index dictionary and the bag_of_words_list.

In [17]:
def create_bag_of_words_deprecated(corpus: List[str]) -> Tuple[Dict[str, int], List[Dict[str, int]]]:
    """
    Create a bag of words representation of a text corpus without using scikit-learn or scipy.
    :param corpus: List[str], a list of preprocessed text documents
    :return: Tuple[Dict[str, int], List[Dict[str, int]]], a dictionary mapping words to their index, 
             and a list of dictionaries representing the bag of words for each document
    """
    def tokenize_and_count(document: str) -> Dict[str, int]:
        tokens = document.split()
        word_count = defaultdict(int)
        for token in tokens:
            word_count[token] += 1
        return word_count

    word_to_index = {}
    bag_of_words_list = []

    for document in corpus:
        word_count = tokenize_and_count(document)
        bag_of_words_list.append(word_count)

        for word in word_count:
            if word not in word_to_index:
                word_to_index[word] = len(word_to_index)

    return word_to_index, bag_of_words_list
