In [12]:
"""
    Top2Vec module.

    Source code adapted from https://github.com/ddangelov/Top2Vec and https://github.com/MaartenGr/BERTopic
"""
import re

import umap
import hdbscan

import logging

import numpy as np
import pandas as pd

from typing import Union, List, Tuple

from sklearn.cluster import dbscan
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sentence_transformers import SentenceTransformer

In [13]:
# Declare constants
NAME = "top2vec"

# Set seed for reproducibility purposes
SEED = 0

# Initialize Stemmer
STEMMER = PorterStemmer()

# Get stopwords and remove punctutaions from them
STOP_WORDS = [re.sub(r"[^a-z]", "", stopword) for stopword in stopwords.words("english")]

In [14]:
# Setup logger
logger = logging.getLogger(NAME)
logger.setLevel(logging.DEBUG)

sh = logging.StreamHandler()
sh.setFormatter(logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s"))
logger.addHandler(sh)

In [15]:
def process_sentence(sentence):
    """
    """
    
    # Convert to lowercase
    sentence = sentence.lower()
    
    # Remove non-alphabetic characters
    sentence = re.sub(r"[^a-z ]", "", sentence)
    
    # Remove stopwords
    tokens = [token for token in sentence.split() if token not in STOP_WORDS]
    
    # Perform stemming
    tokens = [STEMMER.stem(word) for word in tokens]
    
    # Construct bigrams
    bigrams = ["_".join(tokens[i:i+2]) for i in range(len(tokens)-1)]
    
    # Return tokens
    return tokens + bigrams

In [91]:
class Top2Vec:
    """
    """
    def __init__(self,
                 embedding_model: str = "all-MiniLM-L6-v2",
                 umap_model: umap.UMAP = None,
                 hdbscan_model: hdbscan.HDBSCAN = None,
                 vectorizer_model: TfidfVectorizer = None,
                 seed: int = SEED,
                 logger: logging.Logger = logger,
                ):
        
        # Validate logger
        if not isinstance(logger, logging.Logger):
            raise TypeError("logger needs to be an instance of a logging.Logger object.")
        
        # Load embedding model
        logger.info(f"Loading {embedding_model} model.") 
        
        try:
            self.embedding_model = SentenceTransformer(embedding_model)
        
        except:
            raise ValueError("Please select a valid SentenceTransformers model.")
            
        logger.info(f"Loaded {embedding_model} model successfully.")
            
        self.seed = seed
        self.results = None
        
        # UMAP
        self.umap_model = umap_model or umap.UMAP(n_neighbors = 15,
                                                  n_components = 5,
                                                  metric = "cosine",
                                                  random_state = self.seed)
        
        # Set seed for HDBSCAN
        np.random.seed(self.seed)
        
        # HDBSCAN
        self.hdbscan_model = hdbscan_model or hdbscan.HDBSCAN(min_cluster_size = 15, # To experiment with other values
                                                              metric = "euclidean",
                                                              cluster_selection_method = "eom")
        
        # Vectorizer
        self.vectorizer_model = vectorizer_model or TfidfVectorizer(analyzer = process_sentence)
        self.vectorizer_model.build_analyzer()
        
        
    def fit(self, documents: Union[List[str], pd.Series]):
        """
        """
        # Validate documents
        if not (isinstance(documents, list) or isinstance(documents, pd.Series)):
            raise TypeError("documents need to a list or pandas series of strings.")
            
        if not all(isinstance(document, str) for document in documents):
            raise TypeError("documents need to a list or pandas series of strings.")
        
        columns = ["document"]
        if isinstance(documents, list):
            self.results = pd.DataFrame(documents, columns = columns)
        
        if isinstance(documents, pd.Series):
            self.results = documents.to_frame(name = columns[0])
        
        # Obtain document embeddings
        logger.info("Obtaining document embeddings.")
        self.document_embeddings = self.embedding_model.encode(documents,
                                                               convert_to_numpy = True,
                                                               normalize_embeddings = True)
        
        # Obtain umap embeddings
        logger.info("Creating lower dimension document embeddings.")
        umap_embeddings = self.umap_model.fit(self.document_embeddings).embedding_
        
        # Obtain hdbscan clusters
        logger.info("Finding dense areas of documents.")
        clusters = self.hdbscan_model.fit(umap_embeddings)
        
        # Create topic vectors
        logger.info("Finding topics.")
        self.create_topic_vectors(clusters.labels_)
        
        # Deduplicate topics
        self.deduplicate_topics()
        
        # Assign topic to documents
        self.doc_top, self.doc_dist = self.calculate_documents_topic()
        
        # Calculate topic_sizes
        self.topic_sizes = self.calculate_topic_sizes()
        
        # Re-order topics
        self.reorder_topics()
        
        # Append clustering results to dataframe
        self.results["topic"], self.results["score"] = self.doc_top, self.doc_dist
        
        # Sort results by topic and score
        self.results.sort_values(
            by = ["topic", "score"], ascending = [True, False], inplace = True)
        
        self.results.reset_index(drop = True, inplace = True)
        
    
    def create_topic_vectors(self, cluster_labels: np.ndarray) -> None:
        """
            Method to calculate the topic vectors based on the arithmetic mean of all the 
            document embeddings in the same dense cluster.

            Args
            ----------
            cluster_labels: np.ndarray
                    cluster assigned to each document based on HDBSCAN algorithm.

            Returns
            ----------
            None
        """
        unique_labels = set(cluster_labels)
        if -1 in unique_labels:
              unique_labels.remove(-1)

        self.topic_vectors = self.l2_normalize(
            np.vstack([self.document_embeddings[np.where(cluster_labels == label)[0]]
                       .mean(axis = 0) for label in unique_labels]))
            
            
    def deduplicate_topics(self) -> None:
        """
            Method to merge duplicate topics.

            Returns
            ----------
            None
        """
        _, labels = dbscan(X = self.topic_vectors,
                           eps = 0.1,
                           min_samples = 2,
                           metric = "cosine")

        duplicate_clusters = set(labels)

        if len(duplicate_clusters) > 1 or -1 not in duplicate_clusters:
            
            # Unique topics
            unique_topics = self.topic_vectors[np.where(labels == -1)[0]]

            if -1 in duplicate_clusters:
                duplicate_clusters.remove(-1)
                
            # Merge duplicate topics
            for unique_label in duplicate_clusters:
                unique_topics = np.vstack(
                    [unique_topics, self.l2_normalize(self.topic_vectors[np.where(labels == unique_label)[0]]
                                                      .mean(axis = 0))])
            self.topic_vectors = unique_topics
            
            
    def calculate_documents_topic(self, batch_size: int = 64) -> Tuple[np.ndarray, np.ndarray]:
        """
            Method to compute the topic and score of each document.

            Args
            ----------
            batch_size: int (Optional, default 64)
                    number of documents passed to the model per iteration.

            Returns
            ----------
            (document_topics, document_scores): tuple of a pair of np.ndarray
                    the topic assigned to and score of each document. 
        """
        doc_top, doc_dist = [], []
        for start_index in range(0, len(self.document_embeddings), batch_size):
            res = np.inner(self.document_embeddings[start_index: start_index + batch_size], 
                           self.topic_vectors)
            doc_top.extend(np.argmax(res, axis = 1))
            doc_dist.extend(np.max(res, axis = 1))
    
        return np.array(doc_top), np.array(doc_dist)
    
    
    def calculate_topic_sizes(self) -> pd.Series:
        """
            Method to calculate the topic sizes.

            Returns
            ----------
            topic_sizes: pd.Series
                    number of documents belonging to each topic.
        """
        return pd.Series(self.doc_top).value_counts() \
                                      .to_frame(name = "count") \
                                      .reset_index() \
                                      .rename(columns = {"index": "topic"})


    def reorder_topics(self) -> None:
        """
            Method to sort the topics in descending order based on topic size.

            Returns
            ----------
            None
        """
        self.topic_vectors = self.topic_vectors[self.topic_sizes.index]
        old2new = dict(zip(self.topic_sizes.index, range(self.topic_sizes.index.shape[0])))
        self.doc_top = np.array([old2new[i] for i in self.doc_top])
        self.topic_sizes.reset_index(drop=True, inplace=True)
        
        
    def get_results(self) -> pd.DataFrame:
        """
        """
        return self.results
        
        
    def get_summary(self, top_n_documents: int = 10) -> pd.DataFrame:
        """
        """
        return self.results.groupby("topic").head(top_n_documents).reset_index(drop=True)
    
    
    def get_top_n_terms(self, top_n_terms: int = 15):
        """
        """
        # Compute document term matrix
        document_term_matrix = self.vectorizer_model.fit_transform(
            self.results.groupby("topic", as_index = False).agg({"document": " ".join}).document
        ).toarray()

        # Get vocabulary
        vocab = self.vectorizer_model.get_feature_names()

        # Generate the top n words per topic
        return pd.DataFrame(
            [(doc, vocab[word], document_term_matrix[doc][word]) 
            for doc in docs_per_topic["topic"] 
            for word in document_term_matrix.argsort(axis=1)[:, -top_n_terms:][doc][::-1]],
            columns = ["topic", "term", "score"])
            
        
    @staticmethod
    def l2_normalize(vectors: np.ndarray) -> np.ndarray:
        """
            Method to scale input vectors individually to unit l2 norm (vector length).

            Args
            ----------
            vectors: np.ndarray
                    the data to normalize.

            Returns
            ----------
            normalized vectors: np.ndarray
                    normalized input vectors.
        """
        if vectors.ndim == 2:
            return normalize(vectors)
        return normalize(vectors.reshape(1, -1))[0]

In [17]:
news_data = pd.read_csv("news_data.csv")
news_data.head()

Unnamed: 0,altid,title,content
0,sa1a70ab8ef5,Davenport hits out at Wimbledon,World number one Lindsay Davenport has critic...
1,ta497aea0e36,Camera phones are 'must-haves',Four times more mobiles with cameras in them ...
2,ta0f0fa26a93,US top of supercomputing charts,The US has pushed Japan off the top of the su...
3,ba23aaa4f4bb,Trial begins of Spain's top banker,"The trial of Emilio Botin, the chairman of Sp..."
4,baa126aeb946,Safety alert as GM recalls cars,The world's biggest carmaker General Motors (...


In [19]:
# Split content by sentences
from nltk.tokenize import sent_tokenize
news_data["sentences"] = news_data.content.apply(sent_tokenize)

In [20]:
# Store each sentence in its own row
news_data_sentence = news_data[["title", "sentences"]].explode(column = "sentences", ignore_index = True)

In [82]:
docs_per_topic = topic_model.results.groupby("topic", as_index = False).agg({"document": " ".join})
topic_col = "topic"

In [92]:
topic_model = Top2Vec()

2021-12-04 23:49:35,640 - top2vec - INFO - Loading all-MiniLM-L6-v2 model.
2021-12-04 23:49:52,234 - top2vec - INFO - Loaded all-MiniLM-L6-v2 model successfully.


In [93]:
topic_model.fit(news_data_sentence.sentences)

2021-12-04 23:49:52,252 - top2vec - INFO - Obtaining document embeddings.
2021-12-04 23:50:00,718 - top2vec - INFO - Creating lower dimension document embeddings.
2021-12-04 23:50:03,923 - top2vec - INFO - Finding dense areas of documents.
2021-12-04 23:50:03,945 - top2vec - INFO - Finding topics.


In [94]:
topic_model.get_top_n_terms()



Unnamed: 0,topic,term,score
0,0,lift,0.204609
1,0,speed,0.186008
2,0,xbox,0.167408
3,0,tower,0.150987
4,0,allard,0.129418
...,...,...,...
190,12,search_email,0.077325
191,12,googl,0.077325
192,12,user_search,0.077325
193,12,web_search,0.077325


In [95]:
topic_model.get_results()

Unnamed: 0,document,topic,score
0,Xbox frontman J. Allard said the console looke...,0,0.686321
1,Some details of the Xbox's performance and wha...,0,0.612559
2,The Blue Gene/L machine that will be completed...,0,0.595026
3,"The previous top machine, Japan's NEC Earth Si...",0,0.593119
4,"Since the first supercomputer, the Cray-1, was...",0,0.591385
...,...,...,...
1134,It is designed to work alongside Microsoft's O...,12,0.421039
1135,These have included giving people easier ways ...,12,0.396545
1136,The select committee found that those responsi...,12,0.325824
1137,"Around 10,000 people visited the campaign's we...",12,0.216941


In [96]:
topic_model.get_summary()

Unnamed: 0,document,topic,score
0,Xbox frontman J. Allard said the console looke...,0,0.686321
1,Some details of the Xbox's performance and wha...,0,0.612559
2,The Blue Gene/L machine that will be completed...,0,0.595026
3,"The previous top machine, Japan's NEC Earth Si...",0,0.593119
4,"Since the first supercomputer, the Cray-1, was...",0,0.591385
...,...,...,...
125,The firm is following in the footsteps of Micr...,12,0.699218
126,The desktop search technology has been license...,12,0.690468
127,"""Desktop search is just one of many features p...",12,0.682206
128,"""We are all getting more and more files on our...",12,0.638044
