In [2]:
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from bertopic import BERTopic
import pandas as pd
import numpy as np
import os
import string

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [4]:
# Load the dataset
dirty_df = pd.read_csv('/content/drive/MyDrive/BestJournals.csv', index_col=0)
dirty_df = dirty_df[dirty_df["Abstract"].notna()]

In [5]:
# Remove duplicate entries as well as Journals that have the title pasted in the abstract

def cleaning_abs(dirty_df):
   
    tmp_list = list()

    for i in range(len(dirty_df)-1):
        if dirty_df.iloc[i]["Title"] != dirty_df.iloc[i+1]["Title"]:
            if dirty_df.iloc[i]["Abstract"] != dirty_df.iloc[i+1]["Abstract"]:
                if dirty_df.iloc[i]["Name"] != dirty_df.iloc[i+1]["Name"]:
                    tmp_list.append(dirty_df.iloc[i])
   
    df = pd.DataFrame(tmp_list, columns = dirty_df.columns)

    df.reset_index(drop = True, inplace = True)
   
    return df

def remove_title_from_abs(df):
    exclist = '#$%*+/<=>@[\]^_`{|}~'
    table = str.maketrans('', '', exclist)

    for i in range(len(df)-1):

      tmp_old = df.iloc[i]["Abstract"]

      tmp_new = tmp_old.translate(table)
      tmp_new = tmp_new.strip()

      df.at[i, "Abstract"] = tmp_new


    tmp_list = list()

    for i in range(len(df)-1):
      if df.iloc[i]["Title"][:30] != df.iloc[i]["Abstract"][:30]:
        tmp_list.append(df.iloc[i])

    df = pd.DataFrame(tmp_list, columns = dirty_df.columns)

    df.reset_index(drop = True, inplace = True)

    return df

df = cleaning_abs(dirty_df)
df = remove_title_from_abs(df)
docs = df.Abstract.tolist()

df.to_csv('processed_df.csv')

In [6]:
journal_counts = df.groupby(['Year', 'Journal']).size().reset_index(name='count')
journal_pivot = journal_counts.pivot(index='Year', columns='Journal', values='count')
journal_pivot = journal_pivot.fillna(0).astype(int)
journal_pivot['Total'] = journal_pivot.sum(axis=1)
journal_pivot.loc['Total']= journal_pivot.sum()
journal_pivot.to_excel('journal_pivot.xlsx')

In [7]:
# Defines a custom vectorizer class to remove n-gram stop words

class CustomVectorizer(CountVectorizer): 
       
    stop_grams = []    
    
    def __init__(self, stop_grams = [], **opts):
        super().__init__(**opts)
        self.stop_grams = stop_grams
    
    def remove_ngrams(self, doc):
        for stop_gram in self.stop_grams:
            doc = doc.replace(stop_gram, "")
        return doc
    
    # overwrite the build_analyzer method, allowing one to
    # create a custom analyzer for the vectorizer
    def build_analyzer(self):
        
        # load stop words using CountVectorizer's built in method
        stop_words = list(self.get_stop_words())
        
        preprocessor = self.build_preprocessor()
        tokenizer = self.build_tokenizer()
        remove_ngrams = self.remove_ngrams
        
        
        # create the analyzer that will be returned by this method
        def analyser(doc):
                
            # apply the preprocessing and tokenzation steps
            doc_clean = preprocessor(doc.lower())
            
            # remove phrase stopwords
            doc_clean = remove_ngrams(doc)
            
            # tokenize using default tokenizer
            tokens = tokenizer(doc_clean)            
            
            # use CountVectorizer's _word_ngrams built in method
            # to remove stop words and extract n-grams
            return(self._word_ngrams(tokens, stop_words))
        
        return(analyser)

In [None]:
# Remove stop words

import nltk
nltk.download('stopwords')

stop_words = stopwords.words('english')

custom_sw = ['mathrsfs', 'amsmath', 'shrink', 'citation', 
             'volume', 'journal', 'Issue', 'Volume',
             'EarlyView', 'The', 'This', 'We', 'In', 'Journal', 'Philosophical', 'Studies', 'JW', 
             'Page', 'Australasian', '90', '2012', 'March', 'article', 'abstract', '737', 'any055',
             'analys', '10', 'Analysis', 'Wittgensteins', 'Tractatus', 'No', 'available', 'Book']

stop_words.extend(custom_sw)

stop_grams = ['available citation', 'citation abstract',
              'citation abstract', 'abstract available',
              'citation analysis', 'doi 10', '53 doi',
              'usa philosophical', '1093 analys', 'australasian journal',
              'doi 10', 'research 104', '2022 philosophy',
              'australasian philosophy', 'philosophy 89',
              'philosophy 90', 'Philosophical Studies', '1573 0883', 'Australasian Journal', 
              'Philosophy 89', 'Philosophy 90', 'original publication', 'review article', 'Analysis 78', 'analys any055',
              'article abstract', 'citation Analysis', '1093 analys', '10 1093', 'Book review', 
              'Phenomenological Research', 'Philosophy Phenomenological', 'Research Philosophy', '2022 Philosophy',
              'phenomenological research', 'Page 676', 'available citation', 'No available', 
              'citation No', 'citation Book', 'Analysis 78', '1093 analys', 'Type Article', 'philosophy and phenomenological research',
               'Philosophy and Phenomenological Research', 'Philosophy 104', '2011 Philosophy', 'no available', 'available No', 'available Book',
               'Book available']


vectorizer_model = CustomVectorizer(ngram_range=(1, 2), stop_words=list(stop_words), stop_grams = stop_grams)

In [None]:
embedding_model = SentenceTransformer('all-mpnet-base-v2')

In [None]:
embeddings = embedding_model.encode(docs, show_progress_bar=True)
np.save('embeddings.npy', embeddings)

In [11]:
umap_model = UMAP(n_neighbors=8, n_components=4, min_dist=0.0, metric='cosine', random_state=42)

In [12]:
hdbscan_model = HDBSCAN(min_cluster_size=33, min_samples=31,
                        gen_min_span_tree=True,
                        metric='euclidean',
                        prediction_data=True,
                        cluster_selection_method='eom')

In [13]:
model_bert = BERTopic(
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    embedding_model=embedding_model,
    vectorizer_model=vectorizer_model,
    top_n_words=15,
    language='english',
    calculate_probabilities=True,
    verbose=True,
    diversity=0.35
)

In [14]:
topics, probs = model_bert.fit_transform(docs, embeddings)

2023-03-08 09:40:04,987 - BERTopic - Reduced dimensionality
2023-03-08 09:40:11,381 - BERTopic - Clustered reduced embeddings


In [None]:
model_bert.save("topic_model", save_embedding_model=False)

It is important to keep in mind that the same Python environment, as the one where the model was saved in, is needed when loading the model.

In [None]:
# Loading a model on CPU only hardware
# Additionally, define the CustomVectorizer again

from bertopic.backend._utils import select_backend
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer('all-mpnet-base-v2')
model = select_backend(embedding_model)
model_bert = BERTopic.load("/content/drive/MyDrive/topic_model", embedding_model = model)

In [None]:
# Get a clustering metric for model comparison
# For in-depth information refer to: https://www.dbs.ifi.lmu.de/~zimek/publications/SDM2014/DBCV.pdf 

model_bert.hdbscan_model.relative_validity_

In [None]:
# Get all topics created by the model 
model_bert.get_topics()

In [None]:
# Get topic sizes
model_bert.topic_sizes_

In [None]:
# Get similar topics to given word
similar_topics, similarity = model_bert.find_topics("statistics", top_n = 3)

most_similar = similar_topics[0]
print("Most Similar Topic Info: \n{}".format(model_bert.get_topic(most_similar)))
print("Similarity Score: {}".format(similarity[0]))

Visualizations:

In [None]:
# Topic per class (journal)
classes = list(df["Journal"])
topics_per_class = model_bert.topics_per_class(docs, classes=classes)

11it [01:15,  6.91s/it]


In [None]:
# Document cluster map
model_bert.visualize_documents(docs, embeddings=embeddings)

In [None]:
# Similarity matrix
model_bert.visualize_heatmap()

In [None]:
# Topics over time
timestamps = df["Year"].to_list()
topics_over_time = model_bert.topics_over_time(docs, timestamps, global_tuning=True, evolution_tuning=True)
model_bert.visualize_topics_over_time(topics_over_time, top_n_topics=10, )

In [None]:
# Intertopic Distance Map
model_bert.visualize_topics()

Parameter fine-tuning for UMAP and HDBSCAN.
The negative DBCV for HDBSCAN was used as cost function to minimize.

In [None]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, partial, space_eval

In [None]:
import umap
import hdbscan

def generate_clusters(message_embeddings,
                      n_neighbors,
                      n_components, 
                      min_cluster_size,
                      min_samples,
                      random_state = None):
    """
    Generate HDBSCAN cluster object after reducing embedding dimensionality with UMAP
    """
    
    umap_embeddings = (umap.UMAP(n_neighbors=n_neighbors, 
                                n_components=n_components, 
                                min_dist = 0.0,
                                metric='cosine', 
                                random_state=random_state)
                            .fit_transform(message_embeddings))

    clusters = hdbscan.HDBSCAN(min_cluster_size = min_cluster_size,
                               min_samples = min_samples,
                               metric='euclidean', 
                               gen_min_span_tree=True,
                               cluster_selection_method='eom').fit(umap_embeddings)

    return clusters

In [None]:
'''
In this approach points are not assigned cluster labels, but are instead assigned a vector of probabilities. 
The length of the vector is equal to the number of clusters found. 
The probability value at the ith entry of the vector is the probability that that point is a member of the ith cluster
'''
def score_clusters(clusters, prob_threshold = 0.05):
    """
    Returns the label count and cost of a given cluster supplied from running hdbscan
    """
    
    cluster_labels = clusters.labels_
    label_count = len(np.unique(cluster_labels))
    total_num = len(clusters.labels_)
    cost = -clusters.relative_validity_
    
    return label_count, cost

In [None]:
from tqdm import trange
import sklearn
import random

def random_search(embeddings, space, num_evals):
    """
    Randomly search hyperparameter space and limited number of times 
    and return a summary of the results
    """
    
    results = []
    
    for i in trange(num_evals):
        n_neighbors = random.choice(space['n_neighbors'])
        n_components = random.choice(space['n_components'])
        min_cluster_size = random.choice(space['min_cluster_size'])
        min_samples = random.choice(space['min_samples'])
        
        clusters = generate_clusters(embeddings, 
                                     n_neighbors = n_neighbors, 
                                     n_components = n_components, 
                                     min_cluster_size = min_cluster_size,
                                     min_samples = min_samples ,
                                     random_state = 42)
    
        label_count, cost = score_clusters(clusters, prob_threshold = 0.05)
                
        results.append([i, n_neighbors, n_components, min_cluster_size, min_samples,
                        label_count, cost])
    
    result_df = pd.DataFrame(results, columns=['run_id', 'n_neighbors', 'n_components', 
                                               'min_cluster_size', 'min_samples', 'label_count', 'cost'])
    
    return result_df.sort_values(by='cost')

In [None]:
def objective(params, embeddings, label_lower, label_upper):
    """
    Objective function for hyperopt to minimize, which incorporates constraints
    on the number of clusters we want to identify
    """
    
    clusters = generate_clusters(embeddings, 
                                 n_neighbors = params['n_neighbors'], 
                                 n_components = params['n_components'], 
                                 min_cluster_size = params['min_cluster_size'],
                                 min_samples = params['min_samples'],
                                 random_state = params['random_state'])
    
    label_count, cost = score_clusters(clusters, prob_threshold = 0.05)
    
    #15% penalty on the cost function if outside the desired range of groups
    if (label_count < label_lower) | (label_count > label_upper):
        penalty = 0.15 
    else:
        penalty = 0
    
    loss = cost + penalty
    
    return {'loss': loss, 'label_count': label_count, 'status': STATUS_OK}

In [None]:
def bayesian_search(embeddings, space, label_lower, label_upper, max_evals=100):
    """
    Perform bayseian search on hyperopt hyperparameter space to minimize objective function
    """
    
    trials = Trials()
    fmin_objective = partial(objective, embeddings=embeddings, label_lower=label_lower, label_upper=label_upper)
        
    best = fmin(fmin_objective, 
                space = space, 
                algo=tpe.suggest,
                max_evals=max_evals, 
                trials=trials)

    best_params = space_eval(space, best)
    print ('best:')
    print (best_params)
    print (f"label count: {trials.best_trial['result']['label_count']}")
    
    best_clusters = generate_clusters(embeddings, 
                                      n_neighbors = best_params['n_neighbors'], 
                                      n_components = best_params['n_components'], 
                                      min_cluster_size = best_params['min_cluster_size'],
                                      min_samples = best_params['min_samples'],
                                      random_state = best_params['random_state'])
    
    return best_params, best_clusters, trials

In [None]:
hspace = {
    "n_neighbors": hp.choice("n_neighbors",range(4,15)),
    "n_components": hp.choice("n_components", range(2,6)),
    "min_cluster_size": hp.choice("min_cluster_size", range(25,45)),
    "min_samples": hp.choice("min_samples", range(10,35)),
    "random_state": 42
}

# Label_* corresponds to the assumed lower/upper limit of topics
# max_evals refers to the iterations of randomly selected parameters
label_lower = 70
label_upper = 130
max_evals = 300

In [None]:
best_params, best_cluster_use, trials_use = bayesian_search(embeddings,
                                                           space = hspace,
                                                           label_lower = label_lower,
                                                           label_upper = label_upper,
                                                           max_evals = max_evals)

Evaluation:

In [None]:
# Get coherence scores

from bertopic import BERTopic
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel

model_bert = BERTopic(
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    embedding_model=embedding_model,
    vectorizer_model=vectorizer_model,
    top_n_words=15,
    language='english',
    calculate_probabilities=True,
    verbose=True,
    diversity=0.35
)

topics, _ = model_bert.fit_transform(docs,embeddings)

# Preprocess Documents
documents = pd.DataFrame({"Document": docs,
                          "ID": range(len(docs)),
                          "Topic": topics})
documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
cleaned_docs = model_bert._preprocess_text(documents_per_topic.Document.values)

# Extract vectorizer and analyzer from BERTopic
vectorizer = model_bert.vectorizer_model
analyzer = vectorizer.build_analyzer()

# Extract features for Topic Coherence evaluation
words = vectorizer.get_feature_names_out()
tokens = [analyzer(doc) for doc in cleaned_docs]
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]
topic_words = [[words for words, _ in model_bert.get_topic(topic)] 
               for topic in range(len(set(topics))-1)]

# Evaluate
coherence_model = CoherenceModel(topics=topic_words, 
                                 texts=tokens, 
                                 corpus=corpus,
                                 dictionary=dictionary, 
                                 coherence='c_v') # can also use 'c_uci', 'c_npmi', 'u_mass'

coherence = coherence_model.get_coherence()

coherence

In [None]:
# Topic Diversity 

def proportion_unique_words(model_bert, topk=15):
    """
    compute the proportion of unique words
    Parameters
    ----------
    bert_model: fitted BERTopic model
    topk: top k words on which the topic diversity will be computed
    """

    topics_list = model_bert.get_topics()
    topics = [[words for words, _ in model_bert.get_topic(topic)] 
                  for topic in range(len(set(topics_list))-1)]

    if topk > len(topics[0]):
        raise Exception('Words in topics are less than '+str(topk))
    else:
        unique_words = set()
        for topic in topics:
            unique_words = unique_words.union(set(topic[:topk]))
        puw = len(unique_words) / (topk * len(topics))
        return puw


puw = proportion_unique_words(model_bert)

puw

In [None]:
# Topic Diversity 

from itertools import combinations

def pairwise_jaccard_diversity(model_bert, topk=15):
    '''
    compute the average pairwise jaccard distance between the topics 
  
    Parameters
    ----------
    bert_model: fitted BERTopic model
    topk: top k words on which the topic diversity will be computed
    
    Returns
    -------
    pjd: average pairwise jaccard distance
    '''

    topics_list = model_bert.get_topics()
    topics = [[words for words, _ in model_bert.get_topic(topic)] 
                  for topic in range(len(set(topics_list))-1)]

    dist = 0
    count = 0
    for list1, list2 in combinations(topics, 2):
        js = 1 - len(set(list1).intersection(set(list2)))/len(set(list1).union(set(list2)))
        dist = dist + js
        count = count + 1
    return dist/count

pjd = pairwise_jaccard_diversity(model_bert)

pjd