In [57]:
from bertopic import BERTopic
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.decomposition import PCA
from nltk.corpus import stopwords
import pickle
import pandas as pd
from bertopic.vectorizers import ClassTfidfTransformer
import plotly.io as pio


import openai

from bertopic.representation import KeyBERTInspired

from transformers import pipeline
from bertopic.representation import TextGeneration
from bertopic.representation import PartOfSpeech
from bertopic.representation import MaximalMarginalRelevance
from bertopic.representation import OpenAI

In [2]:
pio.renderers.default='iframe'

In [3]:
# this functions are only to save the embedding
def save(x, file_name):
    with open(file_name, 'wb') as handle:
        pickle.dump(x, handle, protocol=pickle.HIGHEST_PROTOCOL)

def restore(file_name):
    with open(file_name, 'rb') as handle:
        x = pickle.load(handle)
    return x

In [4]:
df = pd.read_table('../data/biology_abstracts.csv',sep=';')

Prepare data

In [5]:
df['text'] = df.title+ ' ' + df.abstract
df['text'] = df.text.fillna('')

In [6]:
text_df = df[-df.OST_BK.duplicated(keep='last')].reset_index(drop=True) #drop duplicates

In [7]:
# umap_model = UMAP(n_neighbors=15, n_components=3, min_dist=0.0, metric='cosine', low_memory = True, n_jobs=32)
sentence_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2") #"paraphrase-MiniLM-L3-v2"

def rescale(x, inplace=False):
    """ Rescale an embedding so optimization will not have convergence issues.
    """
    if not inplace:
        x = np.array(x, copy=True)

    x /= np.std(x[:, 0]) * 10000

    return x

In [8]:
# %%time
# embeddings = sentence_model.encode(text_df.text.values, show_progress_bar=False)
# save(embeddings, '../results/embeddings_1.p')

In [9]:
embeddings = restore('../results/embeddings_1.p')

In [10]:
embeddings.shape

(34797, 384)

In [11]:
pca_embeddings = rescale(PCA(n_components=5).fit_transform(embeddings))

In [12]:
pca_embeddings.shape

(34797, 5)

## basic elements

In [13]:
umap_model = UMAP(
    n_neighbors=15,
    n_components=5,
    min_dist=0.0,
    metric="cosine",
    init=pca_embeddings,
    random_state=1234, n_jobs=4, low_memory=False)    
sw=stopwords.words(['english','spanish','french']) + ['elsevier']
vectorizer_model = CountVectorizer(ngram_range=(1,1), stop_words=sw,max_df=1.0, min_df=0.001)
# Setting HDBSCAN model
hdbscan_model = HDBSCAN(min_cluster_size=100,min_samples=1, metric='euclidean',cluster_selection_epsilon=0.05, 
                        cluster_selection_method='leaf', prediction_data=True,core_dist_n_jobs=4,memory='tmp/') #, min_samples=1
ctfidf_model  = ClassTfidfTransformer()

## Representation models

In [14]:
# The main representation of a topic
main_representation = KeyBERTInspired()

In [17]:
# Additional ways of representing a topic
aspect_model1 = [KeyBERTInspired(top_n_words=30), MaximalMarginalRelevance(diversity=.5)]

### LLM


In [19]:
# Create your representation model
prompt = "I have a topic described by the following keywords: [KEYWORDS]. Based on the previous keywords, what is this topic about?"
generator = pipeline('text2text-generation', model='google/flan-t5-base')
flan_model = TextGeneration(generator,prompt=prompt)

In [58]:
client = openai.OpenAI(api_key="")


In [69]:
# Create your representation model
short_name_prompt = """
I have a topic that is described by the following keywords: [KEYWORDS]
Based on the previous keywords, please give me a single word that can accurately represent the topic
"""
long_name_prompt = """
I have a topic that is described by the following keywords: [KEYWORDS]
Based on the previous keywords, please give me a short name of no more than 3 words that can accurately represent the topic
"""
title_prompt = """
I have a topic that is described by the following keywords: [KEYWORDS]
Based on the previous keywords, please give me a title that can describe the topic
"""

openai4m_snp = OpenAI(client, model="gpt-4o-mini", chat=True, prompt=short_name_prompt, nr_docs=50, delay_in_seconds=1)
openai4m_lnp = OpenAI(client, model="gpt-4o-mini", chat=True, prompt=long_name_prompt, nr_docs=50, delay_in_seconds=1)
openai4m_tp = OpenAI(client, model="gpt-4o-mini", chat=True, prompt=title_prompt, nr_docs=50, delay_in_seconds=1)

openai4o_snp = OpenAI(client, model="gpt-4o", chat=True, prompt=short_name_prompt, nr_docs=50, delay_in_seconds=1)
openai4o_lnp = OpenAI(client, model="gpt-4o", chat=True, prompt=long_name_prompt, nr_docs=50, delay_in_seconds=1)
openai4o_tp = OpenAI(client, model="gpt-4o", chat=True, prompt=title_prompt, nr_docs=50, delay_in_seconds=1)



In [74]:
# Add all models together to be run in a single `fit`
representation_models = {
   "Main": main_representation,
    "Aspect1": flan_model,
    "Aspect2": openai4m_snp,
    "Aspect3": openai4m_lnp,
    "Aspect4": openai4m_tp,
    "Aspect5": openai4o_snp,
    "Aspect6": openai4o_lnp,
    "Aspect7": openai4o_tp,
}


#  Model FIT

In [76]:
representation_model = KeyBERTInspired()

topic_model = BERTopic(verbose=True,embedding_model=sentence_model, low_memory=True, calculate_probabilities=False, 
                       vectorizer_model=vectorizer_model, 
                       hdbscan_model=hdbscan_model,
                       umap_model=umap_model,
                       ctfidf_model=ctfidf_model,
                       representation_model=representation_models)


In [77]:
%%time
topics, probabilities  = topic_model.fit_transform(text_df.text.values, embeddings)

2024-08-01 14:40:41,408 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-08-01 14:40:56,226 - BERTopic - Dimensionality - Completed ✓
2024-08-01 14:40:56,227 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-08-01 14:40:56,365 - BERTopic - Cluster - Completed ✓
2024-08-01 14:40:56,369 - BERTopic - Representation - Extracting topics from clusters using representation models.
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 105/105 [00:09<00:00, 10.89it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 105/105 [02:57<00:00,  1.69s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 105/105 [02:58<00:00,  1.70s/it]
100%|███████████████

CPU times: user 1min 26s, sys: 12.8 s, total: 1min 38s
Wall time: 19min 40s


In [87]:
topic_info= topic_model.get_topic_info()

topic_info['Aspect1'] = topic_info['Aspect1'].apply(lambda x: x[0] if x else '').str.replace("'", "").str.replace('"', '')
topic_info['Aspect2'] = topic_info['Aspect2'].apply(lambda x: x[0] if x else '').str.replace("'", "").str.replace('"', '')
topic_info['Aspect3'] = topic_info['Aspect3'].apply(lambda x: x[0] if x else '').str.replace("'", "").str.replace('"', '')
topic_info['Aspect4'] = topic_info['Aspect4'].apply(lambda x: x[0] if x else '').str.replace("'", "").str.replace('"', '')
topic_info['Aspect5'] = topic_info['Aspect5'].apply(lambda x: x[0] if x else '').str.replace("'", "").str.replace('"', '')
topic_info['Aspect6'] = topic_info['Aspect6'].apply(lambda x: x[0] if x else '').str.replace("'", "").str.replace('"', '')
topic_info['Aspect7'] = topic_info['Aspect7'].apply(lambda x: x[0] if x else '').str.replace("'", "").str.replace('"', '')


In [88]:
topic_info= topic_info.rename(columns={
    "Aspect1": "flan_model",
    "Aspect2": "openai4m_snp",
    "Aspect3": "openai4m_lnp",
    "Aspect4": "openai4m_tp",
    "Aspect5": "openai4o_snp",
    "Aspect6": "openai4o_lnp",
    "Aspect7": "openai4o_tp"})

In [93]:
topic_info=topic_info[['Topic', 'Count', 'Name','Representative_Docs', 'Representation', 'flan_model',
       'openai4m_snp', 'openai4m_lnp', 'openai4m_tp', 'openai4o_snp',
       'openai4o_lnp', 'openai4o_tp', ]]

In [95]:
#$topic_model.save('../results/topic_model/basic_model')
topic_info.to_excel('../results/topic_model/topic_info.xlsx',index=False)