In [34]:
from bertopic import BERTopic
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.decomposition import PCA
from nltk.corpus import stopwords
import pickle
import pandas as pd
from bertopic.vectorizers import ClassTfidfTransformer
import plotly.io as pio

from tqdm._tqdm_notebook import tqdm

import openai

from bertopic.representation import KeyBERTInspired

from transformers import pipeline
from bertopic.representation import TextGeneration
from bertopic.representation import PartOfSpeech
from bertopic.representation import MaximalMarginalRelevance
from bertopic.representation import OpenAI

import glob
from sklearn.metrics.pairwise import cosine_similarity

In [29]:
# this functions are only to save the embedding
def save(x, file_name):
    with open(file_name, 'wb') as handle:
        pickle.dump(x, handle, protocol=pickle.HIGHEST_PROTOCOL)

def restore(file_name):
    with open(file_name, 'rb') as handle:
        x = pickle.load(handle)
    return x

In [3]:
df = pd.read_table('../data/biology_abstracts.csv',sep=';')

Prepare data

In [4]:
df['text'] = df.title+ ' ' + df.abstract
df['text'] = df.text.fillna('')

In [5]:
text_df = df[-df.OST_BK.duplicated(keep='last')].reset_index(drop=True) #drop duplicates

In [28]:
# umap_model = UMAP(n_neighbors=15, n_components=3, min_dist=0.0, metric='cosine', low_memory = True, n_jobs=32)
sentence_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2") #"paraphrase-MiniLM-L3-v2"

def rescale(x, inplace=False):
    """ Rescale an embedding so optimization will not have convergence issues.
    """
    if not inplace:
        x = np.array(x, copy=True)

    x /= np.std(x[:, 0]) * 10000

    return x

In [7]:
# %%time
# embeddings = sentence_model.encode(text_df.text.values, show_progress_bar=False)
# save(embeddings, '../results/embeddings_1.p')

In [8]:
embeddings = restore('../results/embeddings_1.p')

In [9]:
embeddings.shape

(34797, 384)

In [10]:
pca_embeddings = rescale(PCA(n_components=5).fit_transform(embeddings))

In [11]:
pca_embeddings.shape

(34797, 5)

## basic elements

In [12]:
umap_model = UMAP(
    n_neighbors=15,
    n_components=5,
    min_dist=0.0,
    metric="cosine",
    init=pca_embeddings,
    random_state=1234, n_jobs=4, low_memory=False)    
sw=stopwords.words(['english','spanish','french']) + ['elsevier']
vectorizer_model = CountVectorizer(ngram_range=(1,1), stop_words=sw,max_df=1.0, min_df=0.001)
# Setting HDBSCAN model
hdbscan_model = HDBSCAN(min_cluster_size=100,min_samples=1, metric='euclidean',cluster_selection_epsilon=0.05, 
                        cluster_selection_method='leaf', prediction_data=True,core_dist_n_jobs=4,memory='tmp/') #, min_samples=1
ctfidf_model  = ClassTfidfTransformer()

## Representation models

In [13]:
# The main representation of a topic
main_representation = KeyBERTInspired()

### LLM


In [18]:
# client = openai.OpenAI(api_key="")


In [19]:
# Create your representation model
short_name_prompt = """
I have a corpus of Biology with 100 topics. I have a topic that is described by the following keywords: [KEYWORDS]
Based on the information above, extract a short topic label of a signle word that can accurately represent the topic, in the following format:
topic: <topic label>
"""
long_name_prompt = """
I have a corpus of Biology with 100 topics. I have a topic that is described by the following keywords: [KEYWORDS]
Based on the information above, extract a short topic label between one and three words that can accurately represent the topic, in the following format:
topic: <topic label>
"""
# title_prompt = """
# I have a topic that is described by the following keywords: [KEYWORDS]
# Based on the previous keywords, please give me a title that can describe the topic
# """

generator = pipeline('text2text-generation', model='google/flan-t5-base')

flan_snp = TextGeneration(generator,prompt=short_name_prompt)
flan_lnp = TextGeneration(generator,prompt=long_name_prompt)

openai4m_snp = OpenAI(client, model="gpt-4o-mini", chat=True, prompt=short_name_prompt, nr_docs=50, delay_in_seconds=1)
openai4m_lnp = OpenAI(client, model="gpt-4o-mini", chat=True, prompt=long_name_prompt, nr_docs=50, delay_in_seconds=1)
# openai4m_tp = OpenAI(client, model="gpt-4o-mini", chat=True, prompt=title_prompt, nr_docs=50, delay_in_seconds=1)

openai4o_snp = OpenAI(client, model="gpt-4o", chat=True, prompt=short_name_prompt, nr_docs=50, delay_in_seconds=1)
openai4o_lnp = OpenAI(client, model="gpt-4o", chat=True, prompt=long_name_prompt, nr_docs=50, delay_in_seconds=1)
# openai4o_tp = OpenAI(client, model="gpt-4o", chat=True, prompt=title_prompt, nr_docs=50, delay_in_seconds=1)



In [20]:
# Add all models together to be run in a single `fit`
representation_models = {
   "Main": main_representation,
    "Aspect1": flan_snp,
    "Aspect2": flan_lnp,
    "Aspect3": openai4m_snp,
    "Aspect4": openai4m_lnp,
    "Aspect5": openai4o_snp,
    "Aspect6": openai4o_lnp
}


#  Model FIT

In [21]:

topic_model = BERTopic(verbose=True,embedding_model=sentence_model, low_memory=True, calculate_probabilities=False, 
                       vectorizer_model=vectorizer_model, 
                       hdbscan_model=hdbscan_model,
                       umap_model=umap_model,
                       ctfidf_model=ctfidf_model,
                       representation_model=representation_models)


In [43]:
%%time
topics, probabilities  = topic_model.fit_transform(text_df.text.values, embeddings)

2024-08-05 14:37:57,298 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-08-05 14:38:11,765 - BERTopic - Dimensionality - Completed ✓
2024-08-05 14:38:11,766 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-08-05 14:38:11,902 - BERTopic - Cluster - Completed ✓
2024-08-05 14:38:11,906 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-08-05 14:38:13,822 - BERTopic - Representation - Completed ✓


CPU times: user 20 s, sys: 86.2 ms, total: 20.1 s
Wall time: 17.9 s


In [34]:
topic_info= topic_model.get_topic_info()

topic_info['Aspect1'] = topic_info['Aspect1'].apply(lambda x: x[0] if x else '').str.replace("'", "").str.replace('"', '')
topic_info['Aspect2'] = topic_info['Aspect2'].apply(lambda x: x[0] if x else '').str.replace("'", "").str.replace('"', '')
topic_info['Aspect3'] = topic_info['Aspect3'].apply(lambda x: x[0] if x else '').str.replace("'", "").str.replace('"', '')
topic_info['Aspect4'] = topic_info['Aspect4'].apply(lambda x: x[0] if x else '').str.replace("'", "").str.replace('"', '')
topic_info['Aspect5'] = topic_info['Aspect5'].apply(lambda x: x[0] if x else '').str.replace("'", "").str.replace('"', '')
topic_info['Aspect6'] = topic_info['Aspect6'].apply(lambda x: x[0] if x else '').str.replace("'", "").str.replace('"', '')


In [36]:
topic_info= topic_info.rename(columns={
    "Aspect1": "flan_snp",
    "Aspect2": "flan_lnp",
    "Aspect3": "openai4m_snp",
    "Aspect4": "openai4m_lnp",
    "Aspect5": "openai4o_snp",
    "Aspect6": "openai4o_lnp"})

In [37]:
topic_info=topic_info[['Topic', 'Count', 'Name','Representative_Docs', 'Representation', 'flan_snp',
       'openai4m_snp', 'openai4o_snp', 'flan_lnp', 'openai4m_lnp',
       'openai4o_lnp']]

In [38]:
#$topic_model.save('../results/topic_model/basic_model')
topic_info.to_excel('../results/topic_model/topic_info.xlsx',index=False)

## Stability

In [22]:
#base model
topic_model = BERTopic(verbose=True,embedding_model=sentence_model, low_memory=True, calculate_probabilities=False, 
                       vectorizer_model=vectorizer_model, 
                       hdbscan_model=hdbscan_model,
                       umap_model=umap_model,
                       ctfidf_model=ctfidf_model)

In [23]:
def get_topic_info_clean(topic_model):
    topic_info= topic_model.get_topic_info()
    
    topic_info['Aspect1'] = topic_info['Aspect1'].apply(lambda x: x[0] if x else '').str.replace("'", "").str.replace('"', '')
    topic_info['Aspect2'] = topic_info['Aspect2'].apply(lambda x: x[0] if x else '').str.replace("'", "").str.replace('"', '')
    topic_info['Aspect3'] = topic_info['Aspect3'].apply(lambda x: x[0] if x else '').str.replace("'", "").str.replace('"', '')
    topic_info['Aspect4'] = topic_info['Aspect4'].apply(lambda x: x[0] if x else '').str.replace("'", "").str.replace('"', '')
    topic_info['Aspect5'] = topic_info['Aspect5'].apply(lambda x: x[0] if x else '').str.replace("'", "").str.replace('"', '')
    topic_info['Aspect6'] = topic_info['Aspect6'].apply(lambda x: x[0] if x else '').str.replace("'", "").str.replace('"', '')
    
    topic_info= topic_info.rename(columns={
        "Aspect1": "flan_snp",
        "Aspect2": "flan_lnp",
        "Aspect3": "openai4m_snp",
        "Aspect4": "openai4m_lnp",
        "Aspect5": "openai4o_snp",
        "Aspect6": "openai4o_lnp"})
    
    topic_info=topic_info[['Topic', 'Count', 'Name','Representative_Docs', 'Representation', 'flan_snp',
           'openai4m_snp', 'openai4o_snp', 'flan_lnp', 'openai4m_lnp',
           'openai4o_lnp']]
    return topic_info


In [24]:
# Add all models together to be run in a single `fit`
representation_models = {
   "Main": main_representation,
    "Aspect1": flan_snp,
    "Aspect2": flan_lnp,
    "Aspect3": openai4m_snp,
    "Aspect4": openai4m_lnp,
    "Aspect5": openai4o_snp,
    "Aspect6": openai4o_lnp
}


In [25]:
%%time
topics, probabilities  = topic_model.fit_transform(text_df.text.values, embeddings)

2024-08-05 21:37:48,091 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-08-05 21:38:10,891 - BERTopic - Dimensionality - Completed ✓
2024-08-05 21:38:10,893 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-08-05 21:38:12,567 - BERTopic - Cluster - Completed ✓
2024-08-05 21:38:12,571 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-08-05 21:38:14,586 - BERTopic - Representation - Completed ✓


CPU times: user 28.2 s, sys: 565 ms, total: 28.8 s
Wall time: 28.1 s


In [69]:
topic_model.save('../results/topic_model/basic_model')



In [None]:
#the connection got lost. I run everything on 1_topic_model_iteration.py
# ti_list = []

# for i in tqdm(range(4,21)):
#     topic_model.load('../results/topic_model/basic_model') # I ensure I have the same basic model
#     topic_model.update_topics(text_df.text.values,representation_model=representation_models)
#     topic_info = get_topic_info_clean(topic_model)
#     topic_info['iteration'] = i
#     ti_list.append(topic_info)

In [None]:
first_iterations = pd.read_excel('../results/topic_model/topic_info_iteration_1-3.xlsx')

iter_df = pd.concat((pd.read_csv(file) for file in glob.glob('../results/topic_model/topic_info_iteration_*')), ignore_index=True)

iter_df = pd.concat([first_iterations,iter_df])

In [17]:
iter_df = iter_df.sort_values(["Topic","iteration"]).reset_index(drop=True)

In [18]:
iter_df.to_csv('../results/topic_model/topic_info_all_iterations.csv',index=False)

## cosine similarity

In [26]:
iter_df_long= pd.melt(iter_df,
                      id_vars=['Topic','Name', 'iteration'],
                      value_vars = ['flan_snp', 'openai4m_snp', 'openai4o_snp', 'flan_lnp', 'openai4m_lnp', 'openai4o_lnp'],
                      value_name='label',var_name='model')

In [53]:
iter_df_long

Unnamed: 0,Topic,Name,iteration,model,label
0,-1,-1_gene_genes_genetic_species,1,flan_snp,biology
1,-1,-1_gene_genes_genetic_species,2,flan_snp,biology
2,-1,-1_gene_genes_genetic_species,3,flan_snp,biology
3,-1,-1_genes_genetic_gene_species,4,flan_snp,biology
4,-1,-1_genes_genetic_gene_species,5,flan_snp,biology
...,...,...,...,...,...
12613,103,103_bacterial_bacteria_caulobacter_proteins,19,openai4o_lnp,Bacterial Protein Mechanisms
12614,103,103_bacterial_bacteria_caulobacter_proteins,20,openai4o_lnp,Bacterial Protein Mechanisms
12615,104,104_bacterial_bacteria_caulobacter_proteins,1,openai4o_lnp,Bacterial Protein Mechanisms
12616,104,104_bacterial_bacteria_caulobacter_proteins,2,openai4o_lnp,Bacterial Proteins Mechanism


In [30]:
label_embeddings = sentence_model.encode(iter_df_long.label.values, show_progress_bar=False)

In [32]:
label_embeddings.shape

(12618, 384)

In [43]:
cosine_sim_matrix = cosine_similarity(label_embeddings)

In [66]:
# Create a dictionary to map (Topic, model) to their indices
topic_model_indices = iter_df_long.groupby(['Topic', 'model']).apply(lambda g: g.index.tolist()).to_dict()

# Function to calculate average distance
def average_distance(indices1, indices2, cosine_sim_matrix):
    distances = []
    for i in indices1:
        for j in indices2:
            distances.append(cosine_sim_matrix[i, j])
    return np.mean(distances)

# Compute average distances for each Topic&model combination
results = []
for (topic, model1), indices1 in topic_model_indices.items():
    for (topic2, model2), indices2 in topic_model_indices.items():
        if topic == topic2:  # Ensure Topic1=Topic2 and models are different
            avg_dist = average_distance(indices1, indices2, cosine_sim_matrix)
            results.append({
                'Topic': topic,
                'Model1': model1,
                'Model2': model2,
                'AverageSimilarity': avg_dist
            })

# Convert results to a DataFrame
similarities_df = pd.DataFrame(results)

  topic_model_indices = iter_df_long.groupby(['Topic', 'model']).apply(lambda g: g.index.tolist()).to_dict()


In [67]:
similarities_df

Unnamed: 0,Topic,Model1,Model2,AverageSimilarity
0,-1,flan_lnp,flan_lnp,0.907734
1,-1,flan_lnp,flan_snp,0.945726
2,-1,flan_lnp,openai4m_lnp,0.569667
3,-1,flan_lnp,openai4m_snp,0.687096
4,-1,flan_lnp,openai4o_lnp,0.524416
...,...,...,...,...
3811,104,openai4o_snp,flan_snp,0.676948
3812,104,openai4o_snp,openai4m_lnp,0.737886
3813,104,openai4o_snp,openai4m_snp,0.699164
3814,104,openai4o_snp,openai4o_lnp,0.766909


In [69]:
# Calculate average distance for Model1 and Model2
average_similarity = (similarities_df
    .groupby(['Model1', 'Model2'])
    .agg({'AverageSimilarity': 'mean'})
    .reset_index())

In [71]:
similarities_df.to_csv('../results/topic_model/iterations_topic_similarity.csv',index=False)
average_similarity.to_csv('../results/topic_model/iterations_average_similarity.csv',index=False)