<a href="https://colab.research.google.com/github/CHANDMX20/ML/blob/main/ML_BERTopic_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud,STOPWORDS

In [2]:
data = pd.read_csv("articles.csv")
data.head()

Unnamed: 0,PMID,ArticleTitle,PubDate,AuthorList,Abstract
0,12297610,Problem substance use among depressed patients...,2002.0,"Roeloffs, Carol A; Wells, Kenneth B; Ziedonis,...",This study identifies characteristics associat...
1,14652336,Racial and ethnic disparities in emergency dep...,2003.0,"Tamayo-Sarver, Joshua H; Hinze, Susan W; Cydul...",We examined racial and ethnic disparities in a...
2,16055770,Developments in the epidemiology of drug use a...,2005.0,"Compton, Wilson M; Thomas, Yonette F; Conway, ...",The past 30 years of research on the epidemiol...
3,16843611,"Motives, diversion and routes of administratio...",2007.0,"McCabe, Sean Esteban; Cranford, James A; Boyd,...",The main objectives of this study were to asse...
4,17169712,Opioid analgesics and rates of fatal drug pois...,2006.0,"Paulozzi, Leonard J; Ryan, George W",To determine whether the variability in rate o...


In [3]:
#convert to lowercase
data['Abstract'] = data['Abstract'].str.lower()

# Specify the column and words to remove
column_to_clean = 'Abstract'
words_to_remove = ['prescription', 'opioids', 'use', 'opioid', 'patient', 'patients', 'tongue', 'sticking', 'year', 'hydrocodone', 'oxycodone', 'morphine', 'hydromorphone', 'fentanyl', 'methadone', 'heroin', 'methamphetamine', 'marijuana', 'overdose', 'overdoses', 'drug', 'drugs', 'gabapentin']

# Function to remove words from a text
def remove_words(text):
    for word in words_to_remove:
        text = text.replace(word, '')
    return text

# Apply the function to the specified column
data[column_to_clean] = data[column_to_clean].apply(remove_words)


In [5]:
import warnings
warnings.filterwarnings("ignore")

In [6]:
#!pip install bertopic




In [7]:
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer

from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer

from gensim.models.word2vec import Word2Vec
import gensim.downloader as api

from sklearn.decomposition import PCA

from sklearn.cluster import KMeans

In [55]:
# Step 1 - Extract embeddings
embedding_model = Word2Vec

# Step 2 - Reduce dimensionality
#umap_model = UMAP(n_neighbors=10, n_components=5, min_dist=0.0, metric='cosine')
dim_model = PCA(n_components=5)

# Step 3 - Cluster reduced embeddings
#hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
cluster_model = KMeans(n_clusters=50)


# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words="english",  min_df=20)

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer()

# Step 6 - (Optional) Fine-tune topic representations with
# a `bertopic.representation` model
representation_model = KeyBERTInspired()

In [56]:
# All steps together
topic_model = BERTopic(
  embedding_model=embedding_model,          # Step 1 - Extract embeddings
  umap_model=dim_model,                    # Step 2 - Reduce dimensionality
  hdbscan_model=cluster_model,              # Step 3 - Cluster reduced embeddings
  vectorizer_model=vectorizer_model,        # Step 4 - Tokenize topics
  ctfidf_model=ctfidf_model,                # Step 5 - Extract topic words
  representation_model=representation_model # Step 6 - (Optional) Fine-tune topic represenations
)

In [57]:
abstract_topics, _ = topic_model.fit_transform(data.Abstract)

In [58]:
freq = topic_model.get_topic_info()
print("Number of topics: {}".format(len(freq)))
freq.head()

Number of topics: 50


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,10,0_prevalence_health_risk_gender,"[prevalence, health, risk, gender, medical, as...",[visits to the emergency department (ed) for ...
1,1,10,1_medical_health_risk_studies,"[medical, health, risk, studies, population, i...",[with governments' increasing efforts to curb ...
2,2,9,2_gender_prevalence_health_age,"[gender, prevalence, health, age, substance, s...",[despite the fact that important gender differ...
3,3,9,3_prevalence_health_medical_treatment,"[prevalence, health, medical, treatment, studi...","[as a population, non-medical rs are not wel..."
4,4,8,4_pain_prevalence_medical_health,"[pain, prevalence, medical, health, risk, exam...","[pain is common in s with liver disease, diffi..."


In [59]:
a_topic = freq.iloc[1]["Topic"] # Select the 1st topic
topic_model.get_topic(a_topic) # Show the words and their c-TF-IDF scores

[('medical', 0.34440944),
 ('health', 0.27049887),
 ('risk', 0.23243418),
 ('studies', 0.20162158),
 ('population', 0.17369765),
 ('increased', 0.15764628),
 ('data', 0.15293862),
 ('high', 0.14587002),
 ('examined', 0.13739884),
 ('association', 0.13227025)]

In [60]:
topic_model.visualize_barchart(top_n_topics=20)


In [61]:
topic_model.visualize_topics()


In [62]:
topic_model.visualize_term_rank()

In [63]:
topic_model.visualize_term_rank(log_scale=True)

In [64]:
topic_model.visualize_hierarchy(top_n_topics=20)

In [65]:
# Select most 3 similar topics
similar_topics, similarity = topic_model.find_topics("sex differences", top_n = 5)
similar_topics

[36, 10, 39, 17, 2]

In [66]:
most_similar = similar_topics[0]
print("Most Similar Topic Info: \n{}".format(topic_model.get_topic(most_similar)))
print("Similarity Score: {}".format(similarity[1]))

Most Similar Topic Info: 
[('prevalence', 0.44528183), ('gender', 0.3976789), ('ethnicity', 0.33330154), ('differences', 0.30673528), ('age', 0.28443837), ('race', 0.28165483), ('sex', 0.25833255), ('medical', 0.250045), ('reported', 0.20881397), ('non', 0.19768846)]
Similarity Score: 0.5604375004768372


In [67]:
topic_model.visualize_heatmap(n_clusters=10, top_n_topics=50)