In [1]:
! pip install -q BERTopic

In [2]:
from bertopic.representation import KeyBERTInspired
from bertopic import BERTopic
import pandas as pd

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
file = "/content/drive/MyDrive/project/data.csv"

In [5]:
df = pd.read_csv(file, index_col='abstract_id')
df = df.sample(frac=0.01)

In [6]:
df.shape

(1907, 1)

In [7]:
df.head(5)

Unnamed: 0_level_0,abstract_text
abstract_id,Unnamed: 1_level_1
24466245,Plasmodium falciparum malaria is treated with ...
23130655,"Metformin ( MF ) ( 1,1-dimethylbiguanide HCl )..."
18366264,proton pump inhibitors ( PPIs ) block the H + ...
7974947,The aim of this study was to determine the eff...
15303633,To demonstrate that 5 days of treatment with a...


In [8]:
#Most frequent terms as stop words
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS
import pandas as pd

documents = df['abstract_text'].tolist()

vectorizer = CountVectorizer(stop_words=list(ENGLISH_STOP_WORDS))
X = vectorizer.fit_transform(documents).toarray()
terms = vectorizer.get_feature_names_out()

document_lengths = X.sum(axis=1)
normalized_tf = X / document_lengths[:, None]


sum_normalized_tf = normalized_tf.sum(axis=0)


tf_df = pd.DataFrame({'term': terms, 'normalized_tf': sum_normalized_tf})


sorted_tf_df = tf_df.sort_values(by='normalized_tf', ascending=False)

N = 20
top_N_terms = sorted_tf_df.head(N)['term'].tolist()


X_percentage = 0.85
term_document_counts = (X > 0).sum(axis=0)
terms_to_remove = [term for term, count in zip(terms, term_document_counts) if count / len(documents) > X_percentage]


custom_stop_words = set(top_N_terms).union(set(terms_to_remove))

custom_stop_words

{'10',
 '12',
 'clinical',
 'compared',
 'control',
 'group',
 'groups',
 'intervention',
 'mean',
 'mg',
 'months',
 'patients',
 'placebo',
 'randomized',
 'significant',
 'significantly',
 'study',
 'treatment',
 'trial',
 'vs'}

In [9]:
vectorizer_model = CountVectorizer(ngram_range=(1, 1), stop_words=list(custom_stop_words))
representation_model = KeyBERTInspired()

topic_model = BERTopic(vectorizer_model=vectorizer_model, representation_model=representation_model)

topics, probs = topic_model.fit_transform(df['abstract_text'])




In [10]:
topic_model.visualize_topics()

In [11]:
topic_model.visualize_barchart()

In [12]:
topic_model.visualize_term_rank()

In [13]:
from gensim.corpora.dictionary import Dictionary
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamodel import LdaModel


topics = topic_model.get_topics()

texts = [text.split() for text in df['abstract_text']]
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]


gensim_topics = {key: [word[0] for word in value] for key, value in topics.items()}
lda_topics = list(gensim_topics.values())


cm = CoherenceModel(topics=lda_topics, texts=texts, dictionary=dictionary, coherence='c_v')
coherence = cm.get_coherence()

print("Average topic coherence:", coherence)


Average topic coherence: 0.48833935516211824


In [14]:
topic_model.get_topics()

{-1: [('health', 0.2122725),
  ('diet', 0.17533791),
  ('healthy', 0.16912192),
  ('results', 0.14456216),
  ('baseline', 0.13980891),
  ('surgery', 0.1396915),
  ('test', 0.13542545),
  ('serum', 0.13075037),
  ('participants', 0.11425336),
  ('differences', 0.107005306)],
 0: [('chemotherapy', 0.55313295),
  ('cisplatin', 0.40645534),
  ('radiotherapy', 0.39375353),
  ('cancer', 0.3382009),
  ('carcinoma', 0.31546038),
  ('tumors', 0.29032543),
  ('tumor', 0.29031295),
  ('doxorubicin', 0.26492035),
  ('prognostic', 0.2610368),
  ('efficacy', 0.25008392)],
 1: [('analgesia', 0.648559),
  ('analgesic', 0.5897387),
  ('anesthetic', 0.5578097),
  ('anesthesia', 0.5446675),
  ('ropivacaine', 0.5430727),
  ('anaesthesia', 0.53632426),
  ('bupivacaine', 0.52388483),
  ('epidural', 0.43912232),
  ('postoperative', 0.41918176),
  ('morphine', 0.3867536)],
 2: [('lumbar', 0.4177866),
  ('physiotherapy', 0.41605738),
  ('exercise', 0.3669563),
  ('disability', 0.35390192),
  ('spinal', 0.34447

In [15]:
document = topic_model.get_document_info(df['abstract_text'])

In [16]:
document.head()

Unnamed: 0,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document
0,Plasmodium falciparum malaria is treated with ...,-1,-1_health_diet_healthy_results,"[health, diet, healthy, results, baseline, sur...",[Perioperative myocardial ischemia is the sing...,health - diet - healthy - results - baseline -...,0.0,False
1,"Metformin ( MF ) ( 1,1-dimethylbiguanide HCl )...",-1,-1_health_diet_healthy_results,"[health, diet, healthy, results, baseline, sur...",[Perioperative myocardial ischemia is the sing...,health - diet - healthy - results - baseline -...,0.0,False
2,proton pump inhibitors ( PPIs ) block the H + ...,17,17_pylori_ulcer_gastritis_lansoprazole,"[pylori, ulcer, gastritis, lansoprazole, gastr...",[Acid hyposecretion may enhance Helicobacter p...,pylori - ulcer - gastritis - lansoprazole - ga...,0.621003,False
3,The aim of this study was to determine the eff...,47,47_sildenafil_erectile_erection_impotence,"[sildenafil, erectile, erection, impotence, in...",[Men with erectile dysfunction ( ED ) often ha...,sildenafil - erectile - erection - impotence -...,1.0,False
4,To demonstrate that 5 days of treatment with a...,5,5_antibiotics_antibiotic_ciprofloxacin_infections,"[antibiotics, antibiotic, ciprofloxacin, infec...",[Viral respiratory infections are common world...,antibiotics - antibiotic - ciprofloxacin - inf...,0.742317,False


In [17]:
document_and_proba = document[['Document', 'Topic', 'Probability']]

In [18]:
document_and_proba

Unnamed: 0,Document,Topic,Probability
0,Plasmodium falciparum malaria is treated with ...,-1,0.000000
1,"Metformin ( MF ) ( 1,1-dimethylbiguanide HCl )...",-1,0.000000
2,proton pump inhibitors ( PPIs ) block the H + ...,17,0.621003
3,The aim of this study was to determine the eff...,47,1.000000
4,To demonstrate that 5 days of treatment with a...,5,0.742317
...,...,...,...
1902,Acute mucositis is a dose-limiting toxicity of...,0,1.000000
1903,Nausea and vomiting can occur in Parkinson 's ...,-1,0.000000
1904,Disturbances in rest-activity rhythm are promi...,22,0.960780
1905,Three local anesthetics are commonly used for ...,1,1.000000


In [19]:
topics = topic_model.get_topics()

topic_df = pd.DataFrame({topic_id: [word for word, _ in words] for topic_id, words in topics.items()})

col_rename = {topic_id: f"{topic_id}" for topic_id in topic_df.columns}
topic_df.rename(columns=col_rename, inplace=True)


In [20]:
topic_df

Unnamed: 0,-1,0,1,2,3,4,5,6,7,8,...,43,44,45,46,47,48,49,50,51,52
0,health,chemotherapy,analgesia,lumbar,intraocular,estrogen,antibiotics,antidepressant,antihypertensive,respiratory,...,stroke,dietary,hepatitis,nicotine,sildenafil,supplementation,remifentanil,metabolic,vaccines,migraine
1,diet,cisplatin,analgesic,physiotherapy,cataract,endometrial,antibiotic,depressive,hypertension,endexpiratory,...,infarction,cholesterol,hcv,smoking,erectile,micronutrients,anaesthesia,glucose,vaccine,rizatriptan
2,healthy,radiotherapy,anesthetic,exercise,retinal,hormone,ciprofloxacin,depression,angiotensinconverting,intubation,...,ischemic,diet,antiviral,tobacco,erection,iron,anesthesia,insulin,immunization,headache
3,results,cancer,anesthesia,disability,glaucoma,endometriosis,infections,fluoxetine,hypertensive,ventilator,...,ischemia,lipids,ribavirin,smoker,impotence,ferritin,remifentanilketamine,diet,vaccination,sumatriptan
4,baseline,carcinoma,ropivacaine,spinal,macular,menstrual,moxifloxacin,paroxetine,angiotensin,ventilation,...,thrombolysis,lipid,ribavirintreated,smokers,intercourse,vitamin,lidocaine,hyperinsulinaemia,immunogenicity,headaches
5,surgery,tumors,anaesthesia,exercises,ocular,estradiol,antimicrobial,psychotherapy,systolic,ventilated,...,hemorrhage,lipoprotein,hbv,cigarettes,ejaculation,anemia,propofollidocaineketamine,obese,pneumococcal,zolmitriptan
6,test,tumor,bupivacaine,spine,corneal,uterine,infection,anxiety,diastolic,airway,...,aneurysm,fatty,hepatic,quit,erections,anaemia,propofol,adipose,immunogenic,50mg
7,serum,doxorubicin,epidural,health,intravitreal,ovarian,azithromycin,therapy,betaadrenergic,lung,...,cerebral,fats,hsv2,smoked,sexual,zinc,intubation,obesity,antigen,tablets
8,participants,prognostic,postoperative,postural,acuity,postmenopausal,levofloxacin,medication,metoprolol,pulmonary,...,intracerebral,fat,interferon,smokeless,testosterone,haemoglobin,relaxants,glycaemic,antigens,10mg
9,differences,efficacy,morphine,pain,postoperative,contraceptive,vancomycin,depressed,inhibitor,oxygenation,...,aneurysmal,cardiovascular,liver,relapse,arousal,deficiency,ketamine,appetite,hepatitis,100mg


In [21]:
document_and_proba['Topic_name'] = document_and_proba['Topic'].apply(lambda topic_id: topics.get(int(topic_id), [])[0][0]) #for top word (topic main name)
#document_and_proba['Topic_name'] = document_and_proba['Topic'].apply(lambda topic_id: ', '.join([word for word, _ in topics.get(int(topic_id), [])])) #for all words with respect to the topic


In [22]:
doc_with_abstract_id_prob_and_topic_name = document_and_proba.set_index(df.index)

In [23]:
doc_with_abstract_id_prob_and_topic_name.sample(20)

Unnamed: 0_level_0,Document,Topic,Probability,Topic_name
abstract_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
26118143,To evaluate blood pressure ( BP ) control util...,-1,0.0,health
10748957,Conventional management of partial thickness f...,40,0.914818,ulcer
24045792,The mother is an important mediator to the inf...,12,0.508482,neonatal
25624368,We tested the hypothesis that observationally ...,37,1.0,hypercholesterolemia
23241989,"DTaP-IPV-Hib-HepB , an investigational hexaval...",51,1.0,vaccines
19809292,Morbidly obese patients show impaired pulmonar...,8,1.0,respiratory
25448628,The National Institute for Health and Care Exc...,-1,0.0,health
16636217,"Adipose-derived cytokines , including tumor ne...",-1,0.0,health
18696099,"N-chlorotaurine ( NCT ) , an endogenous mild a...",3,0.639594,intraocular
18835506,Cardiac surgery provokes an inflammatory respo...,18,1.0,thrombosis


In [25]:
topic_model.get_topic_freq()

Unnamed: 0,Topic,Count
0,-1,392
20,0,96
24,1,91
6,2,81
14,3,79
40,4,57
3,5,47
10,6,45
21,7,44
11,8,42
