# **Topic Modelling for Glioma, Leukaemia, and Retinoblastoma**








In [1]:

!pip install session-info
!pip install pycaret[full] --quiet
!pip install bertopic --quiet
!pip install --upgrade numpy
!pip install bertopic[visualization] --quiet
!pip install session-info
 


[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow 2.8.0 requires tf-estimator-nightly==2.8.0.dev2021122109, which is not installed.
tensorflow 2.8.0 requires numpy>=1.20, but you have numpy 1.19.5 which is incompatible.
hdbscan 0.8.28 requires numpy>=1.20, but you have numpy 1.19.5 which is incompatible.
google-colab 1.0.0 requires ipykernel~=4.10, but you have ipykernel 6.13.0 which is incompatible.
google-colab 1.0.0 requires ipython~=5.5.0, but you have ipython 7.32.0 which is incompatible.
google-colab 1.0.0 requires requests~=2.23.0, but you have requests 2.27.1 which is incompatible.
google-colab 1.0.0 requires tornado~=5.1.0; python_version >= "3.0", but you have tornado 6.1 which is incompatible.
datascience 0.10.6 requires folium==0.2.1, but you have folium 0.8.3 which is incompatible.
bertopic 0.9.4 requires numpy>=1.20.0, but you have n

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pycaret.nlp import *
from bertopic import BERTopic
import session_info


from wordcloud import WordCloud

In [3]:
glioma = pd.read_csv('/content/glioma.csv')
leukaemia = pd.read_csv('/content/leukaemia.csv')
retino = pd.read_csv('/content/retinoblastoma.csv')

In [4]:
glioma.head()

Unnamed: 0,abstract
0,Surgeons have considered extending the resecti...
1,Following the identification of key molecular ...
2,To evaluate whether amide proton transfer (APT...
3,Diffusion-weighted imaging (DWI) plays an impo...
4,MRI is the standard imaging modality used for ...


In [5]:
leukaemia.head()

Unnamed: 0,abstract
0,Acute lymphoid leukaemia (ALL) is the commones...
1,Previous studies on the putative role of aller...
2,Although inferior outcomes of children with Do...
3,Treatment for adults with acute lymphoblastic ...
4,Acute Lymphoblastic Leukemia (ALL) is a neopla...


In [6]:
retino.head()

Unnamed: 0,abstract
0,To report three-decade changes of clinical cha...
1,"Retinoblastoma, also known as ocular cancer, u..."
2,"Retinoblastoma, often referred to as eye cance..."
3,We and others have shown that aberrant activat...
4,The bulb of Eleutherine bulbosa (Mill.) Urb. i...


In [7]:
print('glioma dataset: ', glioma.shape)
print('leukaemia dataset: ', leukaemia.shape)
print('retino dataset: ', retino.shape)



glioma dataset:  (4778, 1)
leukaemia dataset:  (4413, 1)
retino dataset:  (4418, 1)


In [8]:
print('glioma dataset null values: ', glioma.isnull().values.any())
print('leukaeamia dataset null values: ', leukaemia.isnull().values.any())
print('glioma dataset null values: ', retino.isnull().values.any())


glioma dataset null values:  True
leukaeamia dataset null values:  True
glioma dataset null values:  True


In [9]:
glioma.dropna(axis = 0, inplace = True)
leukaemia.dropna(axis = 0, inplace = True)
retino.dropna(axis = 0, inplace = True)

In [10]:
print('glioma dataset null values after drop: ', glioma.isnull().values.any())
print('leukaeamia dataset null values after drop: ', leukaemia.isnull().values.any())
print('glioma dataset null values after drop: ', retino.isnull().values.any())

glioma dataset null values after drop:  False
leukaeamia dataset null values after drop:  False
glioma dataset null values after drop:  False


# **Glioma Model**

In [11]:
glioma.head(10)

Unnamed: 0,abstract
0,Surgeons have considered extending the resecti...
1,Following the identification of key molecular ...
2,To evaluate whether amide proton transfer (APT...
3,Diffusion-weighted imaging (DWI) plays an impo...
4,MRI is the standard imaging modality used for ...
5,The BRAFV600E point mutation plays a key role ...
6,Grading of brain gliomas is of clinical import...
7,Anti-inflammatory effect of vitamin D (VD) cou...
8,Newly emerged molecular markers in gliomas pro...
9,"According to the stem cell theory, two neuroge..."


In [12]:
glioma_experiment = setup(glioma,
                          target = 'abstract',
                          session_id = 451)

Description,Value
session_id,451
Documents,4777
Vocab Size,13749
Custom Stopwords,False


In [13]:
models()

Unnamed: 0_level_0,Name,Reference
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
lda,Latent Dirichlet Allocation,gensim/models/ldamodel
lsi,Latent Semantic Indexing,gensim/models/lsimodel
hdp,Hierarchical Dirichlet Process,gensim/models/hdpmodel
rp,Random Projections,gensim/models/rpmodel
nmf,Non-Negative Matrix Factorization,sklearn.decomposition.NMF


In [14]:
rp_model = create_model('rp')

In [15]:
print(rp_model)

RpModel(num_terms=13749, num_topics=4)


In [16]:

# Assign model results
rp_results = assign_model(rp_model)
rp_results.head()

Unnamed: 0,abstract,Topic_0,Topic_1,Topic_2,Topic_3,Dominant_Topic
0,surgeon consider extend resection margin well ...,8.5,-1.5,-1.5,6.5,Topic 0
1,follow identification key molecular alteration...,-13.0,1.0,11.0,2.0,Topic 2
2,evaluate use characterize pediatric patient pr...,-12.0,-29.0,11.0,-3.0,Topic 2
3,image assessment however derive parameter mode...,-0.5,-12.5,7.5,-1.5,Topic 2
4,modality use diagnosis treatment plan post tre...,-1.5,-5.5,-10.5,0.5,Topic 3


In [17]:
#Will identify the top 20 topics of the dataset
dataset = get_topics(rp_results, 'abstract', model = 'rp', num_topics = 20)

Description,Value
session_id,3738
Documents,4778
Vocab Size,11298
Custom Stopwords,False


In [18]:
dataset.fillna(rp_results.mean())

Unnamed: 0,abstract,Topic_0,Topic_1,Topic_2,Topic_3,Dominant_Topic,Topic_0.1,Topic_1.1,Topic_2.1,Topic_3.1,...,Topic_11,Topic_12,Topic_13,Topic_14,Topic_15,Topic_16,Topic_17,Topic_18,Topic_19,Dominant_Topic.1
0,surgeon consider extend resection margin outco...,8.5,-1.5,-1.5,6.5,Topic 0,0.223607,2.906888,-0.670820,-2.459675,...,2.012461,1.118034,-1.118034,-1.118034,6.484597,2.236066e-01,2.012461,-0.223607,-2.459675,Topic 8
1,follow identification molecular alteration pro...,-13.0,1.0,11.0,2.0,Topic 2,-0.894427,0.447214,1.341641,-4.024922,...,2.683282,0.894427,-2.683282,-1.341641,7.155417,1.788854e+00,-0.447214,-0.447213,1.788854,Topic 15
2,evaluate use characterize pediatric patient pr...,-12.0,-29.0,11.0,-3.0,Topic 2,-3.577709,2.236068,5.813777,1.341640,...,3.130495,2.683282,4.024922,-5.366564,9.838698,4.472132e-01,-1.788854,-10.733125,-4.472137,Topic 8
3,image assessment however derive parameter mode...,-0.5,-12.5,7.5,-1.5,Topic 2,-5.590170,6.484597,5.142956,1.565247,...,1.118034,-0.223607,5.590170,-4.695743,6.931810,6.708202e-01,2.906888,-6.931810,-3.354103,Topic 15
4,modality use diagnosis treatment plan post tre...,-1.5,-5.5,-10.5,0.5,Topic 3,-1.118034,2.012461,4.248529,-2.906889,...,0.670820,2.012461,-2.012461,-0.670821,7.379025,2.906888e+00,5.142957,-2.012461,0.223607,Topic 15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4773,image show effective tool analysis discriminat...,-5.0,-15.0,4.0,-19.0,Topic 2,0.670820,-3.354102,-2.012461,-7.379024,...,3.801315,3.354102,1.118033,2.012461,6.931812,9.615093e+00,6.037383,-8.720662,-6.931812,Topic 16
4774,clinical significance decline free reference r...,1.5,1.5,4.5,-5.5,Topic 2,-0.670820,6.037382,0.223607,-0.223607,...,-2.012461,-0.223607,-2.012461,4.695741,-1.565248,2.012461e+00,2.459675,-2.459675,-0.670820,Topic 1
4775,function include language calculation self mot...,4.5,-2.5,-11.5,3.5,Topic 0,1.788854,0.894427,0.894427,-3.130495,...,1.788854,1.788854,-1.341641,-0.894427,-0.894427,-2.980232e-08,0.894427,2.236068,-0.447213,Topic 10
4776,characterize aggressive local growth pattern p...,3.5,8.5,-11.5,15.5,Topic 3,2.012461,-2.906889,-3.801316,1.565248,...,1.118034,0.670820,-2.012461,2.906888,-2.012461,-2.906888e+00,0.223606,3.801316,0.670820,Topic 6


# **Topic Modelling with BerTopic**

In [19]:



# create model 
 
model = BERTopic(verbose=True)
 
#convert to list 
docs = glioma['abstract'].to_list()
 
topics, probabilities = model.fit_transform(docs)

Batches:   0%|          | 0/150 [00:00<?, ?it/s]

2022-04-16 12:30:17,980 - BERTopic - Transformed documents to Embeddings
2022-04-16 12:30:44,398 - BERTopic - Reduced dimensionality with UMAP
2022-04-16 12:30:44,625 - BERTopic - Clustered UMAP embeddings with HDBSCAN


In [20]:
#Select Top 50 Topics
model.get_topic_freq().head(50)
model.get_topic(50)


[('kurtosis', 0.07455368524034046),
 ('dki', 0.05792902485537294),
 ('diffusion', 0.04431173340490156),
 ('mk', 0.03558022221593483),
 ('metrics', 0.03037837645110476),
 ('parameters', 0.022258190361098863),
 ('fa', 0.02169953399126811),
 ('mean', 0.020643893485974334),
 ('diffusional', 0.01767338648471042),
 ('md', 0.01742137998136389)]

In [21]:
#Frequency Visualization
model.visualize_topics()

In [22]:
#Barchart Visualization
model.visualize_barchart()

In [23]:
#Topic Similarity
model.visualize_heatmap()

In [24]:
#Predict New Topics of any document...just add your dataframe when needed
#topics, probs = model.transform()

# **Leukaemia Model**

In [25]:
leukaemia.head(10)

Unnamed: 0,abstract
0,Acute lymphoid leukaemia (ALL) is the commones...
1,Previous studies on the putative role of aller...
2,Although inferior outcomes of children with Do...
3,Treatment for adults with acute lymphoblastic ...
4,Acute Lymphoblastic Leukemia (ALL) is a neopla...
5,Acute lymphoblastic leukaemia develops in both...
6,Gender-specific differences in survival by cli...
7,The prognostic value of minimal residual disea...
8,T-cell acute lymphoblastic leukemia is a relat...
9,To investigate the clinical characteristics of...


In [26]:
leukaemia_experiment = setup(leukaemia,
                          target = 'abstract',
                          session_id = 451)

Description,Value
session_id,451
Documents,4409
Vocab Size,12703
Custom Stopwords,False


In [27]:
# create model 
 
model = BERTopic(verbose=True)
 
#convert to list 
docs = leukaemia['abstract'].to_list()
 
topics, probabilities = model.fit_transform(docs)

Batches:   0%|          | 0/138 [00:00<?, ?it/s]

2022-04-16 12:32:29,327 - BERTopic - Transformed documents to Embeddings
2022-04-16 12:32:42,591 - BERTopic - Reduced dimensionality with UMAP
2022-04-16 12:32:42,809 - BERTopic - Clustered UMAP embeddings with HDBSCAN


In [28]:
#Select Top 50 Topics
model.get_topic_freq().head(50)
model.get_topic(50)

[('osteonecrosis', 0.05371599616967559),
 ('on', 0.03866459156267535),
 ('symptomatic', 0.02504233145039871),
 ('mri', 0.021849417539431766),
 ('head', 0.019889738113691928),
 ('femoral', 0.018153634717289487),
 ('joints', 0.01784240567775489),
 ('children', 0.015397408392352563),
 ('incidence', 0.015267962655697269),
 ('hips', 0.014988535849477399)]

In [29]:
#Frequency Visualization
model.visualize_topics()

In [30]:
#Barchart Visualization
model.visualize_barchart()

In [31]:
#Topic Similarity
model.visualize_heatmap()

In [32]:
#Predict New Topics of any document...just add your dataframe when needed
#topics, probs = model.transform()

# **Retinoblastoma Model**

In [33]:
retino.head(10)

Unnamed: 0,abstract
0,To report three-decade changes of clinical cha...
1,"Retinoblastoma, also known as ocular cancer, u..."
2,"Retinoblastoma, often referred to as eye cance..."
3,We and others have shown that aberrant activat...
4,The bulb of Eleutherine bulbosa (Mill.) Urb. i...
5,This study attempted to estimate the impact of...
6,To determine the risk of patients with an earl...
7,This case report presents two patients affecte...
8,Intra-arterial chemotherapy (IAC) represents a...
9,Long non-coding RNAs (lncRNAs) have been shown...


In [34]:
retino_experiment = setup(leukaemia,
                          target = 'abstract',
                          session_id = 451)

Description,Value
session_id,451
Documents,4409
Vocab Size,12703
Custom Stopwords,False


In [35]:
# create model 
 
model = BERTopic(verbose=True)
 
#convert to list 
docs = leukaemia['abstract'].to_list()
 
topics, probabilities = model.fit_transform(docs)

Batches:   0%|          | 0/138 [00:00<?, ?it/s]

2022-04-16 12:34:21,990 - BERTopic - Transformed documents to Embeddings
2022-04-16 12:34:35,149 - BERTopic - Reduced dimensionality with UMAP
2022-04-16 12:34:35,364 - BERTopic - Clustered UMAP embeddings with HDBSCAN


In [36]:
#Select Top 50 Topics
model.get_topic_freq().head(50)
model.get_topic(50)

[('allogeneic', 0.023173336999210224),
 ('transplantation', 0.020857061988340923),
 ('os', 0.01931048844939952),
 ('patients', 0.01795845118438476),
 ('survival', 0.015701775516315807),
 ('adult', 0.015192230106864383),
 ('hsct', 0.015055667379819766),
 ('skeletal', 0.01417938157566772),
 ('sall', 0.014054698364659782),
 ('remission', 0.013529175322521481)]

In [37]:
#Frequency Visualization
model.visualize_topics()

In [38]:
#Barchart Visualization
model.visualize_barchart()

In [39]:
#Topic Similarity
model.visualize_heatmap()

In [41]:
session_info.show(html=False)


-----
bertopic            0.9.4
gensim              3.6.0
matplotlib          3.5.1
numpy               1.21.6
pandas              1.3.5
plotly              5.5.0
pycaret             2.3.10
seaborn             0.11.2
session_info        1.0.0
wordcloud           1.5.0
-----
IPython             7.32.0
jupyter_client      7.2.2
jupyter_core        4.9.2
notebook            5.3.1
-----
Python 3.7.13 (default, Mar 16 2022, 17:37:17) [GCC 7.5.0]
Linux-5.4.144+-x86_64-with-Ubuntu-18.04-bionic
-----
Session information updated at 2022-04-16 12:39
