### Installing Required Libraries

In [None]:
# Library needed to read the PDF file containing conference proceedings
!pip install PyPDF2
!pip install bertopic

# Library needed for text pre-processing
!pip install nltk

!pip install wordcloud
!pip install matplotlib

!pip install bertopic[flair]
!pip install bertopic[gensim]
!pip install bertopic[spacy]
!pip install bertopic[use]

### Importing Required Libraries

In [None]:
# Importing function to read PDF file contents
from PyPDF2 import PdfFileReader
from bertopic import BERTopic


# Importing functions needed to handle regular expressions, tokenize text (sentence & word), remove stopwords,punctuations and lemmatize (Text Pre-processing)
import re
import nltk
nltk.download('all')
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import string
from nltk.stem.wordnet import WordNetLemmatizer

# Importing Functions for Generating Word Cloud
from wordcloud import WordCloud


### Reading the Input File (PDF File) and Extracting Text

In [None]:
filename = 'INFORMS_merged.pdf'
pdfFileObj = open(filename,'rb')
pdfReader = PdfFileReader(pdfFileObj,strict=False)

In [None]:
# Getting the number of pages in the file and Initializing a count object
num_pages = pdfReader.numPages 
print(num_pages)

1437


In [None]:
# Extracting the Text
count = 0
text = '''  '''

# Creation of loop to read all pages of the PDF file
while count < num_pages:
    pageObj = pdfReader.getPage(count)
    count +=1
    text += pageObj.extractText()

### Text Preprocessing

In [None]:
# Performing Sentence Tokenization
tokens= sent_tokenize(text)
type(tokens)
#len(tokens)

list

In [None]:
# Converting tokens without stopwords to lower case
lower_case_tokens = [word.lower() for word in tokens]
#print(lower_case_tokens)
print(len(lower_case_tokens))

40707


In [None]:
# Removing stopwords
#exclude = set(string.punctuation)
#print(exclude)

stop = set(stopwords.words('english'))
#print(stop)

# Extracting Tokens Without Stopwords
tokens_without_stopwords = [word for word in lower_case_tokens if word not in stop]
#print(tokens_without_stopwords)
#print(len(tokens_without_stopwords))
#type(tokens_without_stopwords)

In [None]:
lemmatizer= WordNetLemmatizer()
lemma = [lemmatizer.lemmatize (t) for t in tokens_without_stopwords]
# Alternative way to Lemmatize
#lemma = [lemmatizer.lemmatize (t) for t in punc_free_tokens]

common_word = ['informs','session','annual','chair']
lemma= [word for word in lemma if word not in common_word]

#print(lemma)
#print(len(lemma))
#type(lemma)

### Topic Modeling- Training of Topic Model

In [None]:
# Training the Model
topic_model = BERTopic()
topics, probs = topic_model.fit_transform(lemma)

Downloading: 100%|██████████| 1.18k/1.18k [00:00<00:00, 1.35MB/s]
Downloading: 100%|██████████| 190/190 [00:00<00:00, 206kB/s]
Downloading: 100%|██████████| 10.6k/10.6k [00:00<00:00, 5.05MB/s]
Downloading: 100%|██████████| 612/612 [00:00<00:00, 376kB/s]
Downloading: 100%|██████████| 116/116 [00:00<00:00, 97.1kB/s]
Downloading: 100%|██████████| 39.3k/39.3k [00:00<00:00, 23.8MB/s]
Downloading: 100%|██████████| 90.9M/90.9M [00:01<00:00, 75.4MB/s]
Downloading: 100%|██████████| 53.0/53.0 [00:00<00:00, 36.5kB/s]
Downloading: 100%|██████████| 112/112 [00:00<00:00, 76.2kB/s]
Downloading: 100%|██████████| 466k/466k [00:00<00:00, 64.5MB/s]
Downloading: 100%|██████████| 350/350 [00:00<00:00, 284kB/s]
Downloading: 100%|██████████| 13.2k/13.2k [00:00<00:00, 7.43MB/s]
Downloading: 100%|██████████| 232k/232k [00:00<00:00, 33.1MB/s]
Downloading: 100%|██████████| 349/349 [00:00<00:00, 205kB/s]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disablin

### Extraction of Topics From The Model

In [None]:
# Viewing the Topics
#After generating topics and their probabilities, we can access the frequent topics that were generated; -1 refers to all outliers (documents that couldn't be clustered) and should typically be ignored.
# "count" refers to the number of documents per topic
# "name"comprises topic id and words that are best representative of the cluster of the topic
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,19660,-1_the_and_to_of
1,0,649,0_convex_nonconvex_gradient_dual
2,1,377,1_games_game_nash_equilibrium
3,2,371,2_network_networks_nodes_node
4,3,329,3_charging_electric_ev_evs
...,...,...,...
450,449,10,449_qubo_maximumcut_nary_unconstrained
451,450,10,450_unit_nursepatient_remote_patients
452,451,10,451_tokyo_japan_nonhubs_hubandspoke
453,452,10,452_lead_mance_perfor_indicator


In [None]:
# Get top terms for topic 0
topic_model.get_topic(0)

[('convex', 0.01723190687758933),
 ('nonconvex', 0.012823895173328381),
 ('gradient', 0.010201583711493148),
 ('dual', 0.008350246141604768),
 ('proximal', 0.008293549249546193),
 ('primaldual', 0.008048482705853848),
 ('convergence', 0.007596578764003389),
 ('problems', 0.007582393830395507),
 ('optimization', 0.007464187800123472),
 ('descent', 0.007100409552700825)]

### Visualization of Topics

In [None]:
#Visualizing Topics on Intertopic Distance Map
topic_model.visualize_topics()

In [None]:
# Visualizing Optimal Number of Topics and Their Top 10 Constituent Keywords
topic_model.visualize_barchart(width=280, height=330,n_words=10)

In [None]:
#Visualizing Topic Hierarchy
topic_model.visualize_hierarchy(top_n_topics=50)

In [None]:
#Visualize Top 10 Topics
topic_model.visualize_barchart(top_n_topics=10)

In [None]:
#Visualizing Topic Similarity (Determining which topics are related)
topic_model.visualize_heatmap(n_clusters=20, width=1000, height=1000)

In [None]:
#Visualize Term Score Decline
topic_model.visualize_term_rank()

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=69f4f169-8014-4ed3-8b68-7cae4197e215' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>