### Installing Required Libraries

In [None]:
!pip install pandas

# Library needed to read the PDF file containing conference proceedings
#!pip install textract
!pip install PyPDF2

# Library required for topic modeling
!pip install bertopic

# Library needed for text pre-processing
!pip install nltk

### Importing Required Libraries

In [None]:
import pandas as pd

# Importing Packages
#import textract

# Importing function to read PDF file contents
from PyPDF2 import PdfFileReader

# Importing packages for topic modelling and removing stop words
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer

import nltk
nltk.download('all')
from nltk.tokenize import word_tokenize, sent_tokenize

In [19]:
# Setting pandas dataframe display option for rows
pd.options.display.max_rows = 400

### Reading the Input File (PDF File) and Extracting Text

In [20]:
#text =textract.process('INFORMS_2022.pdf', method='pdfminer')
#text_str= str(text)
# consider chamging 'rb' to 'r'

filename = 'INFORMS_2022.pdf'
pdfFileObj = open(filename,'rb')
pdfReader = PdfFileReader(pdfFileObj,strict=False)

num_pages = pdfReader.numPages 
print(num_pages)


count = 0
text = '''  '''

# Creation of loop to read all pages of the PDF file
while count < num_pages:
    pageObj = pdfReader.getPage(count)
    count +=1
    text += pageObj.extractText()

1072


### Text Preprocessing- Removal of Stop Words

In [21]:
# Performing Sentence Tokenization
tokens= sent_tokenize(text)
type(tokens)
#len(tokens)

# Step to remove stop words
vectorizer_model = CountVectorizer(stop_words="english")

### Topic Modeling- Training of Topic Model

In [22]:
# Training the Model;  'calculate_probabilities=True' calculates the probabilities of all topics in a document
#topic_model = BERTopic(language="english", calculate_probabilities=True)  
# nr_topics="auto" does automatic reduction of topics generated

#"topics" contains a one-to-one mapping of inputs to their modeled topic (or cluster).
#"probs" contains a list of probabilities that an input belongs to their assigned topic.

topic_model = BERTopic(vectorizer_model=vectorizer_model, nr_topics="auto", calculate_probabilities=True)

# Alternatively
#topic_model = BERTopic(nr_topics="auto", calculate_probabilities=True)

topics, probs = topic_model.fit_transform(tokens)

In [None]:
print(topics,probs)

### Extraction of Topics From The Model

In [24]:
# To access topics generated based on their relative frequency
topic_model.get_topic_freq()

Unnamed: 0,Topic,Count
0,-1,11209
1,0,7955
2,1,167
3,2,157
4,3,150
5,4,146
6,5,137
7,6,137
8,7,132
9,8,130


In [25]:
# Viewing the Topics
#After generating topics and their probabilities, we can access the frequent topics that were generated; -1 refers to all outliers (documents that couldn't be clustered) and should typically be ignored.
# "count" refers to the number of documents per topic
# "name"comprises topic id and words that are best representative of the cluster of the topic
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,11209,-1_university_model_data_learning
1,0,7955,0_university_session_contact_problem
2,1,167,1_integer_mixedinteger_programming_mixed
3,2,157,2_privacy_differential_private_privacypreserving
4,3,150,3_blockchain_cryptocurrency_cryptocurrencies_m...
5,4,146,4_crowdfunding_donors_fundraising_donation
6,5,137,5_quantum_qubo_computing_annealing
7,6,137,6_fairness_fair_notions_measures
8,7,132,7_drone_drones_delivery_truck
9,8,130,8_bandit_bandits_contextual_multiarmed


In [26]:
# Get top terms for specific topic
topic_model.get_topic(0)

[('university', 0.0072460775010985065),
 ('session', 0.006575720406948091),
 ('contact', 0.005764535929597535),
 ('problem', 0.00530293632243474),
 ('chair', 0.005236868976896552),
 ('optimization', 0.00509061894460121),
 ('model', 0.004972923962020461),
 ('study', 0.004742708602807538),
 ('data', 0.004734583959547473),
 ('network', 0.004521650513822912)]

### Visualization of Topics

In [27]:
#Visualizing Topics on Intertopic Distance Map
topic_model.visualize_topics()

In [28]:
# Visualizing Optimal Number of Topics and Their Top 10 Constituent Keywords
topic_model.visualize_barchart(width=280, height=330,n_words=10, top_n_topics=107)
#topic_model.visualize_barchart(width=280, height=330, top_n_topics=10)


In [29]:
#Visualizing Topic Hierarchy
#topic_model.visualize_hierarchy(top_n_topics=20)

#Alternatively, without specifying the number of topics to be clustered
topic_model.visualize_hierarchy()

In [30]:
#Visualizing Topic Similarity (Determining which topics are related)
#topic_model.visualize_heatmap(n_clusters=20, width=1000, height=1000)
topic_model.visualize_heatmap()

In [31]:
#To visulaize the distributionn of topics in a single document
# Distribution of topics in document 1
topic_model.visualize_distribution(probs[0])

In [32]:
#Visualize Term Score Decline
topic_model.visualize_term_rank()

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=69f4f169-8014-4ed3-8b68-7cae4197e215' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>