In [None]:
import pandas as pd

predicted_issues = pd.read_csv('../../../results/csv/catiss/catiss_predictions.csv')

In [None]:
import sys
sys.path.append('../../../')
from utils.preprocessors.diamantopoulos_preprocessor import diamantopoulos_preprocess

In [12]:
# question is defined as a label of 2

question_issues = predicted_issues[predicted_issues['predictions'] == 2]

In [13]:
question_issues.head()

Unnamed: 0,text,predictions,id,repo,host,url,state,created_at
4,How to find execution path with real data? Hel...,2,626971967,soot-oss/soot,Github,https://github.com/soot-oss/soot/issues/1362,open,2020-05-29 03:31:13+00:00
18,Suppress linting on iOS? I see in http://fbinf...,2,247892795,facebook/infer,Github,https://github.com/facebook/infer/issues/709,open,2017-08-04 03:23:33+00:00
25,Task spotbugsMain NO-SOURCE I have gradle 8.5/...,2,2081412654,spotbugs/spotbugs,Github,https://github.com/spotbugs/spotbugs/issues/2816,closed,2024-01-15 07:31:23+00:00
30,[java] UnusedPrivateField cannot override igno...,2,730237419,pmd/pmd,Github,https://github.com/pmd/pmd/issues/2876,closed,2020-10-27 08:40:12+00:00
35,Detecting internal accessor methods I am tryin...,2,142332172,facebook/infer,Github,https://github.com/facebook/infer/issues/313,closed,2016-03-21 12:23:34+00:00


In [14]:
# cluster all of the bugs

# preprocess

# perform diamantopoulos preprocessing
preprocessed_issues = []
for index, issue in question_issues.iterrows():
    preprocessed_issues.append(diamantopoulos_preprocess(issue['text']))

question_issues['preprocessed'] = preprocessed_issues

In [15]:
# bertopic
from bertopic import BERTopic
from umap import UMAP
from bertopic.representation import KeyBERTInspired, PartOfSpeech, MaximalMarginalRelevance

umap_model = UMAP(
    random_state=42,
    metric="cosine",  # or "cosine" for cosine similarity
    n_neighbors=15,
    n_components=5,
    min_dist=0.0,
    spread=1.0,
)

representation_models = {
    'KeyBERTInspired': KeyBERTInspired(),
    'PartOfSpeech': PartOfSpeech(),
    'MaximalMarginalRelevance': MaximalMarginalRelevance()
}
topic_model = BERTopic(
    n_gram_range=(1, 2),
    calculate_probabilities=True,
    umap_model=umap_model,
    min_topic_size=20,
    embedding_model='all-MiniLM-L6-v2',
    representation_model=representation_models,
    verbose=True
)

strings = question_issues['preprocessed'].apply(lambda x: ' '.join(x)).tolist()

topics, probs = topic_model.fit_transform(strings)

2025-06-03 14:56:04,059 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 88/88 [00:35<00:00,  2.51it/s]
2025-06-03 14:56:45,463 - BERTopic - Embedding - Completed ✓
2025-06-03 14:56:45,465 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-06-03 14:57:25,082 - BERTopic - Dimensionality - Completed ✓
2025-06-03 14:57:25,095 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-06-03 14:57:25,427 - BERTopic - Cluster - Completed ✓
2025-06-03 14:57:25,452 - BERTopic - Representation - Extracting topics from clusters using representation models.
2025-06-03 14:58:11,925 - BERTopic - Representation - Completed ✓


In [16]:
# reduce outliers

new_topics = topic_model.reduce_outliers(strings, topics, probabilities=probs, strategy='probabilities')
topic_model.update_topics(strings, topics=new_topics)



In [None]:
# generate csv so that can perform manual review

manual_review = topic_model.get_topic_info()
manual_review = manual_review.drop(columns=['Representative_Docs', 'Name'])
manual_review['percentage'] = manual_review['Count'] / len(question_issues)

# get topic for every document
document_topics = topic_model.get_document_info(strings)
document_topics['url'] = question_issues['url'].tolist()

# for each topic, get the 5 highest probability documents
def get_top_documents_url(topic):
    return document_topics[document_topics['Topic'] == topic].nlargest(5, 'Probability')
top_documents = ['\n'.join(get_top_documents_url(topic)['url']) for topic in manual_review['Topic']]
manual_review['top_documents'] = top_documents

# create a column for the topic name that is empty
manual_review['Topic Label (to be filled in by reviewer)'] = ''

manual_review.to_csv('../../../results/csv/feature_topic_modeling/questions/questions_manual_review.csv', index=False)

In [None]:
# save document topics
document_topics_df = document_topics[['Topic', 'Probability', 'url']]
document_topics_df['id'] = question_issues['id'].tolist()
document_topics_df['repo'] = question_issues['repo'].tolist()

document_topics_df.to_csv('../../../results/csv/feature_topic_modeling/questions/questions_document_topics.csv', index=False)