In [None]:
import sys
import pandas as pd

sys.path.append('../../../')
from utils.preprocessors.diamantopoulos_preprocessor import diamantopoulos_preprocess
from utils.dataloader import get_issues

issues = get_issues()

In [None]:
# enhancement is 1

enhancement_issues = issues[issues['prop:category'] == "enhancement"]

In [18]:
enhancement_issues.head()

Unnamed: 0,text,predictions,id,repo,host,url,state,created_at
14,Switch from Travis to GitHub Actions We recent...,1,625216324,facebook/infer,Github,https://github.com/facebook/infer/issues/1270,closed,2020-05-26 21:39:31+00:00
22,PeepholeOptimizer usage is awkward The ```soot...,1,692295444,soot-oss/soot,Github,https://github.com/soot-oss/soot/issues/1459,open,2020-09-03 20:11:11+00:00
23,Missing asymmetric visibility checks ### Featu...,1,2763707356,phpstan/phpstan,Github,https://github.com/phpstan/phpstan/issues/12347,open,2024-12-30 21:17:12+00:00
27,[docker] add patch cli to the docker image ###...,1,2461670584,phpstan/phpstan,Github,https://github.com/phpstan/phpstan/issues/11499,closed,2024-08-12 18:47:39+00:00
31,Read @mixin above interfaces ### Feature reque...,1,3016848122,phpstan/phpstan,Github,https://github.com/phpstan/phpstan/issues/12922,open,2025-04-24 10:51:35+00:00


In [None]:
# cluster all of the bugs

# preprocess

# perform diamantopoulos preprocessing
preprocessed_issues = []
for index, issue in enhancement_issues.iterrows():
    preprocessed_issues.append(diamantopoulos_preprocess(issue['title'] + ' ' + issue['body'] if pd.notna(issue['body']) else issue['title']))

enhancement_issues['preprocessed'] = preprocessed_issues

In [None]:
# bertopic
from bertopic import BERTopic
from umap import UMAP
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance

umap_model = UMAP(
    random_state=42,
    metric="cosine",  # or "cosine" for cosine similarity
    n_neighbors=15,
    n_components=5,
    min_dist=0.0,
    spread=1.0,
)

representation_models = {
    'KeyBERTInspired': KeyBERTInspired(),
    'MaximalMarginalRelevance': MaximalMarginalRelevance()
}

topic_model = BERTopic(
    n_gram_range=(1, 2),
    calculate_probabilities=True,
    umap_model=umap_model,
    representation_model=representation_models,
    embedding_model='all-MiniLM-L6-v2',
    min_topic_size=20,
    verbose=True
)

strings = enhancement_issues['preprocessed'].apply(lambda x: ' '.join(x)).tolist()

topics, probs = topic_model.fit_transform(strings)

2025-06-03 14:56:09,777 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 98/98 [00:35<00:00,  2.78it/s]
2025-06-03 14:56:50,100 - BERTopic - Embedding - Completed ✓
2025-06-03 14:56:50,103 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-06-03 14:57:31,618 - BERTopic - Dimensionality - Completed ✓
2025-06-03 14:57:31,650 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-06-03 14:57:32,172 - BERTopic - Cluster - Completed ✓
2025-06-03 14:57:32,204 - BERTopic - Representation - Extracting topics from clusters using representation models.
2025-06-03 14:58:13,891 - BERTopic - Representation - Completed ✓


In [21]:
# reduce outliers

new_topics = topic_model.reduce_outliers(strings, topics, probabilities=probs, strategy='probabilities')
topic_model.update_topics(strings, topics=new_topics)



In [None]:
# generate csv so that can perform manual review

manual_review = topic_model.get_topic_info()
manual_review = manual_review.drop(columns=['Representative_Docs', 'Name'])
manual_review['percentage'] = manual_review['Count'] / len(enhancement_issues)

# get topic for every document
document_topics = topic_model.get_document_info(strings)
document_topics['html_url'] = enhancement_issues['html_url'].tolist()

# for each topic, get the 5 highest probability documents
def get_top_documents_url(topic):
    return document_topics[document_topics['Topic'] == topic].nlargest(5, 'Probability')
top_documents = ['\n'.join(get_top_documents_url(topic)['html_url']) for topic in manual_review['Topic']]
manual_review['top_documents'] = top_documents

# create a column for the topic name that is empty
manual_review['Topic Label (to be filled in by reviewer)'] = ''

manual_review.to_csv('../../../results/csv/topic_modeling/manual_review/enhancements_manual_review.csv', index=False)

In [None]:
# save document topics
document_topics_df = document_topics[['Topic', 'Probability', 'html_url']]
document_topics_df['id'] = enhancement_issues['id'].tolist()
document_topics_df['repo'] = enhancement_issues['repo'].tolist()

document_topics_df.to_csv('../../../results/csv/topic_modeling/clusters/enhancement_document_topics.csv', index=False)