In [1]:
import pandas as pd
import sys

sys.path.append('../../../')

from utils.dataloader import get_issues

issues = get_issues()

In [2]:
import sys
sys.path.append('../../../')
from utils.preprocessors.diamantopoulos_preprocessor import diamantopoulos_preprocess

In [None]:
# bug is 0

bug_issues = issues[issues['prop:category'] == "bug"]

In [4]:
bug_issues.head(20)

Unnamed: 0,text,predictions,id,repo,host,url,state,created_at
0,syntax/lint error depending on php version # F...,0,956670809,phpstan/phpstan,Github,https://github.com/phpstan/phpstan/issues/5397,closed,2021-07-30 11:43:13+00:00
1,False positive about a missing return ```php\r...,0,630047976,phpstan/phpstan,Github,https://github.com/phpstan/phpstan/issues/3390,closed,2020-06-03 14:32:03+00:00
2,Certain `TypeInferenceTestCase` assert types t...,0,2195682164,phpstan/phpstan,Github,https://github.com/phpstan/phpstan/issues/10757,closed,2024-03-19 18:21:50+00:00
3,Returning generic static in interface causes i...,0,2432540714,phpstan/phpstan,Github,https://github.com/phpstan/phpstan/issues/11398,closed,2024-07-26 16:07:53+00:00
5,php 8.1 deprecated messages # Bug report\r\n\r...,0,1124660995,phpstan/phpstan,Github,https://github.com/phpstan/phpstan/issues/6565,closed,2022-02-04 22:08:38+00:00
6,array_pop output should not be null given a fu...,0,392238724,phpstan/phpstan,Github,https://github.com/phpstan/phpstan/issues/1722,closed,2018-12-18 16:36:25+00:00
7,[core] Inconsistent boolean value representati...,0,341929117,pmd/pmd,Github,https://github.com/pmd/pmd/issues/1244,closed,2018-07-17 13:52:21+00:00
8,No error when __callStatic is not static ### B...,0,2215792811,phpstan/phpstan,Github,https://github.com/phpstan/phpstan/issues/10809,open,2024-03-29 18:13:23+00:00
9,Unsoundness in flow sensitive alias algorithm ...,0,1325137599,secure-software-engineering/FlowDroid,Github,https://github.com/secure-software-engineering...,open,2022-08-02 00:38:44+00:00
10,ExcludeBaseLineBug files configuration failed....,0,1756295695,spotbugs/spotbugs,Github,https://github.com/spotbugs/spotbugs/issues/2450,closed,2023-06-14 08:00:17+00:00


In [None]:
preprocessed_issues = []
for index, issue in bug_issues.iterrows():
    preprocessed_issues.append(diamantopoulos_preprocess(issue['title'] + ' ' + issue['body'] if pd.notna(issue['body']) else issue['title']))

bug_issues['preprocessed'] = preprocessed_issues

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bug_issues['preprocessed'] = preprocessed_issues


In [None]:
# bertopic
from bertopic import BERTopic
from umap import UMAP
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance

umap_model = UMAP(
    random_state=42,
    metric="cosine",  # or "cosine" for cosine similarity
    n_neighbors=15,
    n_components=5,
    min_dist=0.0,
    spread=1.0,
)

representation_models = {
    'KeyBERTInspired': KeyBERTInspired(),
    'MaximalMarginalRelevance': MaximalMarginalRelevance()
}

topic_model = BERTopic(
    embedding_model='all-MiniLM-L6-v2',
    representation_model=representation_models,
    n_gram_range=(1, 2),
    min_topic_size=20,
    calculate_probabilities=True,
    umap_model=umap_model,
    verbose=True
)

strings = bug_issues['preprocessed'].apply(lambda x: ' '.join(x)).tolist()

topics, probs = topic_model.fit_transform(strings)


  from .autonotebook import tqdm as notebook_tqdm
Matplotlib created a temporary cache directory at /var/folders/19/z43k9_0d3859509_dksqnd7w0000gn/T/matplotlib-bp6wqsk3 because the default path (/Users/ericzhang/.matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.
Matplotlib is building the font cache; this may take a moment.
2025-06-26 13:28:06,092 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 417/417 [00:46<00:00,  9.06it/s]
2025-06-26 13:28:55,634 - BERTopic - Embedding - Completed ✓
2025-06-26 13:28:55,636 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
2025-06-26 13:29:14,145 - BERTopic - Dimensionality - Completed ✓
2025-06-26 13:29:14,146 - B

In [7]:
# reduce outliers

new_topics = topic_model.reduce_outliers(strings, topics, probabilities=probs, strategy='probabilities')
topic_model.update_topics(strings, topics=new_topics)



In [None]:
# generate csv so that can perform manual review

manual_review = topic_model.get_topic_info()
manual_review = manual_review.drop(columns=['Representative_Docs', 'Name'])
manual_review['percentage'] = manual_review['Count'] / len(bug_issues)

# get topic for every document
document_topics = topic_model.get_document_info(strings)
document_topics['html_url'] = bug_issues['html_url'].tolist()

# for each topic, get the 5 highest probability documents
def get_top_documents_url(topic):
    return document_topics[document_topics['Topic'] == topic].nlargest(5, 'Probability')
top_documents = ['\n'.join(get_top_documents_url(topic)['html_url']) for topic in manual_review['Topic']]
manual_review['top_documents'] = top_documents

# create a column for the topic name that is empty
manual_review['Topic Label (to be filled in by reviewer)'] = ''

manual_review.to_csv('../../../results/csv/topic_modeling/manual_review/bugs_manual_review.csv', index=False)

In [None]:
# save document topics
document_topics_df = document_topics[['Topic', 'Probability', 'html_url']]
document_topics_df['id'] = bug_issues['id'].tolist()
document_topics_df['repo'] = bug_issues['repo'].tolist()

import os
os.makedirs('../../../results/csv/topic_modeling/clusters', exist_ok=True)

document_topics_df.to_csv('../../../results/csv/topic_modeling/clusters/bugs_document_topics.csv', index=False)