# Issue Report Classification: Few Shot Learning

### NLBSE 2024

In [None]:
%pip install pandas emoji re string json

## Import the requisite libraries

In [2]:
# Importing libraries
import pandas as pd
import emoji
import re
import string
import json

# Loading data from CSV files
test_data = pd.read_csv("./data/issues/issues_test.csv")
train_data = pd.read_csv("./data/issues/issues_train.csv")

## Data Visualization

In [33]:
test_data

Unnamed: 0,repo,created_at,label,title,body
0,facebook/react,2023-08-02 02:26:00,bug,Bug: [18.3.0-canary] renderToString hoists som...,<!--\r\n Please provide a clear and concise d...
1,facebook/react,2023-07-17 22:43:05,bug,[DevTools Bug]: Chrome extension gets disconne...,### Website or app\r\n\r\nhttps://react.dev/\r...
2,facebook/react,2023-07-13 19:01:47,bug,[DevTools Bug]: Deprecated __REACT_DEVTOOLS_GL...,### Website or app\n\nN/A\n\n### Repro steps\n...
3,facebook/react,2023-06-07 17:26:43,bug,[DevTools Bug]: React devtools stuck at Loadin...,### Website or app\n\ncorporate project (priva...
4,facebook/react,2023-05-31 15:17:41,bug,Bug: Radio button onChange not called in curre...,<!--\r\n Please provide a clear and concise d...
...,...,...,...,...,...
1495,opencv/opencv,2022-01-22 11:52:21,feature,Task: GCC 12 support,Support compilation with GCC 12 and fix tests\...
1496,opencv/opencv,2022-01-16 19:27:55,feature,AudioIO: add dnn speech recognition sample on C++,### Pull Request Readiness Checklist\r\n\r\nSe...
1497,opencv/opencv,2022-01-14 22:05:58,feature,Use modern OpenVINO package interface,"* new cmake options: `WITH_OPENVINO`, `OPENCV_..."
1498,opencv/opencv,2022-01-12 09:14:41,feature,TiffEncoder write support more depth type,**Merge with extra**: https://github.com/openc...


In [34]:
train_data

Unnamed: 0,repo,created_at,label,title,body
0,facebook/react,2023-08-26 06:33:37,bug,"[DevTools Bug] Cannot add node ""1"" because a n...",### Website or app\n\nPrivate repo cannot give...
1,facebook/react,2023-07-28 05:16:12,bug,[DevTools Bug]: Devtools extension build faili...,### Website or app\n\nN/A\n\n### Repro steps\n...
2,facebook/react,2023-07-13 21:58:31,bug,[DevTools Bug]: Deprecated __REACT_DEVTOOLS_GL...,### Website or app\n\nhttps://github.com/open-...
3,facebook/react,2023-06-14 02:31:20,bug,"[DevTools Bug] Cannot remove node ""0"" because ...",### Website or app\n\nlocal\n\n### Repro steps...
4,facebook/react,2023-06-03 11:29:44,bug,"[DevTools Bug] Cannot remove node ""103"" becaus...",### Website or app\n\nlocalhost\n\n### Repro s...
...,...,...,...,...,...
1495,opencv/opencv,2022-01-24 10:48:13,feature,core: FP denormals support,relates #21046\r\n\r\n- support x86 SSE FTZ+DA...
1496,opencv/opencv,2022-01-20 12:40:55,feature,feature: submodule or a class scope for export...,All classes are registered in the scope that c...
1497,opencv/opencv,2022-01-15 02:39:22,feature,Reading BigTiff images,**Merge with extra: https://github.com/opencv/...
1498,opencv/opencv,2022-01-14 15:37:53,feature,Add general broadcasting layer,Performance details(broadcasting 1x1 to 16x204...


## Data Preprocessing
### Data Cleaning: Method 1  
Within this notebook, we employ two distinct data cleaning methodologies. This tailored approach is followed given variations among the repositories, each showing a more favorable outcome in response to one or the other cleaning methods.

In [32]:
# Initialize counters for text cleaning
cleaned_count = 0
original_count = 0

# Text cleaning function
def clean_text(text):
    global cleaned_count, original_count

    if not isinstance(text, str):
        original_count += 1
        return text

    # Remove double quotation marks
    text = text.replace('"', '')

    # Remove text starting with "DevTools" and ending with "(automated)"
    text = re.sub(r'DevTools.*?\(automated\)', '', text)

    # Lowercasing should be one of the first steps to ensure uniformity
    text = text.lower()

    # Remove emojis
    text = emoji.demojize(text)

    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)

    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # Remove special characters and punctuation
    text = re.sub(f"[{re.escape(string.punctuation)}]", '', text)

    # Remove '#' characters
    text = text.replace("#", "")

    # Remove consecutive whitespaces and replace with a single space
    text = re.sub(r'\s+', ' ', text)

    # Split the text into words
    words = text.split()

    # Remove words that are over 20 characters
    words = [word for word in words if len(word) <= 20]

    # Join the remaining words back into cleaned text
    cleaned_text = ' '.join(words)

    cleaned_count += 1
    return cleaned_text

test_data['body'] = test_data['body'].apply(clean_text)
test_data['title'] = test_data['title'].apply(clean_text)


print(f"Cleaned {cleaned_count} times.")
print(f"Returned original text {original_count} times.")

train_data['body'] = train_data['body'].apply(clean_text)
train_data['title'] = train_data['title'].apply(clean_text)


print(f"Cleaned {cleaned_count} times.")
print(f"Returned original text {original_count} times.")

Cleaned 2998 times.
Returned original text 2 times.
Cleaned 5998 times.
Returned original text 2 times.


## Data Division  

Subsequently, we partitioned our dataset into five smaller dataframes, ensuring an exclusive handling of each project. This segregation was executed on both the training and testing datasets.

In [36]:
test_data_facebook = test_data[: 300]
test_data_tensorflow = test_data[300: 600]
test_data_microsoft = test_data[600: 900]
test_data_bitcoin = test_data[900: 1200]
test_data_opencv= test_data[1200: 1500]


test_data_tensorflow

Unnamed: 0,repo,created_at,label,title,body
300,tensorflow/tensorflow,2023-09-05 05:58:09,question,tensorflow lite in play services issue not my ...,system information android device information ...
301,tensorflow/tensorflow,2023-09-02 18:05:04,question,what is generatevocab func,you referenced in this to generatevocabpy if i...
302,tensorflow/tensorflow,2023-08-24 19:52:10,question,error starting tensorflow in python,issue type others have you reproduced the bug ...
303,tensorflow/tensorflow,2023-08-11 18:32:31,question,tensorflow profiler running into oom issue on gpu,issue type support have you reproduced the bug...
304,tensorflow/tensorflow,2023-08-10 04:03:47,question,tensorflow profiler running into oom issue on gpu,issue type support have you reproduced the bug...
...,...,...,...,...,...
595,tensorflow/tensorflow,2023-06-18 22:44:17,bug,uncaught exception in zmqstream callback when ...,click to expand issue type bug have you reprod...
596,tensorflow/tensorflow,2023-06-13 19:41:41,bug,w could not load dynamic library libcudnnso8 d...,click to expand issue type bug have you reprod...
597,tensorflow/tensorflow,2023-06-12 18:07:28,bug,unexpected failure when preparing tensor alloc...,i have converted my densenet121 model to model...
598,tensorflow/tensorflow,2023-06-12 10:32:27,bug,documentation bug：the description of padding,click to expand issue type documentation bug h...


In [37]:
train_data_facebook = train_data[: 300]
train_data_tensorflow = train_data[300: 600]
train_data_microsoft = train_data[600: 900]
train_data_bitcoin = train_data[900: 1200]
train_data_opencv= train_data[1200: 1500]


train_data_tensorflow

Unnamed: 0,repo,created_at,label,title,body
300,tensorflow/tensorflow,2023-09-07 05:31:36,question,add suppport for vedv,please go to stack overflow for help and suppo...
301,tensorflow/tensorflow,2023-09-03 17:53:40,question,cant run bertvocabfromdataset without typeerro...,issue type support have you reproduced the bug...
302,tensorflow/tensorflow,2023-09-01 16:34:57,question,float16 mixed precision training,issue type bug have you reproduced the bug wit...
303,tensorflow/tensorflow,2023-08-19 06:53:28,question,ckpt to tflite,how can i convert ckpt file to tf lite while i...
304,tensorflow/tensorflow,2023-08-11 14:02:25,question,when converting tensorflow model to tflite mod...,i took a pretrained model ssd mobilenet 320x32...
...,...,...,...,...,...
595,tensorflow/tensorflow,2023-06-19 05:35:02,bug,tensorboard histogram onehot operation causing...,click to expand issue type bug have you reprod...
596,tensorflow/tensorflow,2023-06-16 12:20:44,bug,tftestgpudevicename leads to soft lockup and u...,click to expand issue type bug have you reprod...
597,tensorflow/tensorflow,2023-06-12 21:17:01,bug,tfdatadatasetmap does not support randomization,click to expand issue type bug have you reprod...
598,tensorflow/tensorflow,2023-06-12 10:35:48,bug,functional bug：could not interpret serialized ...,click to expand issue type bug have you reprod...


## Fine-Tuning  
We fine-tuned ChatGPT-3.5-Turbo using the training data, aiming to achieve superior performance compared to the standard approach of invoking the OpenAI API GPT-4 model.

In [3]:
# Invoking the API
from openai import OpenAI
client = OpenAI(api_key = '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')

## Facebook repository dataset fine-tuning process
### Data Transformation  
Prior to beginning the fine-tuning process, our initial step involves transforming our dataframe into a JSON line file format. This formatted file will serve as the prompt input for the fine-tuning process. Each prompt will encapsulate the title and body details of every pull request. Our anticipated outcome from the fine-tuned model will be the corresponding label for each PR, distinguishing between bug reports, questions, or feature requests.

In [39]:
# Open the file in write mode
with open('data/conversationaldata/conversational_data_facebook.jsonl', 'w', encoding='utf-8') as f:
    # Iterate over the rows in the DataFrame
    for index, row in train_data_facebook.iterrows():
        # Create the user message by formatting the prompt with the title and body
        user_message = f"Classify, IN ONLY 1 WORD, the following GitHub issue as 'feature', 'bug', or 'question' based on its title and body:\n{row['title']}\n{row['body']}"
        
        # Create the assistant message by taking the label
        assistant_message = row['label']
        
        # Construct the conversation object
        conversation_object = {
            "messages": [
                {"role": "system", "content": "GitHub Issue Report Classifier"},
                {"role": "user", "content": user_message},
                {"role": "assistant", "content": assistant_message}
            ]
        }
        
        # Write the conversation object to one line in the file
        f.write(json.dumps(conversation_object, ensure_ascii=False) + '\n')

## Training file  
With our JSON line file generated, it now serves as the foundational conversation input for our fine-tuned model. We're prepared to upload this training file to the OpenAI API to initiate the training process.

In [40]:
## Uplopading a training file
client.files.create(
  file=open("data/conversationaldata/conversational_data_facebook.jsonl", "rb"),
  purpose="fine-tune"
)

FileObject(id='file-0Aub147vzoJf7tQGxAYXo2eF', bytes=363360, created_at=1701751545, filename='conversational_data_facebook.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

## Model creation  
At last, the stage is set to create the model, designated with the suffix 'repo-prissueclassifier.'

In [41]:
## Creating a fine-tuned model
client.fine_tuning.jobs.create(
  training_file="file-Mz8izUzswJ5M3H1C8S2aO6X0", 
  model="gpt-3.5-turbo",
  suffix= "fb-issueclassifier"
)

FineTuningJob(id='ftjob-LG8EargXmVTFrGEWBYmF841X', created_at=1701751546, error=None, fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='gpt-3.5-turbo-0613', object='fine_tuning.job', organization_id='org-CpaRU3Zq9ePCCtbhezmcbgrg', result_files=[], status='validating_files', trained_tokens=None, training_file='file-Mz8izUzswJ5M3H1C8S2aO6X0', validation_file=None)

In [38]:
# Retrieving the state of a fine-tune
facebook_ft_model = client.fine_tuning.jobs.retrieve('ftjob-vaUUKFSIdXojPVI8tQW55pYo').fine_tuned_model
print(facebook_ft_model)

ft:gpt-3.5-turbo-0613:gcucst440:fb-issueclassifier:8LLGMnAI


In [43]:
# List up to 20 events from a fine-tuning job to track progress
client.fine_tuning.jobs.list_events(fine_tuning_job_id="ftjob-vaUUKFSIdXojPVI8tQW55pYo", limit=20)

SyncCursorPage[FineTuningJobEvent](data=[FineTuningJobEvent(id='ftevent-zI5hlvpWRqcoWPOnljUR8Sek', created_at=1700097317, level='info', message='The job has successfully completed', object='fine_tuning.job.event', data={}, type='message'), FineTuningJobEvent(id='ftevent-JArfDoB8stVXxToxlxK4zcmA', created_at=1700097314, level='info', message='New fine-tuned model created: ft:gpt-3.5-turbo-0613:gcucst440:fb-issueclassifier:8LLGMnAI', object='fine_tuning.job.event', data={}, type='message'), FineTuningJobEvent(id='ftevent-gHbOAzR3ehC6h0HVtkartknk', created_at=1700097284, level='info', message='Step 891/900: training loss=0.00', object='fine_tuning.job.event', data={'step': 891, 'train_loss': 6.35782896551973e-07, 'train_mean_token_accuracy': 1.0}, type='metrics'), FineTuningJobEvent(id='ftevent-ff2sVNjwO1uv5buZ7F4ossR4', created_at=1700097264, level='info', message='Step 881/900: training loss=0.00', object='fine_tuning.job.event', data={'step': 881, 'train_loss': 6.35782896551973e-07, 't

## Tensorflow repository dataset fine-tuning process

In [44]:
# Open the file in write mode
with open('data/conversationaldata/conversational_data_tensorflow.jsonl', 'w', encoding='utf-8') as f:
    # Iterate over the rows in the DataFrame
    for index, row in train_data_tensorflow.iterrows():
        # Create the user message by formatting the prompt with the title and body
        user_message = f"Classify, IN ONLY 1 WORD, the following GitHub issue as 'feature', 'bug', or 'question' based on its title and body:\n{row['title']}\n{row['body']}"
        
        # Create the assistant message by taking the label
        assistant_message = row['label']
        
        # Construct the conversation object
        conversation_object = {
            "messages": [
                {"role": "system", "content": "GitHub Issue Report Classifier"},
                {"role": "user", "content": user_message},
                {"role": "assistant", "content": assistant_message}
            ]
        }
        
        # Write the conversation object to one line in the file
        f.write(json.dumps(conversation_object, ensure_ascii=False) + '\n')

In [45]:
client.files.create(
  file=open("data/conversationaldata/conversational_data_tensorflow.jsonl", "rb"),
  purpose="fine-tune"
)

FileObject(id='file-otqI5x1eZvI9ifBIdhE1buie', bytes=551677, created_at=1701751548, filename='conversational_data_tensorflow.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

In [46]:
## Creating a fine-tuned model
client.fine_tuning.jobs.create(
  training_file="file-poTXDVFaDaPWgI4OWkzKtHCY", 
  model="gpt-3.5-turbo",
  suffix= "tf-issueclassifier"
)

FineTuningJob(id='ftjob-JPF1YSBlCiLAdOHc3egX9XjI', created_at=1701751548, error=None, fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='gpt-3.5-turbo-0613', object='fine_tuning.job', organization_id='org-CpaRU3Zq9ePCCtbhezmcbgrg', result_files=[], status='validating_files', trained_tokens=None, training_file='file-poTXDVFaDaPWgI4OWkzKtHCY', validation_file=None)

In [47]:
# Retrieving the state of a fine-tune
tensorflow_ft_model = client.fine_tuning.jobs.retrieve('ftjob-8h3QKrmrGHz2MpazYLrzdYVv').fine_tuned_model
print(tensorflow_ft_model)

ft:gpt-3.5-turbo-0613:gcucst440:tf-issueclassifier:8LLGZuRu


In [48]:
# List up to 20 events from a fine-tuning job to track progress
client.fine_tuning.jobs.list_events(fine_tuning_job_id="ftjob-8h3QKrmrGHz2MpazYLrzdYVv", limit=20)

SyncCursorPage[FineTuningJobEvent](data=[FineTuningJobEvent(id='ftevent-57ybtL8Qwb7ruf2rYL2VZ5C5', created_at=1700097330, level='info', message='The job has successfully completed', object='fine_tuning.job.event', data={}, type='message'), FineTuningJobEvent(id='ftevent-l9v52fXBFRG85VPTNaqsZT8z', created_at=1700097327, level='info', message='New fine-tuned model created: ft:gpt-3.5-turbo-0613:gcucst440:tf-issueclassifier:8LLGZuRu', object='fine_tuning.job.event', data={}, type='message'), FineTuningJobEvent(id='ftevent-YjiyQNFpnRdzXnl78mJlDHkG', created_at=1700097306, level='info', message='Step 891/900: training loss=0.00', object='fine_tuning.job.event', data={'step': 891, 'train_loss': 6.35782896551973e-07, 'train_mean_token_accuracy': 1.0}, type='metrics'), FineTuningJobEvent(id='ftevent-yrMZCwEzDYNA51JIsLN5X8fH', created_at=1700097286, level='info', message='Step 881/900: training loss=0.00', object='fine_tuning.job.event', data={'step': 881, 'train_loss': 6.35782896551973e-07, 't

## Microsoft repository dataset fine-tuning process

In [49]:
# Open the file in write mode
with open('data/conversationaldata/conversational_data_microsoft.jsonl', 'w', encoding='utf-8') as f:
    # Iterate over the rows in the DataFrame
    for index, row in train_data_microsoft.iterrows():
        # Create the user message by formatting the prompt with the title and body
        user_message = f"Classify, IN ONLY 1 WORD, the following GitHub issue as 'feature', 'bug', or 'question' based on its title and body:\n{row['title']}\n{row['body']}"
        
        # Create the assistant message by taking the label
        assistant_message = row['label']
        
        # Construct the conversation object
        conversation_object = {
            "messages": [
                {"role": "system", "content": "GitHub Issue Report Classifier"},
                {"role": "user", "content": user_message},
                {"role": "assistant", "content": assistant_message}
            ]
        }
        
        # Write the conversation object to one line in the file
        f.write(json.dumps(conversation_object, ensure_ascii=False) + '\n')

In [50]:
client.files.create(
  file=open("data/conversationaldata/conversational_data_microsoft.jsonl", "rb"),
  purpose="fine-tune"
)

FileObject(id='file-SPe4gJF5xHVBw1bfley7BddS', bytes=300117, created_at=1701751551, filename='conversational_data_microsoft.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

In [21]:
## Creating a fine-tuned model
client.fine_tuning.jobs.create(
  training_file="file-o2pr8FckpXuZhncbVF9Fykzl", 
  model="gpt-3.5-turbo",
  suffix= "ms-issueclassifier"
)

FineTuningJob(id='ftjob-fDSapt9Eh3trHScN7SOW5wIn', created_at=1701829792, error=None, fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='gpt-3.5-turbo-0613', object='fine_tuning.job', organization_id='org-CpaRU3Zq9ePCCtbhezmcbgrg', result_files=[], status='validating_files', trained_tokens=None, training_file='file-o2pr8FckpXuZhncbVF9Fykzl', validation_file=None)

In [23]:
# Retrieving the state of a fine-tune
microsoft_ft_model = client.fine_tuning.jobs.retrieve('ftjob-h7Wun2RE4F7PpKAYmxz9FwBz').fine_tuned_model
print(microsoft_ft_model)

ft:gpt-3.5-turbo-0613:gcucst440:ms-issueclassifier:8LLFl5QI


In [None]:
# List up to 20 events from a fine-tuning job to track progress
client.fine_tuning.jobs.list_events(fine_tuning_job_id="ftjob-h7Wun2RE4F7PpKAYmxz9FwBz", limit=20)

SyncCursorPage[FineTuningJobEvent](data=[FineTuningJobEvent(id='ftevent-RkP7u5W0WaGtFdB5Cupvwhrm', created_at=1700089786, level='info', message='Files validated, moving job to queued state', object='fine_tuning.job.event', data={}, type='message'), FineTuningJobEvent(id='ftevent-UU7EzTjLQd2odPATxNHeFqyN', created_at=1700089692, level='info', message='Validating training file: file-o2pr8FckpXuZhncbVF9Fykzl', object='fine_tuning.job.event', data={}, type='message'), FineTuningJobEvent(id='ftevent-ZAMgyCsgfvWGv2tNc0tKHT9I', created_at=1700089692, level='info', message='Created fine-tuning job: ftjob-h7Wun2RE4F7PpKAYmxz9FwBz', object='fine_tuning.job.event', data={}, type='message')], object='list', has_more=False)

## Bitcoin repository dataset fine-tuning process

In [None]:
# Open the file in write mode
with open('data/conversationaldata/conversational_data_bitcoin.jsonl', 'w', encoding='utf-8') as f:
    # Iterate over the rows in the DataFrame
    for index, row in train_data_bitcoin.iterrows():
        # Create the user message by formatting the prompt with the title and body
        user_message = f"Classify, IN ONLY 1 WORD, the following GitHub issue as 'feature', 'bug', or 'question' based on its title and body:\n{row['title']}\n{row['body']}"
        
        # Create the assistant message by taking the label
        assistant_message = row['label']
        
        # Construct the conversation object
        conversation_object = {
            "messages": [
                {"role": "system", "content": "GitHub Issue Report Classifier"},
                {"role": "user", "content": user_message},
                {"role": "assistant", "content": assistant_message}
            ]
        }
        
        # Write the conversation object to one line in the file
        f.write(json.dumps(conversation_object, ensure_ascii=False) + '\n')

In [None]:
client.files.create(
  file=open("data/conversationaldata/conversational_data_bitcoin.jsonl", "rb"),
  purpose="fine-tune"
)

FileObject(id='file-jDVrdHpKeEy23ttxa5KCfk7O', bytes=548310, created_at=1700103758, filename='conversational_data_bitcoin.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

In [None]:
## Creating a fine-tuned model
client.fine_tuning.jobs.create(
  training_file="file-jDVrdHpKeEy23ttxa5KCfk7O", 
  model="gpt-3.5-turbo",
  suffix= "bc-issueclassifier"
)

FineTuningJob(id='ftjob-dBpnVHyaN48iF0SC5QMwzCCn', created_at=1700103775, error=None, fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='gpt-3.5-turbo-0613', object='fine_tuning.job', organization_id='org-CpaRU3Zq9ePCCtbhezmcbgrg', result_files=[], status='validating_files', trained_tokens=None, training_file='file-jDVrdHpKeEy23ttxa5KCfk7O', validation_file=None)

In [None]:
# Retrieving the state of a fine-tune
bitcoin_ft_model = client.fine_tuning.jobs.retrieve('ftjob-dBpnVHyaN48iF0SC5QMwzCCn').fine_tuned_model
print(bitcoin_ft_model)

ft:gpt-3.5-turbo-0613:gcucst440:bc-issueclassifier:8LOOptG5


In [None]:
# List up to 20 events from a fine-tuning job to track progress
client.fine_tuning.jobs.list_events(fine_tuning_job_id="ftjob-h7Wun2RE4F7PpKAYmxz9FwBz", limit=20)

SyncCursorPage[FineTuningJobEvent](data=[FineTuningJobEvent(id='ftevent-y0CHJxm90FkTN982TdX9pZDk', created_at=1700097279, level='info', message='The job has successfully completed', object='fine_tuning.job.event', data={}, type='message'), FineTuningJobEvent(id='ftevent-agbV8jSMlE348SkUOvo0FvaT', created_at=1700097277, level='info', message='New fine-tuned model created: ft:gpt-3.5-turbo-0613:gcucst440:ms-issueclassifier:8LLFl5QI', object='fine_tuning.job.event', data={}, type='message'), FineTuningJobEvent(id='ftevent-RJCgddNnsTZCnoUr7Tufkq5b', created_at=1700097260, level='info', message='Step 891/900: training loss=0.00', object='fine_tuning.job.event', data={'step': 891, 'train_loss': 6.35782896551973e-07, 'train_mean_token_accuracy': 1.0}, type='metrics'), FineTuningJobEvent(id='ftevent-nG1LeucNX1JNVnEHPOSbURNU', created_at=1700097246, level='info', message='Step 881/900: training loss=0.00', object='fine_tuning.job.event', data={'step': 881, 'train_loss': 6.35782896551973e-07, 't

## OpenCV repository dataset fine-tuning process

In [None]:
# Open the file in write mode
with open('data/conversationaldata/conversational_data_opencv.jsonl', 'w', encoding='utf-8') as f:
    # Iterate over the rows in the DataFrame
    for index, row in train_data_opencv.iterrows():
        # Create the user message by formatting the prompt with the title and body
        user_message = f"Classify, IN ONLY 1 WORD, the following GitHub issue as 'feature', 'bug', or 'question' based on its title and body:\n{row['title']}\n{row['body']}"
        
        # Create the assistant message by taking the label
        assistant_message = row['label']
        
        # Construct the conversation object
        conversation_object = {
            "messages": [
                {"role": "system", "content": "GitHub Issue Report Classifier"},
                {"role": "user", "content": user_message},
                {"role": "assistant", "content": assistant_message}
            ]
        }
        
        # Write the conversation object to one line in the file
        f.write(json.dumps(conversation_object, ensure_ascii=False) + '\n')

In [None]:
client.files.create(
  file=open("data/conversationaldata/conversational_data_opencv.jsonl", "rb"),
  purpose="fine-tune"
)

FileObject(id='file-31FW3RwGmDhMZnIEtc95Oj5t', bytes=621942, created_at=1700104001, filename='conversational_data_opencv.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

In [None]:
## Creating a fine-tuned model
client.fine_tuning.jobs.create(
  training_file="file-31FW3RwGmDhMZnIEtc95Oj5t", 
  model="gpt-3.5-turbo",
  suffix= "cv-issueclassifier"
)

FineTuningJob(id='ftjob-XdaiNJQtq10dbFhMPdDfKk4t', created_at=1700104041, error=None, fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='gpt-3.5-turbo-0613', object='fine_tuning.job', organization_id='org-CpaRU3Zq9ePCCtbhezmcbgrg', result_files=[], status='validating_files', trained_tokens=None, training_file='file-31FW3RwGmDhMZnIEtc95Oj5t', validation_file=None)

In [None]:
# Retrieving the state of a fine-tune
opencv_ft_model = client.fine_tuning.jobs.retrieve('ftjob-XdaiNJQtq10dbFhMPdDfKk4t').fine_tuned_model
print(opencv_ft_model)

ft:gpt-3.5-turbo-0613:gcucst440:cv-issueclassifier:8LOdV6zV


In [None]:
# List up to 20 events from a fine-tuning job to track progress
client.fine_tuning.jobs.list_events(fine_tuning_job_id="ftjob-XdaiNJQtq10dbFhMPdDfKk4t", limit=20)

SyncCursorPage[FineTuningJobEvent](data=[FineTuningJobEvent(id='ftevent-TJ8kaSf4iSFxWzcMXoKKUolj', created_at=1700104053, level='warn', message='File file-31FW3RwGmDhMZnIEtc95Oj5t contains examples greater than the supported context size for model `gpt-3.5-turbo-0613` (4096 tokens)', object='fine_tuning.job.event', data={}, type='message'), FineTuningJobEvent(id='ftevent-ZMhKqizMlPfeRW252IeIJGMK', created_at=1700104041, level='info', message='Validating training file: file-31FW3RwGmDhMZnIEtc95Oj5t', object='fine_tuning.job.event', data={}, type='message'), FineTuningJobEvent(id='ftevent-oG81YfwZcFnSSJFZ6Vm9JXDw', created_at=1700104041, level='info', message='Created fine-tuning job: ftjob-XdaiNJQtq10dbFhMPdDfKk4t', object='fine_tuning.job.event', data={}, type='message')], object='list', has_more=False)

## Fine-tuning results  
The successful fine-tuning of all models was completed using the default of 3 epochs. The process spanned approximately 5 hours; however, variations in processing time might occur due to queue dynamics at any given moment.

## Utilizing Fine-tuned model  
Next, another API from OpenAI is used to invoke the fine-tuned model and assess its performance on the testing dataset.

In [14]:
import openai
import time
import pandas as pd
import re
import concurrent.futures
import tiktoken
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix, classification_report

# Replace 'open-ai-key' with your actual OpenAI API key
openai.api_key = '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'

# max_token here should be one since 'bug', 'feature', and 'question' are one token long. This might change for future versions of the model and api but you can check the value on the
def query_chatgpt(prompt, model, temperature=0.0,  max_tokens=1, max_retries=5):
    """
    Function to query ChatGPT-4 with a given prompt, with retries for timeouts.

    :param prompt: Prompt string to send to ChatGPT-2.5
    :param model: The model to use, default is ChatGPT-3.5
    :param max_tokens: Maximum number of tokens to generate
    :param max_retries: Maximum number of retries for timeout
    :return: Response from ChatGPT-3.5 or None if all retries fail
    """
    attempt = 0
    max_content_tokens = 3999
    encoding = tiktoken.get_encoding("cl100k_base")
    encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")

    # Function to truncate the message and avoid passing the limit of 4k tokens per gpt-3.5 fine-tuned model limitations
    def truncate_message(message, max_length):
        tokens = encoding.encode(message)
        if len(tokens) > max_length:
            truncated_tokens = tokens[:max_length]
            message = encoding.decode(truncated_tokens)
        return message

    # Truncate the prompt if necessary
    prompt = truncate_message(prompt, max_content_tokens)

    while attempt < max_retries:
        with concurrent.futures.ThreadPoolExecutor() as executor:
            future = executor.submit(
                openai.chat.completions.create,
                model=model,
                messages=[{"role": "system", "content": "GitHub Issue Report Classifier"}, {"role": "user", "content": prompt}],
                max_tokens=max_tokens,
                temperature=temperature
            )
            try:
                response = future.result(timeout=5)  # 5 seconds timeout
                return response.choices[0].message.content
            except concurrent.futures.TimeoutError:
                print(f"Attempt {attempt + 1}/{max_retries} - Request timed out. Retrying...")
            except Exception as e:
                print(f"Attempt {attempt + 1}/{max_retries} - An error occurred: {e}")
            finally:
                attempt += 1

    print("Failed to get a response after several retries.")
    return None
    
labels = ['feature', 'bug', 'question']

## Facebook React Repo Testing  
The function defined above is being called, passing the specific model for each repository and testing it with the testing dataset. It's essential to note the setup of a timer to comply with the "token per minute" limitations on the API. Additionally, the results of each iteration are printed for tracking and improvement purposes.

In [None]:
y_true_fb = []
y_pred_fb = []

iterations = len(test_data_facebook)

# Now let's loop through the test data and classify the GitHub issues
for i in range(iterations):
    correct_label = test_data_facebook.iloc[i]['label'].lower()
    description = f"{test_data_facebook.iloc[i]['title']} \n {test_data_facebook.iloc[i]['body']}"
    print(f"Correct PR type: {correct_label}")
    
    prompt = f"Classify, IN ONLY 1 WORD, the following GitHub issue as 'feature', 'bug', or 'question' based on its title and body:\n{description}"
    response = query_chatgpt(prompt, facebook_ft_model)
    
    if response is None:
        print("Failed to get a response after several retries. Skipping this item.")
        continue  # Skip this iteration and move to the next one
    
    # Clean the response to keep only letters (and optionally numbers)
    predicted_label = re.sub(r'[^A-Za-z]+', '', response).lower().strip()
    print(f"Predicted PR type: {predicted_label}")
    
    # Append to lists for evaluation
    y_true_fb.append(correct_label)
    y_pred_fb.append(predicted_label)
    time.sleep(6)  # Wait for 6 seconds before retrying

Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted P

## Calculating the results  
Once all testing data undergoes evaluation using the corresponding fine-tuned models, we'll leverage the two arrays generated—representing the true labels and predicted labels—to conduct result assessments.

For tracking purposes, a CSV file has been generated for each result.

In [None]:
# Calculate weighted average F1-score, precision, and recall
f1_fb = f1_score(y_true_fb, y_pred_fb, labels=labels, average='weighted')
precision_fb = precision_score(y_true_fb, y_pred_fb, labels=labels, average='weighted')
recall_fb = recall_score(y_true_fb, y_pred_fb, labels=labels, average='weighted')

# Calculate confusion matrix
cm_fb = confusion_matrix(y_true_fb, y_pred_fb, labels=labels)

cm_df_fb = pd.DataFrame(cm_fb, index=labels, columns=labels)

# Calculate TP, FP, FN, TN
results_fb = {}
for i, label in enumerate(labels):
    results_fb[label] = {'TP': cm_fb[i, i]}
    results_fb[label]['FP'] = cm_fb[:, i].sum() - cm_fb[i, i]
    results_fb[label]['FN'] = cm_fb[i, :].sum() - cm_fb[i, i]
    results_fb[label]['TN'] = cm_fb.sum() - (results_fb[label]['TP'] + results_fb[label]['FP'] + results_fb[label]['FN'])

# Print results_fb
for label, metrics in results_fb.items():
    print(f"{label}: {metrics}")

# Save results_fb to CSV
results_fb_df = pd.DataFrame(results_fb).T
results_fb_df['F1-score'] = f1_fb
results_fb_df['Recall'] = recall_fb
results_fb_df['Precision'] = precision_fb

results_fb_df.to_csv('metrics/confusion_matrix_fb.csv')

print(f"Precision = {precision_fb}")
print(f"Recall = {recall_fb}")
print(f"F1-score = {f1_fb}")

feature: {'TP': 89, 'FP': 15, 'FN': 11, 'TN': 185}
bug: {'TP': 95, 'FP': 19, 'FN': 5, 'TN': 181}
question: {'TP': 74, 'FP': 8, 'FN': 26, 'TN': 192}
Precision = 0.8638471961642693
Recall = 0.86
F1-score = 0.8578621000281252


### F1-Score: 85.79%

## Evaluating the metrics  
Below, the metrics for each label are presented to facilitate a more precise evaluation.

In [None]:
# Create a classification report
report_fb = classification_report(y_true_fb, y_pred_fb, labels=['bug', 'feature', 'question'], target_names=['bug', 'feature', 'question'], zero_division=0, output_dict=True)

# Convert the report to a DataFrame
report_df_fb = pd.DataFrame(report_fb).transpose()

# Print the classification report
print(report_df_fb)

              precision  recall  f1-score  support
bug            0.833333    0.95  0.887850   100.00
feature        0.855769    0.89  0.872549   100.00
question       0.902439    0.74  0.813187   100.00
accuracy       0.860000    0.86  0.860000     0.86
macro avg      0.863847    0.86  0.857862   300.00
weighted avg   0.863847    0.86  0.857862   300.00


## Tensorflow Repo Testing

In [None]:
y_true_tf = []
y_pred_tf = []

iterations = len(test_data_tensorflow)

# Now let's loop through the test data and classify the GitHub pull requests
for i in range(iterations):
    correct_label = test_data_tensorflow.iloc[i]['label'].lower()
    description = f"{test_data_tensorflow.iloc[i]['title']} \n {test_data_tensorflow.iloc[i]['body']}"
    print(f"Correct PR type: {correct_label}")
    
    prompt = f"Classify, IN ONLY 1 WORD, the following GitHub pull request as 'feature', 'bug', or 'question' based on its title and body:\n{description}"
    response = query_chatgpt(prompt, tensorflow_ft_model)
    
    if response is None:
        print("Failed to get a response after several retries. Skipping this item.")
        continue  # Skip this iteration and move to the next one
    
    # Clean the response to keep only letters (and optionally numbers)
    predicted_label = re.sub(r'[^A-Za-z]+', '', response).lower().strip()
    print(f"Predicted PR type: {predicted_label}")
    
    # Append to lists for evaluation
    y_true_tf.append(correct_label)
    y_pred_tf.append(predicted_label)
    time.sleep(6)  # Wait for 6 seconds before retrying

Correct PR type: question
Predicted PR type: question
Correct PR type: question
Predicted PR type: question
Correct PR type: question
Predicted PR type: question
Correct PR type: question
Predicted PR type: question
Correct PR type: question
Predicted PR type: question
Correct PR type: question
Predicted PR type: question
Correct PR type: question
Predicted PR type: feature
Correct PR type: question
Predicted PR type: question
Correct PR type: question
Predicted PR type: question
Correct PR type: question
Predicted PR type: question
Correct PR type: question
Predicted PR type: question
Correct PR type: question
Predicted PR type: question
Correct PR type: question
Predicted PR type: question
Correct PR type: question
Predicted PR type: question
Correct PR type: question
Predicted PR type: question
Correct PR type: question
Predicted PR type: question
Correct PR type: question
Predicted PR type: question
Correct PR type: question
Predicted PR type: question
Correct PR type: question
Pre

In [None]:
# Calculate weighted average F1-score, precision, and recall
f1_tf = f1_score(y_true_tf, y_pred_tf, labels=labels, average='weighted')
precision_tf = precision_score(y_true_tf, y_pred_tf, labels=labels, average='weighted')
recall_tf = recall_score(y_true_tf, y_pred_tf, labels=labels, average='weighted')

# Calculate confusion matrix
cm_tf = confusion_matrix(y_true_tf, y_pred_tf, labels=labels)

cm_df_tf = pd.DataFrame(cm_tf, index=labels, columns=labels)

# Calculate TP, FP, FN, TN
results_tf = {}
for i, label in enumerate(labels):
    results_tf[label] = {'TP': cm_tf[i, i]}
    results_tf[label]['FP'] = cm_tf[:, i].sum() - cm_tf[i, i]
    results_tf[label]['FN'] = cm_tf[i, :].sum() - cm_tf[i, i]
    results_tf[label]['TN'] = cm_tf.sum() - (results_tf[label]['TP'] + results_tf[label]['FP'] + results_tf[label]['FN'])

# Print results_tf
for label, metrics in results_tf.items():
    print(f"{label}: {metrics}")

# Save results_tf to CSV
results_tf_df = pd.DataFrame(results_tf).T
results_tf_df['F1-score'] = f1_tf
results_tf_df['Recall'] = recall_tf
results_tf_df['Precision'] = precision_tf

results_tf_df.to_csv('metrics/confusion_matrix_tf.csv')

print(f"Precision = {precision_tf}")
print(f"Recall = {recall_tf}")
print(f"F1-score = {f1_tf}")

feature: {'TP': 81, 'FP': 6, 'FN': 19, 'TN': 194}
bug: {'TP': 88, 'FP': 8, 'FN': 12, 'TN': 192}
question: {'TP': 89, 'FP': 28, 'FN': 11, 'TN': 172}
Precision = 0.869461636703016
Recall = 0.86
F1-score = 0.861515280599043


### F1-score: 86.15%

In [None]:
# Create a classification report
report_tf = classification_report(y_true_tf, y_pred_tf, labels=['bug', 'feature', 'question'], target_names=['bug', 'feature', 'question'], zero_division=0, output_dict=True)

# Convert the report to a DataFrame
report_df_tf = pd.DataFrame(report_tf).transpose()

# Print the classification report
print(report_df_tf)

              precision  recall  f1-score  support
bug            0.916667    0.88  0.897959   100.00
feature        0.931034    0.81  0.866310   100.00
question       0.760684    0.89  0.820276   100.00
accuracy       0.860000    0.86  0.860000     0.86
macro avg      0.869462    0.86  0.861515   300.00
weighted avg   0.869462    0.86  0.861515   300.00


## Microsoft Repo Testing

In [None]:
y_true_ms = []
y_pred_ms = []

iterations = len(test_data_microsoft)

# Now let's loop through the test data and classify the GitHub pull requests
for i in range(iterations):
    correct_label = test_data_microsoft.iloc[i]['label'].lower()
    description = f"{test_data_microsoft.iloc[i]['title']}\n{test_data_microsoft.iloc[i]['body']}"
    print(f"Correct PR type: {correct_label}")
    
    prompt = f"Classify, IN ONLY 1 WORD, the following GitHub pull request as 'feature', 'bug', or 'question' based on its title and body:\n{description}"
    response = query_chatgpt(prompt, microsoft_ft_model)
    
    if response is None:
        print("Failed to get a response after several retries. Skipping this item.")
        continue  # Skip this iteration and move to the next one
    
    # Clean the response to keep only letters (and optionally numbers)
    predicted_label = re.sub(r'[^A-Za-z]+', '', response).lower().strip()
    print(f"Predicted PR type: {predicted_label}")
    
    # Append to lists for evaluation
    y_true_ms.append(correct_label)
    y_pred_ms.append(predicted_label)
    time.sleep(6)  # Wait for 6 seconds before retrying

Correct PR type: bug
Predicted PR type: feature
Correct PR type: bug
Predicted PR type: feature
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: question
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: question
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: question
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: question
Correct PR type: bug
Predicted PR type: bug
Corr

In [None]:
# Calculate weighted average F1-score, precision, and recall
f1_ms = f1_score(y_true_ms, y_pred_ms, labels=labels, average='weighted')
precision_ms = precision_score(y_true_ms, y_pred_ms, labels=labels, average='weighted')
recall_ms = recall_score(y_true_ms, y_pred_ms, labels=labels, average='weighted')

# Calculate confusion matrix
cm_ms = confusion_matrix(y_true_ms, y_pred_ms, labels=labels)

cm_df_ms = pd.DataFrame(cm_ms, index=labels, columns=labels)

# Calculate TP, FP, FN, TN
results_ms = {}
for i, label in enumerate(labels):
    results_ms[label] = {'TP': cm_ms[i, i]}
    results_ms[label]['FP'] = cm_ms[:, i].sum() - cm_ms[i, i]
    results_ms[label]['FN'] = cm_ms[i, :].sum() - cm_ms[i, i]
    results_ms[label]['TN'] = cm_ms.sum() - (results_ms[label]['TP'] + results_ms[label]['FP'] + results_ms[label]['FN'])

# Print results_ms
for label, metrics in results_ms.items():
    print(f"{label}: {metrics}")

# Save results_ms to CSV
results_ms_df = pd.DataFrame(results_ms).T
results_ms_df['F1-score'] = f1_ms
results_ms_df['Recall'] = recall_ms
results_ms_df['Precision'] = precision_ms


results_ms_df.to_csv('metrics/confusion_matrix_ms.csv')

print(f"Precision = {precision_ms}")
print(f"Recall = {recall_ms}")
print(f"F1-score = {f1_ms}")

feature: {'TP': 84, 'FP': 21, 'FN': 16, 'TN': 179}
bug: {'TP': 76, 'FP': 13, 'FN': 24, 'TN': 187}
question: {'TP': 80, 'FP': 26, 'FN': 20, 'TN': 174}
Precision = 0.802883188467246
Recall = 0.8
F1-score = 0.8001480094936562


### F1-score: 80.01%

In [None]:
# Create a classification report
report_ms = classification_report(y_true_ms, y_pred_ms, labels=['bug', 'feature', 'question'], target_names=['bug', 'feature', 'question'], zero_division=0, output_dict=True)

# Convert the report to a DataFrame
report_df_ms = pd.DataFrame(report_ms).transpose()

# Print the classification report
print(report_df_ms)

              precision  recall  f1-score  support
bug            0.853933    0.76  0.804233    100.0
feature        0.800000    0.84  0.819512    100.0
question       0.754717    0.80  0.776699    100.0
accuracy       0.800000    0.80  0.800000      0.8
macro avg      0.802883    0.80  0.800148    300.0
weighted avg   0.802883    0.80  0.800148    300.0


## Bitcoin Repo Testing

In [None]:
y_true_bc = []
y_pred_bc = []

iterations = len(test_data_bitcoin)

# Now let's loop through the test data and classify the GitHub pull requests
for i in range(iterations):
    correct_label = test_data_bitcoin.iloc[i]['label'].lower()
    description = f"{test_data_bitcoin.iloc[i]['title']}\n{test_data_bitcoin.iloc[i]['body']}"
    print(f"Correct PR type: {correct_label}")
    
    prompt = f"Classify, IN ONLY 1 WORD, the following GitHub pull request as 'feature', 'bug', or 'question' based on its title and body:\n{description}"
    response = query_chatgpt(prompt, bitcoin_ft_model)
    
    if response is None:
        print("Failed to get a response after several retries. Skipping this item.")
        continue  # Skip this iteration and move to the next one
    
    # Clean the response to keep only letters (and optionally numbers)
    predicted_label = re.sub(r'[^A-Za-z]+', '', response).lower().strip()
    print(f"Predicted PR type: {predicted_label}")
    
    # Append to lists for evaluation
    y_true_bc.append(correct_label)
    y_pred_bc.append(predicted_label)
    time.sleep(6)  # Wait for 6 seconds before retrying


Correct PR type: feature
Predicted PR type: feature
Correct PR type: feature
Predicted PR type: feature
Correct PR type: feature
Predicted PR type: feature
Correct PR type: feature
Predicted PR type: feature
Correct PR type: feature
Predicted PR type: feature
Correct PR type: feature
Predicted PR type: feature
Correct PR type: feature
Predicted PR type: feature
Correct PR type: feature
Predicted PR type: bug
Correct PR type: feature
Predicted PR type: feature
Correct PR type: feature
Predicted PR type: feature
Correct PR type: feature
Predicted PR type: feature
Correct PR type: feature
Predicted PR type: feature
Correct PR type: feature
Predicted PR type: feature
Correct PR type: feature
Predicted PR type: feature
Correct PR type: feature
Predicted PR type: feature
Correct PR type: feature
Predicted PR type: feature
Correct PR type: feature
Predicted PR type: feature
Correct PR type: feature
Predicted PR type: feature
Correct PR type: feature
Predicted PR type: feature
Correct PR type:

In [None]:
# Calculate weighted average F1-score, precision, and recall
f1_bc = f1_score(y_true_bc, y_pred_bc, labels=labels, average='weighted')
precision_bc = precision_score(y_true_bc, y_pred_bc, labels=labels, average='weighted')
recall_bc = recall_score(y_true_bc, y_pred_bc, labels=labels, average='weighted')

# Calculate confusion matrix
cm_bc = confusion_matrix(y_true_bc, y_pred_bc, labels=labels)

cm_df_bc = pd.DataFrame(cm_bc, index=labels, columns=labels)

# Calculate TP, FP, FN, TN
results_bc = {}
for i, label in enumerate(labels):
    results_bc[label] = {'TP': cm_bc[i, i]}
    results_bc[label]['FP'] = cm_bc[:, i].sum() - cm_bc[i, i]
    results_bc[label]['FN'] = cm_bc[i, :].sum() - cm_bc[i, i]
    results_bc[label]['TN'] = cm_bc.sum() - (results_bc[label]['TP'] + results_bc[label]['FP'] + results_bc[label]['FN'])

# Print results_bc
for label, metrics in results_bc.items():
    print(f"{label}: {metrics}")

# Save results_bc to CSV
results_bc_df = pd.DataFrame(results_bc).T
results_bc_df['F1-score'] = f1_bc
results_bc_df['Recall'] = recall_bc
results_bc_df['Precision'] = precision_bc

results_bc_df.to_csv('metrics/confusion_matrix_bc.csv')

print(f"Precision = {precision_bc}")
print(f"Recall = {recall_bc}")
print(f"F1-score = {f1_bc}")

feature: {'TP': 89, 'FP': 18, 'FN': 11, 'TN': 182}
bug: {'TP': 80, 'FP': 29, 'FN': 20, 'TN': 171}
question: {'TP': 62, 'FP': 22, 'FN': 38, 'TN': 178}
Precision = 0.7679386310527527
Recall = 0.77
F1-score = 0.7664555547850743


### F1-Score: 76.65%

In [None]:
# Create a classification report
report_bc = classification_report(y_true_bc, y_pred_bc, labels=['bug', 'feature', 'question'], target_names=['bug', 'feature', 'question'], zero_division=0, output_dict=True)

# Convert the report to a DataFrame
report_df_bc = pd.DataFrame(report_bc).transpose()

# Print the classification report
print(report_df_bc)

              precision  recall  f1-score  support
bug            0.733945    0.80  0.765550   100.00
feature        0.831776    0.89  0.859903   100.00
question       0.738095    0.62  0.673913   100.00
accuracy       0.770000    0.77  0.770000     0.77
macro avg      0.767939    0.77  0.766456   300.00
weighted avg   0.767939    0.77  0.766456   300.00


## OpenCV Repo Testing

In [None]:
y_true_oc = []
y_pred_oc = []

iterations = len(test_data_opencv)

# Now let's loop through the test data and classify the GitHub pull requests
for i in range(iterations):
    correct_label = test_data_opencv.iloc[i]['label'].lower()
    description = f"{test_data_opencv.iloc[i]['title']}\n{test_data_opencv.iloc[i]['body']}"
    print(f"Correct PR type: {correct_label}")
    
    prompt = f"Classify, IN ONLY 1 WORD, the following GitHub pull request as 'feature', 'bug', or 'question' based on its title and body:\n{description}"
    response = query_chatgpt(prompt, opencv_ft_model)
    
    if response is None:
        print("Failed to get a response after several retries. Skipping this item.")
        continue  # Skip this iteration and move to the next one
    
    # Clean the response to keep only letters (and optionally numbers)
    predicted_label = re.sub(r'[^A-Za-z]+', '', response).lower().strip()
    print(f"Predicted PR type: {predicted_label}")
    
    # Append to lists for evaluation
    y_true_oc.append(correct_label)
    y_pred_oc.append(predicted_label)
    time.sleep(6)  # Wait for 6 seconds before retrying

Correct PR type: bug
Predicted PR type: question
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: feature
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: question
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: b

In [None]:
# Calculate weighted average F1-score, precision, and recall
f1_oc = f1_score(y_true_oc, y_pred_oc, labels=labels, average='weighted')
precision_oc = precision_score(y_true_oc, y_pred_oc, labels=labels, average='weighted')
recall_oc = recall_score(y_true_oc, y_pred_oc, labels=labels, average='weighted')

# Calculate confusion matrix
cm_oc = confusion_matrix(y_true_oc, y_pred_oc, labels=labels)

cm_df_oc = pd.DataFrame(cm_oc, index=labels, columns=labels)

# Calculate TP, FP, FN, TN
results_oc = {}
for i, label in enumerate(labels):
    results_oc[label] = {'TP': cm_oc[i, i]}
    results_oc[label]['FP'] = cm_oc[:, i].sum() - cm_oc[i, i]
    results_oc[label]['FN'] = cm_oc[i, :].sum() - cm_oc[i, i]
    results_oc[label]['TN'] = cm_oc.sum() - (results_oc[label]['TP'] + results_oc[label]['FP'] + results_oc[label]['FN'])

# Print results_oc
for label, metrics in results_oc.items():
    print(f"{label}: {metrics}")

# Save results_oc to CSV
results_oc_df = pd.DataFrame(results_oc).T
results_oc_df['F1-score'] = f1_oc
results_oc_df['Recall'] = recall_oc
results_oc_df['Precision'] = precision_oc

results_oc_df.to_csv('metrics/confusion_matrix_oc.csv')

print(f"Precision = {precision_oc}")
print(f"Recall = {recall_oc}")
print(f"F1-score = {f1_oc}")

feature: {'TP': 80, 'FP': 14, 'FN': 20, 'TN': 186}
bug: {'TP': 82, 'FP': 28, 'FN': 18, 'TN': 172}
question: {'TP': 81, 'FP': 15, 'FN': 19, 'TN': 185}
Precision = 0.8134227917472598
Recall = 0.81
F1-score = 0.8107417537461722


### F1-score: 81.07%


In [None]:
# Create a classification report
report_oc = classification_report(y_true_oc, y_pred_oc, labels=['bug', 'feature', 'question'], target_names=['bug', 'feature', 'question'], zero_division=0, output_dict=True)

# Convert the report to a DataFrame
report_df_oc = pd.DataFrame(report_oc).transpose()

# Print the classification report
print(report_df_oc)

              precision  recall  f1-score  support
bug            0.745455    0.82  0.780952   100.00
feature        0.851064    0.80  0.824742   100.00
question       0.843750    0.81  0.826531   100.00
accuracy       0.810000    0.81  0.810000     0.81
macro avg      0.813423    0.81  0.810742   300.00
weighted avg   0.813423    0.81  0.810742   300.00


## Overall results  
With results gathered for all repositories tested against their respective trained models, we're poised to consolidate the confusion matrix data and derive the overall metrics.

In [None]:
from functools import reduce

# Filenames of your CSV files
csv_files = ['metrics/confusion_matrix_fb.csv', 'metrics/confusion_matrix_tf.csv', 'metrics/confusion_matrix_ms.csv', 'metrics/confusion_matrix_bc.csv', 'metrics/confusion_matrix_oc.csv']

# Initialize empty DataFrame to store combined confusion matrix
combined_confusion_matrix = pd.DataFrame(columns=['feature', 'bug', 'question'])

# Loop through each file and accumulate confusion matrix
for file in csv_files:
    df = pd.read_csv(file, usecols=['TP', 'FP', 'FN', 'TN'])
    combined_confusion_matrix = combined_confusion_matrix.add(df, fill_value=0)

# Calculate the sum of each column to use for metrics calculation
sum_tp = combined_confusion_matrix['TP'].sum()
sum_fp = combined_confusion_matrix['FP'].sum()
sum_fn = combined_confusion_matrix['FN'].sum()
sum_tn = combined_confusion_matrix['TN'].sum()

# Assuming binary classification for simplicity, calculate metrics
precision = sum_tp / (sum_tp + sum_fp)
recall = sum_tp / (sum_tp + sum_fn)
f1 = 2 * (precision * recall) / (precision + recall)

# Save the combined confusion matrix to a new CSV file
combined_confusion_matrix.to_csv('metrics/combined_confusion_matrix.csv', index=False)

# Print the calculated metrics
print(f'Combined Precision: {precision}')
print(f'Combined Recall: {recall}')
print(f'Combined F1-Score: {f1}')

Combined Precision: 0.82
Combined Recall: 0.82
Combined F1-Score: 0.82


## Data Cleaning: Method 2  
Upon analysis, opportunities for enhancement in our cleaning method surfaced, leading to the implementation of a new cleaning function.  
In the revised cleaning method (Method 2), emphasis was placed on stripping markdown text while adopting a strategy of replacing certain text elements to uphold the intended meaning.

In [57]:
# Function to convert Markdown to plain text
def strip_markdown(text):
    # Remove Markdown links
    text = re.sub(r'\[([^\]]*)\]\([^\)]*\)', r'\1', text)
    
    # Remove Markdown emphasis (* or _)
    text = re.sub(r'(\*|_)(.*?)\1', r'\2', text)
    
    # Remove Markdown inline code (`)
    text = re.sub(r'`([^`]+)`', r'\1', text)
    
    # Remove Markdown headers (##, ###, etc.)
    text = re.sub(r'#+\s*(.*?)\n', r'\1\n', text)
    
    # Remove other Markdown elements as needed
    
    return text

# Initialize counters for text cleaning
cleaned_count = 0
original_count = 0

def clean_text(text):
    global cleaned_count, original_count

    if not isinstance(text, str):
        original_count += 1
        return text

######################################
#        Standardize The Text        #
######################################

    # Lowercasing should be one of the first steps to ensure uniformity
    text = text.lower()

######################################
#         Remove Characters          #
######################################

    # Remove emojis, special characters, and punctuation
    text = emoji.demojize(text)
    text = re.sub(f"[{re.escape(string.punctuation)}]", '', text)

######################################
#         Remove/Replace Text        #
######################################

    # Remove specific phrases "Website or app" and "local react development"
    text = text.replace("website or app", "")
    text = text.replace("local react development", "")

    # Replace URLs, HTML tags, user mentions, and markdown image references
    text = re.sub(r'https?://\S+|www\.\S+', '<URL>', text)
    text = re.sub(r'<.*?>', '<HTML_TAG>', text)
    text = re.sub(r'@\w+', '<USER>', text)
    text = re.sub(r'!\[image\]\(.*?\)', '<IMAGE>', text)

    # Remove text starting with "DevTools" and ending with "(automated)"
    text = re.sub(r'DevTools.*?\(automated\)', '', text)



        # Strip markdown formatting
    text = strip_markdown(text)

######################################
#        Tidy Up Whitespaces         #
######################################

    # Remove consecutive whitespaces and replace with a single space
    text = re.sub(r'\s+', ' ', text)

######################################
#            Final Things            #
######################################

    # Tokenize the text into words
    words = text.split()

    # Remove words that are over 20 characters
    words = [word for word in words if len(word) <= 20]

    # Join the remaining words back into cleaned text
    cleaned_text = ' '.join(words)

    cleaned_count += 1
    return cleaned_text

# Applying clean_text function to test and train data
test_data['body'] = test_data['body'].apply(clean_text)
test_data['title'] = test_data['title'].apply(clean_text)

train_data['body'] = train_data['body'].apply(clean_text)
train_data['title'] = train_data['title'].apply(clean_text)

# Displaying cleaning statistics
print(f"Cleaned {cleaned_count} times.")
print(f"Returned original text {original_count} times.")

Cleaned 5998 times.
Returned original text 2 times.


In [54]:
test_data_facebook = test_data[: 300]
test_data_tensorflow = test_data[300: 600]
test_data_microsoft = test_data[600: 900]
test_data_bitcoin = test_data[900: 1200]
test_data_opencv= test_data[1200: 1500]


test_data_tensorflow

Unnamed: 0,repo,created_at,label,title,body
300,tensorflow/tensorflow,2023-09-05 05:58:09,question,tensorflow lite in play services issue not my ...,system information android device information ...
301,tensorflow/tensorflow,2023-09-02 18:05:04,question,what is generatevocab func,you referenced in this to if i understand corr...
302,tensorflow/tensorflow,2023-08-24 19:52:10,question,error starting tensorflow in python,issue type others have you reproduced the bug ...
303,tensorflow/tensorflow,2023-08-11 18:32:31,question,tensorflow profiler running into oom issue on gpu,issue type support have you reproduced the bug...
304,tensorflow/tensorflow,2023-08-10 04:03:47,question,tensorflow profiler running into oom issue on gpu,issue type support have you reproduced the bug...
...,...,...,...,...,...
595,tensorflow/tensorflow,2023-06-18 22:44:17,bug,uncaught exception in zmqstream callback when ...,detailssummaryclick to expandsummary issue typ...
596,tensorflow/tensorflow,2023-06-13 19:41:41,bug,w could not load dynamic library libcudnnso8 d...,detailssummaryclick to expandsummary issue typ...
597,tensorflow/tensorflow,2023-06-12 18:07:28,bug,unexpected failure when preparing tensor alloc...,i have converted my densenet121 model to model...
598,tensorflow/tensorflow,2023-06-12 10:32:27,bug,documentation bug：the description of padding,detailssummaryclick to expandsummary issue typ...


In [55]:
train_data_facebook = train_data[: 300]
train_data_tensorflow = train_data[300: 600]
train_data_microsoft = train_data[600: 900]
train_data_bitcoin = train_data[900: 1200]
train_data_opencv= train_data[1200: 1500]


train_data_tensorflow

Unnamed: 0,repo,created_at,label,title,body
300,tensorflow/tensorflow,2023-09-07 05:31:36,question,add suppport for vedv,please go to stack overflow for help and suppo...
301,tensorflow/tensorflow,2023-09-03 17:53:40,question,cant run bertvocabfromdataset without typeerro...,issue type support have you reproduced the bug...
302,tensorflow/tensorflow,2023-09-01 16:34:57,question,float16 mixed precision training,issue type bug have you reproduced the bug wit...
303,tensorflow/tensorflow,2023-08-19 06:53:28,question,ckpt to tflite,how can i convert ckpt file to tf lite while i...
304,tensorflow/tensorflow,2023-08-11 14:02:25,question,when converting tensorflow model to tflite mod...,i took a pretrained model ssd mobilenet 320x32...
...,...,...,...,...,...
595,tensorflow/tensorflow,2023-06-19 05:35:02,bug,tensorboard histogram onehot operation causing...,detailssummaryclick to expandsummary issue typ...
596,tensorflow/tensorflow,2023-06-16 12:20:44,bug,tftestgpudevicename leads to soft lockup and u...,detailssummaryclick to expandsummary issue typ...
597,tensorflow/tensorflow,2023-06-12 21:17:01,bug,tfdatadatasetmap does not support randomization,detailssummaryclick to expandsummary issue typ...
598,tensorflow/tensorflow,2023-06-12 10:35:48,bug,functional bug：could not interpret serialized ...,detailssummaryclick to expandsummary issue typ...


## Improved models  
Upon analyzing the step metrics of the fine-tuned models, it became evident that certain models, specifically those associated with the TensorFlow, Microsoft, and OpenCV repositories, exhibited training_loss figures indicating potential for improvement.

Considering this insight, we opted to develop new fine-tuned models, augmenting the epochs and integrating the enhanced cleaning method for these specific repositories.

## Tensorflow 2nd model
New cleaning and 10 epochs

In [None]:
import tiktoken

max_content_tokens = 3999
encoding = tiktoken.get_encoding("cl100k_base")
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")

# Function to truncate the message and avoid passing the limit of 4k tokens per gpt-3.5 fine-tuned model limitations
def truncate_message(message, max_length):
    tokens = encoding.encode(message)
    if len(tokens) > max_length:
        truncated_tokens = tokens[:max_length]
        message = encoding.decode(truncated_tokens)
    return message

# Open the file in write mode
with open('data/conversationaldata/conversational_data_tensorflow_new.jsonl', 'w', encoding='utf-8') as f:
    # Iterate over the rows in the DataFrame
    for index, row in train_data_tensorflow.iterrows():
        # Create the user message by formatting the prompt with the title and body
        user_message = f"Classify, IN ONLY 1 WORD, the following GitHub issue as 'feature', 'bug', or 'question' based on its title and body:\n{row['title']}\n{row['body']}"
        
        # Truncate the prompt if necessary
        user_message = truncate_message(user_message, max_content_tokens)
        # Create the assistant message by taking the label
        assistant_message = row['label']
        
        # Construct the conversation object
        conversation_object = {
            "messages": [
                {"role": "system", "content": "GitHub Issue Report Classifier"},
                {"role": "user", "content": user_message},
                {"role": "assistant", "content": assistant_message}
            ]
        }
        
        # Write the conversation object to one line in the file
        f.write(json.dumps(conversation_object, ensure_ascii=False) + '\n')

In [None]:
client.files.create(
  file=open("data/conversationaldata/conversational_data_tensorflow_new.jsonl", "rb"),
  purpose="fine-tune"
)

FileObject(id='file-BJCK5M6m756cgb0G6peUXb91', bytes=547089, created_at=1701744292, filename='conversational_data_tensorflow_new.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

In [None]:
## Creating a fine-tuned model
client.fine_tuning.jobs.create(
  training_file="file-BJCK5M6m756cgb0G6peUXb91", 
  model="gpt-3.5-turbo",
  suffix= "tf-issueclassifier",
  hyperparameters={"n_epochs": 10}
)

FineTuningJob(id='ftjob-AbcuR8vh5M8kQDIV8hhLoXpe', created_at=1701744304, error=None, fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs=10, batch_size='auto', learning_rate_multiplier='auto'), model='gpt-3.5-turbo-0613', object='fine_tuning.job', organization_id='org-CpaRU3Zq9ePCCtbhezmcbgrg', result_files=[], status='validating_files', trained_tokens=None, training_file='file-BJCK5M6m756cgb0G6peUXb91', validation_file=None)

In [None]:
# Retrieving the state of a fine-tune
tensorflow_ft_model_new = client.fine_tuning.jobs.retrieve('ftjob-AbcuR8vh5M8kQDIV8hhLoXpe').fine_tuned_model
print(tensorflow_ft_model_new)

ft:gpt-3.5-turbo-0613:gcucst440:tf-issueclassifier:8SGhlOsl


In [None]:
# List up to 20 events from a fine-tuning job to track progress
client.fine_tuning.jobs.list_events(fine_tuning_job_id="ftjob-AbcuR8vh5M8kQDIV8hhLoXpe", limit=20)

SyncCursorPage[FineTuningJobEvent](data=[FineTuningJobEvent(id='ftevent-ZCUD3gwzt8wE28XJvFqRMQ8a', created_at=1701748093, level='info', message='The job has successfully completed', object='fine_tuning.job.event', data={}, type='message'), FineTuningJobEvent(id='ftevent-Hyf9n5MhisMIAl0GT4VOtfk5', created_at=1701748090, level='info', message='New fine-tuned model created: ft:gpt-3.5-turbo-0613:gcucst440:tf-issueclassifier:8SGhlOsl', object='fine_tuning.job.event', data={}, type='message'), FineTuningJobEvent(id='ftevent-QSAzXICyO3swQNZPBjUUYuAu', created_at=1701747839, level='info', message='Step 1401/1500: training loss=0.00', object='fine_tuning.job.event', data={'step': 1401, 'train_loss': 6.35782896551973e-07, 'train_mean_token_accuracy': 1.0}, type='metrics'), FineTuningJobEvent(id='ftevent-Tp65vZhe34zzUv5iSMPtFTUd', created_at=1701747592, level='info', message='Step 1301/1500: training loss=0.00', object='fine_tuning.job.event', data={'step': 1301, 'train_loss': 6.35782896551973e-

In [None]:
y_true_tf_new = []
y_pred_tf_new = []

iterations = len(test_data_tensorflow)

# Now let's loop through the test data and classify the GitHub pull requests
for i in range(iterations):
    correct_label = test_data_tensorflow.iloc[i]['label'].lower()
    description = f"{test_data_tensorflow.iloc[i]['title']} \n {test_data_tensorflow.iloc[i]['body']}"
    print(f"Correct PR type: {correct_label}")
    
    prompt = f"Classify, IN ONLY 1 WORD, the following GitHub pull request as 'feature', 'bug', or 'question' based on its title and body:\n{description}"
    response = query_chatgpt(prompt, tensorflow_ft_model_new)
    
    if response is None:
        print("Failed to get a response after several retries. Skipping this item.")
        continue  # Skip this iteration and move to the next one
    
    # Clean the response to keep only letters (and optionally numbers)
    predicted_label = re.sub(r'[^A-Za-z]+', '', response).lower().strip()
    print(f"Predicted PR type: {predicted_label}")
    
    # Append to lists for evaluation
    y_true_tf_new.append(correct_label)
    y_pred_tf_new.append(predicted_label)
    time.sleep(6)  # Wait for 6 seconds before retrying

Correct PR type: question
Predicted PR type: question
Correct PR type: question
Predicted PR type: question
Correct PR type: question
Predicted PR type: question
Correct PR type: question
Predicted PR type: question
Correct PR type: question
Predicted PR type: question
Correct PR type: question
Predicted PR type: question
Correct PR type: question
Predicted PR type: question
Correct PR type: question
Predicted PR type: question
Correct PR type: question
Predicted PR type: question
Correct PR type: question
Predicted PR type: question
Correct PR type: question
Predicted PR type: question
Correct PR type: question
Predicted PR type: question
Correct PR type: question
Predicted PR type: question
Correct PR type: question
Predicted PR type: question
Correct PR type: question
Predicted PR type: question
Correct PR type: question
Predicted PR type: question
Correct PR type: question
Predicted PR type: question
Correct PR type: question
Predicted PR type: question
Correct PR type: question
Pr

In [52]:
# Calculate weighted average F1-score, precision, and recall
f1_tf_new = f1_score(y_true_tf_new, y_pred_tf_new, labels=labels, average='weighted')
precision_tf_new = precision_score(y_true_tf_new, y_pred_tf_new, labels=labels, average='weighted')
recall_tf_new = recall_score(y_true_tf_new, y_pred_tf_new, labels=labels, average='weighted')

# Calculate confusion matrix
cm_tf_new = confusion_matrix(y_true_tf_new, y_pred_tf_new, labels=labels)

cm_df_tf_new = pd.DataFrame(cm_tf_new, index=labels, columns=labels)

# Calculate TP, FP, FN, TN
results_tf_new = {}
for i, label in enumerate(labels):
    results_tf_new[label] = {'TP': cm_tf_new[i, i]}
    results_tf_new[label]['FP'] = cm_tf_new[:, i].sum() - cm_tf_new[i, i]
    results_tf_new[label]['FN'] = cm_tf_new[i, :].sum() - cm_tf_new[i, i]
    results_tf_new[label]['TN'] = cm_tf_new.sum() - (results_tf_new[label]['TP'] + results_tf_new[label]['FP'] + results_tf_new[label]['FN'])

# Print results_tf_new
for label, metrics in results_tf_new.items():
    print(f"{label}: {metrics}")

# Save results_tf_new to CSV
results_tf_new_df = pd.DataFrame(results_tf_new).T
results_tf_new_df['F1-score'] = f1_tf_new
results_tf_new_df['Recall'] = recall_tf_new
results_tf_new_df['Precision'] = precision_tf_new

results_tf_new_df.to_csv('metrics/confusion_matrix_tf_new.csv')

print(f"Precision = {precision_tf_new}")
print(f"Recall = {recall_tf_new}")
print(f"F1-score = {f1_tf_new}")

feature: {'TP': 82, 'FP': 6, 'FN': 18, 'TN': 194}
bug: {'TP': 88, 'FP': 9, 'FN': 12, 'TN': 191}
question: {'TP': 91, 'FP': 24, 'FN': 9, 'TN': 176}
Precision = 0.8767796748298766
Recall = 0.87
F1-score = 0.8707510228891061


In [53]:
# Create a classification report
report_tf_new = classification_report(y_true_tf_new, y_pred_tf_new, labels=['bug', 'feature', 'question'], target_names=['bug', 'feature', 'question'], zero_division=0, output_dict=True)

# Convert the report to a DataFrame
report_df_tf_new = pd.DataFrame(report_tf_new).transpose()

# Print the classification report
print(report_df_tf_new)

              precision  recall  f1-score  support
bug            0.907216    0.88  0.893401   100.00
feature        0.931818    0.82  0.872340   100.00
question       0.791304    0.91  0.846512   100.00
accuracy       0.870000    0.87  0.870000     0.87
macro avg      0.876780    0.87  0.870751   300.00
weighted avg   0.876780    0.87  0.870751   300.00


## OpenCV New Model
New cleaning and 6 epochs
Employing a new cleaning method alongside 6 epochs, we chose this iteration as our experimentation with 10 epochs indicated it to be excessive for our TensorFlow model.

In [58]:
# Open the file in write mode
with open('data/conversationaldata/conversational_data_opencv_new.jsonl', 'w', encoding='utf-8') as f:
    # Iterate over the rows in the DataFrame
    for index, row in train_data_opencv.iterrows():
        # Create the user message by formatting the prompt with the title and body
        user_message = f"Classify, IN ONLY 1 WORD, the following GitHub issue as 'feature', 'bug', or 'question' based on its title and body:\n{row['title']}\n{row['body']}"
        
        # Truncate the prompt if necessary
        user_message = truncate_message(user_message, max_content_tokens)
        # Create the assistant message by taking the label
        assistant_message = row['label']
        
        # Construct the conversation object
        conversation_object = {
            "messages": [
                {"role": "system", "content": "GitHub Issue Report Classifier"},
                {"role": "user", "content": user_message},
                {"role": "assistant", "content": assistant_message}
            ]
        }
        
        # Write the conversation object to one line in the file
        f.write(json.dumps(conversation_object, ensure_ascii=False) + '\n')

In [59]:
client.files.create(
  file=open("data/conversationaldata/conversational_data_opencv_new.jsonl", "rb"),
  purpose="fine-tune"
)

FileObject(id='file-BXCWSF32dn7asPO4Nfxqyhq5', bytes=605037, created_at=1701752486, filename='conversational_data_opencv_new.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

In [60]:
## Creating a fine-tuned model
client.fine_tuning.jobs.create(
  training_file="file-BXCWSF32dn7asPO4Nfxqyhq5", 
  model="gpt-3.5-turbo",
  suffix= "oc-issueclassifier",
  hyperparameters={"n_epochs": 6}
)

FineTuningJob(id='ftjob-rNIHMHUIy97BY1eDQDttqEZy', created_at=1701752539, error=None, fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs=6, batch_size='auto', learning_rate_multiplier='auto'), model='gpt-3.5-turbo-0613', object='fine_tuning.job', organization_id='org-CpaRU3Zq9ePCCtbhezmcbgrg', result_files=[], status='validating_files', trained_tokens=None, training_file='file-BXCWSF32dn7asPO4Nfxqyhq5', validation_file=None)

In [88]:
# Retrieving the state of a fine-tune
opencv_ft_model_new = client.fine_tuning.jobs.retrieve('ftjob-rNIHMHUIy97BY1eDQDttqEZy').fine_tuned_model
print(opencv_ft_model_new)

ft:gpt-3.5-turbo-0613:gcucst440:oc-issueclassifier:8SJ2ph8V


In [72]:
# List up to 20 events from a fine-tuning job to track progress
client.fine_tuning.jobs.list_events(fine_tuning_job_id="ftjob-rNIHMHUIy97BY1eDQDttqEZy", limit=20)

SyncCursorPage[FineTuningJobEvent](data=[FineTuningJobEvent(id='ftevent-3gPPqvCgj8ZVrP2diHc7gQmq', created_at=1701757087, level='info', message='The job has successfully completed', object='fine_tuning.job.event', data={}, type='message'), FineTuningJobEvent(id='ftevent-jUjfql7Umt1Kb5MJyI4OfnrN', created_at=1701757084, level='info', message='New fine-tuned model created: ft:gpt-3.5-turbo-0613:gcucst440:oc-issueclassifier:8SJ2ph8V', object='fine_tuning.job.event', data={}, type='message'), FineTuningJobEvent(id='ftevent-BHsj8jOKPrw6nvXbLGCfYt4w', created_at=1701756823, level='info', message='Step 1701/1800: training loss=0.00', object='fine_tuning.job.event', data={'step': 1701, 'train_loss': 6.35782896551973e-07, 'train_mean_token_accuracy': 1.0}, type='metrics'), FineTuningJobEvent(id='ftevent-7Te7WV3HBwUUSheVNrzMRJDl', created_at=1701756575, level='info', message='Step 1601/1800: training loss=0.00', object='fine_tuning.job.event', data={'step': 1601, 'train_loss': 6.35782896551973e-

In [73]:
y_true_oc_new = []
y_pred_oc_new = []

iterations = len(test_data_opencv)

# Now let's loop through the test data and classify the GitHub pull requests
for i in range(iterations):
    correct_label = test_data_opencv.iloc[i]['label'].lower()
    description = f"{test_data_opencv.iloc[i]['title']}\n{test_data_opencv.iloc[i]['body']}"
    print(f"Correct PR type: {correct_label}")
    
    prompt = f"Classify, IN ONLY 1 WORD, the following GitHub pull request as 'feature', 'bug', or 'question' based on its title and body:\n{description}"
    response = query_chatgpt(prompt, opencv_ft_model_new)
    
    if response is None:
        print("Failed to get a response after several retries. Skipping this item.")
        continue  # Skip this iteration and move to the next one
    
    # Clean the response to keep only letters (and optionally numbers)
    predicted_label = re.sub(r'[^A-Za-z]+', '', response).lower().strip()
    print(f"Predicted PR type: {predicted_label}")
    
    # Append to lists for evaluation
    y_true_oc_new.append(correct_label)
    y_pred_oc_new.append(predicted_label)
    time.sleep(6)  # Wait for 6 seconds before retrying

Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: feature
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: question
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Pr

In [74]:
# Calculate weighted average F1-score, precision, and recall
f1_oc_new = f1_score(y_true_oc_new, y_pred_oc_new, labels=labels, average='weighted')
precision_oc_new = precision_score(y_true_oc_new, y_pred_oc_new, labels=labels, average='weighted')
recall_oc_new = recall_score(y_true_oc_new, y_pred_oc_new, labels=labels, average='weighted')

# Calculate confusion matrix
cm_oc_new = confusion_matrix(y_true_oc_new, y_pred_oc_new, labels=labels)

cm_df_oc_new = pd.DataFrame(cm_oc_new, index=labels, columns=labels)

# Calculate TP, FP, FN, TN
results_oc_new = {}
for i, label in enumerate(labels):
    results_oc_new[label] = {'TP': cm_oc_new[i, i]}
    results_oc_new[label]['FP'] = cm_oc_new[:, i].sum() - cm_oc_new[i, i]
    results_oc_new[label]['FN'] = cm_oc_new[i, :].sum() - cm_oc_new[i, i]
    results_oc_new[label]['TN'] = cm_oc_new.sum() - (results_oc_new[label]['TP'] + results_oc_new[label]['FP'] + results_oc_new[label]['FN'])

# Print results_oc_new
for label, metrics in results_oc_new.items():
    print(f"{label}: {metrics}")

# Save results_oc_new to CSV
results_oc_new_df = pd.DataFrame(results_oc_new).T
results_oc_new_df['F1-score'] = f1_oc_new
results_oc_new_df['Recall'] = recall_oc_new
results_oc_new_df['Precision'] = precision_oc_new

results_oc_new_df.to_csv('metrics/confusion_matrix_oc_new.csv')

print(f"Precision = {precision_oc_new}")
print(f"Recall = {recall_oc_new}")
print(f"F1-score = {f1_oc_new}")

feature: {'TP': 80, 'FP': 8, 'FN': 20, 'TN': 192}
bug: {'TP': 86, 'FP': 32, 'FN': 14, 'TN': 168}
question: {'TP': 81, 'FP': 13, 'FN': 19, 'TN': 187}
Precision = 0.8332021986908391
Recall = 0.8233333333333334
F1-score = 0.8250354006223534


In [75]:
# Create a classification report
report_oc_new = classification_report(y_true_oc_new, y_pred_oc_new, labels=['bug', 'feature', 'question'], target_names=['bug', 'feature', 'question'], zero_division=0, output_dict=True)

# Convert the report to a DataFrame
report_df_oc_new = pd.DataFrame(report_oc_new).transpose()

# Print the classification report
print(report_df_oc_new)

              precision    recall  f1-score     support
bug            0.728814  0.860000  0.788991  100.000000
feature        0.909091  0.800000  0.851064  100.000000
question       0.861702  0.810000  0.835052  100.000000
accuracy       0.823333  0.823333  0.823333    0.823333
macro avg      0.833202  0.823333  0.825035  300.000000
weighted avg   0.833202  0.823333  0.825035  300.000000


## Microsoft new model
Old cleaning method and 6 epochs

In [24]:
## Creating a fine-tuned model
client.fine_tuning.jobs.create(
  training_file="file-o2pr8FckpXuZhncbVF9Fykzl", ## Using same file with old cleaning method
  model=microsoft_ft_model, ## Using old model as the base model
  suffix= "ms-issueclassifier",
  hyperparameters={"n_epochs": 3}
)

FineTuningJob(id='ftjob-ZRVLAfBawJPXM5OB06FPLSur', created_at=1701829890, error=None, fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs=3, batch_size='auto', learning_rate_multiplier='auto'), model='ft:gpt-3.5-turbo-0613:gcucst440:ms-issueclassifier:8LLFl5QI', object='fine_tuning.job', organization_id='org-CpaRU3Zq9ePCCtbhezmcbgrg', result_files=[], status='validating_files', trained_tokens=None, training_file='file-o2pr8FckpXuZhncbVF9Fykzl', validation_file=None)

In [31]:
# Retrieving the state of a fine-tune
microsoft_ft_model_new = client.fine_tuning.jobs.retrieve('ftjob-ZRVLAfBawJPXM5OB06FPLSur').fine_tuned_model
print(microsoft_ft_model_new)

ft:gpt-3.5-turbo-0613:gcucst440:ms-issueclassifier:8ScXRY4K


In [30]:
# List up to 20 events from a fine-tuning job to track progress
client.fine_tuning.jobs.list_events(fine_tuning_job_id="ftjob-ZRVLAfBawJPXM5OB06FPLSur", limit=20)

SyncCursorPage[FineTuningJobEvent](data=[FineTuningJobEvent(id='ftevent-MzqWg9M8LLkaQWF0bgvcqWqN', created_at=1701832021, level='info', message='The job has successfully completed', object='fine_tuning.job.event', data={}, type='message'), FineTuningJobEvent(id='ftevent-Mk5DH2psX60hlkZZspLqiHcn', created_at=1701832017, level='info', message='New fine-tuned model created: ft:gpt-3.5-turbo-0613:gcucst440:ms-issueclassifier:8ScXRY4K', object='fine_tuning.job.event', data={}, type='message'), FineTuningJobEvent(id='ftevent-Rp7izWe9o5gWribnDMdMVYWk', created_at=1701831990, level='info', message='Step 891/900: training loss=0.00', object='fine_tuning.job.event', data={'step': 891, 'train_loss': 6.35782896551973e-07, 'train_mean_token_accuracy': 1.0}, type='metrics'), FineTuningJobEvent(id='ftevent-4sgDHQ6fN0rJTgQhm1WUGRBD', created_at=1701831969, level='info', message='Step 881/900: training loss=0.00', object='fine_tuning.job.event', data={'step': 881, 'train_loss': 6.35782896551973e-07, 't

In [33]:
y_true_ms_new = []
y_pred_ms_new = []

iterations = len(test_data_microsoft)

# Now let's loop through the test data and classify the GitHub pull requests
for i in range(iterations):
    correct_label = test_data_microsoft.iloc[i]['label'].lower()
    description = f"{test_data_microsoft.iloc[i]['title']}\n{test_data_microsoft.iloc[i]['body']}"
    print(f"Correct PR type: {correct_label}")
    
    prompt = f"Classify, IN ONLY 1 WORD, the following GitHub pull request as 'feature', 'bug', or 'question' based on its title and body:\n{description}"
    response = query_chatgpt(prompt, microsoft_ft_model_new)
    
    if response is None:
        print("Failed to get a response after several retries. Skipping this item.")
        continue  # Skip this iteration and move to the next one
    
    # Clean the response to keep only letters (and optionally numbers)
    predicted_label = re.sub(r'[^A-Za-z]+', '', response).lower().strip()
    print(f"Predicted PR type: {predicted_label}")
    
    # Append to lists for evaluation
    y_true_ms_new.append(correct_label)
    y_pred_ms_new.append(predicted_label)
    time.sleep(6)  # Wait for 6 seconds before retrying

Correct PR type: bug
Predicted PR type: feature
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: feature
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Pre

In [34]:
# Calculate weighted average F1-score, precision, and recall
f1_ms_new = f1_score(y_true_ms_new, y_pred_ms_new, labels=labels, average='weighted')
precision_ms_new = precision_score(y_true_ms_new, y_pred_ms_new, labels=labels, average='weighted')
recall_ms_new = recall_score(y_true_ms_new, y_pred_ms_new, labels=labels, average='weighted')

# Calculate confusion matrix
cm_ms_new = confusion_matrix(y_true_ms_new, y_pred_ms_new, labels=labels)

cm_df_ms_new = pd.DataFrame(cm_ms_new, index=labels, columns=labels)

# Calculate TP, FP, FN, TN
results_ms_new = {}
for i, label in enumerate(labels):
    results_ms_new[label] = {'TP': cm_ms_new[i, i]}
    results_ms_new[label]['FP'] = cm_ms_new[:, i].sum() - cm_ms_new[i, i]
    results_ms_new[label]['FN'] = cm_ms_new[i, :].sum() - cm_ms_new[i, i]
    results_ms_new[label]['TN'] = cm_ms_new.sum() - (results_ms_new[label]['TP'] + results_ms_new[label]['FP'] + results_ms_new[label]['FN'])

# Print results_ms_new
for label, metrics in results_ms_new.items():
    print(f"{label}: {metrics}")

# Save results_ms_new to CSV
results_ms_new_df = pd.DataFrame(results_ms_new).T
results_ms_new_df['F1-score'] = f1_ms_new
results_ms_new_df['Recall'] = recall_ms_new
results_ms_new_df['Precision'] = precision_ms_new


results_ms_new_df.to_csv('metrics/confusion_matrix_ms_new.csv')

print(f"Precision = {precision_ms_new}")
print(f"Recall = {recall_ms_new}")
print(f"F1-score = {f1_ms_new}")

feature: {'TP': 87, 'FP': 20, 'FN': 13, 'TN': 180}
bug: {'TP': 80, 'FP': 14, 'FN': 20, 'TN': 186}
question: {'TP': 79, 'FP': 20, 'FN': 21, 'TN': 180}
Precision = 0.8207092466388549
Recall = 0.82
F1-score = 0.8197639424774652


In [35]:
# Create a classification report
report_ms_new = classification_report(y_true_ms_new, y_pred_ms_new, labels=['bug', 'feature', 'question'], target_names=['bug', 'feature', 'question'], zero_division=0, output_dict=True)

# Convert the report to a DataFrame
report_df_ms_new = pd.DataFrame(report_ms_new).transpose()

# Print the classification report
print(report_df_ms_new)

              precision  recall  f1-score  support
bug            0.851064    0.80  0.824742   100.00
feature        0.813084    0.87  0.840580   100.00
question       0.797980    0.79  0.793970   100.00
accuracy       0.820000    0.82  0.820000     0.82
macro avg      0.820709    0.82  0.819764   300.00
weighted avg   0.820709    0.82  0.819764   300.00


## Overall Results

In [60]:
from functools import reduce
import pandas as pd

# Filenames of your CSV files
csv_files = ['metrics/confusion_matrix_fb.csv', 'metrics/confusion_matrix_tf_new.csv', 
             'metrics/confusion_matrix_ms_new.csv', 'metrics/confusion_matrix_bc.csv', 
             'metrics/confusion_matrix_oc_new.csv']

# Initialize empty DataFrame to store combined confusion matrix
combined_confusion_matrix = pd.DataFrame(columns=['feature', 'bug', 'question'])

# Loop through each file and accumulate confusion matrix
for file in csv_files:
    df = pd.read_csv(file, usecols=['TP', 'FP', 'FN', 'TN'])
    combined_confusion_matrix = combined_confusion_matrix.add(df, fill_value=0)

bug_row = combined_confusion_matrix.iloc[0]
feature_row = combined_confusion_matrix.iloc[1]
question_row = combined_confusion_matrix.iloc[2]

# Function to calculate precision, recall, and F1 score
def calculate_metrics(row):
    tp = row['TP']
    fp = row['FP']
    fn = row['FN']
    tn = row['TN']

    precision = tp / (tp + fp) if tp + fp != 0 else 0
    recall = tp / (tp + fn) if tp + fn != 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if precision + recall != 0 else 0

    return precision, recall, f1

# Calculating metrics for each category
bug_precision, bug_recall, bug_f1 = calculate_metrics(bug_row)
feature_precision, feature_recall, feature_f1 = calculate_metrics(feature_row)
question_precision, question_recall, question_f1 = calculate_metrics(question_row)

# Calculating overall metrics
overall_tp = combined_confusion_matrix['TP'].sum()
overall_fp = combined_confusion_matrix['FP'].sum()
overall_fn = combined_confusion_matrix['FN'].sum()
overall_tn = combined_confusion_matrix['TN'].sum()

overall_precision, overall_recall, overall_f1 = calculate_metrics({
    'TP': overall_tp, 'FP': overall_fp, 'FN': overall_fn, 'TN': overall_tn
})

# Formatting the results
formatted_metrics = {
    "Bug": {"Precision": "{:.4f}".format(bug_precision), 
            "Recall": "{:.4f}".format(bug_recall), 
            "F1-Score": "{:.4f}".format(bug_f1)},
    "Feature": {"Precision": "{:.4f}".format(feature_precision), 
                "Recall": "{:.4f}".format(feature_recall), 
                "F1-Score": "{:.4f}".format(feature_f1)},
    "Question": {"Precision": "{:.4f}".format(question_precision), 
                 "Recall": "{:.4f}".format(question_recall), 
                 "F1-Score": "{:.4f}".format(question_f1)},
    "Overall": {"Precision": "{:.4f}".format(overall_precision), 
                "Recall": "{:.4f}".format(overall_recall), 
                "F1-Score": "{:.4f}".format(overall_f1)}
}

formatted_metrics


{'Bug': {'Precision': '0.8644', 'Recall': '0.8540', 'F1-Score': '0.8592'},
 'Feature': {'Precision': '0.8064', 'Recall': '0.8580', 'F1-Score': '0.8314'},
 'Question': {'Precision': '0.8165', 'Recall': '0.7740', 'F1-Score': '0.7947'},
 'Overall': {'Precision': '0.8287', 'Recall': '0.8287', 'F1-Score': '0.8287'}}