# LLM Issue Domain Classification

# Installs

In [31]:
%pip install pandas emoji openai tiktoken scikit-learn openai

^C
Note: you may need to restart the kernel to use updated packages.


# Library Imports

In [2]:
# Importing libraries
import pandas as pd
import emoji
import re
import string
import json
import ast
import openai


# Data Import

In [3]:
df = pd.read_csv("DATA/linked_data.csv")

df.head()

Unnamed: 0,Issue #,PR #,Pull Request,issue text,issue description,created_at,closed_at,userlogin,author_name,most_recent_commit,...,Utility-Performance Tools,Utility-Diagnostic Utilities,Utility-Backup Tools,Test,Test-Unit Testing,Test-Integration Testing,Test-Performance Testing,Test-Security Testing,Test-Usability Testing,Test-Regression Testing
0,11281,11285,True,Position (width) of Preview inside entry edito...,### JabRef version\r\n\r\nLatest development b...,2024-05-08T19:44:13Z,2024-05-08T20:44:29Z,Siedlerchr,Siedlerchr,ce830d233d3b4d03b8fcf25b506e51d25e8ce541,...,0,0,0,0,0,0,0,0,0,0
1,11269,11271,True,Selected group loses focus after paste doi,JabRef 5.14--2024-04-30--4f87740\r\nWindows 11...,2024-05-01T11:21:50Z,2024-05-02T09:39:47Z,Siedlerchr,Siedlerchr,3cb6756682e53ebd7dba092ee52beae95befa450,...,0,0,0,0,0,0,0,0,0,0
2,11267,11268,True,"Keep search string across library shows ""No re...",### JabRef version\n\nLatest development branc...,2024-04-30T05:00:17Z,2024-04-30T06:14:38Z,LoayGhreeb,Loay Ghreeb,ad4e88609852069c745183cb99bcd7c9eaf4ac63,...,0,0,0,0,0,0,0,0,0,0
3,11254,11255,True,Drag and dropping an entry from one library to...,### JabRef version\n\n5.13 (latest release)\n\...,2024-04-28T18:35:47Z,2024-04-30T12:33:54Z,HoussemNasri,HoussemNasri,9f259c9c22e570ab33b2f45d608add1e064b1391,...,0,0,0,0,0,0,0,0,0,0
4,11198,11195,True,InaccessibleObjectException,### JabRef version\n\nLatest development branc...,2024-04-15T11:04:15Z,2024-04-15T21:08:09Z,koppor,Carl Christian Snethlage,cff1f640734f38506ce9bf453e651252538187d0,...,0,0,0,0,0,0,0,0,0,0


# Preprocessing

Preprocessing steps involve cleaning the issue title and body text. On first approach I am using the approach found in the folowing project, https://github.com/G4BE-334/NLBSE-issue-report-classification/blob/main/issueclassificationgpt.ipynb 
This is doing similar work in labeling github issues and provides a way to remove noise commonly found in github text (html tags, links, etc.)

## Filter Out Rare and Common Domains

Filtering out domains based on a threshold allows us to prioritize/emphasize domains that are common in a repository with common domains (present in more than 90% of issues) representing skills that are required for the repo as a whole and rare domains (present in less than 10% of issues) representing skills that are rarely required in the repo.

In [65]:
df = pd.read_csv("DATA/linked_data.csv")

df.head()
domains = df.columns[15:]
columns_to_drop = []
occurrence_dictionary = {}
for domain in domains:
    column_values = df[domain].tolist()
    occurrence = column_values.count(1)
    lower_bound = int(len(df) * 0.40)
    upper_bound = int(len(df) * 0.80)
    
    if occurrence < lower_bound or occurrence > upper_bound:
        columns_to_drop.append(domain)
    else:
        occurrence_dictionary[domain] = occurrence

df = df.drop(columns=columns_to_drop)

print(len(df.columns[15:]))
df.columns[15:]

22


Index(['Computer Graphics', 'Data Structure-Tree Structures',
       'Data Structure-Data Sorting', 'Data Structure-Search Algorithms',
       'Data Structure-Data Manipulation', 'Databases',
       'Databases-Query Execution',
       'Software Development and IT Operations-Version Control',
       'Software Development and IT Operations-Monitoring and Logging',
       'Error Handling', 'Event Handling',
       'Event Handling-Event Driven Processing', 'Input-Output', 'Logic',
       'Language-Standard Libraries', 'Parser', 'Parser-Data Conversion',
       'Parser-Validation', 'User Interface-Layout Design',
       'User Interface-Interaction Design', 'User Interface-Accessibility',
       'User Interface-User Feedback'],
      dtype='object')

In [66]:
print(occurrence_dictionary)

{'Computer Graphics': 596, 'Data Structure-Tree Structures': 641, 'Data Structure-Data Sorting': 510, 'Data Structure-Search Algorithms': 569, 'Data Structure-Data Manipulation': 636, 'Databases': 664, 'Databases-Query Execution': 402, 'Software Development and IT Operations-Version Control': 774, 'Software Development and IT Operations-Monitoring and Logging': 653, 'Error Handling': 765, 'Event Handling': 447, 'Event Handling-Event Driven Processing': 400, 'Input-Output': 487, 'Logic': 659, 'Language-Standard Libraries': 752, 'Parser': 714, 'Parser-Data Conversion': 407, 'Parser-Validation': 451, 'User Interface-Layout Design': 643, 'User Interface-Interaction Design': 785, 'User Interface-Accessibility': 528, 'User Interface-User Feedback': 721}


In [67]:
def sort_dict_by_values(d, reverse=True):
    # Sort the dictionary by its values
    sorted_dict = dict(sorted(d.items(), key=lambda item: item[1], reverse=reverse))
    return sorted_dict
    
def get_top_domains(n, d, df):
    counter = 1
    columns_to_drop = []
    for key, value in d.items():
        if counter > n:
            columns_to_drop.append(key)
        counter += 1
    df = df.drop(columns=columns_to_drop)
    return df
occurrence_dictionary = sort_dict_by_values(occurrence_dictionary)
print(occurrence_dictionary)



{'User Interface-Interaction Design': 785, 'Software Development and IT Operations-Version Control': 774, 'Error Handling': 765, 'Language-Standard Libraries': 752, 'User Interface-User Feedback': 721, 'Parser': 714, 'Databases': 664, 'Logic': 659, 'Software Development and IT Operations-Monitoring and Logging': 653, 'User Interface-Layout Design': 643, 'Data Structure-Tree Structures': 641, 'Data Structure-Data Manipulation': 636, 'Computer Graphics': 596, 'Data Structure-Search Algorithms': 569, 'User Interface-Accessibility': 528, 'Data Structure-Data Sorting': 510, 'Input-Output': 487, 'Parser-Validation': 451, 'Event Handling': 447, 'Parser-Data Conversion': 407, 'Databases-Query Execution': 402, 'Event Handling-Event Driven Processing': 400}


In [68]:
num_of_domains = 15

df = get_top_domains(num_of_domains, occurrence_dictionary, df)
df.columns[15:]

Index(['Computer Graphics', 'Data Structure-Tree Structures',
       'Data Structure-Search Algorithms', 'Data Structure-Data Manipulation',
       'Databases', 'Software Development and IT Operations-Version Control',
       'Software Development and IT Operations-Monitoring and Logging',
       'Error Handling', 'Logic', 'Language-Standard Libraries', 'Parser',
       'User Interface-Layout Design', 'User Interface-Interaction Design',
       'User Interface-Accessibility', 'User Interface-User Feedback'],
      dtype='object')

## Drop NaN Values

In [32]:
print(len(df))
df.dropna(subset=['issue description'], inplace=True)
print(len(df))

997
997


## Clean Issue Text

In [33]:
cleaned_count = 0
original_count = 0

# Text cleaning function
def clean_text(text):
    global cleaned_count, original_count

    if not isinstance(text, str):
        original_count += 1
        return text

    # Remove double quotation marks
    text = text.replace('"', '')

    # Remove text starting with "DevTools" and ending with "(automated)"
    text = re.sub(r'DevTools.*?\(automated\)', '', text)

    # Lowercasing should be one of the first steps to ensure uniformity
    text = text.lower()

    # Remove emojis
    text = emoji.demojize(text)

    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)

    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # Remove special characters and punctuation
    text = re.sub(f"[{re.escape(string.punctuation)}]", '', text)

    # Remove '#' characters
    text = text.replace("#", "")

    # Remove consecutive whitespaces and replace with a single space
    text = re.sub(r'\s+', ' ', text)

    # Split the text into words
    words = text.split()

    # Remove words that are over 20 characters
    words = [word for word in words if len(word) <= 20]

    # Join the remaining words back into cleaned text
    cleaned_text = ' '.join(words)

    cleaned_count += 1
    return cleaned_text

df['issue text'] = df['issue text'].apply(clean_text)
df['issue description'] = df['issue description'].apply(clean_text)
df.head()

Unnamed: 0,Issue #,PR #,Pull Request,issue text,issue description,created_at,closed_at,userlogin,author_name,most_recent_commit,...,User Interface-Accessibility,User Interface-Animation,User Interface-User Feedback,Utility,Utility-Data Conversion,Utility-System Tools,Utility-Automation Scripts,Utility-Diagnostic Utilities,Test,Test-Unit Testing
0,11281,11285,True,position width of preview inside entry editor ...,jabref version latest development branch build...,2024-05-08T19:44:13Z,2024-05-08T20:44:29Z,Siedlerchr,Siedlerchr,ce830d233d3b4d03b8fcf25b506e51d25e8ce541,...,0,0,1,0,0,0,0,0,0,0
1,11269,11271,True,selected group loses focus after paste doi,jabref 514202404304f87740 windows 11 100 amd64...,2024-05-01T11:21:50Z,2024-05-02T09:39:47Z,Siedlerchr,Siedlerchr,3cb6756682e53ebd7dba092ee52beae95befa450,...,0,1,1,0,0,0,0,0,0,0
2,11267,11268,True,keep search string across library shows no res...,jabref version latest development branch build...,2024-04-30T05:00:17Z,2024-04-30T06:14:38Z,LoayGhreeb,Loay Ghreeb,ad4e88609852069c745183cb99bcd7c9eaf4ac63,...,1,0,1,0,0,0,0,0,0,0
3,11254,11255,True,drag and dropping an entry from one library to...,jabref version 513 latest release operating sy...,2024-04-28T18:35:47Z,2024-04-30T12:33:54Z,HoussemNasri,HoussemNasri,9f259c9c22e570ab33b2f45d608add1e064b1391,...,0,0,1,0,0,0,0,0,0,0
4,11198,11195,True,,jabref version latest development branch build...,2024-04-15T11:04:15Z,2024-04-15T21:08:09Z,koppor,Carl Christian Snethlage,cff1f640734f38506ce9bf453e651252538187d0,...,1,0,1,0,0,0,0,0,0,0


# Data Transformation

Generate messages based on our dataset that will be used to fine-tune the gpt model, ensuring that the messages are clear in what we want the model to do.

## Generate System Message

Load in domains and reformat the json for clarity, based on past runs this format ensures that the fine tuned model returns its response in json format making it easier interpret the results and ensures that each domain is being acknowledged by the model.

In [69]:
with open("Domains.json", 'r') as file:
    dictionary = json.load(file)


In [73]:
formatted_domains = {}
gpt_output = {}

# reformat domains to increase clarity for gpt model and create dictionary with only domains/subdomains (to serve as expected gpt output)
for key, value in dictionary.items():
    if key in df.columns:
        formatted_domains[key] = 'Domain'
        gpt_output[key] = 0
    # iterate through each subdomain in list and add to dictionary
    for i in range(len(value)):
        subdomain, description = list(value[i].items())[0]
        if subdomain in df.columns:
            formatted_domains[subdomain] = description
            gpt_output[subdomain] = 0



# convert to string to pass to gpt model
domains_string = str(formatted_domains)
print(len(gpt_output.keys()))
print(gpt_output.keys())

15
dict_keys(['Computer Graphics', 'Data Structure-Tree Structures', 'Data Structure-Search Algorithms', 'Data Structure-Data Manipulation', 'Databases', 'Software Development and IT Operations-Version Control', 'Software Development and IT Operations-Monitoring and Logging', 'Error Handling', 'Logic', 'Language-Standard Libraries', 'Parser', 'User Interface-Layout Design', 'User Interface-Interaction Design', 'User Interface-Accessibility', 'User Interface-User Feedback'])


## Data Split

Initial run has a small testing set to minimize OpenAI call costs

In [42]:
# Randomly sample 70% of the rows for the first DataFrame
training_df = df.sample(frac=0.70, random_state=1)

# The remaining 30% of the rows for the second DataFrame
testing_df = df.drop(training_df.index)
print(len(training_df))
print(len(testing_df))
training_df.head()
len(df)


698
299


997

## Generate GPT Messages

Messages will then be stored in a json as this is a suitable format for training the gpt model. System messages are constant and consist of the context behind the domains and subdomains. User messages are variable and consist of the task as well as the issue's title and description. Assistant messages are also variable and consist of every domain with a 1 or 0 attached in json format.

In [29]:

# Open the file in write mode
with open('gpt_messages.jsonl', 'w', encoding='utf-8') as f:
    assistant_message = gpt_output
    # Iterate over the rows in the DataFrame
    for index, row in training_df.iterrows():
        # Create the user message by formatting the prompt with the title and body
        user_message = (
        f"Classify a GitHub issue by indicating whether each domain and subdomain is relevant to the issue based on its title: [{row['issue text']}] "
        f"and body: [{row['issue description']}]. Ensure that every domain/subdomain is accounted for, and its relevance is indicated with a 1 (relevant) or a 0 (not relevant)."
        )

        # logic to update assistant message with values in df
        for column in df.columns:
            if column in gpt_output:
                if row[column] > 0:
                    assistant_message[column] = 1
                else:
                    assistant_message[column] = 0

        
        # Construct the conversation object
        conversation_object = {
            "messages": [
                {"role": "system", "content": "Refer to these domains and subdomains when classifying " + domains_string},
                {"role": "user", "content": user_message},
                {"role": "assistant", "content": str(assistant_message)}
                ]
        }
        
        # Write the conversation object to one line in the file
        f.write(json.dumps(conversation_object, ensure_ascii=False) + '\n')

# Model Fine Tuning

## Setup

Create Client using OpenAI API key

In [9]:
# Invoking the API
from openai import OpenAI
client = OpenAI(api_key = 'API-Key')

## Training File Upload

Upload training file

In [31]:
## Uploading a training file
domain_classifier_training_file = client.files.create(
  file=open("gpt_messages.jsonl", "rb"),
  purpose="fine-tune"
)
domain_classifier_training_file

FileObject(id='file-QHqqCkqyTijjbJ8ILihy0ZtV', bytes=6833630, created_at=1718127311, filename='gpt_messages.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

## Create Fine Tuning Job

In [32]:
print(domain_classifier_training_file.id)

file-QHqqCkqyTijjbJ8ILihy0ZtV


In [34]:
## Creating a fine-tuned model
ft_job_dc = client.fine_tuning.jobs.create(
  training_file=domain_classifier_training_file.id, 
  model="gpt-3.5-turbo",
  suffix= "domain_classifier"
)

ft_job_dc

FineTuningJob(id='ftjob-N6Gagn9cBFjAEFboWArMimES', created_at=1718129292, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-RQmLagMyfsDY9gy4UMq97uCI', result_files=[], seed=1083576369, status='validating_files', trained_tokens=None, training_file='file-QHqqCkqyTijjbJ8ILihy0ZtV', validation_file=None, estimated_finish=None, integrations=[], user_provided_suffix='domain_classifier')

## Check Status

In [68]:
# Retrieving the state of a fine-tune
issue_classifier = client.fine_tuning.jobs.retrieve(ft_job_dc.id).fine_tuned_model
print(issue_classifier)

ft:gpt-3.5-turbo-0125:northern-arizona-university-nau:domain-classifier:9Z1Ry2K4


In [67]:
# You can track the progress of your fine-tuning job by listing the lastest events. 
client.fine_tuning.jobs.list_events(fine_tuning_job_id=ft_job_dc.id, limit=20)

SyncCursorPage[FineTuningJobEvent](data=[FineTuningJobEvent(id='ftevent-djvSJ4fKzn8Pk1eeCRAJmN1c', created_at=1718134091, level='info', message='The job has successfully completed', object='fine_tuning.job.event', data={}, type='message'), FineTuningJobEvent(id='ftevent-1hriGuZ2za9iX3JC6x3OFlHs', created_at=1718134083, level='info', message='New fine-tuned model created: ft:gpt-3.5-turbo-0125:northern-arizona-university-nau:domain-classifier:9Z1Ry2K4', object='fine_tuning.job.event', data={}, type='message'), FineTuningJobEvent(id='ftevent-D7MYJsGgXg8nOh9sLvsEP63F', created_at=1718134083, level='info', message='Checkpoint created at step 1396 with Snapshot ID: ft:gpt-3.5-turbo-0125:northern-arizona-university-nau:domain-classifier:9Z1RynEG:ckpt-step-1396', object='fine_tuning.job.event', data={}, type='message'), FineTuningJobEvent(id='ftevent-BVMoD3P0QD4CWpI8MLrPLJH3', created_at=1718134083, level='info', message='Checkpoint created at step 698 with Snapshot ID: ft:gpt-3.5-turbo-0125:

# Testing Fine Tuned Model

## Get Testing Data Responses

In [43]:
response_columns = testing_df.columns[16:]
response_columns = response_columns.insert(0, 'Issue #')

response_df = pd.DataFrame(columns=response_columns)
response_df.columns

Index(['Issue #', 'Big Data-Data Processing', 'Computer Graphics',
       'Computer Graphics-Image Rendering', 'Computer Graphics-Animation',
       'Computer Graphics-Modeling', 'Computer Graphics-Texture Mapping',
       'Computer Graphics-Graphics Optimization', 'Data Structure',
       'Data Structure-Linear Structures',
       ...
       'User Interface-Accessibility', 'User Interface-Animation',
       'User Interface-User Feedback', 'Utility', 'Utility-Data Conversion',
       'Utility-System Tools', 'Utility-Automation Scripts',
       'Utility-Diagnostic Utilities', 'Test', 'Test-Unit Testing'],
      dtype='object', length=106)

In [47]:
import concurrent.futures
import openai

def query_chatGPT(user_message, system_message, max_retries=5):
    attempt = 0

    # attempt to query model
    while attempt < max_retries:
        with concurrent.futures.ThreadPoolExecutor() as executor:
            future = executor.submit(
                client.chat.completions.create,
                model=issue_classifier,
                messages=[
                    {"role": "user", "content": user_message}
                ]
            )
            try:
                response = future.result()
                return response.choices[0].message.content
            except Exception as e:
                print(f"Attempt {attempt + 1}/{max_retries} - An error occurred: {e}")
            finally:
                attempt += 1

    print("Failed to get a response after several retries.")
    return None

In [48]:
all_responses = {}
for index, row in testing_df.iterrows():

    # create user and system messages
    user_message = (
    f"Classify a GitHub issue by indicating up to THREE domains and subdomains that are relevant to the issue based on its title: [{row['issue text']}] "
    f"and body: [{row['issue description']}]. Prioritize positive precision by marking an issue with a 1 only when VERY CERTAIN a domain is relevant to the issue text. Ensure that you only provide three domains and refer to ONLY THESE domains and subdomains when classifying: {domains_string}."
    f"\n\nImportant: only provide the name of the domains in list format."
    )
    system_message = "Refer to these domains and subdomains when classifying " + domains_string

    # query fine tuned model
    response = query_chatGPT(user_message, system_message)
    all_responses[row['Issue #']] = response
    print("Issue #"+str(row['Issue #'])+" complete")

with open('gpt_predictions.json', 'w') as json_file:
    json.dump(all_responses, json_file, indent=4)

Issue #11171 complete
Issue #11135 complete
Issue #11085 complete
Issue #10996 complete
Issue #10993 complete
Issue #10985 complete
Issue #10970 complete
Issue #10966 complete
Issue #10959 complete
Issue #10948 complete
Issue #10929 complete
Issue #10872 complete
Issue #10764 complete
Issue #10677 complete
Issue #10660 complete
Issue #10589 complete
Issue #10584 complete
Issue #10507 complete
Issue #10499 complete
Issue #10452 complete
Issue #10431 complete
Issue #10424 complete
Issue #10415 complete
Issue #10404 complete
Issue #10348 complete
Issue #10336 complete
Issue #10248 complete
Issue #10149 complete
Issue #10081 complete
Issue #10017 complete
Issue #9881 complete
Issue #9863 complete
Issue #9826 complete
Issue #9803 complete
Issue #9778 complete
Issue #9754 complete
Issue #9742 complete
Issue #9708 complete
Issue #9699 complete
Issue #9609 complete
Issue #9606 complete
Issue #9576 complete
Issue #9540 complete
Issue #9492 complete
Issue #9473 complete
Issue #9467 complete
Issu

KeyboardInterrupt: 

## Calculate Testing Metrics

In [93]:
with open('gpt_predictions.json', 'r') as file:
    json_content = file.read()
    responses = json.loads(json_content)

domains = training_df.columns[15:]

scores = {}

for domain in domains:
    scores[domain] = [0, 0, 0, 0]

print(scores)
# Array formatted as [FP, TP, FN, TN]

{'Big Data': [0, 0, 0, 0], 'Big Data-Data Processing': [0, 0, 0, 0], 'Computer Graphics': [0, 0, 0, 0], 'Computer Graphics-Image Rendering': [0, 0, 0, 0], 'Computer Graphics-Animation': [0, 0, 0, 0], 'Computer Graphics-Modeling': [0, 0, 0, 0], 'Computer Graphics-Texture Mapping': [0, 0, 0, 0], 'Computer Graphics-Graphics Optimization': [0, 0, 0, 0], 'Data Structure': [0, 0, 0, 0], 'Data Structure-Linear Structures': [0, 0, 0, 0], 'Data Structure-Tree Structures': [0, 0, 0, 0], 'Data Structure-Graph Structures': [0, 0, 0, 0], 'Data Structure-Data Sorting': [0, 0, 0, 0], 'Data Structure-Search Algorithms': [0, 0, 0, 0], 'Data Structure-Data Manipulation': [0, 0, 0, 0], 'Databases': [0, 0, 0, 0], 'Databases-Query Execution': [0, 0, 0, 0], 'Databases-Transaction Management': [0, 0, 0, 0], 'Databases-Schema Design': [0, 0, 0, 0], 'Databases-Database Security': [0, 0, 0, 0], 'Databases-Backup and Recovery': [0, 0, 0, 0], 'Databases-Database Optimization': [0, 0, 0, 0], 'Software Development 

### Get FP, TP, FN, TN

In [94]:
# Response as list
columns = testing_df.columns[15:]
for index, row in testing_df.iterrows():
    if str(row['Issue #']) in responses:
        try:
            curr_response = ast.literal_eval(responses[str(row['Issue #'])])
            print(row['Issue #'])
            for domain in columns:
                if domain in curr_response:
                    pred_y = 1
                    true_y = row[domain]
                else:
                    pred_y = 0
                
                # false positive
                if true_y == 0 and pred_y == 1:
                    scores[domain][0] += 1

                # true positive
                elif true_y == 1 and pred_y == 1:
                    scores[domain][1] += 1

                # false negative
                elif true_y == 1 and pred_y == 0:
                    scores[domain][2] += 1

                #true negative
                elif true_y == 0 and pred_y == 0:
                    scores[domain][3] += 1
        except ValueError:
            # Handle the case where the string is not properly formatted
            print("Issue #" + str(row["Issue #"]) + " response not in json format")
    else:
        print("Issue #" + str(row['Issue #']) + " Issue not in response json")



11171
11135
11085
10996
10993
10985
10970
10966
10959
10948
10929
10872
10764
10677
10660
10589
10584
10507
10499
10452
10431
10424
10415
10404
10348
10336
10248
10149
10081
10017
9881
9863
9826
9803
9778
9754
9742
9708
9699
9609
9606
9576
9540
9492
9473
9467
9432
9386
9367
9361
9334
9267
9221
9198
9169
9157
9147
9087
9066
9064
9056
9053
9017
8933
8928
8849
8832
8797
8788
8745
8655
8654
8653
8652
8540
8525
8510
8468
8466
8444
8420
8417
8396
8390
8372
8322
8310
8276
8265
8230
8198
8169
8133
8127
8115
8107
8104
8094
8087
8071
8055
8024
8011
7982
7961
7854
7719
7716
7590
7554
7420
7411
7382
7356
7354
7346
7291
7264
7205
7177
7114
7108
7058
7019
6967
6869
6851
6848
6819
6796
6787
6753
6730
6696
6692
6639
6638
6625
6624
6601
6591
6589
6588
6536
6527
6515
6487
6465
6456
6453
6421
6405
6383
6307
6303
6297
6266
6177
6170
6146
6109
6099
6091
6078
6017
5935
5919
5905
5897
5891
5861
5846
5833
5815
5753
5701
5664
5653
5546
5537
5458
5447
5374
5334
5333
5275
5261
5220
5198
5194
5164
5109
5028
5019


In [95]:
print(scores)

{'Big Data': [0, 0, 221, 78], 'Big Data-Data Processing': [0, 0, 221, 78], 'Computer Graphics': [2, 7, 216, 74], 'Computer Graphics-Image Rendering': [0, 0, 223, 76], 'Computer Graphics-Animation': [0, 0, 223, 76], 'Computer Graphics-Modeling': [0, 0, 223, 76], 'Computer Graphics-Texture Mapping': [0, 0, 223, 76], 'Computer Graphics-Graphics Optimization': [0, 0, 223, 76], 'Data Structure': [0, 0, 223, 76], 'Data Structure-Linear Structures': [0, 0, 223, 76], 'Data Structure-Tree Structures': [2, 21, 208, 68], 'Data Structure-Graph Structures': [0, 0, 229, 70], 'Data Structure-Data Sorting': [0, 0, 229, 70], 'Data Structure-Search Algorithms': [1, 3, 225, 70], 'Data Structure-Data Manipulation': [27, 51, 173, 48], 'Databases': [7, 8, 216, 68], 'Databases-Query Execution': [0, 0, 224, 75], 'Databases-Transaction Management': [0, 0, 224, 75], 'Databases-Schema Design': [0, 0, 224, 75], 'Databases-Database Security': [0, 0, 224, 75], 'Databases-Backup and Recovery': [0, 0, 224, 75], 'Data

### Calculate Metrics For Each Class

In [60]:
# Data Format: [FP, TP, FN, TN]

performance_columns = ['Domain', 'Accuracy', 'Precision (1)', 'Recall (1)', 'F1 (1)', 'Precision (0)', 'Recall (0)', 'F1 (0)', 'Confusion Matrix']

performance_df = pd.DataFrame(columns=performance_columns)
sum_list = [0, 0, 0 ,0]
for key, value in scores.items():
    sum_list[0] += value[0]
    sum_list[1] += value[1]
    sum_list[2] += value[2]
    sum_list[3] += value[3]
    
    accuracy = (value[1] + value[3]) / sum(value)
    accuracy = float("{:.2f}".format(accuracy))

    # calculate metrics for positive cases
    positive_precision = (value[1]) / (value[1] + value[0]) if (value[1] + value[0]) > 0 else 1
    positive_precision = float("{:.2f}".format(positive_precision))

    positive_recall = (value[1]) / (value[1] + value[2]) if (value[1] + value[2]) > 0 else 1
    positive_recall = float("{:.2f}".format(positive_recall))

    positive_F1 = 2 * (((positive_recall * positive_recall)) / (positive_precision + positive_recall)) if (positive_precision + positive_recall) > 0 else 1
    positive_F1 = float("{:.2f}".format(positive_F1))

    # calculate metrics for negative cases
    negative_precision = (value[3]) / (value[2] + value[3]) if (value[2] + value[3]) > 0 else 1
    negative_precision = float("{:.2f}".format(negative_precision))

    negative_recall = (value[3]) / (value[3] + value[0]) if (value[3] + value[0]) > 0 else 1
    negative_recall = float("{:.2f}".format(negative_recall))

    negative_F1 = 2 * (((negative_precision * negative_recall)) / (negative_precision + negative_recall)) if (negative_precision + negative_recall) > 0 else 1
    negative_F1 = float("{:.2f}".format(negative_F1))
    
    confusion_matrix = "0[{}  {}]\n1[{}  {}]\n   0     1".format(value[3], value[0], value[2], value[1])
    new_row = [key, accuracy, positive_precision, positive_recall, positive_F1, negative_precision, negative_recall, negative_F1, confusion_matrix]
    performance_df.loc[len(performance_df)] = new_row

In [61]:
performance_df.to_csv('linked_issue_performance.csv', index=False)

### Compute Micro and Macro Metrics

In [62]:
print(sum_list)

[31429, 12527, 19, 438]


In [63]:
# Positive Micro
positive_micro_precision = (sum_list[1]) / (sum_list[0] + sum_list[1])
positive_micro_recall = (sum_list[1]) / (sum_list[0] + sum_list[2])
positive_micro_F1 = 2 * ((positive_micro_precision * positive_micro_recall) / (positive_micro_precision + positive_micro_recall))

print('Micro positive precision:', positive_micro_precision)
print('Micro positive recall:', positive_micro_recall)
print('Micro positive F1:', positive_micro_F1)

# Negative Micro 
negative_micro_precision = (sum_list[3])/(sum_list[2] + sum_list[3])
negative_micro_recall = (sum_list[3])/(sum_list[3] + sum_list[0])
negative_micro_F1 = 2 * ((negative_micro_precision * negative_micro_recall) / (negative_micro_precision + negative_micro_recall))

print('Micro negative precision:', negative_micro_precision)
print('Micro negative recall:', negative_micro_recall)
print('Micro negative F1:', negative_micro_F1)

# Micro Accuracy
micro_accuracy = (sum_list[1] + sum_list[3]) / sum(sum_list)
print('micro accuracy:', micro_accuracy)

Micro positive precision: 0.284989534989535
Micro positive recall: 0.39834011701857036
Micro positive F1: 0.332263540395735
Micro negative precision: 0.9584245076586433
Micro negative recall: 0.013744626102237424
Micro negative F1: 0.027100606360598932
micro accuracy: 0.29191903271564634


In [64]:
# positive
positive_macro_precision = performance_df['Precision (1)'].mean()
positive_macro_recall = performance_df['Recall (1)'].mean()
positive_macro_F1 = 2 * (positive_macro_precision * positive_macro_recall) / (positive_macro_precision + positive_macro_recall)

print('Positive Precision:', positive_macro_precision)
print('Positive Recall:', positive_macro_recall)
print('Positive F1:', positive_macro_F1)

# negative
negative_macro_precision = performance_df['Precision (0)'].mean()
negative_macro_recall = performance_df['Recall (0)'].mean()
negative_macro_F1 = 2 * (negative_macro_precision * negative_macro_recall) / (negative_macro_precision + negative_macro_recall)

print('Negative Precision:', negative_macro_precision)
print('Negative Recall:', negative_macro_recall)
print('Negative F1:', negative_macro_F1)

Positive Precision: 0.28273584905660376
Positive Recall: 0.9901886792452831
Positive F1: 0.4398718552876474
Negative Precision: 0.8395283018867924
Negative Recall: 0.008584905660377359
Negative F1: 0.016996012340755977
