# LLM Issue Domain Classification

# Installs

In [31]:
%pip install pandas emoji openai tiktoken scikit-learn openai

^C
Note: you may need to restart the kernel to use updated packages.


# Library Imports

In [37]:
# Importing libraries
import pandas as pd
import emoji
import re
import string
import json
import ast


# Data Import

In [38]:
df = pd.read_csv("DATA/pr_data_binary.csv")

df.head()

Unnamed: 0,PR #,Pull Request,issue text,issue description,created_at,closed_at,userlogin,author_name,most_recent_commit,filename,...,Utility-Performance Tools,Utility-Diagnostic Utilities,Utility-Backup Tools,Test,Test-Unit Testing,Test-Integration Testing,Test-Performance Testing,Test-Security Testing,Test-Usability Testing,Test-Regression Testing
0,1,True,New Sorting/Export preferences,"This will add a new ""File Sorting"" Tab to the ...",2014-03-12T09:26:29Z,2014-03-12T11:38:01Z,olenz,Olaf Lenz,6db780e96882aca9fa3170afd13a7965dfb61a69,src/java/net/sf/jabref/BibtexFields.java,...,0,0,0,0,0,0,0,0,0,0
1,2,True,Basic gradle integration,This adds basic gradle integration. The projec...,2014-03-12T17:35:16Z,2014-03-12T18:29:22Z,simonharrer,Simon Harrer,a66468793a5dcba41f0f9307f8d762baecc7f331,src/extensions/SimpleCsvImporter.java,...,0,0,0,0,0,0,0,0,0,0
2,7,True,I have implemented 2 features requested in Ti...,``` The user can now specify an arbitrary numb...,2014-03-23T04:53:18Z,2014-03-23T16:03:12Z,noravanq,noravanq,233203728c3168172527970b53b311be27f75474,src/main/java/net/sf/jabref/gui/MainTableSelec...,...,0,1,0,0,0,0,0,0,0,0
3,8,True,PDF-file metadata: Privacy Filtering all metadata,This pull-request pertains to the addition of ...,2014-04-23T12:58:24Z,2014-04-23T21:23:03Z,adaerr,Adrian Daerr,2360cb74d2e8d88e1bc705db36e885e44607642f,src/main/java/net/sf/jabref/util/XMPUtil.java,...,0,0,0,0,0,0,0,0,0,0
4,9,True,Support FindFullText with ACS DOIs,Adds a FullTextFinder implementation to transf...,2014-05-06T06:36:03Z,2014-05-20T12:53:43Z,ansell,Peter Ansell,57cbe70e1be14f7b8f8754e05f6b722ed72ce887,src/test/java/net/sf/jabref/external/AcsPdfTes...,...,0,0,0,1,0,0,0,0,0,0


# Preprocessing

Preprocessing steps involve cleaning the issue title and body text. On first approach I am using the approach found in the folowing project, https://github.com/G4BE-334/NLBSE-issue-report-classification/blob/main/issueclassificationgpt.ipynb 
This is doing similar work in labeling github issues and provides a way to remove noise commonly found in github text (html tags, links, etc.)

In [39]:
print(len(df))
df.dropna(subset=['issue description'], inplace=True)
print(len(df))

504
419


In [40]:
cleaned_count = 0
original_count = 0

# Text cleaning function
def clean_text(text):
    global cleaned_count, original_count

    if not isinstance(text, str):
        original_count += 1
        return text

    # Remove double quotation marks
    text = text.replace('"', '')

    # Remove text starting with "DevTools" and ending with "(automated)"
    text = re.sub(r'DevTools.*?\(automated\)', '', text)

    # Lowercasing should be one of the first steps to ensure uniformity
    text = text.lower()

    # Remove emojis
    text = emoji.demojize(text)

    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)

    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # Remove special characters and punctuation
    text = re.sub(f"[{re.escape(string.punctuation)}]", '', text)

    # Remove '#' characters
    text = text.replace("#", "")

    # Remove consecutive whitespaces and replace with a single space
    text = re.sub(r'\s+', ' ', text)

    # Split the text into words
    words = text.split()

    # Remove words that are over 20 characters
    words = [word for word in words if len(word) <= 20]

    # Join the remaining words back into cleaned text
    cleaned_text = ' '.join(words)

    cleaned_count += 1
    return cleaned_text

df['issue text'] = df['issue text'].apply(clean_text)
df['issue description'] = df['issue description'].apply(clean_text)
df.head()

Unnamed: 0,PR #,Pull Request,issue text,issue description,created_at,closed_at,userlogin,author_name,most_recent_commit,filename,...,Utility-Performance Tools,Utility-Diagnostic Utilities,Utility-Backup Tools,Test,Test-Unit Testing,Test-Integration Testing,Test-Performance Testing,Test-Security Testing,Test-Usability Testing,Test-Regression Testing
0,1,True,new sortingexport preferences,this will add a new file sorting tab to the pr...,2014-03-12T09:26:29Z,2014-03-12T11:38:01Z,olenz,Olaf Lenz,6db780e96882aca9fa3170afd13a7965dfb61a69,src/java/net/sf/jabref/BibtexFields.java,...,0,0,0,0,0,0,0,0,0,0
1,2,True,basic gradle integration,this adds basic gradle integration the project...,2014-03-12T17:35:16Z,2014-03-12T18:29:22Z,simonharrer,Simon Harrer,a66468793a5dcba41f0f9307f8d762baecc7f331,src/extensions/SimpleCsvImporter.java,...,0,0,0,0,0,0,0,0,0,0
2,7,True,i have implemented 2 features requested in tic...,the user can now specify an arbitrary number o...,2014-03-23T04:53:18Z,2014-03-23T16:03:12Z,noravanq,noravanq,233203728c3168172527970b53b311be27f75474,src/main/java/net/sf/jabref/gui/MainTableSelec...,...,0,1,0,0,0,0,0,0,0,0
3,8,True,pdffile metadata privacy filtering all metadata,this pullrequest pertains to the addition of m...,2014-04-23T12:58:24Z,2014-04-23T21:23:03Z,adaerr,Adrian Daerr,2360cb74d2e8d88e1bc705db36e885e44607642f,src/main/java/net/sf/jabref/util/XMPUtil.java,...,0,0,0,0,0,0,0,0,0,0
4,9,True,support findfulltext with acs dois,adds a fulltextfinder implementation to transf...,2014-05-06T06:36:03Z,2014-05-20T12:53:43Z,ansell,Peter Ansell,57cbe70e1be14f7b8f8754e05f6b722ed72ce887,src/test/java/net/sf/jabref/external/AcsPdfTes...,...,0,0,0,1,0,0,0,0,0,0


# Data Transformation

Generate messages based on our dataset that will be used to fine-tune the gpt model, ensuring that the messages are clear in what we want the model to do.

## Filter Out Rare and Common Domains

Filtering out domains based on a threshold allows us to prioritize/emphasize domains that are common in a repository with common domains (present in more than 90% of issues) representing skills that are required for the repo as a whole and rare domains (present in less than 10% of issues) representing skills that are rarely required in the repo.

In [41]:
domains = df.columns[15:]
columns_to_drop = []
for domain in domains:
    column_values = df[domain].tolist()
    occurrence = column_values.count(1)
    lower_bound = int(len(df) * 0.1)
    upper_bound = int(len(df) * 0.9)
    
    if occurrence < lower_bound or occurrence > upper_bound:
        columns_to_drop.append(domain)

df = df.drop(columns=columns_to_drop)
    
df.columns

Index(['PR #', 'Pull Request', 'issue text', 'issue description', 'created_at',
       'closed_at', 'userlogin', 'author_name', 'most_recent_commit',
       'filename', 'file_commit', 'api', 'function_name', 'api_domain',
       'subdomain', 'Computer Graphics', 'Computer Graphics-Image Rendering',
       'Computer Graphics-Modeling', 'Computer Graphics-Texture Mapping',
       'Computer Graphics-Graphics Optimization',
       'Data Structure-Linear Structures', 'Data Structure-Tree Structures',
       'Data Structure-Data Sorting', 'Data Structure-Search Algorithms',
       'Data Structure-Data Manipulation', 'Databases',
       'Databases-Query Execution', 'Databases-Transaction Management',
       'Databases-Schema Design', 'Databases-Database Security',
       'Databases-Backup and Recovery', 'Databases-Database Optimization',
       'Software Development and IT Operations-Automated Testing',
       'Software Development and IT Operations-Configuration Management',
       'Software

### Generate System Message

Load in domains and reformat the json for clarity, based on past runs this format ensures that the fine tuned model returns its response in json format making it easier interpret the results and ensures that each domain is being acknowledged by the model.

In [42]:
with open("Domains.json", 'r') as file:
    dictionary = json.load(file)


In [43]:
formatted_domains = {}
gpt_output = {}

# reformat domains to increase clarity for gpt model and create dictionary with only domains/subdomains (to serve as expected gpt output)
for key, value in dictionary.items():
    gpt_output[key] = 0

    if key in df.columns:
        formatted_domains[key] = 'Domain'
        gpt_output[key] = 0
    # iterate through each subdomain in list and add to dictionary
    for i in range(len(value)):
        subdomain, description = list(value[i].items())[0]
        if subdomain in df.columns:
            formatted_domains[subdomain] = description
            gpt_output[subdomain] = 0



# convert to string to pass to gpt model
domains_string = str(formatted_domains)
print(domains_string)

{'Computer Graphics': 'Domain', 'Computer Graphics-Image Rendering': 'Converts data from a model into a visual format that can be viewed as images or animations.', 'Computer Graphics-Modeling': 'The creation of 3D objects within a scene.', 'Computer Graphics-Texture Mapping': 'Applies a surface texture to a 3D model to give it more realism.', 'Computer Graphics-Graphics Optimization': 'Enhances the performance and quality of the graphical output.', 'Data Structure-Linear Structures': 'Manages data elements arranged in a linear sequence, such as arrays and lists.', 'Data Structure-Tree Structures': 'Handles hierarchical data structures where each node contains one or more child nodes.', 'Data Structure-Data Sorting': 'Involves arranging data into a defined order based on their attributes.', 'Data Structure-Search Algorithms': 'Methods used to locate specific data among a collection of data.', 'Data Structure-Data Manipulation': 'The process of changing data to make it more organized and

## Data Split

Initial run has a small testing set to minimize OpenAI call costs

In [44]:
# Randomly sample 70% of the rows for the first DataFrame
training_df = df.sample(frac=0.12, random_state=1)

# The remaining 30% of the rows for the second DataFrame
testing_df = df.drop(training_df.index)
print(len(training_df))
training_df.head()


50


Unnamed: 0,PR #,Pull Request,issue text,issue description,created_at,closed_at,userlogin,author_name,most_recent_commit,filename,...,Parser-Validation,Search,User Interface,User Interface-Layout Design,User Interface-Interaction Design,User Interface-Accessibility,User Interface-User Feedback,Utility,Utility-Automation Scripts,Utility-Diagnostic Utilities
462,928,True,when changing the type of an entry the editor ...,regression bug introduced by jabref v32 is fin...,2016-03-10T13:35:44Z,2016-03-10T16:33:49Z,stefan-kolb,Stefan Kolb,23fcbc2cfe125376cfa8771b81b9355f8cbd443d,src/main/java/net/sf/jabref/gui/actions/Change...,...,1,1,1,1,1,1,1,0,0,0
176,343,True,wip replaced deprecated glazedlistsapi with cu...,i replaced the deprecated calls to the glazedl...,2015-11-15T14:35:37Z,2015-11-27T09:33:14Z,oscargus,Oscar Gustafsson,7732b00072f09edea8d2aaef7b78ed8a0d935334,src/main/java/net/sf/jabref/gui/FetcherPreview...,...,1,1,1,1,1,1,1,0,0,0
17,23,True,fixed bug 949,an ioexception is caught when the user tries t...,2014-10-20T19:33:08Z,2014-10-21T09:40:26Z,rmmsilva,rmmsilva,02899e7dc1076352a4774a86f636b8dd77f20e90,src/main/java/net/sf/jabref/imports/ImportCust...,...,0,0,1,1,1,0,0,0,0,0
192,377,True,removed some random unused stuff,removed unused variables methods and classes t...,2015-11-20T21:06:09Z,2015-11-21T15:03:00Z,oscargus,Oscar Gustafsson,3282596a7edd49c964cf8fa815549c4da6027de8,src/main/java/net/sf/jabref/external/Synchroni...,...,1,1,1,1,1,1,1,0,0,0
4,9,True,support findfulltext with acs dois,adds a fulltextfinder implementation to transf...,2014-05-06T06:36:03Z,2014-05-20T12:53:43Z,ansell,Peter Ansell,57cbe70e1be14f7b8f8754e05f6b722ed72ce887,src/test/java/net/sf/jabref/external/AcsPdfTes...,...,0,0,0,0,0,0,0,0,0,0


### Generate GPT Messages

Messages will then be stored in a json as this is a suitable format for training the gpt model. System messages are constant and consist of the context behind the domains and subdomains. User messages are variable and consist of the task as well as the issue's title and description. Assistant messages are also variable and consist of every domain with a 1 or 0 attached in json format.

In [46]:

# Open the file in write mode
with open('gpt_messages.jsonl', 'w', encoding='utf-8') as f:
    assistant_message = gpt_output
    # Iterate over the rows in the DataFrame
    for index, row in training_df.iterrows():
        # Create the user message by formatting the prompt with the title and body
        user_message = (
        f"Classify a GitHub issue by indicating whether each domain and subdomain is relevant to the issue based on its title: [{row['issue text']}] "
        f"and body: [{row['issue description']}]. Ensure that every domain/subdomain is accounted for, and its relevance is indicated with a 1 (relevant) or a 0 (not relevant)."
        )

        # logic to update assistant message with values in df
        for column in df.columns:
            if column in gpt_output:
                if row[column] > 0:
                    assistant_message[column] = 1
                else:
                    assistant_message[column] = 0

        
        # Construct the conversation object
        conversation_object = {
            "messages": [
                {"role": "system", "content": "Refer to these domains and subdomains when classifying " + domains_string},
                {"role": "user", "content": user_message},
                {"role": "assistant", "content": str(assistant_message)}
                ]
        }
        
        # Write the conversation object to one line in the file
        f.write(json.dumps(conversation_object, ensure_ascii=False) + '\n')

# Model Fine Tuning

## Setup

Create Client using OpenAI API key

In [11]:
# Invoking the API
from openai import OpenAI
client = OpenAI(api_key = 'API-KEY')

## Training File Upload

Upload training file

In [17]:
## Uploading a training file
domain_classifier_training_file = client.files.create(
  file=open("gpt_messages.jsonl", "rb"),
  purpose="fine-tune"
)
domain_classifier_training_file

FileObject(id='file-eQ8scV2nzKMNaRTGWSCNAdVf', bytes=1571336, created_at=1716906573, filename='gpt_messages.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

## Create Fine Tuning Job

In [18]:
## Creating a fine-tuned model
ft_job_dc = client.fine_tuning.jobs.create(
  training_file=domain_classifier_training_file.id, 
  model="gpt-3.5-turbo",
  suffix= "issue_classifier"
)

ft_job_dc

FineTuningJob(id='ftjob-a7zaH5caCHqnUZgJishMzbqD', created_at=1716906785, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-RQmLagMyfsDY9gy4UMq97uCI', result_files=[], seed=246686278, status='validating_files', trained_tokens=None, training_file='file-eQ8scV2nzKMNaRTGWSCNAdVf', validation_file=None, estimated_finish=None, integrations=[], user_provided_suffix='issue_classifier')

In [15]:
# Retrieving the state of a fine-tune
issue_classifier = client.fine_tuning.jobs.retrieve(ft_job_dc.id).fine_tuned_model
print(issue_classifier)

NameError: name 'ft_job_dc' is not defined

In [34]:
# You can track the progress of your fine-tuning job by listing the lastest events. 
client.fine_tuning.jobs.list_events(fine_tuning_job_id=ft_job_dc.id, limit=20)

SyncCursorPage[FineTuningJobEvent](data=[FineTuningJobEvent(id='ftevent-SWxdNcFFLt91UON3CUhRQBN8', created_at=1716907348, level='info', message='The job has successfully completed', object='fine_tuning.job.event', data={}, type='message'), FineTuningJobEvent(id='ftevent-TZqbKLml52NnTxH6g6CcEV31', created_at=1716907346, level='info', message='New fine-tuned model created: ft:gpt-3.5-turbo-0125:northern-arizona-university-nau:issue-classifier:9TsJt9e2', object='fine_tuning.job.event', data={}, type='message'), FineTuningJobEvent(id='ftevent-FOYG4xm2KxPz8s6YaQMD3EGz', created_at=1716907346, level='info', message='Checkpoint created at step 100 with Snapshot ID: ft:gpt-3.5-turbo-0125:northern-arizona-university-nau:issue-classifier:9TsJteOI:ckpt-step-100', object='fine_tuning.job.event', data={}, type='message'), FineTuningJobEvent(id='ftevent-XncIm13m5hvK6WbJBoUOGbcF', created_at=1716907346, level='info', message='Checkpoint created at step 50 with Snapshot ID: ft:gpt-3.5-turbo-0125:north

# Testing Fine Tuned Model

## Get Testing Data Responses

In [13]:
response_columns = testing_df.columns[15:]
response_columns = response_columns.insert(0, 'PR #')

response_df = pd.DataFrame(columns=response_columns)
response_df.columns

Index(['PR #', 'Application', 'Application-Integration',
       'Application-Plugin Management', 'Application-User Customization',
       'Application-App Configuration', 'Application-Version Control',
       'Application-Compatibility Checks', 'Application Performance Manager',
       'Application Performance Manager-Performance Monitoring',
       ...
       'Utility-Performance Tools', 'Utility-Diagnostic Utilities',
       'Utility-Backup Tools', 'Test', 'Test-Unit Testing',
       'Test-Integration Testing', 'Test-Performance Testing',
       'Test-Security Testing', 'Test-Usability Testing',
       'Test-Regression Testing'],
      dtype='object', length=218)

In [14]:
import concurrent.futures
import openai

def query_chatGPT(user_message, system_message, max_retries=5):
    attempt = 0

    # attempt to query model
    while attempt < max_retries:
        with concurrent.futures.ThreadPoolExecutor() as executor:
            future = executor.submit(
                client.chat.completions.create,
                model="ft:gpt-3.5-turbo-0125:northern-arizona-university-nau:issue-classifier:9TsJt9e2",
                messages=[
                    {"role": "system", "content": system_message},
                    {"role": "user", "content": user_message}
                ]
            )
            try:
                response = future.result()
                return response.choices[0].message.content
            except Exception as e:
                print(f"Attempt {attempt + 1}/{max_retries} - An error occurred: {e}")
            finally:
                attempt += 1

    print("Failed to get a response after several retries.")
    return None

In [24]:
responses = {}
for index, row in testing_df.iterrows():

    # create user and system messages
    user_message = (
        f"Classify a GitHub issue by indicating whether each domain and subdomain is relevant to the issue based on its title: [{row['issue text']}] "
        f"and body: [{row['issue description']}]. Ensure that every domain/subdomain is accounted for, and its relevance is indicated with a 1 (relevant) or a 0 (not relevant)."
        )
    system_message = "Refer to these domains and subdomains when classifying " + domains_string

    # query fine tuned model
    response = query_chatGPT(user_message, system_message)
    all_responses[row['PR #']] = response
    print("PR #"+str(row['PR #'])+" complete")

with open('GPT_Responses.json', 'w') as json_file:
    json.dump(all_responses, json_file, indent=4)

PR #1 complete
PR #2 complete
PR #7 complete
PR #8 complete
PR #11 complete
PR #12 complete
PR #13 complete
PR #14 complete
PR #15 complete
PR #16 complete
PR #17 complete
PR #18 complete
PR #19 complete
PR #21 complete
PR #22 complete
PR #24 complete
PR #25 complete
PR #26 complete
PR #27 complete
PR #28 complete
PR #30 complete
PR #32 complete
PR #33 complete
PR #34 complete
PR #35 complete
PR #36 complete
PR #38 complete
PR #39 complete
PR #42 complete
PR #43 complete
PR #44 complete
PR #45 complete
PR #46 complete
PR #47 complete
PR #48 complete
PR #50 complete
PR #53 complete
PR #54 complete
PR #55 complete
PR #59 complete
PR #61 complete
PR #62 complete
PR #63 complete
PR #66 complete
PR #68 complete
PR #71 complete
PR #72 complete
PR #74 complete
PR #75 complete
PR #77 complete
PR #81 complete
PR #82 complete
PR #84 complete
PR #85 complete
PR #86 complete
PR #89 complete
PR #90 complete
PR #93 complete
PR #94 complete
PR #96 complete
PR #99 complete
PR #100 complete
PR #105 com

KeyboardInterrupt: 

In [15]:
with open('GPT_Responses.json', 'w') as json_file:
    json.dump(all_responses, json_file, indent=4)

NameError: name 'all_responses' is not defined

## Calculate Testing Metrics

## Setup

In [9]:
with open('GPT_Responses.json', 'r') as file:
    json_content = file.read()
    responses = json.loads(json_content)

domains = testing_df.columns[15:]

scores = {}

for domain in domains:
    scores[domain] = [0, 0, 0, 0]

print(scores)

# Array formatted as [FP, TP, FN, TN]

{'Application': [0, 0, 0, 0], 'Application-Integration': [0, 0, 0, 0], 'Application-Plugin Management': [0, 0, 0, 0], 'Application-User Customization': [0, 0, 0, 0], 'Application-App Configuration': [0, 0, 0, 0], 'Application-Version Control': [0, 0, 0, 0], 'Application-Compatibility Checks': [0, 0, 0, 0], 'Application Performance Manager': [0, 0, 0, 0], 'Application Performance Manager-Performance Monitoring': [0, 0, 0, 0], 'Application Performance Manager-Resource Allocation': [0, 0, 0, 0], 'Application Performance Manager-Error Detection': [0, 0, 0, 0], 'Application Performance Manager-Load Balancing': [0, 0, 0, 0], 'Application Performance Manager-Traffic Management': [0, 0, 0, 0], 'Application Performance Manager-Diagnostic Tools': [0, 0, 0, 0], 'Big Data': [0, 0, 0, 0], 'Big Data-Data Processing': [0, 0, 0, 0], 'Big Data-Data Storage': [0, 0, 0, 0], 'Big Data-Data Analysis': [0, 0, 0, 0], 'Big Data-Real-Time Processing': [0, 0, 0, 0], 'Big Data-Batch Processing': [0, 0, 0, 0], 'B

# Get FP, TP, FN, TN

In [12]:
for index, row in testing_df.iterrows():
    if str(row['PR #']) in responses:
        try:
            curr_response = ast.literal_eval(responses[str(row['PR #'])])
            print(row['PR #'])
            for domain in domains:
                true_y = row[domain]
                pred_y = curr_response[domain]

                # false positive
                if true_y == 0 and pred_y == 1:
                    scores[domain][0] += 1

                # true positive
                elif true_y == 1 and pred_y == 1:
                    scores[domain][1] += 1

                # false negative
                elif true_y == 1 and pred_y == 0:
                    scores[domain][2] += 1

                #true negative
                elif true_y == 0 and pred_y == 0:
                    scores[domain][3] += 1
        except ValueError:
            # Handle the case where the string is not properly formatted
            print("PR #" + str(row["PR #"]) + " response not in json format")
    else:
        print("PR #" + str(row['PR #']) + " PR not in response json")



1
2
7
8
11
12
13
14
15
16
17
18
19
21
22
24
25
26
27
28
30
32
33
34
35
36
38
39
42
43
44
45
46
47
48
50
53
54
55
59
61
62
63
66
68
71
72
74
75
77
81
82
84
85
86
89
90
93
94
96
99
100
105
108
109
PR #127 PR not in response json
137
140
143
146
147
150
162
164
165
166
172
175
176
182
183
187
192
193
195
196
197
198
200
204
205
206
211
217
218
219
222
225
227
228
230
232
235
236
237
241
242
246
248
252
253
255
256
259
261
264
266
267
272
273
274
276
283
286
287
291
292
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
319
320
322
329
331
332
334
335
336
340
342
347
351
354
356
357
359
361
370
371
373
374
375
379
380
381
387
390
391
392
394
396
401
406
407
418
422
424
427
429
433
PR #440 PR not in response json
449
450
452
453
456
459
460
464
465
468
469
474
475
476
478
484
485
486
488
491
500
502
507
515
517
519
530
536
541
542
546
547
551
553
554
557
566
567
568
572
575
580
581
582
583
584
585
594
595
596
597
600
601
602
603
605
606
626
PR #633 PR not in response json
PR #641 P

In [13]:
print(scores)

{'Application': [0, 0, 0, 240], 'Application-Integration': [0, 0, 0, 240], 'Application-Plugin Management': [0, 0, 0, 240], 'Application-User Customization': [0, 0, 0, 240], 'Application-App Configuration': [0, 0, 0, 240], 'Application-Version Control': [0, 0, 0, 240], 'Application-Compatibility Checks': [0, 0, 0, 240], 'Application Performance Manager': [3, 0, 7, 230], 'Application Performance Manager-Performance Monitoring': [0, 0, 0, 240], 'Application Performance Manager-Resource Allocation': [0, 0, 0, 240], 'Application Performance Manager-Error Detection': [0, 0, 0, 240], 'Application Performance Manager-Load Balancing': [0, 0, 0, 240], 'Application Performance Manager-Traffic Management': [0, 0, 0, 240], 'Application Performance Manager-Diagnostic Tools': [0, 0, 0, 240], 'Big Data': [0, 0, 0, 240], 'Big Data-Data Processing': [0, 0, 0, 240], 'Big Data-Data Storage': [0, 0, 0, 240], 'Big Data-Data Analysis': [0, 0, 0, 240], 'Big Data-Real-Time Processing': [0, 0, 0, 240], 'Big Da

# Calculate Metrics For Each Class

In [56]:
# Data Format: [FP, TP, FN, TN]

performance_columns = ['Domain', 'Accuracy', 'Precision (1)', 'Recall (1)', 'F1 (1)', 'Precision (0)', 'Recall (0)', 'F1 (0)', 'Confusion Matrix']

performance_df = pd.DataFrame(columns=performance_columns)
sum_list = [0, 0, 0 ,0]
for key, value in scores.items():
    sum_list[0] += value[0]
    sum_list[1] += value[1]
    sum_list[2] += value[2]
    sum_list[3] += value[3]
    
    accuracy = (value[1] + value[3]) / sum(value)
    accuracy = float("{:.2f}".format(accuracy))

    # calculate metrics for positive cases
    positive_precision = (value[1]) / (value[1] + value[0]) if (value[1] + value[0]) > 0 else 1
    positive_precision = float("{:.2f}".format(positive_precision))

    positive_recall = (value[1]) / (value[1] + value[2]) if (value[1] + value[2]) > 0 else 1
    positive_recall = float("{:.2f}".format(positive_recall))

    positive_F1 = 2 * (((positive_recall * positive_recall)) / (positive_precision + positive_recall)) if (positive_precision + positive_recall) > 0 else 1
    positive_F1 = float("{:.2f}".format(positive_F1))

    # calculate metrics for negative cases
    negative_precision = (value[3]) / (value[2] + value[3]) if (value[2] + value[3]) > 0 else 1
    negative_precision = float("{:.2f}".format(negative_precision))

    negative_recall = (value[3]) / (value[3] + value[0]) if (value[3] + value[0]) > 0 else 1
    negative_recall = float("{:.2f}".format(negative_recall))

    negative_F1 = 2 * (((negative_precision * negative_recall)) / (negative_precision + negative_recall)) if (negative_precision + negative_recall) > 0 else 1
    negative_F1 = float("{:.2f}".format(negative_F1))
    
    confusion_matrix = "0[{}  {}]\n1[{}  {}]\n   0     1".format(value[3], value[0], value[2], value[1])
    new_row = [key, accuracy, positive_precision, positive_recall, positive_F1, negative_precision, negative_recall, negative_F1, confusion_matrix]
    performance_df.loc[len(performance_df)] = new_row

In [58]:
performance_df.to_csv('performance.csv', index=False)

## Compute Micro and Macro Metrics

In [59]:
print(sum_list)

[1699, 3171, 3354, 43851]


In [65]:
# Positive Micro
positive_micro_precision = (sum_list[1]) / (sum_list[0] + sum_list[1])
positive_micro_recall = (sum_list[1]) / (sum_list[0] + sum_list[2])
positive_micro_F1 = 2 * ((positive_micro_precision * positive_micro_recall) / (positive_micro_precision + positive_micro_recall))

print('Micro positive precision:', positive_micro_precision)
print('Micro positive recall:', positive_micro_recall)
print('Micro positive F1:', positive_micro_F1)

# Negative Micro 
negative_micro_precision = (sum_list[3])/(sum_list[2] + sum_list[3])
negative_micro_recall = (sum_list[3])/(sum_list[3] + sum_list[0])
negative_micro_F1 = 2 * ((negative_micro_precision * negative_micro_recall) / (negative_micro_precision + negative_micro_recall))

print('Micro negative precision:', negative_micro_precision)
print('Micro negative recall:', negative_micro_recall)
print('Micro negative F1:', negative_micro_F1)

# Micro Accuracy
micro_accuracy = (sum_list[1] + sum_list[3]) / sum(sum_list)
print('micro accuracy:', micro_accuracy)

Micro positive precision: 0.651129363449692
Micro positive recall: 0.6275479912923017
Micro positive F1: 0.6391212334979341
Micro negative precision: 0.928948204639339
Micro negative recall: 0.9627003293084523
Micro negative F1: 0.9455231523907067
micro accuracy: 0.902966874699952


In [67]:
# Macro
positive_macro_precision = performance_df['Precision (1)'].mean()
positive_macro_recall = performance_df['Recall (1)'].mean()
positive_macro_F1 = 2 * (positive_macro_precision * positive_macro_recall) / (positive_macro_precision + positive_macro_recall)

print('Positive Precision:', positive_macro_precision)
print('Positive Recall:', positive_macro_recall)
print('Positive F1:', positive_macro_F1)

# Macro
negative_macro_precision = performance_df['Precision (0)'].mean()
negative_macro_recall = performance_df['Recall (0)'].mean()
negative_macro_F1 = 2 * (negative_macro_precision * negative_macro_recall) / (negative_macro_precision + negative_macro_recall)

print('Negative Precision:', negative_macro_precision)
print('Negative Recall:', negative_macro_recall)
print('Negative F1:', negative_macro_F1)

Positive Precision: 0.759032258064516
Positive Recall: 0.7272811059907833
Positive F1: 0.7428175423541495
Negative Precision: 0.8815668202764978
Negative Recall: 0.9150230414746544
Negative F1: 0.8979834188380516
