# RAW labels

For each commit, find whether it was unrelated or not.

## Files

In [1]:
import os
 
RAW_COMMIT_CSV = os.path.join("data", "previous-study", "raw-commit-labels.csv") 
NEW_COMMITS = os.path.join("data", "update-previous-study", "new-commits-dataset.json")

RAW_COMMIT = os.path.join("data", "process-labeled-commits", "raw-commit-labels.json")
NON_FORKED_RAW_COMMIT = os.path.join("data", "process-labeled-commits", "non-fork-commit-labels.json")
DISTINCT_RAW_COMMIT = os.path.join("data", "process-labeled-commits", "distinct-commit-labels.json")
REMAPPED_RAW_COMMIT = os.path.join("data", "process-labeled-commits", "remapped-commit-labels.json")

FULL_COMMIT_LABELS = os.path.join("data", "process-labeled-commits", "full-commit-labels.json")


## Parsing to json + initial fork filter

In [2]:
# import csv
import csv
import json

# read csv file to a list of dictionaries
with open(RAW_COMMIT_CSV, 'r') as file:
    csv_reader = csv.DictReader(file)
    
    raw_commit_labels = [{"hash": row["Commit Link"].split("/commit/")[1], "codes": [label.strip() for label in row["Labels"].split(",")]} for row in csv_reader]  
    
with open(RAW_COMMIT, "w") as outfile:
        json.dump(raw_commit_labels, outfile)

non_forked = [commit for commit in raw_commit_labels for code in commit["codes"] if "fork" not in code]

with open(NON_FORKED_RAW_COMMIT, "w") as outfile:
        json.dump(non_forked, outfile)
    

## Find unique commits

Also reports duplicate commits that do not have same labels

In [3]:
file = open(NON_FORKED_RAW_COMMIT)
nuggets = json.load(file)

hash_hist = {}
commit_list = []
index = -1

for nugget in nuggets:
    commit_hash = nugget["hash"]
    
    if hash_hist.get(commit_hash, None) is None:
        index += 1
        hash_hist[commit_hash] = index
        commit_list.append(nugget)
    else:
        t_index = hash_hist[commit_hash]
        if (commit_list[t_index] != nugget):
            print(commit_hash, "1:", commit_list[t_index]["codes"], "2:", nugget["codes"])

with open(DISTINCT_RAW_COMMIT, "w") as outfile:
        json.dump(commit_list, outfile)

b8c57d20558b2a82f0bdedbfbb16211b299583e7 1: ['other-false_positive'] 2: ['other-addition-unknown']
a07d0f5ae5219ebda9696978e49a6621ffe30a02 1: ['other-false_positive'] 2: ['other-addition-unknown']
be1245d8634025277ba79a4155ee88d7eaffcdfb 1: ['cost-saving-instance'] 2: ['cost-awareness-instance']


## Manually reassign labels to make consistent

| commit | codes left commit | codes right commit | result codes
| --- | --- | --- | --- |
| b8c57d20558b2a82f0bdedbfbb16211b299583e7 | ['other-false_positive'] | ['other-addition-unknown'] | ['other-false_positive'] |
| a07d0f5ae5219ebda9696978e49a6621ffe30a02  | ['other-false_positive'] | ['other-addition-unknown'] | ['other-false_positive'] |
| be1245d8634025277ba79a4155ee88d7eaffcdfb  | ['cost-saving-instance'] | ['cost-awareness-instance'] | ['cost-saving-instance'] |

These have been manually altered in the DISTINCT_RAW_COMMIT file.

## Catalog labels

Can be skipped

In [4]:
file = open(DISTINCT_RAW_COMMIT)
nuggets = json.load(file)

label_hist = {}

for nugget in nuggets:
    for label in nugget["codes"]:
        if label_hist.get(label, None) is None:
            label_hist[label] = 1
        else:
            label_hist[label] = label_hist[label] + 1

print(label_hist)

{'cost-saving-unknown': 47, 'other-addition-billing': 188, 'other-false_positive': 588, 'oither-false_positive': 1, 'cost-awareness-networking-NAT': 11, 'cost-saving-area': 12, 'other-addition-module': 121, 'cost-awareness-storage': 17, 'cost-saving-billing_mode': 13, 'cost-saving-cluster': 11, 'cost-awareness-instance': 48, 'other-addition-vars': 69, 'cost-awareness-billing_mode': 16, 'cost-saving-instance': 119, 'other-saving-billing_mode': 1, 'other-addition-cost_calculator': 54, 'other-addition-payload': 73, 'other-performance-payload': 5, 'other-refactor-payload': 35, 'other-bug_fix-modules': 1, 'other-bugfix-modules': 1, 'cost-saving-provider': 19, 'cost-awareness-policy': 8, 'cost-awareness-provider': 7, 'cost-increase-provider': 2, 'other-refactor-billing': 49, 'cost-awareness-unknown': 35, 'cost-saving-storage': 60, 'cost-saving-feature': 44, 'cost-saving-networking-NAT': 38, 'other-saving-provider': 2, 'other-performace-unknown': 2, 'cost-awareness-alert': 45, 'other-addition

## Mark unrelated labels

In [7]:
file = open(DISTINCT_RAW_COMMIT)
nuggets = json.load(file)

output = {}

for nugget in nuggets:
    other_labels = 0
    for code in nugget["codes"]:
        if ("other" in code):
            other_labels += 1
    if (len(nugget["codes"]) != other_labels):
        output[nugget["hash"]] = nugget["codes"]
    else:
        output[nugget["hash"]] = ["unrelated"]

with open(REMAPPED_RAW_COMMIT, "w") as outfile:
        json.dump(output, outfile)

## Combine with updated commits

In [14]:
previous_commits_file = open(REMAPPED_RAW_COMMIT)
previous_commits = json.load(previous_commits_file)

new_commits_file = open(NEW_COMMITS)
new_commits = json.load(new_commits_file)

for commit in new_commits:
    hash_ = commit["url"].split("/")[6]
    previous_commits[hash_] = commit["codes"]

with open(FULL_COMMIT_LABELS, "w") as outfile:
        json.dump(previous_commits, outfile)