In [5]:
import os
import pickle
import pandas as pd
from tqdm import tqdm

In [6]:
PROJECT = "la"
COMMIT_METADATA_CSV = "raw-data/commit_metadata.csv"
COMMIT_CODE_CSV = "raw-data/commit_code.csv"
TRAIN_TEST_SPLIT_MODE = "by_time"  # by_time/cross_project

TRAIN_TEST_SPLIT_DIR = f"raw-data/split_data/{TRAIN_TEST_SPLIT_MODE}"
OUTPUT_DIR = f"data/{PROJECT}/{TRAIN_TEST_SPLIT_MODE}"

In [7]:
def run_la_data_generator():
    commit_metadata_df = pd.read_csv(COMMIT_METADATA_CSV)
    commit_code_df = pd.read_csv(COMMIT_CODE_CSV)
    print(f"Loaded commit data from CSV files [Size: {len(commit_metadata_df)}]")

    la_train_df, la_test_df = get_la_data(commit_metadata_df, commit_code_df)

    print(f"Dumping PKL files to {OUTPUT_DIR}")
    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)
    la_train_df.to_pickle(os.path.join(OUTPUT_DIR, "features_train.pkl"))
    la_test_df.to_pickle(os.path.join(OUTPUT_DIR, "features_test.pkl"))
    
def get_la_data(commit_metadata_df, commit_code_df):
    print(f"Building LAPredict data format")
    la_df = pd.DataFrame(columns=['commit_hash', 'is_buggy_commit', 'la', 'ld'])

    for commit_id, vul_label in tqdm(commit_metadata_df[["commit_id", "vul_label"]].values):
        label = vul_label

        commit_code_detail = commit_code_df[commit_code_df["commit_id"] == commit_id]
        added_code_num = 0
        removed_code_num = 0
        for diff_file in commit_code_detail.iterrows():
            added_code = diff_file[1][1]
            removed_code = diff_file[1][2]

            if isinstance(removed_code, str):
                removed_code_num += len(removed_code.splitlines())
            if isinstance(added_code, str):
                added_code_num += len(added_code.splitlines())
        la_df.loc[len(la_df)] = [commit_id, label, added_code_num, removed_code_num]

    training_data_df, testing_data_df = split_data(la_df)

    return training_data_df, testing_data_df

def split_data(input_data_df):
    print(f"Splitting training & testing data from size {len(input_data_df)}")
    train_ids = [l.strip() for l in open(os.path.join(TRAIN_TEST_SPLIT_DIR, "train_ids.txt")).readlines()]
    test_ids = [l.strip() for l in open(os.path.join(TRAIN_TEST_SPLIT_DIR, "test_ids.txt")).readlines()]
    assert len(set(train_ids + test_ids)) == len(input_data_df)
    train_df, test_df = [x for _, x in input_data_df.groupby(~input_data_df['commit_hash'].isin(train_ids))]
    train_df = train_df.set_index('commit_hash').loc[train_ids].reset_index()
    test_df = test_df.set_index('commit_hash').loc[test_ids].reset_index()
    return train_df, test_df
run_la_data_generator()

Loaded commit data from CSV files [Size: 20274]
Building LAPredict data format


100%|██████████| 20274/20274 [01:26<00:00, 233.82it/s]

Splitting training & testing data from size 20274
Dumping PKL files to data/la/by_time





In [8]:
df = pickle.load(open(f'data/{PROJECT}/{TRAIN_TEST_SPLIT_MODE}/features_train.pkl', 'rb'))
print(df)

                                    commit_hash  is_buggy_commit   la   ld
0      2b1702456dab2e8de6f74e8e4e03aac87571aafd                1  470   83
1      04906bd5de2f220bf100b605dad37b4a1d9a91a6                0    1    0
2      2864e767053317538feafa815046fff89e5a16be                0   77   43
3      795b859eee96c700e8f3c3fe68e6a9a39d95797c                1  940  431
4      c0f8b0470cbc3707993db02a81f4356e294adcf1                1   30   45
...                                         ...              ...  ...  ...
16214  98da63b3f5f5a277c5c3a16860db9a9f6741e54c                0    1    1
16215  5d996b56499f00f80b02a41bab3d6b7349e36e9d                0   10    0
16216  a84d610b372c63e8a48a9ed7c038a2954097512c                0    4    1
16217  23f3f92361a3db53e595de33cfd5440f53bee220                0    5    5
16218  7bc4f0846f5e15dad5a54490290241243b5a4416                0    1    0

[16219 rows x 4 columns]
