In [11]:
import os
import pickle
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split

In [12]:
PROJECT = "jitfine"
COMMIT_METADATA_CSV = "raw-data/commit_metadata.csv"
METRICS_CSV = "raw-data/commit_expert_features_14_metrics.csv"
COMMIT_CODE_CSV = "raw-data/commit_code.csv"
TRAIN_TEST_SPLIT_MODE = "by_time"  # by_time/cross_project

TRAIN_TEST_SPLIT_DIR = f"raw-data/split_data/{TRAIN_TEST_SPLIT_MODE}"
OUTPUT_DIR = f"data/{PROJECT}/{TRAIN_TEST_SPLIT_MODE}"

In [13]:
def run_jit_data_generator():
    commit_metadata_df = pd.read_csv(COMMIT_METADATA_CSV)

    commit_expert_feature_df = pd.read_csv(METRICS_CSV)
    commit_expert_feature_df.columns = [col_name.lower() for col_name in
                                        commit_expert_feature_df.columns.values.tolist()]
    commit_expert_feature_df.rename(columns={'commit_id': 'commit_hash'}, inplace=True)
    commit_expert_feature_df = commit_expert_feature_df[
        ['index', 'commit_hash', 'la', 'ld', 'nf', 'ns', 'nd', 'entropy',
         'ndev', 'lt', 'nuc', 'age', 'exp', 'rexp', 'sexp', 'fix']]
    commit_code_df = pd.read_csv(COMMIT_CODE_CSV)

    print(f"Loaded commit data from CSV files [Size: {len(commit_metadata_df)}]")

    code_change_data_objs, expert_feature_data_dfs = get_cc2vec_data(commit_metadata_df, commit_expert_feature_df,
                                                                     commit_code_df)

    ef_train_df, ef_val_df, ef_test_df = expert_feature_data_dfs
    cc_train_obj, cc_val_obj, cc_test_obj = code_change_data_objs

    print(f"Dumping PKL files to {OUTPUT_DIR}")

    # dump Expert Features with Pandas Dataframe pickles
    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)
    ef_train_df.to_pickle(os.path.join(OUTPUT_DIR, "features_train.pkl"))
    ef_val_df.to_pickle(os.path.join(OUTPUT_DIR, "features_valid.pkl"))
    ef_test_df.to_pickle(os.path.join(OUTPUT_DIR, "features_test.pkl"))

    # dump Code Changes with Python pickles
    with open(os.path.join(OUTPUT_DIR, "changes_train.pkl"), 'wb') as f:
        pickle.dump(cc_train_obj, f)
    with open(os.path.join(OUTPUT_DIR, "changes_valid.pkl"), 'wb') as f:
        pickle.dump(cc_val_obj, f)
    with open(os.path.join(OUTPUT_DIR, "changes_test.pkl"), 'wb') as f:
        pickle.dump(cc_test_obj, f)


def get_cc2vec_data(commit_metadata_df, commit_expert_feature_df, commit_code_df):
    print(f"Building JITFine data format")
    ids, labels, msgs, codes = [], [], [], []

    for commit_id, vul_label, commit_msg in tqdm(
            commit_metadata_df[["commit_id", "vul_label", "commit_message"]].values):
        label = vul_label

        if not isinstance(commit_msg, str):
            commit_msg = ""
        msg = split_sentence(commit_msg)
        msg = ' '.join(msg.split(' ')).lower()

        commit_code_detail = commit_code_df[commit_code_df["commit_id"] == commit_id]
        files_changed_code = {"added_code": set(), "removed_code": set()}
        for diff_file in commit_code_detail.iterrows():
            added_code = diff_file[1][1]
            removed_code = diff_file[1][2]

            if isinstance(removed_code, str):
                for line in removed_code.splitlines():
                    removed_line = line.strip()
                    removed_line = ' '.join(split_sentence(removed_line).split())
                    files_changed_code["removed_code"].add(removed_line)
            if isinstance(added_code, str):
                for line in added_code.splitlines():
                    added_line = line.strip()
                    added_line = ' '.join(split_sentence(added_line).split())
                    files_changed_code["added_code"].add(added_line)

        ids.append(commit_id)
        labels.append(label)
        msgs.append(msg)
        codes.append(files_changed_code)

    code_data = [ids, labels, msgs, codes]

    code_data_splits, expert_feature_df_splits = split_data(code_data, commit_expert_feature_df)

    return code_data_splits, expert_feature_df_splits


def split_data(code_data, commit_expert_feature_df):
    print(f"Splitting training & testing data from size {len(code_data[0])}")
    data_id_to_index_dict = {c_id: i for i, c_id in enumerate(code_data[0])}
    train_ids = [l.strip() for l in open(os.path.join(TRAIN_TEST_SPLIT_DIR, "train_ids.txt")).readlines()]
    test_ids = [l.strip() for l in open(os.path.join(TRAIN_TEST_SPLIT_DIR, "test_ids.txt")).readlines()]
    # assert len(set(train_ids + test_ids)) == len(code_data[0])
    train_ids = list(set(train_ids) & set(code_data[0]))
    test_ids = list(set(test_ids) & set(code_data[0]))
    training_data, val_data, testing_data = [[], [], [], []], [[], [], [], []], [[], [], [], []]
    ef_train_df, ef_test_df = [x for _, x in commit_expert_feature_df.groupby(
        ~commit_expert_feature_df['commit_hash'].isin(train_ids))]

    train_ids, val_ids = train_test_split(train_ids, test_size=0.20, shuffle=False)
    ef_train_df, ef_val_df = [x for _, x in ef_train_df.groupby(~ef_train_df['commit_hash'].isin(train_ids))]

    ef_train_df = ef_train_df.set_index('commit_hash').loc[train_ids].reset_index()
    ef_val_df = ef_val_df.set_index('commit_hash').loc[val_ids].reset_index()
    ef_test_df = ef_test_df.set_index('commit_hash').loc[test_ids].reset_index()

    for _id in train_ids:
        index = data_id_to_index_dict[_id]
        training_data[0].append(code_data[0][index])
        training_data[1].append(code_data[1][index])
        training_data[2].append(code_data[2][index])
        training_data[3].append(code_data[3][index])

    for _id in val_ids:
        index = data_id_to_index_dict[_id]
        val_data[0].append(code_data[0][index])
        val_data[1].append(code_data[1][index])
        val_data[2].append(code_data[2][index])
        val_data[3].append(code_data[3][index])

    for _id in test_ids:
        index = data_id_to_index_dict[_id]
        testing_data[0].append(code_data[0][index])
        testing_data[1].append(code_data[1][index])
        testing_data[2].append(code_data[2][index])
        testing_data[3].append(code_data[3][index])

    return (training_data, val_data, testing_data), (ef_train_df, ef_val_df, ef_test_df)


def split_sentence(sentence):
    sentence = sentence.replace('.', ' . ').replace('_', ' ').replace('@', ' @ ') \
        .replace('-', ' - ').replace('~', ' ~ ').replace('%', ' % ').replace('^', ' ^ ') \
        .replace('&', ' & ').replace('*', ' * ').replace('(', ' ( ').replace(')', ' ) ') \
        .replace('+', ' + ').replace('=', ' = ').replace('{', ' { ').replace('}', ' } ') \
        .replace('|', ' | ').replace('\\', ' \ ').replace('[', ' [ ').replace(']', ' ] ') \
        .replace(':', ' : ').replace(';', ' ; ').replace(',', ' , ').replace('<', ' < ') \
        .replace('>', ' > ').replace('?', ' ? ').replace('/', ' / ')
    sentence = ' '.join(sentence.split())
    return sentence

run_jit_data_generator()

Loaded commit data from CSV files [Size: 20274]
Building JITFine data format


100%|██████████| 20274/20274 [01:13<00:00, 275.95it/s]


Splitting training & testing data from size 20274
Dumping PKL files to data/jitfine/by_time


In [14]:
df = pickle.load(open(f'data/{PROJECT}/{TRAIN_TEST_SPLIT_MODE}/features_train.pkl', 'rb'))
print(df)

                                    commit_hash  index   la  ld  nf  ns  nd  \
0      d208d1eba3799c58fd6d3602d31de3e686f14aec   7584    4  10   1   1   1   
1      5d7743019b327b3333947f5e96ca6289654c4aa7   2606  415   1   2   2   2   
2      54a6c11b20bb635ac5bb5d9369782bf00d0c7e19   5396    8   0   1   1   1   
3      444bc908611ccaf4512dc37c33ac3b54d873a62b   4790   16   8   1   1   1   
4      1a6245a5b0b4e8d822c739b403fc67c8a7bc8d12   3233  103  47   1   1   1   
...                                         ...    ...  ...  ..  ..  ..  ..   
12970  12abac8bb78c494597d740e7f9afd202f63180a3     42   58  34   1   1   1   
12971  079d0b7f1eedcc634c371fe05b617fdc55c8b762   1393   91  86   2   2   2   
12972  57ec555e8ef3c5ef1d77d48dc7cc868e56ddadc9   3658   17   2   1   1   1   
12973  cb077b7aa319caf4a11e811df93b1c2b86fff954   4744    0   2   1   1   1   
12974  2d83f323d63332e5ecaa481d8f9301c0ea92b6ba   1915    9  10   1   1   1   

        entropy  ndev     lt  nuc     age   exp    