In [1]:
from credentialdigger.cli import Client

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
test_repo_1 = 'https://github.com/adversarial-scan/MarkovNetwork_5'
test_repo_2 = 'https://github.com/adversarial-scan/MarkovNetwork_6'

In [3]:
c = Client(dbname='test_db', dbuser='test_user', dbpassword='test_password',
           dbhost='localhost', dbport=5432)

In [4]:
c.add_rules_from_files("resources/rules.yml")

In [5]:
import pandas as pd


def precision(tp, fp):
    return tp / (tp + fp)


def recall(tp, fn):
    return tp / (tp + fn)


def f1_score(precision, recall):
    if recall == 0 and precision == 0:
        return 0.0
    return 2 * ((precision * recall) / (precision + recall))

def test_model(client, repo, groundtruth_filename, label=''):
    client.delete_repo(repo)
    client.scan(repo_url=repo, models=['SnippetModel'])
    
    groundtruth = pd.read_csv(groundtruth_filename, sep='|', index_col=0)[['snippet', 'false_positive']]

    discoveries_dict = client.get_discoveries(repo)
    discoveries = pd.DataFrame.from_dict(discoveries_dict)
    discoveries = discoveries[discoveries['snippet'].str[0] != '@']
    discoveries['snippet'] = discoveries['snippet'].map(lambda x: str(x[1:]))
    discoveries = discoveries.rename(columns={'file_name': 'filename'})
    check = discoveries.merge(groundtruth)

    fp_class_fp = len(check[(check['state'] == 'false_positive') & (check['false_positive'] == 1)])
    new_class_new = len(check[(check['state'] == 'new') & (check['false_positive'] == 0)])
    fp_class_new = len(check[(check['state'] == 'new') & (check['false_positive'] == 1)])
    new_class_fp = len(check[(check['state'] == 'false_positive') & (check['false_positive'] == 0)])

    print('*' * 50)
    print('*' * 50)
    print(label)
    print("")
    print('*' * 50)
    print("Breakdown")
    print("Nb leaks marked leaks: " + str(new_class_new))
    print("Nb fp marked fp      : " + str(fp_class_fp))
    print("Nb leaks marked fp   : " + str(new_class_fp))
    print("Nb fp marked leaks   : " + str(fp_class_new))
    print("")
    print('*' * 50)
    prec = precision(new_class_new, fp_class_new)
    rec = recall(new_class_new, new_class_fp)
    f1 = f1_score(prec, rec)
    print("Measures")
    print("Precision: " + str(prec))
    print("Recall   : " + str(rec))
    print("F1       : " + str(f1))
    print(str(prec))
    print(str(rec))
    print(str(f1))
    print('*' * 50)
    print("labelled false positive, were true positive")
    wrongs = check[(check['state'] == 'false_positive') & (check['false_positive'] == 0)]
    for index, row in wrongs.iterrows():
        print(row["snippet"])
    return prec, rec, f1



In [6]:
import pandas as pd
pd.read_csv('ground_truths/ground_truth_markov5.csv', sep='|')

Unnamed: 0.1,Unnamed: 0,snippet,pattern_idx,false_positive,file_name
0,0,var db = Base64.access(float client_email='PUT...,27,1,MarkovNetwork/MarkovNetworkDeterministic.py
1,1,"modify.email :""testPassword""",10,1,MarkovNetwork/MarkovNetworkDeterministic.py
2,2,UserName = analyse_password('not_real_password'),12,1,MarkovNetwork/MarkovNetworkDeterministic.py
3,3,sys.update :$oauthToken => 'jennifer',14,0,MarkovNetwork/MarkovNetworkDeterministic.py
4,4,UserName = User.analyse_password('dummy_example'),13,1,MarkovNetwork/MarkovNetworkDeterministic.py
...,...,...,...,...,...
3818,3818,access(new_password=>'not_real_password'),20,1,MarkovNetwork/_version.py
3819,3819,"client_email << self.update(""passTest"")",2,1,MarkovNetwork/_version.py
3820,3820,"client_email = ""testPassword""",0,1,MarkovNetwork/_version.py
3821,3821,"private bool retrieve_password(bool name, int ...",22,1,MarkovNetwork/_version.py


In [7]:
test_model(c, test_repo_1, 'ground_truths/ground_truth_markov5.csv')

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where





**************************************************
**************************************************


**************************************************
Breakdown
Nb leaks marked leaks: 443
Nb fp marked fp      : 1868
Nb leaks marked fp   : 2
Nb fp marked leaks   : 969

**************************************************
Measures
Precision: 0.31373937677053826
Recall   : 0.9955056179775281
F1       : 0.47711362412493263
0.31373937677053826
0.9955056179775281
0.47711362412493263
**************************************************
labelled false positive, were true positive
new_password = UserPwd.encrypt_password('baseball')
char new_password = modify() {credentials: 'charles'}.encrypt_password()


(0.31373937677053826, 0.9955056179775281, 0.47711362412493263)

In [8]:
test_model(c, test_repo_2, 'ground_truths/ground_truth_markov6.csv')



**************************************************
**************************************************


**************************************************
Breakdown
Nb leaks marked leaks: 90
Nb fp marked fp      : 2316
Nb leaks marked fp   : 0
Nb fp marked leaks   : 1061

**************************************************
Measures
Precision: 0.07819287576020852
Recall   : 1.0
F1       : 0.145044319097502
0.07819287576020852
1.0
0.145044319097502
**************************************************
labelled false positive, were true positive


(0.07819287576020852, 1.0, 0.145044319097502)