In [4]:
import pandas
import os
import datetime
import json
import binascii
import hashlib

In [5]:
def find_active_scan_files(active_scan_path):
    files_names = []
    file_dirs = os.listdir(active_scan_path)
    print(len(file_dirs))
    for folder in file_dirs:
        print(folder)
        if 'b_' not in folder[:2]:
            continue
        files = os.listdir(active_scan_path + "/" + folder)
        for filename in files:
            if 'certs.txt' == filename:
                filepath = active_scan_path + folder + "/" + filename
                files_names.append(filepath)
    return files_names


def get_ccadb_hashes(filePath="../datasets/tls_scans/ccadb/cert_hashes_ccadb.txt"):
    ccadb_hashes = dict()
    with open(filePath, 'rt') as f:
        for line in f:
            ccadb_hashes[line.rstrip()] = None
    return ccadb_hashes


def hash_cert(pem):
    pem_to_hash = pem.replace('\n','').replace('-----BEGIN CERTIFICATE-----','').replace('-----END CERTIFICATE-----','')
    pem_to_hash += "=" * ((4 - len(pem_to_hash) % 4) % 4)
    pem_decode = binascii.a2b_base64(pem_to_hash)
    hash_object = hashlib.sha1(pem_decode)
    return hash_object.hexdigest(), pem_to_hash


def parse_date(value):
    if 'T' in value:
        return datetime.datetime.strptime(value.split('T')[0], '%Y-%m-%d').replace(tzinfo=datetime.timezone.utc).timestamp()
    else:
        return datetime.datetime.strptime(value.split(' ')[0], '%Y-%m-%d').replace(tzinfo=datetime.timezone.utc).timestamp()


def is_expired(not_after, not_before, snapshot_date_timestamp):
    week = 60*60*24*7
    if snapshot_date_timestamp <= (not_after + week) and (not_before - week) <= snapshot_date_timestamp:
        return False
    else:
        return True


def validate_certificate(cert_list, ccadb_hashes, snapshot_timestamp):
    # Check for root certifcate in ccadb
    if 'pem' in cert_list[-1]:
        certHashed, rawPem = hash_cert(cert_list[-1]['pem'])
        if certHashed not in ccadb_hashes:
            return None
    
    ee_cert = None
    for cert in cert_list:
        if 'not_before' in cert and 'not_after' in cert:
            if is_expired(parse_date(cert['not_after']), parse_date(cert['not_before']), snapshot_timestamp):
                return None

        if 'basic_constraints' in cert:
            if 'is_ca' in cert['basic_constraints']:
                if cert['basic_constraints']['is_ca'] == False:
                    if 'dns_names' in cert:
                        if len(cert['dns_names']) > 0:
                            if 'is_self_signed' in cert:
                                if cert['is_self_signed'] == False:
                                    ee_cert = cert
    return ee_cert


def active_scan_preprocessing(file_names, ccadb_hashes, snapshot_timestamp, result_path):
    with open(result_path + "ee_certs.txt", 'wt') as write_file:
        for i, file_name in enumerate(file_names):
            print(f"Processing file {i} out of {len(file_names)}:\t {file_name}")
            with open(file_name, "rt") as current_file:
                for line in current_file:
                    try:
                        data = json.loads(line.strip())
                    except:
                        continue
                    
                for ip in data:
                    if "certificates" in data[ip]:
                        ee_cert = validate_certificate(data[ip]['certificates'], ccadb_hashes, snapshot_timestamp)
                        if ee_cert is not None:
                            json_format = json.dumps({ip: ee_cert})
                            write_file.write(f"{json_format}\n")
                    
            
            
    

In [6]:
def main(datatype="active"):
    result_path = "./results/"
    
    ccadb_path = "../datasets/tls_scans/ccadb/cert_hashes_ccadb.txt"
    ccadb_hashes = get_ccadb_hashes(ccadb_path)
    
    snapshot_timestamp  = datetime.datetime.strptime("5-10-2020", '%d-%m-%Y').replace(tzinfo=datetime.timezone.utc).timestamp()

    # ccadb SSL/TLS resources: https://www.ccadb.org/resources pem of root certificates
    if datatype == "active":
        active_path = "../Dataset-ignore/active_scan/"
        files_names = find_active_scan_files(active_path)
        active_scan_preprocessing(files_names, ccadb_hashes, snapshot_timestamp, result_path)
    
    
main()

6
b_994
b_995
b_996
b_997
b_998
b_999
Processing file 0 out of 6:	 ../Dataset-ignore/active_scan/b_994/certs.txt
Processing file 1 out of 6:	 ../Dataset-ignore/active_scan/b_995/certs.txt
Processing file 2 out of 6:	 ../Dataset-ignore/active_scan/b_996/certs.txt
Processing file 3 out of 6:	 ../Dataset-ignore/active_scan/b_997/certs.txt
Processing file 4 out of 6:	 ../Dataset-ignore/active_scan/b_998/certs.txt
Processing file 5 out of 6:	 ../Dataset-ignore/active_scan/b_999/certs.txt


In [7]:
ccadb_path = "../datasets/tls_scans/ccadb/cert_hashes_ccadb.txt"

counter = 0
with open(ccadb_path, "rt") as file:
    for line in file:
        counter += 1
print(counter)

6568
