In [1]:
import pandas
import os
import datetime
import json
import binascii
import hashlib

In [7]:
def find_active_scan_files(active_scan_path):
    files_names = []
    file_dirs = os.listdir(active_scan_path)
    print(len(file_dirs))
    for folder in file_dirs:
        print(folder)
        if 'b_' not in folder[:2]:
            continue
        files = os.listdir(active_scan_path + "/" + folder)
        for filename in files:
            if 'certs.txt' == filename:
                filepath = active_scan_path + folder + "/" + filename
                files_names.append(filepath)
    return files_names


def get_ccadb_hashes(filePath="../datasets/tls_scans/ccadb/cert_hashes_ccadb.txt"):
    ccadb_hashes = dict()
    with open(filePath, 'rt') as f:
        for line in f:
            ccadb_hashes[line.rstrip()] = None
    return ccadb_hashes


def hash_cert(pem):
    pem_to_hash = pem.replace('\n','').replace('-----BEGIN CERTIFICATE-----','').replace('-----END CERTIFICATE-----','')
    pem_to_hash += "=" * ((4 - len(pem_to_hash) % 4) % 4)
    pem_decode = binascii.a2b_base64(pem_to_hash)
    hash_object = hashlib.sha1(pem_decode)
    return hash_object.hexdigest(), pem_to_hash


def parse_date(value):
    if 'T' in value:
        return datetime.datetime.strptime(value.split('T')[0], '%Y-%m-%d').replace(tzinfo=datetime.timezone.utc).timestamp()
    else:
        return datetime.datetime.strptime(value.split(' ')[0], '%Y-%m-%d').replace(tzinfo=datetime.timezone.utc).timestamp()


def is_expired(not_after, not_before, snapshot_date_timestamp):
    week = 60*60*24*7
    if snapshot_date_timestamp <= (not_after + week) and (not_before - week) <= snapshot_date_timestamp:
        return False
    else:
        return True


def validate_certificate(cert_list, ccadb_hashes, snapshot_timestamp):
    # Check for root certifcate in ccadb
    if 'pem' in cert_list[-1]:
        certHashed, rawPem = hash_cert(cert_list[-1]['pem'])
        if certHashed not in ccadb_hashes:
            return None
    
    ee_cert = None
    for cert in cert_list:
        if 'not_before' in cert and 'not_after' in cert:
            if is_expired(parse_date(cert['not_after']), parse_date(cert['not_before']), snapshot_timestamp):
                return None

        if 'basic_constraints' in cert:
            if 'is_ca' in cert['basic_constraints']:
                if cert['basic_constraints']['is_ca'] == False:
                    if 'dns_names' in cert:
                        if len(cert['dns_names']) > 0:
                            if 'is_self_signed' in cert:
                                if cert['is_self_signed'] == False:
                                    ee_cert = cert
    return ee_cert


def active_scan_preprocessing(file_names, ccadb_hashes, snapshot_timestamp, result_path):
    with open(result_path + "ee_certs.txt", 'wt') as write_file:
        for i, file_name in enumerate(file_names):
            print(f"Processing file {i} out of {len(file_names)}:\t {file_name}")
            with open(file_name, "rt") as current_file:
                for line in current_file:
                    try:
                        data = json.loads(line.strip())
                    except:
                        continue
                    
                for ip in data:
                    if "certificates" in data[ip]:
                        ee_cert = validate_certificate(data[ip]['certificates'], ccadb_hashes, snapshot_timestamp)
                        if ee_cert is not None:
                            json_format = json.dumps({ip: ee_cert})
                            write_file.write(f"{json_format}\n")
                    
            
            
    

In [8]:
def main(datatype="active"):
    result_path = "./results/"
    
    ccadb_path = "../datasets/tls_scans/ccadb/cert_hashes_ccadb.txt"
    ccadb_hashes = get_ccadb_hashes(ccadb_path)
    
    snapshot_timestamp  = datetime.datetime.strptime("5-10-2020", '%d-%m-%Y').replace(tzinfo=datetime.timezone.utc).timestamp()

    # ccadb SSL/TLS resources: https://www.ccadb.org/resources pem of root certificates
    if datatype == "active":
        active_path = "/media/gerben/Seagate/Study/hacking_lab/tls_scans/active/"
        files_names = find_active_scan_files(active_path)
        active_scan_preprocessing(files_names, ccadb_hashes, snapshot_timestamp, result_path)
    
    
main()

4361
b_1
b_10
b_11
b_12
b_13
b_100
b_101
b_102
b_103
b_104
b_105
b_106
b_107
b_108
b_109
b_110
b_111
b_112
b_113
b_114
b_115
b_116
b_117
b_118
b_119
b_120
b_121
b_122
b_123
b_124
b_125
b_126
b_127
b_128
b_129
b_130
b_131
b_132
b_1000
b_1001
b_1002
b_1003
b_1004
b_1005
b_1006
b_1007
b_1008
b_1009
b_1010
b_1011
b_1012
b_1013
b_1014
b_1015
b_1016
b_1017
b_1018
b_1019
b_1020
b_1021
b_1022
b_1023
b_1024
b_1025
b_1026
b_1027
b_1028
b_1029
b_1030
b_1031
b_1032
b_1033
b_1034
b_1035
b_1036
b_1037
b_1038
b_1039
b_1040
b_1041
b_1042
b_1043
b_1044
b_1045
b_1046
b_1047
b_1048
b_1049
b_1050
b_1051
b_1052
b_1053
b_1054
b_1055
b_1056
b_1057
b_1058
b_1059
b_1060
b_1061
b_1062
b_1063
b_1064
b_1065
b_1066
b_1067
b_1068
b_1069
b_1070
b_1071
b_1072
b_1073
b_1074
b_1075
b_1076
b_1077
b_1078
b_1079
b_1080
b_1081
b_1082
b_1083
b_1084
b_1085
b_1086
b_1087
b_1088
b_1089
b_1090
b_1091
b_1092
b_1093
b_1094
b_1095
b_1096
b_1097
b_1098
b_1099
b_1100
b_1101
b_1102
b_1103
b_1104
b_1105
b_1106
b_1107
b_1108
b_1109
b_1