In [1]:
from OpenSSL import crypto
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import statistics as stat
import os

In [2]:
def get_existing_certs(distinct_urls,folder):
    successes = []
    data_path = folder+'/'
    for url in distinct_urls:
        try:
            f = open(data_path+url)    
            if f.readline() != '':
                successes.append(url)
        except:
            pass
    return successes

In [3]:
path = "url_datasets/benign_unpopular20000.csv"
df = pd.read_csv(path)
urls = df['Domain']

In [4]:
stat.mean(df['Index'])

601545.2731

In [5]:
data_path = 'benign_data'
successes = get_existing_certs(urls,data_path)
len(successes)

14047

In [6]:
missing_cert_urls = []
found_url_indices = np.zeros(len(successes),dtype=bool)
certs = []
creation_times = []
for i,url in enumerate(successes):
    cert_file = 'benign_data/'+url
    
    f = open(cert_file)
    line = f.readline()
    found_cert = False
    while line != '':
        line = f.readline()
        if "-----BEGIN CERTIFICATE-----" in line:
            found_cert=True
            line = ''
    f.close()
    if found_cert:
        found_url_indices[i] = True
        cert = crypto.load_certificate(crypto.FILETYPE_PEM, open(cert_file).read())
        certs.append(cert)
        t = os.path.getctime(cert_file)
        creation_times.append(t)
    else:
        missing_cert_urls.append(cert)
len(missing_cert_urls)

583

In [7]:
final_df = pd.DataFrame()

subjects = [cert.get_subject() for cert in certs]
issuers = [cert.get_issuer() for cert in certs]

expiries = [cert.has_expired() for cert in certs]

In [8]:
final_df['CN'] = [s.CN for s in subjects]
final_df['O'] = [s.O for s in subjects]
final_df['C'] = [s.C for s in subjects]
final_df['businessCategory'] = [s.businessCategory for s in subjects]
final_df['serialNumber'] = [s.serialNumber for s in subjects]
final_df['juristiction'] = [s.jurisdictionLocalityName for s in subjects]
final_df['issuer_CN'] = [i.CN for i in issuers]
final_df['issuer_O'] = [i.O for i in issuers]
final_df['expired'] = expiries
final_df['notBefore'] = [c.get_notBefore() for c in certs]
final_df['notAfter'] = [c.get_notAfter() for c in certs]

verifications = []
for i,url in enumerate(successes):
    if found_url_indices[i]:
        f = open(data_path+'/'+url)
        line = f.readline()
        verified=False
        while line != '':
            if "Verify return code:" in line:
                if verified:
                    #print(url)
                    pass
                else:
                    verifications.append(line)
                    verified=True
            line = f.readline()
        if not verified:
            #print("FAILED: "+url)
            verifications.append('Verify return code: None (none)')
        f.close()  
new_verifications = [v[5:].split(' ')[3] for v in verifications]
final_df['verCode'] = new_verifications
final_df['time_collected'] = creation_times

In [9]:
final_df

Unnamed: 0,CN,O,C,businessCategory,serialNumber,juristiction,issuer_CN,issuer_O,expired,notBefore,notAfter,verCode,time_collected
0,hauntedrooms.com,,,,,,R3,Let's Encrypt,False,b'20210401184030Z',b'20210630184030Z',0,1.620576e+09
1,bigsound.org.au,,,,,,R3,Let's Encrypt,False,b'20210428164644Z',b'20210727164644Z',0,1.620576e+09
2,*.realytics.io,,,,,,GeoTrust RSA CA 2018,DigiCert Inc,False,b'20200408000000Z',b'20220531120000Z',0,1.620576e+09
3,*.munich-business-school.de,,,,,,Sectigo RSA Domain Validation Secure Server CA,Sectigo Limited,False,b'20200702000000Z',b'20220702235959Z',0,1.620576e+09
4,*.gridserver.com,,,,,,Go Daddy Secure Certificate Authority - G2,"GoDaddy.com, Inc.",False,b'20210107184337Z',b'20220208184337Z',0,1.620576e+09
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13459,herzkind.de,,,,,,R3,Let's Encrypt,False,b'20210323090139Z',b'20210621090139Z',0,1.620586e+09
13460,nahq.org,,,,,,R3,Let's Encrypt,False,b'20210313152258Z',b'20210611152258Z',0,1.620586e+09
13461,runyixs.com,,,,,,TrustAsia TLS RSA CA,"TrustAsia Technologies, Inc.",False,b'20201010000000Z',b'20211010120000Z',0,1.620586e+09
13462,*.food.de,,,,,,Starfield Secure Certificate Authority - G2,"Starfield Technologies, Inc.",False,b'20200327131959Z',b'20220426131959Z',0,1.620586e+09


In [10]:
final_df.drop_duplicates(subset=['CN'],inplace=True)
final_df

Unnamed: 0,CN,O,C,businessCategory,serialNumber,juristiction,issuer_CN,issuer_O,expired,notBefore,notAfter,verCode,time_collected
0,hauntedrooms.com,,,,,,R3,Let's Encrypt,False,b'20210401184030Z',b'20210630184030Z',0,1.620576e+09
1,bigsound.org.au,,,,,,R3,Let's Encrypt,False,b'20210428164644Z',b'20210727164644Z',0,1.620576e+09
2,*.realytics.io,,,,,,GeoTrust RSA CA 2018,DigiCert Inc,False,b'20200408000000Z',b'20220531120000Z',0,1.620576e+09
3,*.munich-business-school.de,,,,,,Sectigo RSA Domain Validation Secure Server CA,Sectigo Limited,False,b'20200702000000Z',b'20220702235959Z',0,1.620576e+09
4,*.gridserver.com,,,,,,Go Daddy Secure Certificate Authority - G2,"GoDaddy.com, Inc.",False,b'20210107184337Z',b'20220208184337Z',0,1.620576e+09
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13459,herzkind.de,,,,,,R3,Let's Encrypt,False,b'20210323090139Z',b'20210621090139Z',0,1.620586e+09
13460,nahq.org,,,,,,R3,Let's Encrypt,False,b'20210313152258Z',b'20210611152258Z',0,1.620586e+09
13461,runyixs.com,,,,,,TrustAsia TLS RSA CA,"TrustAsia Technologies, Inc.",False,b'20201010000000Z',b'20211010120000Z',0,1.620586e+09
13462,*.food.de,,,,,,Starfield Secure Certificate Authority - G2,"Starfield Technologies, Inc.",False,b'20200327131959Z',b'20220426131959Z',0,1.620586e+09


In [20]:
final_df.to_csv('final_certificate_datasets/benign_unpopular.csv')

In [12]:
final_df['verCode'].astype(str)

0        0
1        0
2        0
3        0
4        0
        ..
13459    0
13460    0
13461    0
13462    0
13463    0
Name: verCode, Length: 10086, dtype: object