In [1]:
from OpenSSL import crypto
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import os
import time

In [2]:
def get_existing_certs(distinct_urls,folder):
    successes = []
    data_path = folder+'/'
    for url in distinct_urls:
        try:
            f = open(data_path+url)    
            if f.readline() != '':
                successes.append(url)
        except:
            pass
    return successes

In [3]:
base_path = "url_datasets/phishtank-"
days = ['16-04','18-04','19-04','20-04','21-04','22-04','23-04','24-04','30-04','01-05','03-05','04-05','05-05','07-05','08-05']
data_path = 'phish_data-'

In [4]:
cert_lists = []
missing_lists = []
vercode_lists = []
time_lists = []
for day in days:
    print(day)
    
    df = pd.read_csv(base_path+day+'.csv')
    urls = df['url']
    new_urls = [u.split("/")[2] for u in urls]
    distinct_urls = list(set(new_urls))
    
    
    successes = get_existing_certs(distinct_urls,data_path+day)
    missing_cert_urls = []
    found_url_indices = np.zeros(len(successes),dtype=bool)
    certs = []
    creation_times = []
    for i,url in enumerate(successes):
        cert_file = data_path+day+'/'+url
        f = open(cert_file)
        line = f.readline()
        found_cert = False
        while line != '':
            line = f.readline()
            if "-----BEGIN CERTIFICATE-----" in line:
                found_cert=True
                line = ''
        f.close()
        if found_cert:
            found_url_indices[i] = True
            cert = crypto.load_certificate(crypto.FILETYPE_PEM, open(cert_file).read())
            certs.append(cert)
            
            t = os.path.getctime(cert_file)
            creation_times.append(t)
        else:
            missing_cert_urls.append(url)
            
    verifications = []   
    for i,url in enumerate(successes):
        if found_url_indices[i]:
            f = open(data_path+day+'/'+url)
            line = f.readline()
            verified=False
            while line != '':
                if "Verify return code:" in line:
                    if verified:
                        #print(url)
                        pass
                    else:
                        verifications.append(line)
                        verified=True
                line = f.readline()
            if not verified:
                #print("FAILED: "+url)
                verifications.append('Verify return code: None (none)')
            f.close()  
            
    new_verifications = [v[5:].split(' ')[3] for v in verifications]
    print("certs found: {} | empty cert sessions: {}".format(len(certs),len(missing_cert_urls)))
    cert_lists.append(certs)
    time_lists.append(creation_times)
    missing_lists.append(missing_cert_urls)
    vercode_lists.append(new_verifications)

16-04
certs found: 3189 | empty cert sessions: 1287
18-04
certs found: 3901 | empty cert sessions: 444
19-04
certs found: 4304 | empty cert sessions: 475
20-04
certs found: 4466 | empty cert sessions: 480
21-04
certs found: 4323 | empty cert sessions: 384
22-04
certs found: 3323 | empty cert sessions: 306
23-04
certs found: 1684 | empty cert sessions: 153
24-04
certs found: 4295 | empty cert sessions: 389
30-04
certs found: 4428 | empty cert sessions: 214
01-05
certs found: 3345 | empty cert sessions: 155
03-05
certs found: 0 | empty cert sessions: 0
04-05
certs found: 4490 | empty cert sessions: 243
05-05
certs found: 4290 | empty cert sessions: 245
07-05
certs found: 5088 | empty cert sessions: 486
08-05
certs found: 386 | empty cert sessions: 44


In [5]:
dfs = []
for n,day in enumerate(days):
    final_df = pd.DataFrame()
    certs = cert_lists[n]
    subjects = [cert.get_subject() for cert in certs]
    issuers = [cert.get_issuer() for cert in certs]

    expiries = [cert.has_expired() for cert in certs]
    
    final_df['CN'] = [s.CN for s in subjects]
    final_df['O'] = [s.O for s in subjects]
    final_df['C'] = [s.C for s in subjects]
    final_df['businessCategory'] = [s.businessCategory for s in subjects]
    final_df['serialNumber'] = [s.serialNumber for s in subjects]
    final_df['juristiction'] = [s.jurisdictionLocalityName for s in subjects]
    final_df['issuer_CN'] = [i.CN for i in issuers]
    final_df['issuer_O'] = [i.O for i in issuers]
    final_df['expired'] = expiries
    final_df['notBefore'] = [c.get_notBefore() for c in certs]
    final_df['notAfter'] = [c.get_notAfter() for c in certs]
    dfs.append(final_df)

    final_df['verCode'] = vercode_lists[n]
    final_df['time_collected'] = time_lists[n]

In [6]:
time_lists[0]

[1620479589.395142,
 1620479589.7509096,
 1620479589.7394621,
 1620479589.26887,
 1620479588.9401681,
 1620479588.9312944,
 1620479589.691454,
 1620479588.74588,
 1620479589.4107633,
 1620479588.9740677,
 1620479588.8878686,
 1620479589.6416245,
 1620479589.2392857,
 1620479588.702799,
 1620479589.1261206,
 1620479588.8882115,
 1620479588.8144853,
 1620479588.6357346,
 1620479588.8323026,
 1620479588.6250563,
 1620479589.44076,
 1620479589.0859678,
 1620479589.5538735,
 1620479588.767139,
 1620479588.794571,
 1620479588.952266,
 1620479589.2543437,
 1620479588.8309817,
 1620479588.7344956,
 1620479589.5475037,
 1620479589.3087754,
 1620479588.8915408,
 1620479589.3692517,
 1620479589.263505,
 1620479589.1094682,
 1620479589.0768688,
 1620479588.7377079,
 1620479589.5697815,
 1620479589.7018657,
 1620479589.4735763,
 1620479589.272356,
 1620479589.4514954,
 1620479588.6415913,
 1620479588.64324,
 1620479588.9172647,
 1620479588.7371633,
 1620479589.1203206,
 1620479588.5894976,
 1620479

In [7]:
final_df = pd.concat(dfs)
final_df

Unnamed: 0,CN,O,C,businessCategory,serialNumber,juristiction,issuer_CN,issuer_O,expired,notBefore,notAfter,verCode,time_collected
0,www.abcnike.shop,,,,,,R3,Let's Encrypt,0.0,b'20210326015631Z',b'20210624015631Z',0,1.620480e+09
1,*.bangarvideo.xyz,,,,,,Let's Encrypt Authority X3,Let's Encrypt,1.0,b'20201018134251Z',b'20210116134251Z',10,1.620480e+09
2,*.google.com,Google LLC,US,,,,GTS CA 1O1,Google Trust Services,0.0,b'20210316192807Z',b'20210608192806Z',0,1.620480e+09
3,*.hostgator.com.br,,,,,,Sectigo RSA Domain Validation Secure Server CA,Sectigo Limited,0.0,b'20200908000000Z',b'20210908235959Z',0,1.620480e+09
4,*.mailjet.com,Mailjet SAS,FR,,,,DigiCert SHA2 Secure Server CA,DigiCert Inc,0.0,b'20200302000000Z',b'20210514120000Z',0,1.620480e+09
...,...,...,...,...,...,...,...,...,...,...,...,...,...
381,blog.secure.runescape.com-wsvs.top,,,,,,R3,Let's Encrypt,0.0,b'20210507092316Z',b'20210805092316Z',0,1.620494e+09
382,*.inmotionhosting.com,,,,,,Sectigo RSA Domain Validation Secure Server CA,Sectigo Limited,0.0,b'20191029000000Z',b'20211028235959Z',0,1.620495e+09
383,sni.cloudflaressl.com,"Cloudflare, Inc.",US,,,,Cloudflare Inc ECC CA-3,"Cloudflare, Inc.",0.0,b'20200816000000Z',b'20210816120000Z',0,1.620495e+09
384,*.mobileglobal.ae,,,,,,R3,Let's Encrypt,0.0,b'20210504194540Z',b'20210802194540Z',0,1.620495e+09


In [11]:
final_distinct_df = final_df.drop_duplicates(subset=['CN'],inplace=False)
final_distinct_df

Unnamed: 0,CN,O,C,businessCategory,serialNumber,juristiction,issuer_CN,issuer_O,expired,notBefore,notAfter,verCode,time_collected
0,www.abcnike.shop,,,,,,R3,Let's Encrypt,0.0,b'20210326015631Z',b'20210624015631Z',0,1.620480e+09
1,*.bangarvideo.xyz,,,,,,Let's Encrypt Authority X3,Let's Encrypt,1.0,b'20201018134251Z',b'20210116134251Z',10,1.620480e+09
2,*.google.com,Google LLC,US,,,,GTS CA 1O1,Google Trust Services,0.0,b'20210316192807Z',b'20210608192806Z',0,1.620480e+09
3,*.hostgator.com.br,,,,,,Sectigo RSA Domain Validation Secure Server CA,Sectigo Limited,0.0,b'20200908000000Z',b'20210908235959Z',0,1.620480e+09
4,*.mailjet.com,Mailjet SAS,FR,,,,DigiCert SHA2 Secure Server CA,DigiCert Inc,0.0,b'20200302000000Z',b'20210514120000Z',0,1.620480e+09
...,...,...,...,...,...,...,...,...,...,...,...,...,...
368,signaturetn.com,,,,,,"cPanel, Inc. Certification Authority","cPanel, Inc.",0.0,b'20210508000000Z',b'20210806235959Z',0,1.620495e+09
372,cpanel.catalyx.biz,,,,,,R3,Let's Encrypt,0.0,b'20210310192820Z',b'20210608192820Z',0,1.620495e+09
375,freeler.de,,,,,,Starfield Secure Certificate Authority - G2,"Starfield Technologies, Inc.",0.0,b'20200704160310Z',b'20220730080815Z',0,1.620495e+09
377,nwcadvocates.com,,,,,,"cPanel, Inc. Certification Authority","cPanel, Inc.",0.0,b'20210330000000Z',b'20210628235959Z',0,1.620495e+09


In [12]:
final_distinct_df.to_csv('final_certificate_datasets/phish_dataset.csv')

In [15]:
final_df

Unnamed: 0,CN,O,C,businessCategory,serialNumber,juristiction,issuer_CN,issuer_O,expired,notBefore,notAfter,verCode,time_collected
0,www.abcnike.shop,,,,,,R3,Let's Encrypt,0.0,b'20210326015631Z',b'20210624015631Z',0,1.620480e+09
1,*.bangarvideo.xyz,,,,,,Let's Encrypt Authority X3,Let's Encrypt,1.0,b'20201018134251Z',b'20210116134251Z',10,1.620480e+09
2,*.google.com,Google LLC,US,,,,GTS CA 1O1,Google Trust Services,0.0,b'20210316192807Z',b'20210608192806Z',0,1.620480e+09
3,*.hostgator.com.br,,,,,,Sectigo RSA Domain Validation Secure Server CA,Sectigo Limited,0.0,b'20200908000000Z',b'20210908235959Z',0,1.620480e+09
4,*.mailjet.com,Mailjet SAS,FR,,,,DigiCert SHA2 Secure Server CA,DigiCert Inc,0.0,b'20200302000000Z',b'20210514120000Z',0,1.620480e+09
...,...,...,...,...,...,...,...,...,...,...,...,...,...
381,blog.secure.runescape.com-wsvs.top,,,,,,R3,Let's Encrypt,0.0,b'20210507092316Z',b'20210805092316Z',0,1.620494e+09
382,*.inmotionhosting.com,,,,,,Sectigo RSA Domain Validation Secure Server CA,Sectigo Limited,0.0,b'20191029000000Z',b'20211028235959Z',0,1.620495e+09
383,sni.cloudflaressl.com,"Cloudflare, Inc.",US,,,,Cloudflare Inc ECC CA-3,"Cloudflare, Inc.",0.0,b'20200816000000Z',b'20210816120000Z',0,1.620495e+09
384,*.mobileglobal.ae,,,,,,R3,Let's Encrypt,0.0,b'20210504194540Z',b'20210802194540Z',0,1.620495e+09
