In [4]:
from OpenSSL import crypto
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import os
import time

In [5]:
def normalise_counter(ctr):
    total = sum(ctr.values())
    for key in ctr.keys():
        ctr[key] /=total
    return ctr

In [6]:
def get_existing_certs(distinct_urls,folder):
    successes = []
    data_path = folder+'/'
    for url in distinct_urls:
        try:
            f = open(data_path+url)    
            if f.readline() != '':
                successes.append(url)
        except:
            pass
    return successes

In [7]:
phish_df = pd.read_csv('final_certificate_datasets/phish_dataset.csv')
benign_df = pd.read_csv('final_certificate_datasets/benign_certs.csv')

phish_total = len(phish_df)
benign_total = len(benign_df)

In [8]:
base_path = "url_datasets/phishtank-"
days = ['16-04','18-04','19-04','20-04','21-04','22-04','23-04','24-04','30-04','01-05','03-05','04-05','05-05','07-05','08-05']
data_path = 'phish_data-'

In [11]:
def get_domain_mapping(raw_urls):
    new_urls = [u.split("/")[2] for u in raw_urls]
    return new_urls
def get_existing_raw_url_certs(raw_urls,successes,targets):
    new_urls = get_domain_mapping(raw_urls)

    url_dict = dict()
    for i,u in enumerate(raw_urls):
            if new_urls[i] in successes:
                if new_urls[i] in url_dict.keys():
                    url_dict[new_urls[i]].append([u,targets[i]])
                else:
                    url_dict[new_urls[i]] = [[u,targets[i]]]
                
    return url_dict

In [12]:
cert_lists = []
missing_lists = []
vercode_lists = []
time_lists = []
url_lists = []
for day in days:
    print(day)
    
    df = pd.read_csv(base_path+day+'.csv')
    urls = df['url']
    targets = df['target']
    new_urls = [u.split("/")[2] for u in urls]
    distinct_urls = list(set(new_urls))
    
    
    successes = get_existing_certs(distinct_urls,data_path+day)
    url_dict = get_existing_raw_url_certs(urls,successes,targets)

    missing_cert_urls = []
    found_url_indices = np.zeros(len(successes),dtype=bool)
    certs = []
    creation_times = []
    for i,url in enumerate(successes):
        cert_file = data_path+day+'/'+url
        f = open(cert_file)
        line = f.readline()
        found_cert = False
        while line != '':
            line = f.readline()
            if "-----BEGIN CERTIFICATE-----" in line:
                found_cert=True
                line = ''
        f.close()
        if found_cert:
            found_url_indices[i] = True
            cert = crypto.load_certificate(crypto.FILETYPE_PEM, open(cert_file).read())
            certs.append(cert)
            url_lists.append(url_dict[url])
            t = os.path.getctime(cert_file)
            creation_times.append(t)
        else:
            missing_cert_urls.append(url)
            
    verifications = []   
    for i,url in enumerate(successes):
        if found_url_indices[i]:
            f = open(data_path+day+'/'+url)
            line = f.readline()
            verified=False
            while line != '':
                if "Verify return code:" in line:
                    if verified:
                        #print(url)
                        pass
                    else:
                        verifications.append(line)
                        verified=True
                line = f.readline()
            if not verified:
                #print("FAILED: "+url)
                verifications.append('Verify return code: None (none)')
            f.close()  
            
    new_verifications = [v[5:].split(' ')[3] for v in verifications]
    print("certs found: {} | empty cert sessions: {}".format(len(certs),len(missing_cert_urls)))
    cert_lists.append(certs)
    time_lists.append(creation_times)
    missing_lists.append(missing_cert_urls)
    vercode_lists.append(new_verifications)

16-04
certs found: 3189 | empty cert sessions: 1287
18-04
certs found: 3901 | empty cert sessions: 444
19-04
certs found: 4304 | empty cert sessions: 475
20-04
certs found: 4466 | empty cert sessions: 480
21-04
certs found: 4323 | empty cert sessions: 384
22-04
certs found: 3323 | empty cert sessions: 306
23-04
certs found: 1684 | empty cert sessions: 153
24-04
certs found: 4295 | empty cert sessions: 389
30-04
certs found: 4428 | empty cert sessions: 214
01-05
certs found: 3345 | empty cert sessions: 155
03-05
certs found: 0 | empty cert sessions: 0
04-05
certs found: 4490 | empty cert sessions: 243
05-05
certs found: 4290 | empty cert sessions: 245
07-05
certs found: 5088 | empty cert sessions: 486
08-05
certs found: 386 | empty cert sessions: 44


In [13]:
full_list = []
for s in url_lists:
    full_list+=s
len(full_list)

93194

In [15]:
len(url_lists)

51512

In [19]:
dfs = []
i = 0
for n,day in enumerate(days):
    final_df = pd.DataFrame()
    certs = cert_lists[n]
    
    phish_urls = []
    new_certs = []
    new_vers = []
    new_times = []
    new_targets = []
    for z,cert in enumerate(certs):
        curr_urls = url_lists[i]
        for j,u in enumerate(curr_urls):
            new_certs.append(cert)
            new_vers.append(vercode_lists[n][z])
            new_times.append(time_lists[n][z])
            new_targets.append(u[1])
            phish_urls.append(u[0])
        i+=1
    certs = new_certs
    subjects = [cert.get_subject() for cert in certs]
    issuers = [cert.get_issuer() for cert in certs]

    expiries = [cert.has_expired() for cert in certs]
    final_df['url'] = phish_urls
    final_df['CN'] = [s.CN for s in subjects]
    final_df['O'] = [s.O for s in subjects]
    final_df['C'] = [s.C for s in subjects]
    final_df['businessCategory'] = [s.businessCategory for s in subjects]
    final_df['serialNumber'] = [s.serialNumber for s in subjects]
    final_df['juristiction'] = [s.jurisdictionLocalityName for s in subjects]
    final_df['issuer_CN'] = [i.CN for i in issuers]
    final_df['issuer_O'] = [i.O for i in issuers]
    final_df['expired'] = expiries
    final_df['notBefore'] = [c.get_notBefore() for c in certs]
    final_df['notAfter'] = [c.get_notAfter() for c in certs]
    dfs.append(final_df)

    final_df['verCode'] = new_vers
    final_df['time_collected'] = new_times
    final_df['target'] = new_targets

In [21]:
full_cert_list = []
for s in cert_lists:
    full_cert_list+=s
len(full_cert_list)

51512

In [20]:
final_df = pd.concat(dfs)
final_df

Unnamed: 0,url,CN,O,C,businessCategory,serialNumber,juristiction,issuer_CN,issuer_O,expired,notBefore,notAfter,verCode,time_collected,target
0,http://sillyabba.com/abut/Adobe1/Adobe1/login....,*.hostgator.com,,,,,,Sectigo RSA Domain Validation Secure Server CA,Sectigo Limited,0.0,b'20200903000000Z',b'20210903235959Z',0,1.620480e+09,Other
1,https://www.cybersolution.eu/,*.aruba.it,Aruba S.p.A.,IT,,,,Actalis Organization Validated Server CA G3,Actalis S.p.A.,0.0,b'20200723123537Z',b'20220723123537Z',0,1.620480e+09,Other
2,https://dopeydog.co.nz/wp-includes/index.html?...,Plesk,Odin,US,,,,Plesk,Odin,1.0,b'20160428040009Z',b'20170428040009Z',10,1.620480e+09,Other
3,http://unrecognisedrequestedpayee.com/lloyds/L...,raa.namecheap.com,,,,,,Sectigo RSA Domain Validation Secure Server CA,Sectigo Limited,0.0,b'20201111000000Z',b'20211129235959Z',0,1.620480e+09,Other
4,https://paypal.me.holdpaystatic.shop/connexion,*.web-hosting.com,,,,,,Sectigo RSA Domain Validation Secure Server CA,Sectigo Limited,0.0,b'20200507000000Z',b'20220405235959Z',0,1.620480e+09,PayPal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
568,https://f96ccf42fd231af595cc84c480a379bd-dot-g...,*.appspot.com,Google LLC,US,,,,GTS CA 1O1,Google Trust Services,0.0,b'20210413101017Z',b'20210706101016Z',0,1.620495e+09,Other
569,https://neuralfluency.com/control/outlook.offi...,neuralfluency.com,,,,,,"cPanel, Inc. Certification Authority","cPanel, Inc.",0.0,b'20210505000000Z',b'20210803235959Z',0,1.620495e+09,Other
570,https://neuralfluency.com/control/outlook.offi...,neuralfluency.com,,,,,,"cPanel, Inc. Certification Authority","cPanel, Inc.",0.0,b'20210505000000Z',b'20210803235959Z',0,1.620495e+09,Other
571,https://nwcadvocates.com/cssdocsign/shoppng/u....,nwcadvocates.com,,,,,,"cPanel, Inc. Certification Authority","cPanel, Inc.",0.0,b'20210330000000Z',b'20210628235959Z',0,1.620495e+09,Other


In [28]:
final_distinct_df = final_df.drop_duplicates(subset=['url'],inplace=False)
final_distinct_df

Unnamed: 0,url,CN,O,C,businessCategory,serialNumber,juristiction,issuer_CN,issuer_O,expired,notBefore,notAfter,verCode,time_collected,target
0,http://sillyabba.com/abut/Adobe1/Adobe1/login....,*.hostgator.com,,,,,,Sectigo RSA Domain Validation Secure Server CA,Sectigo Limited,0.0,b'20200903000000Z',b'20210903235959Z',0,1.620480e+09,Other
1,https://www.cybersolution.eu/,*.aruba.it,Aruba S.p.A.,IT,,,,Actalis Organization Validated Server CA G3,Actalis S.p.A.,0.0,b'20200723123537Z',b'20220723123537Z',0,1.620480e+09,Other
2,https://dopeydog.co.nz/wp-includes/index.html?...,Plesk,Odin,US,,,,Plesk,Odin,1.0,b'20160428040009Z',b'20170428040009Z',10,1.620480e+09,Other
3,http://unrecognisedrequestedpayee.com/lloyds/L...,raa.namecheap.com,,,,,,Sectigo RSA Domain Validation Secure Server CA,Sectigo Limited,0.0,b'20201111000000Z',b'20211129235959Z',0,1.620480e+09,Other
4,https://paypal.me.holdpaystatic.shop/connexion,*.web-hosting.com,,,,,,Sectigo RSA Domain Validation Secure Server CA,Sectigo Limited,0.0,b'20200507000000Z',b'20220405235959Z',0,1.620480e+09,PayPal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
561,http://u11073366bg.ha004.t.justns.ru/société_g...,*.justhost.ru,,,,,,Sectigo RSA Domain Validation Secure Server CA,Sectigo Limited,0.0,b'20201108000000Z',b'20211108235959Z',21,1.620495e+09,Other
564,http://vivianegibert.com/Adpadpsecurity/adp,vivianegibert.com,,,,,,R3,Let's Encrypt,0.0,b'20210508153722Z',b'20210806153722Z',0,1.620494e+09,Other
565,http://fbtimeline-1jhlo4ji.villalarnia.org/con...,villalarnia.org,,,,,,R3,Let's Encrypt,0.0,b'20210326064357Z',b'20210624064357Z',0,1.620495e+09,Other
566,https://fbtimeline-1jhlo4ji.villalarnia.org/co...,villalarnia.org,,,,,,R3,Let's Encrypt,0.0,b'20210326064357Z',b'20210624064357Z',0,1.620495e+09,Other


In [29]:
final_distinct_df.to_csv('final_certificate_datasets/phish_dataset_urls.csv')

In [27]:
Counter(final_distinct_df.drop_duplicates(subset=['CN'])['target']).most_common(15)

[('Other', 3133),
 ('eBay, Inc.', 47),
 ('Microsoft', 46),
 ('Internal Revenue Service', 46),
 ('PayPal', 44),
 ('Facebook', 29),
 ('Capitec Bank', 24),
 ('Amazon.com', 23),
 ('ABSA Bank', 22),
 ('Vodafone', 16),
 ('ING Direct', 14),
 ('Rakuten', 14),
 ('PKO Polish Bank', 13),
 ('HSBC Group', 11),
 ('RuneScape', 10)]