In [19]:
from OpenSSL import crypto
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import os
import time

In [20]:
def normalise_counter(ctr):
    total = sum(ctr.values())
    for key in ctr.keys():
        ctr[key] /=total
    return ctr

In [21]:
phish_df = pd.read_csv('final_certificate_datasets/phish_dataset.csv')
benign_df = pd.read_csv('final_certificate_datasets/benign_certs.csv')
benign_df.drop_duplicates(subset=['CN'],inplace=True)
phish_df2 = pd.read_csv('final_certificate_datasets/phish_harsh_subset.csv')
phish_df3 =  pd.read_csv('final_certificate_datasets/phish_dataset_urls.csv')
benign_df2 = pd.read_csv('final_certificate_datasets/benign_unpopular.csv')
benign_df2.drop_duplicates(subset=['CN'],inplace=True)
benign_df2['verCode'] = benign_df2['verCode'].astype(str)
#benign_df3 = benign_df[:500]
all_dfs = [phish_df,phish_df2,phish_df3,benign_df,benign_df2]
labels = ['phish','phish subset','phish all urls','benign','benign unpopular']

In [22]:
def experiment(df,valid_only=True):

    if valid_only:
        df = df.loc[df['verCode'].astype(str)=='0']
    ctr = Counter(df['O'])
    return ctr
    

In [23]:
ctrs = []
for z,df in enumerate(all_dfs):
    print()
    print(labels[z])
    c = experiment(df)
    ctrs.append(c)


phish

phish subset

phish all urls

benign

benign unpopular


In [24]:
z = 0
print(labels[z])
ctrs[z].most_common(10)

phish


[(nan, 4342),
 ('Cloudflare, Inc.', 25),
 ('Google LLC', 16),
 ('Microsoft Corporation', 14),
 ('Amazon.com, Inc.', 8),
 ('Fastly, Inc.', 4),
 ('International Business Machines Corporation', 4),
 ('Yandex LLC', 3),
 ('Bitly, Inc.', 3),
 ('Mailjet SAS', 2)]

In [25]:
z = 1
print(labels[z])
ctrs[z].most_common(10)

phish subset


[(nan, 1507),
 ('Cloudflare, Inc.', 2),
 ('Beijing Sogou Technology Development Co., Ltd.', 1),
 ('Preduzece za telekomunikacije Telekom Srbija a.d., Beograd', 1),
 ('PayPal Pte Ltd', 1),
 ('Register S.p.A.', 1),
 ('Perum LPPNPI', 1),
 ('Facebook, Inc.', 1),
 ('I. Municipalidad de Punta Arenas', 1),
 ('DATLAS S.R.L.', 1)]

In [26]:
z = 2
print(labels[z])
ctrs[z].most_common(10)

phish all urls


[(nan, 8517),
 ('Google LLC', 2158),
 ('Cloudflare, Inc.', 971),
 ('Microsoft Corporation', 350),
 ('Bitly, Inc.', 157),
 ('Mailjet SAS', 117),
 ('Fastly, Inc.', 104),
 ('Amazon.com, Inc.', 36),
 ('Aruba S.p.A.', 35),
 ('Incapsula Inc', 22)]

In [27]:
z = 3
print(labels[z])
ctrs[z].most_common(10)

benign


[(nan, 18162),
 ('Cloudflare, Inc.', 862),
 ('Google LLC', 245),
 ('Fastly, Inc.', 41),
 ('Microsoft Corporation', 29),
 ('CBS Interactive Inc.', 20),
 ('Oracle Corporation', 19),
 ('Informationstechnikzentrum Bund (ITZBund)', 19),
 ('Apple Inc.', 18),
 ('Oath Inc', 18)]

In [28]:
z = 4
print(labels[z])
ctrs[z].most_common(10)

benign unpopular


[(nan, 8454),
 ('Cloudflare, Inc.', 134),
 ('Fasthosts Internet Ltd.', 6),
 ('Planeta Innovacion SA', 4),
 ('Oath Inc', 4),
 ('Fastly, Inc.', 3),
 ('Acquia Inc', 3),
 ('Google LLC', 2),
 ('Solera Holdings, INC.', 2),
 ('GitHub, Inc.', 2)]

In [29]:
new_ctrs = [normalise_counter(c) for c in ctrs]

In [17]:
for c in new_ctrs:
    print(1-c.most_common(1)[0][1])

0.040441988950276286
0.007899934167215239
0.3320523880479962
0.32788098586337056
0.10226186683657212


In [18]:
print(labels)

['phish', 'phish subset', 'phish all urls', 'benign', 'benign unpopular']


In [30]:
all_keys = []
all_ctrs = []
for z,df in enumerate(all_dfs):
    ctr = experiment(df)
    all_keys+=ctr.keys()
    all_ctrs.append(ctr)

In [31]:
all_keys

[nan,
 'Google LLC',
 'Mailjet SAS',
 'Physician Billing Associates, Inc.',
 'Microsoft Corporation',
 'ServiceNow, Inc.',
 'Fastly, Inc',
 'Squarespace, Inc.',
 'Loopia Aktiebolag',
 'Amazon.com, Inc.',
 'DigitalOcean, LLC',
 'Aruba S.p.A.',
 'Yandex LLC',
 'Fastly, Inc.',
 'Register.it S.p.A.',
 'Facebook, Inc.',
 'Netlify, Inc',
 'International Business Machines Corporation',
 'TimeWeb Company Limited',
 'FLYWHEEL',
 'ZixCorp Systems, Inc.',
 'GitHub, Inc.',
 'New Jersey Education Association',
 'iXL Hosting B.V.',
 'Bitly, Inc.',
 'MAIN DEPARTMENT OF GOVERNMENT SERVICES IN MYKOLAIV OBLAST',
 'Beijing Sogou Technology Development Co., Ltd.',
 'Preduzece za telekomunikacije Telekom Srbija a.d., Beograd',
 'NINETY DEGREE STONE PVT. LTD.',
 'SurveyLegend AB',
 'Evernote Corporation',
 'Fundacao INATEL',
 'Trend Micro Inc.',
 'Twitter, Inc.',
 'Oracle Corporation',
 'Japan Post Bank Co., Ltd.',
 'STRIKINGLY, INC',
 'Criteo SA',
 'TOTAL DIRECT ENERGIE  SA',
 'Dropbox, Inc',
 'Tokio Marin

In [32]:
z = 1
print(labels[z])
ctrs[z]

phish subset


Counter({nan: 0.9921000658327848,
         'Beijing Sogou Technology Development Co., Ltd.': 0.0006583278472679394,
         'Preduzece za telekomunikacije Telekom Srbija a.d., Beograd': 0.0006583278472679394,
         'PayPal Pte Ltd': 0.0006583278472679394,
         'Register S.p.A.': 0.0006583278472679394,
         'Perum LPPNPI': 0.0006583278472679394,
         'Facebook, Inc.': 0.0006583278472679394,
         'I. Municipalidad de Punta Arenas': 0.0006583278472679394,
         'Cloudflare, Inc.': 0.0013166556945358788,
         'DATLAS S.R.L.': 0.0006583278472679394,
         'PT. United Dico Citas': 0.0006583278472679394,
         'EFT Group S.A.': 0.0006583278472679394})

In [34]:
phish_df2.loc[phish_df2['O'].notna()]

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,CN,O,C,businessCategory,serialNumber,juristiction,issuer_CN,issuer_O,expired,notBefore,notAfter,verCode,time_collected
62,259,558,Xochicalco,SomeOrganization,--,,,,Xochicalco,SomeOrganization,1.0,b'20181114193524Z',b'20191114193524Z',21,1620480000.0
67,292,646,*.hosting-services.net.au,VentraIP Group (Australia) Pty Ltd,AU,,,,"Trustwave Organization Validation SHA256 CA, L...","Trustwave Holdings, Inc.",1.0,b'20150318214808Z',b'20171028034808Z',10,1620480000.0
115,456,1123,nginx,nginx,RU,,,,nginx,nginx,1.0,b'20190311155857Z',b'20200310155857Z',10,1620480000.0
126,509,1289,*.sogou.com,"Beijing Sogou Technology Development Co., Ltd.",CN,,,,GlobalSign RSA OV SSL CA 2018,GlobalSign nv-sa,0.0,b'20200428033050Z',b'20210722052741Z',0,1620480000.0
127,522,1326,*.oblaci.rs,Preduzece za telekomunikacije Telekom Srbija a...,RS,,,,GlobalSign RSA OV SSL CA 2018,GlobalSign nv-sa,0.0,b'20200109124109Z',b'20220217160641Z',0,1620480000.0
136,547,1393,banned.ssl.vhost,AfterMarket.pl,PL,,,,banned.ssl.vhost,AfterMarket.pl,0.0,b'20200313171119Z',b'20300311171119Z',18,1620480000.0
138,555,1407,ns100968.ip-147-135-15.us,OVH,--,,,,ns100968.ip-147-135-15.us,OVH,0.0,b'20180424022800Z',b'20450908022800Z',18,1620480000.0
174,696,1824,com-linweb226,Webhosting,BE,,,,com-linweb226,Webhosting,0.0,b'20200110074621Z',b'20220320074621Z',18,1620480000.0
181,717,1895,celeo.net,Celeonet SAS,FR,,,,celeo.net,Celeonet SAS,0.0,b'20170503104616Z',b'21170409104616Z',18,1620480000.0
211,792,2125,78.108.89.240,XX,XX,,,,78.108.89.240,XX,0.0,b'20100423153113Z',b'20231231153113Z',18,1620480000.0


In [39]:
df1 = benign_df
df1.loc[df1['O']=='SomeOrganization']

Unnamed: 0.1,Unnamed: 0,CN,O,C,businessCategory,serialNumber,juristiction,issuer_CN,issuer_O,expired,notBefore,notAfter,verCode,time_collected
6686,6686,ip-172-31-63-226,SomeOrganization,--,,,,ip-172-31-63-226,SomeOrganization,True,b'20170519191617Z',b'20180519191617Z',10,1620129000.0
8155,8155,inquilab.com,SomeOrganization,--,,,,inquilab.com,SomeOrganization,False,b'20210218075734Z',b'20220218075734Z',21,1620130000.0
8296,8296,hst-new.choopa.com,SomeOrganization,--,,,,hst-new.choopa.com,SomeOrganization,True,b'20100515071627Z',b'20110515071627Z',10,1620139000.0
8740,8740,baptistaoktatas.hosting.adatpark.hu,SomeOrganization,--,,,,baptistaoktatas.hosting.adatpark.hu,SomeOrganization,True,b'20180405084912Z',b'20190405084912Z',21,1620382000.0
11958,11958,ip-172-29-139-178.aws1.disney,SomeOrganization,--,,,,ip-172-29-139-178.aws1.disney,SomeOrganization,True,b'20151130002958Z',b'20161129002958Z',21,1620128000.0
17109,17109,ip-172-30-0-91,SomeOrganization,--,,,,ip-172-30-0-91,SomeOrganization,True,b'20190103154656Z',b'20200103154656Z',10,1620129000.0
17156,17156,centos6.localdomain,SomeOrganization,--,,,,centos6.localdomain,SomeOrganization,True,b'20140604071848Z',b'20150604071848Z',10,1620141000.0
20365,20365,cl1.itnewsinfo.com,SomeOrganization,--,,,,cl1.itnewsinfo.com,SomeOrganization,True,b'20180504143918Z',b'20190504143918Z',21,1620146000.0
21824,21824,db.rmo.org,SomeOrganization,--,,,,db.rmo.org,SomeOrganization,True,b'20171003194654Z',b'20181003194654Z',21,1620133000.0
23296,23296,ip-172-31-1-148,SomeOrganization,--,,,,ip-172-31-1-148,SomeOrganization,True,b'20170917061032Z',b'20180917061032Z',10,1620136000.0
