In [1]:
from OpenSSL import crypto
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import os
import time

In [2]:
def normalise_counter(ctr):
    total = sum(ctr.values())
    for key in ctr.keys():
        ctr[key] /=total
    return ctr

In [3]:
phish_df = pd.read_csv('final_certificate_datasets/phish_dataset.csv')
benign_df = pd.read_csv('final_certificate_datasets/benign_certs.csv')
benign_df.drop_duplicates(subset=['CN'],inplace=True)
phish_df2 = pd.read_csv('final_certificate_datasets/phish_harsh_subset.csv')
phish_df3 =  pd.read_csv('final_certificate_datasets/phish_dataset_urls.csv')
benign_df2 = pd.read_csv('final_certificate_datasets/benign_unpopular.csv')
benign_df2.drop_duplicates(subset=['CN'],inplace=True)
benign_df2['verCode'] = benign_df2['verCode'].astype(str)
#benign_df3 = benign_df[:500]
all_dfs = [phish_df,phish_df2,phish_df3,benign_df,benign_df2]
labels = ['phish','phish subset','phish all urls','benign','benign unpopular']

In [4]:
def experiment(df,valid_only=True):

    if valid_only:
        df = df.loc[df['verCode'].astype(str)=='0']
    ctr = Counter(df['businessCategory'])
    print(ctr)
    return ctr

In [5]:
all_keys = []
all_ctrs = []
for z,df in enumerate(all_dfs):
    ctr = experiment(df)
    all_keys+=ctr.keys().
    all_ctrs.append(ctr)

Counter({nan: 4511, 'Private Organization': 14})
Counter({nan: 1517, 'Private Organization': 2})
Counter({nan: 12583, 'Private Organization': 168})
Counter({nan: 25926, 'Private Organization': 880, 'Government Entity': 203, 'Non-Commercial Entity': 11, 'Business Entity': 2})
Counter({nan: 9268, 'Private Organization': 135, 'Government Entity': 13, 'Business Entity': 1})


In [6]:
all_keys = set(all_keys)
all_keys

{'Business Entity',
 'Government Entity',
 'Non-Commercial Entity',
 'Private Organization',
 nan}

In [27]:
new_df = pd.DataFrame()
new_df['Dataset'] = labels
for code in list(all_keys)[1:]:
    results = []
    for i,ctr in enumerate(all_ctrs):
        if code in ctr.keys():
            results.append(ctr[code])
        else:
            results.append(0)
    print(results)
    new_df[code] = results

[14, 2, 168, 880, 135]
[0, 0, 0, 11, 0]
[0, 0, 0, 2, 1]
[0, 0, 0, 203, 13]


In [29]:
new_df

Unnamed: 0,Dataset,Private Organization,Non-Commercial Entity,Business Entity,Government Entity
0,phish,14,0,0,0
1,phish subset,2,0,0,0
2,phish all urls,168,0,0,0
3,benign,880,11,2,203
4,benign unpopular,135,0,1,13


In [17]:
new_df[pd.UInt16Dtype]

KeyError: <NA>

In [14]:
for z,df in enumerate(all_dfs):
    print()
    print(labels[z])
    experiment(df)


phish
Counter({nan: 4511, 'Private Organization': 14})

phish subset
Counter({nan: 1517, 'Private Organization': 2})

phish all urls
Counter({nan: 12583, 'Private Organization': 168})

benign
Counter({nan: 25926, 'Private Organization': 880, 'Government Entity': 203, 'Non-Commercial Entity': 11, 'Business Entity': 2})

benign unpopular
Counter({nan: 9268, 'Private Organization': 135, 'Government Entity': 13, 'Business Entity': 1})


In [4]:
ctr = Counter(phish_df['businessCategory'])
ctr = normalise_counter(ctr)
ctr.most_common(15)

[(nan, 0.9967735430530349), ('Private Organization', 0.003226456946965114)]

In [6]:
Counter(phish_df['businessCategory'])

Counter({nan: 4943, 'Private Organization': 16})

In [7]:
Counter(phish_df['serialNumber'])

Counter({nan: 4942,
         '800684408': 1,
         '26302266': 1,
         '6896854': 1,
         'B109476': 2,
         '200509725E': 1,
         '4627013': 1,
         'HRB 140616': 1,
         '3014267': 2,
         '692502000': 1,
         '00986729': 1,
         '03602670': 1,
         '1978093': 1,
         '5568191810': 1,
         'JF/MZQVo6bFSsChimsKNKVjiCKqptJ0s': 1,
         'R17247303': 1})

### benign:

In [10]:
Counter(benign_df['businessCategory'])

Counter({nan: 39695,
         'Private Organization': 1049,
         'Non-Commercial Entity': 15,
         'Government Entity': 225,
         'Business Entity': 5})

In [33]:
ctr = Counter(benign_df['businessCategory'])
ctr = normalise_counter(ctr)
ctr.most_common(15)

[(nan, 0.9684305545390227),
 ('Private Organization', 0.025592232062260606),
 ('Government Entity', 0.005489277611066384),
 ('Non-Commercial Entity', 0.0003659518407377589),
 ('Business Entity', 0.0001219839469125863)]

In [38]:
#how many times larger is the ratio of private organisation for benign vs phish certs (noth low proportion in first place)
0.025592232062260606/0.003226456946965114

7.9319924247968965

In [39]:
# no government certs for phishes

In [25]:
ctr = Counter(benign_df['serialNumber'])
ctr = normalise_counter(ctr)
ctr.most_common(15)

[(nan, 0.9668935568079241),
 ('R17247303', 0.0023176949913391396),
 ('Government Entity', 0.0022932982019566225),
 ('C0806592', 0.0003659518407377589),
 ('2110234', 0.00021957110444265535),
 ('Government Entities', 0.00021957110444265535),
 ('05022287', 0.00021957110444265535),
 ('1945-12-27', 0.0001707775256776208),
 ('3014267', 0.0001219839469125863),
 ('03230061', 0.0001219839469125863),
 ('CG0073', 9.758715753006904e-05),
 ('2157877', 9.758715753006904e-05),
 ('2154254', 9.758715753006904e-05),
 ('HRB 30000', 9.758715753006904e-05),
 ('05747339', 9.758715753006904e-05)]

In [29]:
Counter(benign_df['serialNumber']).most_common(1)[0][1] - len(benign_df)

-1357

In [31]:
no_serial = benign_df.loc[benign_df['serialNumber'].isna()]
len(no_serial) - len(benign_df)

-1357

In [32]:
Counter(no_serial['businessCategory'])

Counter({nan: 39632})

In [40]:
#top benign
codes = benign_df['businessCategory']
b200 = codes[:200]
b5000 = codes[100:5000]

In [41]:
ctr = Counter(b200)
ctr = normalise_counter(ctr)
ctr.most_common(15)

[(nan, 0.965), ('Private Organization', 0.035)]

In [42]:
ctr = Counter(b5000)
ctr = normalise_counter(ctr)
ctr

Counter({nan: 0.9579591836734694,
         'Private Organization': 0.03,
         'Non-Commercial Entity': 0.0014285714285714286,
         'Government Entity': 0.010612244897959184})