In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter

In [2]:
dataset_list = ['16-04','18-04','19-04','20-04','21-04','22-04','23-04','24-04','30-04','01-05','03-05','04-05','05-05','07-05']

In [3]:
datasets = [dict() for i in dataset_list]

In [5]:
for i,e in enumerate(dataset_list):
    datasets[i]['path'] = "url_datasets/phishtank-"+e+".csv"

    df = pd.read_csv(datasets[i]['path'])
    urls = df['url']
    new_urls = [u.split("/")[2] for u in urls]
    distinct_urls = list(set(new_urls))
    datasets[i]['distinct_urls'] = distinct_urls
    print(len(distinct_urls))

5872
5762
5765
5636
5573
5721
5506


## show how many unique URLs added and removed between each day

In [12]:
for i,e in enumerate(dataset_list[:-1]):
    new_novel = np.ones(len(datasets[i+1]['distinct_urls']),dtype=bool)
    old_novel = np.ones(len(datasets[i]['distinct_urls']),dtype=bool)

    for url in datasets[i]['distinct_urls']:
        for j,z in enumerate(datasets[i+1]['distinct_urls']):
            if url == z:
                new_novel[j] = False

    for url in datasets[i+1]['distinct_urls']:
        for j,z in enumerate(datasets[i]['distinct_urls']):
            if url == z:
                old_novel[j] = False
    datasets[i]['old_novel'] = old_novel
    datasets[i+1]['new_novel'] = new_novel

### how many got taken down next day/dataset

In [15]:
for i,e in enumerate(dataset_list[:-1]):
    print("{} vs {}:".format(e,dataset_list[i+1]))
    print(Counter(datasets[i]['old_novel']))

16-04 vs 18-04:
Counter({False: 5299, True: 573})
18-04 vs 20-04:
Counter({False: 5274, True: 488})
20-04 vs 21-04:
Counter({False: 5420, True: 345})
21-04 vs 22-04:
Counter({False: 5311, True: 325})
22-04 vs 23-04:
Counter({False: 5257, True: 316})
23-04 vs 24-04:
Counter({False: 5458, True: 263})


### how many new ones got verified

In [18]:
for i,e in enumerate(dataset_list[:-1]):
    print("{} vs {}:".format(dataset_list[i+1],e))
    print(Counter(datasets[i+1]['new_novel']))

18-04 vs 16-04:
Counter({False: 5299, True: 463})
20-04 vs 18-04:
Counter({False: 5274, True: 491})
21-04 vs 20-04:
Counter({False: 5420, True: 216})
22-04 vs 21-04:
Counter({False: 5311, True: 262})
23-04 vs 22-04:
Counter({False: 5257, True: 464})
24-04 vs 23-04:
Counter({False: 5458, True: 48})


# how many certificates got fetched

In [34]:
for i,e in enumerate(datasets):

    is_empty = np.ones(len(e['distinct_urls']),dtype=bool)
    for j,url in enumerate(e['distinct_urls']):
        path = 'phish_data-'+dataset_list[i]+'/'
        try:
            f = open(path+url)    
            if f.readline() != '':
                is_empty[j] = False
        except:
            pass
    datasets[i]['is_empty'] = is_empty

In [46]:
for i,e in enumerate(datasets):
    print(dataset_list[i])
    print(Counter(e['is_empty']))

16-04
Counter({False: 4476, True: 1396})
18-04
Counter({False: 4345, True: 1417})
20-04
Counter({False: 4946, True: 819})
21-04
Counter({False: 4707, True: 929})
22-04
Counter({False: 3629, True: 1944})
23-04
Counter({True: 3884, False: 1837})
24-04
Counter({False: 4684, True: 822})


## failure rate for fresh websites

In [39]:
for i,e in enumerate(datasets[1:]):
    print(dataset_list[i])
    print(Counter(e['new_novel']))
    print(Counter(np.logical_and(e['is_empty'],e['new_novel'])))

16-04
Counter({False: 5299, True: 463})
Counter({False: 5455, True: 307})
18-04
Counter({False: 5274, True: 491})
Counter({False: 5615, True: 150})
20-04
Counter({False: 5420, True: 216})
Counter({False: 5571, True: 65})
21-04
Counter({False: 5311, True: 262})
Counter({False: 5457, True: 116})
22-04
Counter({False: 5257, True: 464})
Counter({False: 5357, True: 364})
23-04
Counter({False: 5458, True: 48})
Counter({False: 5485, True: 21})


## compare with next day

In [24]:
df2 = pd.read_csv("url_datasets/phishtank-18-04.csv")
urls2 = df2['url']
new_urls2 = [u.split("/")[2] for u in urls2]
distinct_urls2 = list(set(new_urls2))

In [26]:
new_novel = np.ones(len(distinct_urls2),dtype=bool)
old_novel = np.ones(len(distinct_urls),dtype=bool)


for url in new_urls:
    for i,e in enumerate(distinct_urls2):
        if url == e:
            new_novel[i] = False
            
for url in new_urls2:
    for i,e in enumerate(distinct_urls):
        if url == e:
            old_novel[i] = False

In [27]:
Counter(new_novel)

Counter({False: 5522, True: 240})

In [28]:
Counter(old_novel)

Counter({False: 5522, True: 322})