In [1]:
import pandas as pd
import numpy as np
from tls_paper_funcs import tls_12_appdata_filtering, remove_empty, nst_detection_and_removal_after_filtering

This file shows the type of processing that was used to obtain certificate size inference and application behavior extraction from the DoH dataset.  
Unlike others, this notebook has not been converted to a parallel version, and is thus not very efficient.

In [2]:
doh_df = pd.read_csv('data/doh/dataset_patchedjoy2.csv', engine='python')
doh_df

Unnamed: 0,c_ip,c_port,s_ip,s_port,ip_proto,start_time,c_tls_version,s_tls_version,c_supported_versions,s_supported_versions,...,tls_tp_12,tls_tp_13,tls_tp_14,tls_tp_15,tls_tp_16,tls_tp_17,tls_tp_18,tls_tp_19,sni,file
0,192.168.20.191,57501,172.217.164.205,443,6,1.579027e+09,5,5,0a1a1a0304030303020301,304,...,23,23,23,23,23,23,23,-1,accounts.google.com,a_00000
1,192.168.20.191,57504,23.4.44.172,443,6,1.579027e+09,5,5,0ababa0304030303020301,304,...,23,-1,-1,-1,-1,-1,-1,-1,udn.com,a_00000
2,192.168.20.191,57507,172.217.1.3,443,6,1.579027e+09,5,5,0a1a1a0304030303020301,304,...,23,23,23,23,-1,-1,-1,-1,www.gstatic.com,a_00000
3,192.168.20.191,57509,172.217.1.10,443,6,1.579027e+09,5,5,0a2a2a0304030303020301,304,...,23,23,23,23,23,23,-1,-1,www.googleapis.com,a_00000
4,192.168.20.191,57510,72.247.168.197,443,6,1.579027e+09,5,5,0a3a3a0304030303020301,304,...,23,-1,-1,-1,-1,-1,-1,-1,s.udn.com.tw,a_00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
824633,192.168.20.111,34654,52.33.55.70,443,6,1.578335e+09,5,5,80304030303020301,-1,...,21,21,-1,-1,-1,-1,-1,-1,shavar.services.mozilla.com,k_00001
824634,192.168.20.111,36168,8.8.8.8,443,6,1.578335e+09,5,5,80304030303020301,304,...,23,23,23,23,23,23,23,23,dns.google,k_00001
824635,192.168.20.111,34658,52.33.55.70,443,6,1.578335e+09,5,5,80304030303020301,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,shavar.services.mozilla.com,k_00001
824636,192.168.20.111,36172,8.8.8.8,443,6,1.578335e+09,5,5,80304030303020301,304,...,23,23,23,23,23,23,23,23,dns.google,k_00001


In [3]:
doh_df['tls_dir_19'].value_counts()

tls_dir_19
-1    586529
 1    176330
 0     61737
-2        42
Name: count, dtype: int64

In [4]:
586529 /(586529 + 176330 + 61737 + 42)

0.7112563330819098

In [5]:
doh_df['c2'] = 0
doh_df.loc[doh_df['file'].str.startswith('e') |
           doh_df['file'].str.startswith('f') |
           doh_df['file'].str.startswith('g'), 'c2'] = 1

In [6]:
doh_df['tls_version_guess'] = 'TLS 1.2'
doh_df.loc[(doh_df['c_tls_version']==5)&(doh_df['s_tls_version']==5)&
       (doh_df['c_supported_versions']!='-1')&(doh_df['c_supported_versions']!='')&
       ((doh_df['s_supported_versions'] == '7f17')|(doh_df['s_supported_versions'] == 'fb1a')|
       (doh_df['s_supported_versions'] == '304')|(doh_df['s_supported_versions'] == 304)), 'tls_version_guess'] = 'TLS 1.3'

print("Total:",len(doh_df))
print("TLS 1.2: {}, TLS 1.3: {}".format(
            len(doh_df[(doh_df['tls_version_guess'] == 'TLS 1.2')]),
            len(doh_df[(doh_df['tls_version_guess'] == 'TLS 1.3')])
    ))
print("TLS 1.3 no resumptions:", len(doh_df[(doh_df['tls_version_guess'] == 'TLS 1.3') & (doh_df['s_psk'] == -1)]))

Total: 824638
TLS 1.2: 431973, TLS 1.3: 392665
TLS 1.3 no resumptions: 345410


In [7]:
display(doh_df['c2'].value_counts())
display(doh_df[doh_df['c2']==1]['tls_version_guess'].value_counts())

c2
0    677378
1    147260
Name: count, dtype: int64

tls_version_guess
TLS 1.3    143448
TLS 1.2      3812
Name: count, dtype: int64

In [8]:
# do not perform certificate size guessing here;
# these files already have certificate size guessing:

# from tls_paper_funcs import cert_size_inference_simple_diagram
# tls13samples = list(doh_df[(doh_df['tls_version_guess'] == 'TLS 1.3') & (doh_df['s_psk'] == -1)].index)
# for i in tls13samples:
#    doh_df.loc[i,'cert_size_guess_simple'] = cert_size_inference_simple_diagram(doh_df.loc[i])
#    print(i, end='\r')
# doh_df.to_csv('data/doh/joy-tls-versions-certinf-joypatched.csv', index=False)

# pd.read_csv('data/doh/joy-tls-versions-certinf-joypatched.csv')

In [9]:
# Graph for certificate size distribution:
# _ = doh_df[doh_df['cert_size_guess_simple'] > 0]['cert_size_guess_simple'].hist(bins=100, figsize=(16,6))

In [10]:
tls_tps = ['tls_tp_'+str(x) for x in range(20)]
bad_rows = doh_df[( (doh_df[tls_tps]>23).any(axis=1) |\
                    (((doh_df[tls_tps]<20)&(doh_df[tls_tps]>1)).any(axis=1)) \
                  ) & (doh_df['tls_version_guess'] == 'TLS 1.3') & (doh_df['s_psk'] == -1)].index

In [11]:
# There are rows with errors that should be removed.
doh_df.iloc[bad_rows]

Unnamed: 0,c_ip,c_port,s_ip,s_port,ip_proto,start_time,c_tls_version,s_tls_version,c_supported_versions,s_supported_versions,...,tls_tp_14,tls_tp_15,tls_tp_16,tls_tp_17,tls_tp_18,tls_tp_19,sni,file,c2,tls_version_guess
118,192.168.20.191,57664,220.181.90.52,443,6,1.579027e+09,5,5,0a7a7a0304030303020301,304,...,-1,-1,-1,-1,-1,-1,sohu.com,a_00000,0,TLS 1.3
5316,192.168.20.191,49845,103.138.128.65,443,6,1.579031e+09,5,5,0afafa0304030303020301,304,...,-1,-1,-1,-1,-1,-1,accounts.zoho.com.au,a_00000,0,TLS 1.3
7117,192.168.20.191,53074,184.84.243.41,443,6,1.579032e+09,5,5,0a2a2a0304030303020301,304,...,185,24,43,27,164,168,sf-tb-sg.ibytedtos.com,a_00001,0,TLS 1.3
7119,192.168.20.191,53081,184.84.243.51,443,6,1.579032e+09,5,5,0a8a8a0304030303020301,304,...,-1,-1,-1,-1,-1,-1,s16.tiktokcdn.com,a_00001,0,TLS 1.3
9898,192.168.20.191,57772,157.240.2.25,443,6,1.579034e+09,5,5,0a5a5a0304030303020301,304,...,163,152,143,-1,-1,-1,connect.facebook.net,a_00001,0,TLS 1.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
801491,192.168.20.111,50284,104.123.193.100,443,6,1.576816e+09,5,5,80304030303020301,304,...,23,23,23,109,76,247,players.brightcove.net,k_00001,0,TLS 1.3
801877,192.168.20.111,40326,74.119.119.131,443,6,1.576816e+09,5,5,80304030303020301,304,...,23,23,157,23,23,-1,static.criteo.net,k_00001,0,TLS 1.3
802216,192.168.20.111,60704,104.25.143.101,443,6,1.576816e+09,5,5,80304030303020301,304,...,-1,-1,-1,-1,-1,-1,images-cache.alwatanvoice.com,k_00001,0,TLS 1.3
802279,192.168.20.111,45572,35.211.99.204,443,6,1.576816e+09,5,5,80304030303020301,304,...,-1,-1,-1,-1,-1,-1,bsw.digitru.st,k_00001,0,TLS 1.3


In [12]:
doh13_filtered = tls_12_appdata_filtering( doh_df[ (doh_df['tls_version_guess'] == 'TLS 1.3') \
                                          & (doh_df['s_psk'] == -1) ].drop(bad_rows) ).astype(int)
doh13_filtered[['c_ip', 'c_port', 's_ip', 's_port', 'ip_proto', 'start_time', 'sni', 'file', 'c2']] = \
            doh_df[ (doh_df['tls_version_guess'] == 'TLS 1.3') & (doh_df['s_psk'] == -1) ].drop(bad_rows)[['c_ip', 'c_port', 's_ip', 's_port', 'ip_proto', 'start_time', 'sni', 'file', 'c2']].values
doh13_filtered

Unnamed: 0,tls_b_0,tls_b_1,tls_b_2,tls_b_3,tls_b_4,tls_b_5,tls_b_6,tls_b_7,tls_b_8,tls_b_9,...,tls_dir_19,c_ip,c_port,s_ip,s_port,ip_proto,start_time,sni,file,c2
0,2907,53,81,302,27,497,57,26,26,740,...,-1,192.168.20.191,57501,172.217.164.205,443,6,1579026596.703039,accounts.google.com,a_00000,0
1,46,3367,281,69,69,266,266,34,19,-1,...,-1,192.168.20.191,57504,23.4.44.172,443,6,1579026597.537284,udn.com,a_00000,0
2,3643,53,81,260,497,57,26,363,1425,1025,...,-1,192.168.20.191,57507,172.217.1.3,443,6,1579026597.900412,www.gstatic.com,a_00000,0
3,2929,53,81,285,28,497,57,26,26,350,...,-1,192.168.20.191,57509,172.217.1.10,443,6,1579026598.017493,www.googleapis.com,a_00000,0
4,46,3372,281,69,69,266,266,34,19,-1,...,-1,192.168.20.191,57510,72.247.168.197,443,6,1579026598.161586,s.udn.com.tw,a_00000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
824518,32,2664,95,69,69,165,198,88,52,88,...,-1,192.168.20.111,49616,9.9.9.9,443,6,1578327212.387145,,k_00001,0
824552,32,2664,97,69,69,165,198,90,52,90,...,-1,192.168.20.111,49684,9.9.9.9,443,6,1578329171.610747,,k_00001,0
824582,46,2575,281,69,69,165,247,282,282,65,...,-1,192.168.20.111,49418,23.33.57.61,443,6,1578331412.484167,getpocket.cdn.mozilla.net,k_00001,0
824604,32,2664,95,69,69,165,198,90,52,90,...,-1,192.168.20.111,49788,9.9.9.9,443,6,1578332771.933971,,k_00001,0


In [13]:
remove_empty(doh13_filtered, True)

50


In [14]:
tls13doh_appdata_filtered = []
tls13samples = list(doh13_filtered.index)
print(len(tls13samples))
for count, i in enumerate(tls13samples):
    nst_discard_b, nst_discard_dir = nst_detection_and_removal_after_filtering(doh13_filtered.loc[i])
    line = np.hstack( [nst_discard_b,   (20 - len(nst_discard_b))*[-1],
                       nst_discard_dir, (20 - len(nst_discard_dir))*[-1]] )
    tls13doh_appdata_filtered.append((i,line))
    print(count, end='\r')

342881
342880

In [15]:
ident_columns = ['c_ip', 'c_port', 's_ip', 's_port', 'ip_proto', 'start_time', 'sni', 'file', 'c2']

In [16]:
tls_bs = ['tls_b_'+str(x) for x in range(20)]
tls_dir = ['tls_dir_'+str(x) for x in range(20)]

doh_tlsdata13 = pd.DataFrame(np.vstack([b for a,b in tls13doh_appdata_filtered]),
                            columns= tls_bs+tls_dir,
                            index=[a for a,b in tls13doh_appdata_filtered])

In [17]:
doh_tlsdata13[ident_columns] = doh13_filtered[ident_columns]

In [18]:
to_remove13 = doh_tlsdata13[ (doh_tlsdata13[tls_bs] == -1).all(axis=1) ].index
print(len(to_remove13))
doh_tlsdata13.drop(to_remove13, inplace=True)

24036


In [19]:
doh_tlsdata13

Unnamed: 0,tls_b_0,tls_b_1,tls_b_2,tls_b_3,tls_b_4,tls_b_5,tls_b_6,tls_b_7,tls_b_8,tls_b_9,...,tls_dir_19,c_ip,c_port,s_ip,s_port,ip_proto,start_time,sni,file,c2
0,81,302,27,57,26,26,740,64,288,64,...,-1,192.168.20.191,57501,172.217.164.205,443,6,1579026596.703039,accounts.google.com,a_00000,0
1,266,266,34,-1,-1,-1,-1,-1,-1,-1,...,-1,192.168.20.191,57504,23.4.44.172,443,6,1579026597.537284,udn.com,a_00000,0
2,81,260,57,26,363,1425,1025,34,34,-1,...,-1,192.168.20.191,57507,172.217.1.3,443,6,1579026597.900412,www.gstatic.com,a_00000,0
3,81,285,28,57,26,26,350,336,202,34,...,-1,192.168.20.191,57509,172.217.1.10,443,6,1579026598.017493,www.googleapis.com,a_00000,0
4,266,266,34,-1,-1,-1,-1,-1,-1,-1,...,-1,192.168.20.191,57510,72.247.168.197,443,6,1579026598.161586,s.udn.com.tw,a_00000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
824518,165,198,88,52,88,60,26,288,56,-1,...,-1,192.168.20.111,49616,9.9.9.9,443,6,1578327212.387145,,k_00001,0
824552,165,198,90,52,90,60,26,282,233,-1,...,-1,192.168.20.111,49684,9.9.9.9,443,6,1578329171.610747,,k_00001,0
824582,165,247,65,26,1041,1041,1041,1041,1041,-1,...,-1,192.168.20.111,49418,23.33.57.61,443,6,1578331412.484167,getpocket.cdn.mozilla.net,k_00001,0
824604,165,198,90,52,90,60,26,290,241,-1,...,-1,192.168.20.111,49788,9.9.9.9,443,6,1578332771.933971,,k_00001,0


In [None]:
# doh_tlsdata13.to_csv('data/doh/tls13-behav.csv')