In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate
from sklearn.metrics import recall_score, make_scorer

# Decision tree with cross validation:
from sklearn.tree import DecisionTreeClassifier as dtc # tree algorithm
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedGroupKFold

from functools import reduce

In [2]:
tls_tps = ['tls_tp_'+str(x) for x in range(20)]
tls_bs = ['tls_b_'+str(x) for x in range(20)]
tls_dir = ['tls_dir_'+str(x) for x in range(20)]
joy_indices_tls = tls_bs + tls_dir

### Loading data:

In [3]:
labeled_mta = pd.read_csv('processed-datasets/labeled_mta.csv', low_memory=False)
# MTA files range from 4 to 2345

In [4]:
tranco_joy = pd.read_csv('processed-datasets/tranco-dataset.csv', low_memory=False)
tranco_joy['c2'] = 0

# There are 2000 tranco files. We will assign an ID from 10000 to 11999
# so that they don't overlap with MTA.
tranco_joy['file_cat'] = tranco_joy['file'].astype('category')
tranco_joy['file'] = tranco_joy['file_cat'].cat.codes
tranco_joy['file'] = tranco_joy['file'] + 10000
tranco_joy['file']

0         10000
1         10000
2         10000
3         10000
4         10000
          ...  
505933    11999
505934    11999
505935    11999
505936    11999
505937    11999
Name: file, Length: 505938, dtype: int16

In [5]:
metasploit_joy = pd.read_csv('processed-datasets/ms-full.csv')
metasploit_joy['c2'] = 1
metasploit_joy['source'] = 'MS'

# For metasploit there are 44 combinations of (command, tlsversion, certificate).
# These will be used as a file id. From 20000 onwards to avoid overlap.
metasploit_joy['file'] = metasploit_joy.groupby(['command', 'tlsver', 'certver']).ngroup() + 20000

In [6]:
full_df_with_resumes = pd.concat([labeled_mta, tranco_joy], ignore_index=True)
full_df_with_resumes = shuffle(full_df_with_resumes, random_state=42)

full_df = full_df_with_resumes[(full_df_with_resumes['session_resumed'] == False)|(full_df_with_resumes['s_psk'] == -1)].reset_index(drop=True)

In [7]:
full_df_12 = full_df[full_df['tls_version_guess'] == 'TLS 1.2'].copy()
full_df_12.reset_index(drop=True, inplace=True)

In [8]:
# Take ~20% of Tranco and MTA for final testing. These should be taken considering groups.
# This can pottentially make 5 folds, and the experiment could be repeated.
# Worth doing, or just do this single experiment?

# X and Y composed only of MTA and Tranco 1.2.
# StratifiedGroupKFold to take 20% out for final testing.
x = full_df_12.loc[:,joy_indices_tls].values.astype(np.float64)
y = full_df_12['c2'].values
groups = full_df_12['file'].values

In [9]:
sgkf = StratifiedGroupKFold(n_splits=5)
# Take 1 fold/split, as our traing/test split:
train_index, test_index = next(sgkf.split(x, y, groups))
# groups = groups[train_index]

In [10]:
# Metasploit 1.2 Short -- take 50% of files for testing only.
# Out of the reamining 50%, take only 50% of each file for testing too.
# Only 50% of 50% is used for training/validation in StratifiedGroupKFold fashion.
# Metasploit 1.3 is all used for testing. Never for training -- We assume we don't have its groundtruth.

# These test files won't be used for training/validation at all:
# (can we catch an MS deployment we haven't seen before?)
tls12long_and_tls13_samples = (metasploit_joy['tlsver'] == 'tls13')|((metasploit_joy['tlsver'] == 'tls12') & (metasploit_joy['certver'] == 'longcert'))
tls12long_and_tls13_files = metasploit_joy[tls12long_and_tls13_samples]['file'].unique()

np.random.seed(42)
metasploit_test_files = np.random.choice(
    metasploit_joy[~metasploit_joy['file'].isin(tls12long_and_tls13_files)]['file'].unique(),
    metasploit_joy[~metasploit_joy['file'].isin(tls12long_and_tls13_files)]['file'].nunique()//2,
    replace=False
)

all_test_files = np.append(metasploit_test_files, tls12long_and_tls13_files)

In [11]:
# Out of the remaining files, we will also take 50% of traffic for testing
# (can we effectively catch traffic from deployments we have seen before?)
metasploit_train_samples, metasploit_test_samples_short12 = train_test_split(
    metasploit_joy[~metasploit_joy['file'].isin(all_test_files)].index,
    test_size=0.9,
    random_state=42,
)

metasploit_test_samples = np.append(metasploit_joy[metasploit_joy['file'].isin(all_test_files)].index, metasploit_test_samples_short12)

In [12]:
metasploit_joy[~metasploit_joy['file'].isin(all_test_files)]['certver'].value_counts()

certver
shortcert    73146
Name: count, dtype: int64

In [13]:
# Now we combine MS 1.2 short (train), with the previous training samples obtained from the StratifiedGroupKFold.
# These are used with Cross Validation to train RFCs and DecTrees. CV should be sgkf as well.
# Using CV we will choose the best parameters (tree depth, number of estimators, whatever...),
# and then use those parameters to train a "final model", that we will apply to the "test set".

train_df = shuffle(pd.concat([
    metasploit_joy.loc[metasploit_train_samples],
    full_df_12.loc[train_index]
], ignore_index=True), random_state=42).reset_index(drop=True)

display(train_df['source'].value_counts())

x_train = train_df.loc[:,joy_indices_tls].values.astype(np.float64)
y_train = train_df['c2'].values
groups_train = train_df['file'].values

source
tranco    225758
MTA        28988
MS          7314
Name: count, dtype: int64

In [14]:
# For the test set, we should display results separated by (sub)datasets:
# Tranco 1.2/1.3 (TNRs); MTA (TPR, TNR); MS 1.2/1.3 short/long (TPRs); DoH? (TNR)
tranco_test = {
    12: full_df.loc[test_index][(full_df.loc[test_index, 'source'] == 'tranco') & (full_df.loc[test_index, 'tls_version_guess'] == 'TLS 1.2')].loc[:,joy_indices_tls].values.astype(np.float64),
    13: tranco_joy.loc[tranco_joy['tls_version_guess'] == 'TLS 1.3', joy_indices_tls].values.astype(np.float64),
}
mta_test = full_df.loc[test_index][full_df.loc[test_index, 'source'] == 'MTA'].loc[:,joy_indices_tls].values.astype(np.float64)
mta_test_y = full_df.loc[test_index][full_df.loc[test_index, 'source'] == 'MTA'].loc[:,'c2'].values

ms_test = {
    12: {"short": metasploit_joy.loc[metasploit_test_samples].loc[(metasploit_joy.loc[metasploit_test_samples]['tlsver'] == 'tls12') & (metasploit_joy.loc[metasploit_test_samples]['certver'] == 'shortcert'), joy_indices_tls].values.astype(np.float64),
         "long":  metasploit_joy.loc[metasploit_test_samples].loc[(metasploit_joy.loc[metasploit_test_samples]['tlsver'] == 'tls12') & (metasploit_joy.loc[metasploit_test_samples]['certver'] == 'longcert'), joy_indices_tls].values.astype(np.float64)},
    13: {"short": metasploit_joy.loc[(metasploit_joy['tlsver'] == 'tls13') & (metasploit_joy['certver'] == 'shortcert'), joy_indices_tls].values.astype(np.float64),
         "long":  metasploit_joy.loc[(metasploit_joy['tlsver'] == 'tls13') & (metasploit_joy['certver'] == 'longcert'), joy_indices_tls].values.astype(np.float64)}
}

## Training

In [15]:
scoring = {"AUC": "roc_auc",
           "Accuracy": "accuracy",
           "Balanced Accracy": "balanced_accuracy",
           "Recall": "recall",
           "Neg Recall": make_scorer(recall_score, pos_label=0),
           "F1": "f1"}

sgkf = StratifiedGroupKFold(n_splits=10)

In [16]:
dtc_classifier = dtc(random_state=42)
parameters = {'max_depth':list(range(1,20))+[30,40,50,60], 
              'criterion': ['gini', 'entropy'], 
              'splitter': ['best', 'random'], 
              'max_features': [None, 'sqrt', 'log2'],
              'class_weight': [None, 'balanced']}
clf = GridSearchCV(dtc_classifier, parameters, n_jobs=-1, scoring=scoring, refit='F1', cv=sgkf)

In [17]:
# Skip this if not performing grid search again.
# clf.fit(x_train, y_train, groups=groups_train)
# print(clf.best_score_, clf.best_params_)
# clf.cv_results_
# 0.966298722793853 {'class_weight': None, 'criterion': 'gini', 'max_depth': 16, 'max_features': None, 'splitter': 'best'}

In [29]:
# Skip this if not performing grid search again.
# Now that we have the "best parameters", print some metrics:
tree_model = clf.best_estimator_
scores_dtc = cross_validate(clf.best_estimator_, x_train, y_train, groups=groups_train, cv=sgkf, n_jobs=-1, scoring=scoring, return_estimator=True)
display(scores_dtc)

# Print the average feature importance for all CV tree classifiers:
feature_importances = []
for idx,estimator in enumerate(scores_dtc['estimator']):
    feature_importances.append(pd.DataFrame(estimator.feature_importances_,
                                            index = joy_indices_tls,
                                            columns=['importance'+str(idx)]).sort_values('importance'+str(idx), ascending=False))

all_feature_importances = reduce(lambda left,right: pd.merge(left,right,how='outer',
                                                             left_index=True,
                                                             right_index=True), feature_importances)
display(all_feature_importances.mean(axis=1).sort_values(ascending=False).head(10))

{'fit_time': array([1.78152275, 1.57804203, 1.73124933, 1.98099375, 1.66869235,
        1.55350852, 1.80737543, 1.62125802, 1.71402121, 1.52401829]),
 'score_time': array([0.04558277, 0.04478908, 0.04452682, 0.04931736, 0.04899573,
        0.04472017, 0.04792953, 0.044801  , 0.04540658, 0.05142808]),
 'estimator': [DecisionTreeClassifier(max_depth=16, random_state=42),
  DecisionTreeClassifier(max_depth=16, random_state=42),
  DecisionTreeClassifier(max_depth=16, random_state=42),
  DecisionTreeClassifier(max_depth=16, random_state=42),
  DecisionTreeClassifier(max_depth=16, random_state=42),
  DecisionTreeClassifier(max_depth=16, random_state=42),
  DecisionTreeClassifier(max_depth=16, random_state=42),
  DecisionTreeClassifier(max_depth=16, random_state=42),
  DecisionTreeClassifier(max_depth=16, random_state=42),
  DecisionTreeClassifier(max_depth=16, random_state=42)],
 'test_AUC': array([0.98497039, 0.96679266, 0.99411598, 0.9775256 , 0.99313818,
        0.998679  , 0.99457532, 0.

tls_b_0     0.501670
tls_b_2     0.277984
tls_b_6     0.122010
tls_b_1     0.042179
tls_b_4     0.014376
tls_b_7     0.010433
tls_b_11    0.007931
tls_b_10    0.004608
tls_b_12    0.003297
tls_b_9     0.002802
dtype: float64

In [None]:
# Skip this if not performing grid search again.
# We will take the estimator with the best performance, train on all train data, apply to test data:
tree_model = clf.best_estimator_
tree_model.fit(x_train, y_train)

In [18]:
# Instead of doing grid search again, use previous parameters:
tree_model = dtc(random_state=42, class_weight=None, criterion='gini', max_depth=16, max_features=None, splitter='best')
tree_model.fit(x_train, y_train)

## Evaluation on the test set

In [19]:
doh_df = pd.read_csv('data/doh/joy-tls-versions-certinf-joypatched.csv')

  doh_df = pd.read_csv('data/doh/joy-tls-versions-certinf-joypatched.csv')


In [20]:
bad_rows = doh_df[( (doh_df[tls_tps]>23).any(axis=1) |\
                    (((doh_df[tls_tps]<20)&(doh_df[tls_tps]>1)).any(axis=1)) \
                  ) & (doh_df['tls_version_guess'] == 'TLS 1.3') & (doh_df['s_psk'] == -1)].index
print(len(bad_rows))
doh_df.drop(bad_rows, inplace=True)

2479


In [21]:
doh_df['c2'] = 0
doh_df.loc[doh_df['file'].str.startswith('e') |
           doh_df['file'].str.startswith('f') |
           doh_df['file'].str.startswith('g'), 'c2'] = 1

In [22]:
doh_df_all_benign = doh_df.loc[(doh_df['tls_version_guess'] == 'TLS 1.3') & (doh_df['s_psk'] == -1) & (doh_df['c2'] == 0), joy_indices_tls].values.astype(np.float64)
doh_df_all_malicious = doh_df.loc[(doh_df['tls_version_guess'] == 'TLS 1.3') & (doh_df['s_psk'] == -1) & (doh_df['c2'] == 1), joy_indices_tls].values.astype(np.float64)

In [23]:
print(" & ".join('{:.2f}'.format(x) for x in [ sum(tree_model.predict(mta_test)[np.where(mta_test_y==0)] == 0) / sum(mta_test_y==0) *100,
                   sum(tree_model.predict(mta_test)[np.where(mta_test_y)]) / sum(mta_test_y) *100,
                   sum(tree_model.predict(tranco_test[12])==0)/len(tranco_test[12]) * 100,
                   sum(tree_model.predict(tranco_test[13])==0)/len(tranco_test[13]) * 100,
                   sum(tree_model.predict(ms_test[12]["short"])==1)/len(ms_test[12]["short"]) * 100,
                   sum(tree_model.predict(ms_test[12]["long"])==1)/len(ms_test[12]["long"]) * 100,
                   sum(tree_model.predict(ms_test[13]["short"])==1)/len(ms_test[13]["short"]) * 100,
                   sum(tree_model.predict(ms_test[13]["long"])==1)/len(ms_test[13]["long"]) * 100,
                   sum(tree_model.predict(doh_df_all_benign)==0)/len(doh_df_all_benign) * 100]))

96.87 & 98.82 & 100.00 & 99.98 & 100.00 & 0.00 & 0.00 & 0.00 & 99.98


In [24]:
# Motivating Example:
np.random.seed(42)

me_tranco12 = tree_model.predict(tranco_test[12][np.random.choice(range(tranco_test[12].shape[0]),10000,replace=False)])
me_tranco13 = tree_model.predict(tranco_test[13][np.random.choice(range(tranco_test[13].shape[0]),10000,replace=False)])
me_ms12 = tree_model.predict(ms_test[12]["short"][np.random.choice(range(ms_test[12]["short"].shape[0]),75,replace=False)])
me_ms13short = tree_model.predict(ms_test[13]["short"][np.random.choice(range(ms_test[13]["short"].shape[0]),75,replace=False)])
me_ms13long = tree_model.predict(ms_test[13]["long"][np.random.choice(range(ms_test[13]["long"].shape[0]),50,replace=False)])

print(" & ".join('{:.2f}'.format(x) for x in [
    (sum(me_tranco12 == 0) / len(me_tranco12) * 100),
    (sum(me_tranco13 == 0) / len(me_tranco13) * 100),
    (sum(me_ms12 == 1) / len(me_ms12) * 100),
    (sum(me_ms13short == 1) / len(me_ms13short) * 100),
    (sum(me_ms13long == 1) / len(me_ms13long) * 100)
]))

100.00 & 99.98 & 100.00 & 0.00 & 0.00


## RFs

In [25]:
rfc = RandomForestClassifier(random_state=42, n_jobs=-1)
parameters_rf = {'n_estimators':[40, 50, 60, 70],
                 'max_depth': [5, 10, 15, 20, 25, None],
                 'max_features': [None, 'sqrt', 'log2'],
                 'class_weight': [None, 'balanced']
                }
clf_rf = GridSearchCV(rfc, parameters_rf, n_jobs=-1, scoring=scoring, refit='F1', cv=sgkf)

In [26]:
# Skip this if not performing grid search again.
# clf_rf.fit(x_train, y_train, groups=groups_train)
# print(clf_rf.best_score_, clf_rf.best_params_)
# 0.9688526568042404 {'class_weight': None, 'max_depth': 15, 'max_features': None, 'n_estimators': 70}

In [78]:
# Skip this if not performing grid search again.
# Now that we have the "best parameters", print some metrics:
scores_rf = cross_validate(clf_rf.best_estimator_, x_train, y_train, groups=groups_train, cv=sgkf, n_jobs=-1, scoring=scoring, return_estimator=True)
display(scores_rf)

# Print the average feature importance for all CV tree classifiers:
feature_importances_rf = []
for idx,estimator in enumerate(scores_rf['estimator']):
    feature_importances_rf.append(pd.DataFrame(estimator.feature_importances_,
                                            index = joy_indices_tls,
                                            columns=['importance'+str(idx)]).sort_values('importance'+str(idx), ascending=False))

{'fit_time': array([124.47887111, 119.66531515, 127.79431891, 116.8562007 ,
        119.31770992, 125.28497672, 126.11348939, 125.77922726,
         29.61426902,  25.91545391]),
 'score_time': array([0.58903694, 0.75628281, 0.65951753, 0.71559596, 0.70435691,
        0.60523915, 0.58164692, 0.59670067, 0.13466477, 0.24692106]),
 'estimator': [RandomForestClassifier(max_depth=15, max_features=None, n_estimators=70,
                         n_jobs=-1, random_state=42),
  RandomForestClassifier(max_depth=15, max_features=None, n_estimators=70,
                         n_jobs=-1, random_state=42),
  RandomForestClassifier(max_depth=15, max_features=None, n_estimators=70,
                         n_jobs=-1, random_state=42),
  RandomForestClassifier(max_depth=15, max_features=None, n_estimators=70,
                         n_jobs=-1, random_state=42),
  RandomForestClassifier(max_depth=15, max_features=None, n_estimators=70,
                         n_jobs=-1, random_state=42),
  RandomFore

In [79]:
# Skip this if not performing grid search again.
all_feature_importances_rf = reduce(lambda left,right: pd.merge(left,right,how='outer',
                                                             left_index=True,
                                                             right_index=True), feature_importances_rf)
display(all_feature_importances_rf.mean(axis=1).sort_values(ascending=False).head(10))

tls_b_0     0.501127
tls_b_2     0.277565
tls_b_6     0.122021
tls_b_1     0.043212
tls_b_4     0.014349
tls_b_7     0.008647
tls_b_11    0.008100
tls_b_10    0.004797
tls_b_9     0.003126
tls_b_12    0.002687
dtype: float64

In [None]:
# Skip this if not performing grid search again.
# We will take the estimator with the best performance, train on all train data, apply to test data:
rf_model = clf_rf.best_estimator_
rf_model.fit(x_train, y_train)

In [27]:
rf_model = RandomForestClassifier(random_state=42, n_jobs=-1, class_weight=None, max_depth=15, max_features=None, n_estimators=70)
rf_model.fit(x_train, y_train)

In [28]:
print(" & ".join('{:.2f}'.format(x) for x in [ 
                   sum(rf_model.predict(mta_test)[np.where(mta_test_y==0)] == 0) / sum(mta_test_y==0) *100,
                   sum(rf_model.predict(mta_test)[np.where(mta_test_y)]) / sum(mta_test_y) *100,
                   sum(rf_model.predict(tranco_test[12])==0)/len(tranco_test[12]) * 100,
                   sum(rf_model.predict(tranco_test[13])==0)/len(tranco_test[13]) * 100,
                   sum(rf_model.predict(ms_test[12]["short"])==1)/len(ms_test[12]["short"]) * 100,
                   sum(rf_model.predict(ms_test[12]["long"])==1)/len(ms_test[12]["long"]) * 100,
                   sum(rf_model.predict(ms_test[13]["short"])==1)/len(ms_test[13]["short"]) * 100,
                   sum(rf_model.predict(ms_test[13]["long"])==1)/len(ms_test[13]["long"]) * 100,
                   sum(rf_model.predict(doh_df_all_benign)==0)/len(doh_df_all_benign) * 100]))

96.68 & 98.92 & 100.00 & 100.00 & 100.00 & 0.00 & 0.00 & 0.00 & 99.98


## Motivating Example:

In [29]:
np.random.seed(42)

me_tranco12 = rf_model.predict(tranco_test[12][np.random.choice(range(tranco_test[12].shape[0]),10000,replace=False)])
me_tranco13 = rf_model.predict(tranco_test[13][np.random.choice(range(tranco_test[13].shape[0]),10000,replace=False)])
me_ms12 = rf_model.predict(ms_test[12]["short"][np.random.choice(range(ms_test[12]["short"].shape[0]),75,replace=False)])
me_ms13short = rf_model.predict(ms_test[13]["short"][np.random.choice(range(ms_test[13]["short"].shape[0]),75,replace=False)])
me_ms13long = rf_model.predict(ms_test[13]["long"][np.random.choice(range(ms_test[13]["long"].shape[0]),50,replace=False)])

print(" & ".join('{:.2f}'.format(x) for x in [
    (sum(me_tranco12 == 0) / len(me_tranco12) * 100),
    (sum(me_tranco13 == 0) / len(me_tranco13) * 100),
    (sum(me_ms12 == 1) / len(me_ms12) * 100),
    (sum(me_ms13short == 1) / len(me_ms13short) * 100),
    (sum(me_ms13long == 1) / len(me_ms13long) * 100)
]))

100.00 & 100.00 & 100.00 & 0.00 & 0.00


## Load UCNet
This part was not used. It was just meant to confirm that C-Basic's results are artificially "good" on UCNet.

In [None]:
realnet_all = pd.read_csv('data/ucnet/all.csv')

In [None]:
realnet_all = realnet_all[realnet_all['first_hs_tp'].astype(str) == '1'].reset_index(drop=True)
tls_dir = ['tls_dir_'+str(a) for a in range(20)]
realnet_all.drop(realnet_all[realnet_all[tls_dir].isin([1,-1]).all(axis=1)].index, inplace=True)
realnet_all.drop(realnet_all[realnet_all[tls_dir].isin([0,-1]).all(axis=1)].index, inplace=True)

In [None]:
realnet_all['tls_version_guess'] = 'TLS 1.2'
realnet_all.loc[(realnet_all['c_tls_version']==5)&(realnet_all['s_tls_version']==5)&
       (realnet_all['c_supported_versions']!='-1')&(realnet_all['c_supported_versions']!='')&
       ((realnet_all['s_supported_versions'] == '7f17')|(realnet_all['s_supported_versions'] == 'fb1a')|
       (realnet_all['s_supported_versions'] == '304')|(realnet_all['s_supported_versions'] == 304)|(realnet_all['s_supported_versions'] == '0304')), 'tls_version_guess'] = 'TLS 1.3'

In [None]:
realnet_all_test = {
    12: realnet_all.loc[realnet_all['tls_version_guess'] == 'TLS 1.2', joy_indices_tls].values.astype(np.float64),
    13: realnet_all.loc[realnet_all['tls_version_guess'] == 'TLS 1.3', joy_indices_tls].values.astype(np.float64),
}

In [92]:
(sum(tree_model.predict(realnet_all_test[12])==0) / len(realnet_all_test[12]) *100,
 sum(rf_model.predict(realnet_all_test[12])==0) / len(realnet_all_test[12]) * 100)

(99.91765014267135, 99.92751737081711)

In [93]:
(sum(tree_model.predict(realnet_all_test[13])==0) / len(realnet_all_test[13]) *100,
 sum(rf_model.predict(realnet_all_test[13])==0) / len(realnet_all_test[13]) * 100)

(99.87851732262291, 99.93281744751667)