In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate
from sklearn.metrics import recall_score, make_scorer

# Decision tree with cross validation:
from sklearn.tree import DecisionTreeClassifier as dtc # tree algorithm
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import StratifiedGroupKFold
from functools import reduce

In [2]:
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [3]:
tls_tps = ['tls_tp_'+str(x) for x in range(20)]
tls_bs = ['tls_b_'+str(x) for x in range(20)]
tls_dir = ['tls_dir_'+str(x) for x in range(20)]

joy_indices_tls = tls_bs + tls_dir
joy_indices_tp = tls_bs + tls_dir + tls_tps

In [4]:
tls_bs_half = ['tls_b_'+str(x) for x in range(10)]
tls_dir_half = ['tls_dir_'+str(x) for x in range(10)]
joy_indices_tls_half = tls_bs_half + tls_dir_half

In [5]:
ms_tlsdata = pd.read_csv('processed-datasets/ms-tls12-data.csv', index_col=0)
ms13_filtered = pd.read_csv('processed-datasets/ms-tls13-data.csv', index_col=0)
ms_tlsdata13 = pd.read_csv('processed-datasets/ms-tls13-behav.csv', index_col=0)
for df in [ms_tlsdata, ms13_filtered, ms_tlsdata13]:
    df['source'] = 'MS'
    df['c2'] = 1

In [6]:
# For metasploit there are 44 combinations of (command, tlsversion, certificate).
# These will be used as a file id. From 20000 onwards to avoid overlap.
ms_tlsdata['file'] = ms_tlsdata.groupby(['command', 'certver']).ngroup() + 20000

In [7]:
tranco_tlsdata12 = pd.read_csv('processed-datasets/tranco-tls12-data.csv', index_col=0)
tranco_13filtered = pd.read_csv('processed-datasets/tranco-tls13-data.csv', index_col=0)
tranco_13behav = pd.read_csv('processed-datasets/tranco-tls13-behav-new.csv', index_col=0)

for df in [tranco_tlsdata12, tranco_13filtered, tranco_13behav]:
    df['source'] = 'tranco'
    df['c2'] = 0
tranco_tlsdata12['tranco_type'] = 'data12'; tranco_13filtered['tranco_type'] = 'data13'; tranco_13behav['tranco_type'] = 'behav13'

In [8]:
# There are 2000 tranco files. We will assign an ID from 10000 to 11999
# so that they don't overlap with MTA.
tranco_tlsdata12['file_cat'] = tranco_tlsdata12['file'].astype('category')
tranco_tlsdata12['file'] = tranco_tlsdata12['file_cat'].cat.codes
tranco_tlsdata12['file'] = tranco_tlsdata12['file'] + 10000
tranco_tlsdata12['file']

0         10000
1         10000
3         10000
4         10000
5         10000
          ...  
505209    11510
505321    11510
505413    11510
505535    11510
505902    11510
Name: file, Length: 200678, dtype: int16

In [9]:
mta_tlsdata = pd.read_csv('processed-datasets/mta_tlsdata.csv')

  mta_tlsdata = pd.read_csv('processed-datasets/mta_tlsdata.csv')


In [10]:
doh_behav = pd.read_csv('data/doh/tls13-behav.csv', index_col=0)
doh_behav

Unnamed: 0,tls_b_0,tls_b_1,tls_b_2,tls_b_3,tls_b_4,tls_b_5,tls_b_6,tls_b_7,tls_b_8,tls_b_9,tls_b_10,tls_b_11,tls_b_12,tls_b_13,tls_b_14,tls_b_15,tls_b_16,tls_b_17,tls_b_18,tls_b_19,tls_dir_0,tls_dir_1,tls_dir_2,tls_dir_3,tls_dir_4,tls_dir_5,tls_dir_6,tls_dir_7,tls_dir_8,tls_dir_9,tls_dir_10,tls_dir_11,tls_dir_12,tls_dir_13,tls_dir_14,tls_dir_15,tls_dir_16,tls_dir_17,tls_dir_18,tls_dir_19
0,81,302,27,57,26,26,740,64,288,64,34,34,-1,-1,-1,-1,-1,-1,-1,-1,0,0,0,1,1,0,1,1,1,1,1,0,-1,-1,-1,-1,-1,-1,-1,-1
1,266,266,34,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,1,1,1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
2,81,260,57,26,363,1425,1025,34,34,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,0,0,1,0,1,1,1,1,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
3,81,285,28,57,26,26,350,336,202,34,34,-1,-1,-1,-1,-1,-1,-1,-1,-1,0,0,0,1,0,1,1,1,1,1,0,-1,-1,-1,-1,-1,-1,-1,-1,-1
4,266,266,34,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,1,1,1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
176170,339,105,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,0,1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
176171,343,1436,1436,1436,1436,1436,126,1436,131,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,0,1,1,1,1,1,1,1,1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
176172,322,1241,306,1436,1436,1436,1436,1436,129,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,0,1,0,1,1,1,1,1,1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
176173,165,262,57,26,26,30,418,1413,1413,1413,1413,1413,1413,-1,-1,-1,-1,-1,-1,-1,0,0,1,1,0,0,1,1,1,1,1,1,1,-1,-1,-1,-1,-1,-1,-1


In [11]:
full_df_tlsdata = shuffle(pd.concat([tranco_tlsdata12, mta_tlsdata], ignore_index=True), random_state=42)
#full_df = full_df_with_resumes[(full_df_with_resumes['session_resumed'] == False)|(full_df_with_resumes['s_psk'] == -1)].reset_index(drop=True)

In [12]:
# Take ~20% of Tranco and MTA for final testing. These should be taken considering groups.
# This can pottentially make 5 folds, and the experiment could be repeated.
# Worth doing, or just do this single experiment?

# X and Y composed only of MTA and Tranco 1.2.
# StratifiedGroupKFold to take 20% out for final testing.
x = full_df_tlsdata.loc[:,joy_indices_tls].values.astype(np.float64)
y = full_df_tlsdata['c2'].values
groups = full_df_tlsdata['file'].values

In [13]:
sgkf = StratifiedGroupKFold(n_splits=5)
# Take 1 fold/split, as our traing/test split:
train_index, test_index = next(sgkf.split(x, y, groups))

In [14]:
# Metasploit 1.2 Short -- take 50% of files for testing only.
# Out of the reamining 50%, take only 50% of each file for testing too.
# Only 50% of 50% is used for training/validation in StratifiedGroupKFold fashion.
# Metasploit 1.3 is all used for testing. Never for training -- We assume we don't have its groundtruth.

# These test files won't be used for training/validation at all:
# (can we catch an MS deployment we haven't seen before?)
tls12long = (ms_tlsdata['certver'] == 'longcert')
tls12long_files = ms_tlsdata[tls12long]['file'].unique()

np.random.seed(42)
metasploit_test_files = np.random.choice(
    ms_tlsdata[~ms_tlsdata['file'].isin(tls12long_files)]['file'].unique(),
    ms_tlsdata[~ms_tlsdata['file'].isin(tls12long_files)]['file'].nunique()//2,
    replace=False
)

all_test_files = np.append(metasploit_test_files, tls12long_files)

In [15]:
# Out of the remaining files, we will also take 50% of traffic for testing
# (can we effectively catch traffic from deployments we have seen before?)
metasploit_train_samples, metasploit_test_samples_short12 = train_test_split(
    ms_tlsdata[~ms_tlsdata['file'].isin(all_test_files)].index,
    test_size=0.9,
    random_state=42,
)

metasploit_test_samples = np.append(ms_tlsdata[ms_tlsdata['file'].isin(all_test_files)].index, metasploit_test_samples_short12)

In [16]:
# Now we combine MS 1.2 short (train), with the previous training samples obtained from the StratifiedGroupKFold.
# These are used with Cross Validation to train RFCs and DecTrees. CV should be sgkf as well.
# Using CV we will choose the best parameters (tree depth, number of estimators, whatever...),
# and then use those parameters to train a "final model", that we will apply to the "test set".

train_df = shuffle(pd.concat([
    ms_tlsdata.loc[metasploit_train_samples],
    full_df_tlsdata.loc[train_index]
], ignore_index=True), random_state=42).reset_index(drop=True)

display(train_df['source'].value_counts())

x_train = train_df.loc[:,joy_indices_tls].values.astype(np.float64)
y_train = train_df['c2'].values
groups_train = train_df['file'].values

source
tranco    160934
MTA        26974
MS          7314
Name: count, dtype: int64

In [17]:
tranco_test_half = {
    '12': full_df_tlsdata.loc[test_index][(full_df_tlsdata.loc[test_index, 'source'] == 'tranco')].loc[:,joy_indices_tls_half].values.astype(np.float64),
    '13filter': tranco_13filtered.loc[:, joy_indices_tls_half].values.astype(np.float64),
    '13behav': tranco_13behav.loc[:, joy_indices_tls_half].values.astype(np.float64),
}
mta_test_half = full_df_tlsdata.loc[test_index][full_df_tlsdata.loc[test_index, 'source'] == 'MTA'].loc[:,joy_indices_tls_half].values.astype(np.float64)
mta_test_y = full_df_tlsdata.loc[test_index][full_df_tlsdata.loc[test_index, 'source'] == 'MTA'].loc[:,'c2'].values
ms_test_half = {
    '12': {"short": ms_tlsdata.loc[metasploit_test_samples].loc[(ms_tlsdata.loc[metasploit_test_samples]['certver'] == 'shortcert'), joy_indices_tls_half].values.astype(np.float64),
           "long":  ms_tlsdata.loc[metasploit_test_samples].loc[(ms_tlsdata.loc[metasploit_test_samples]['certver'] == 'longcert'), joy_indices_tls_half].values.astype(np.float64)},
    '13filter': {"short": ms13_filtered.loc[(ms13_filtered['certver'] == 'shortcert'), joy_indices_tls_half].values.astype(np.float64),
                 "long":  ms13_filtered.loc[(ms13_filtered['certver'] == 'longcert'), joy_indices_tls_half].values.astype(np.float64)},
    '13behav': {"short": ms_tlsdata13.loc[(ms_tlsdata13['certver'] == 'shortcert'), joy_indices_tls_half].values.astype(np.float64),
                 "long":  ms_tlsdata13.loc[(ms_tlsdata13['certver'] == 'longcert'), joy_indices_tls_half].values.astype(np.float64)}
}

In [18]:
doh_test_half = doh_behav.loc[:, joy_indices_tls_half].values.astype(np.float64)
x_train_half = train_df.loc[:,joy_indices_tls_half].values.astype(np.float64)

In [19]:
# Add some error to the training to see what happens.
# 10 sizes, 10 directions. But we only add errror to directions.
error_mask = np.array(x_train_half != -1).astype(int) * np.concatenate((np.ones((x_train_half.shape[0],10)), np.zeros((x_train_half.shape[0],10))), axis=1)
rng = np.random.default_rng(42)
error_vals = rng.noncentral_chisquare(3, 10, error_mask.shape).astype(int)-10
error_to_add = error_mask * error_vals

In [20]:
x_train_half_err = x_train_half + error_to_add
x_train_half_err_noerr, y_train_err_noerr, groups_err_noerr = shuffle(np.concatenate([x_train_half_err, x_train_half]), np.concatenate([y_train, y_train]), np.concatenate([groups_train, groups_train]), random_state=42)

## Training

In [21]:
scoring = {"AUC": "roc_auc",
           "Accuracy": "accuracy",
           "Balanced Accracy": "balanced_accuracy",
           "Recall": "recall",
           "Neg Recall": make_scorer(recall_score, pos_label=0),
           "F1": "f1"}

sgkf = StratifiedGroupKFold(n_splits=10)

In [22]:
dtc_classifier = dtc(criterion = 'gini')
parameters = {'max_depth':list(range(1,20))+[30,40,50,60], 
              'criterion': ['gini', 'entropy'], 
              'splitter': ['best', 'random'],
              'class_weight': [None, 'balanced']}
clf = GridSearchCV(dtc_classifier, parameters, n_jobs=-1, scoring=scoring, refit='F1', cv=sgkf)

In [28]:
# Skip this if not performing grid search again!
clf2 = GridSearchCV(dtc_classifier, parameters, n_jobs=-1, scoring=scoring, refit='F1', cv=sgkf)
clf2.fit(x_train_half_err_noerr, y_train_err_noerr, groups=groups_err_noerr)
print(clf2.best_score_, clf2.best_params_)

0.9218531864086035 {'class_weight': None, 'criterion': 'entropy', 'max_depth': 16, 'splitter': 'best'}


In [31]:
# Skip this if not performing grid search again!
# Now that we have the "best parameters", print some metrics:
scores_dtc2 = cross_validate(clf2.best_estimator_, x_train_half_err_noerr, y_train_err_noerr, groups=groups_err_noerr, cv=sgkf, n_jobs=-1, scoring=scoring, return_estimator=True)
display(scores_dtc2)

# Print the average feature importance for all CV tree classifiers:
feature_importances2 = []
for idx,estimator in enumerate(scores_dtc2['estimator']):
    feature_importances2.append(pd.DataFrame(estimator.feature_importances_,
                                            index = joy_indices_tls_half,
                                            columns=['importance'+str(idx)]).sort_values('importance'+str(idx), ascending=False))

all_feature_importances2 = reduce(lambda left,right: pd.merge(left,right,how='outer',
                                                             left_index=True,
                                                             right_index=True), feature_importances2)
display(all_feature_importances2.mean(axis=1).sort_values(ascending=False).head(10))

{'fit_time': array([1.36347556, 1.32792354, 1.37320709, 1.4528327 , 1.32286787,
        1.34773111, 1.34274888, 1.33639812, 1.293612  , 1.22569013]),
 'score_time': array([0.0684824 , 0.06711674, 0.0736413 , 0.07245111, 0.06796002,
        0.06784558, 0.07728744, 0.06751323, 0.06774068, 0.06564355]),
 'estimator': [DecisionTreeClassifier(criterion='entropy', max_depth=16),
  DecisionTreeClassifier(criterion='entropy', max_depth=16),
  DecisionTreeClassifier(criterion='entropy', max_depth=16),
  DecisionTreeClassifier(criterion='entropy', max_depth=16),
  DecisionTreeClassifier(criterion='entropy', max_depth=16),
  DecisionTreeClassifier(criterion='entropy', max_depth=16),
  DecisionTreeClassifier(criterion='entropy', max_depth=16),
  DecisionTreeClassifier(criterion='entropy', max_depth=16),
  DecisionTreeClassifier(criterion='entropy', max_depth=16),
  DecisionTreeClassifier(criterion='entropy', max_depth=16)],
 'test_AUC': array([0.96044099, 0.93035622, 0.98734648, 0.97946041, 0.9779

tls_b_0      0.704533
tls_b_1      0.156534
tls_dir_1    0.049971
tls_b_2      0.033264
tls_b_3      0.016351
tls_dir_2    0.009563
tls_dir_0    0.009339
tls_dir_3    0.006706
tls_b_5      0.002509
tls_b_4      0.002362
dtype: float64

In [25]:
# 0.9218531864086035 {'class_weight': None, 'criterion': 'entropy', 'max_depth': 16, 'splitter': 'best'}
clf2 = dtc(class_weight=None, criterion='entropy', max_depth=16, splitter='best')
clf2.fit(x_train_half_err_noerr, y_train_err_noerr)

In [29]:
print(" & ".join('{:.2f}'.format(x) for x in [ 
                   sum(clf2.predict(mta_test_half)[np.where(mta_test_y==0)] == 0) / sum(mta_test_y==0) *100,
                   sum(clf2.predict(mta_test_half)[np.where(mta_test_y)]) / sum(mta_test_y) *100,
                   sum(clf2.predict(tranco_test_half['12'])==0)/len(tranco_test_half['12']) * 100,
                   sum(clf2.predict(ms_test_half['12']["short"])==1)/len(ms_test_half['12']["short"]) * 100,
                   sum(clf2.predict(ms_test_half['12']["long"])==1)/len(ms_test_half['12']["long"]) * 100
]))

93.90 & 95.64 & 99.93 & 98.97 & 98.85


In [34]:
print(" & ".join('{:.2f}'.format(x) for x in [ 
                   sum(clf2.predict(mta_test_half)[np.where(mta_test_y==0)] == 0) / sum(mta_test_y==0) *100,
                   sum(clf2.predict(mta_test_half)[np.where(mta_test_y)]) / sum(mta_test_y) *100,
                   sum(clf2.predict(tranco_test_half['12'])==0)/len(tranco_test_half['12']) * 100,
                   sum(clf2.predict(ms_test_half['12']["short"])==1)/len(ms_test_half['12']["short"]) * 100,
                   sum(clf2.predict(ms_test_half['12']["long"])==1)/len(ms_test_half['12']["long"]) * 100
]))

93.90 & 95.64 & 99.93 & 98.97 & 98.85


In [31]:
print(" & ".join('{:.2f}'.format(x) for x in [ 
                   sum(clf2.predict(tranco_test_half['13filter'])==0)/len(tranco_test_half['13filter']) * 100,
                   sum(clf2.predict(ms_test_half['13filter']["short"])==1)/len(ms_test_half['13filter']["short"]) * 100,
                   sum(clf2.predict(ms_test_half['13filter']["long"])==1)/len(ms_test_half['13filter']["long"]) * 100
]))

89.90 & 0.00 & 0.00


In [32]:
print(" & ".join('{:.2f}'.format(x) for x in [ 
                   sum(clf2.predict(tranco_test_half['13behav'])==0)/len(tranco_test_half['13behav']) * 100,
                   sum(clf2.predict(ms_test_half['13behav']["short"])==1)/len(ms_test_half['13behav']["short"]) * 100,
                   sum(clf2.predict(ms_test_half['13behav']["long"])==1)/len(ms_test_half['13behav']["long"]) * 100,
                   sum(clf2.predict(doh_test_half)==0)/len(doh_test_half) * 100
]))

98.64 & 93.93 & 93.54 & 97.27


In [33]:
# Motivating Example:
np.random.seed(42)

me_tranco12 = clf2.predict(tranco_test_half['12'][np.random.choice(range(tranco_test_half['12'].shape[0]),10000,replace=False)])
me_tranco13 = clf2.predict(tranco_test_half['13behav'][np.random.choice(range(tranco_test_half['13behav'].shape[0]),10000,replace=False)])
me_ms12 = clf2.predict(ms_test_half['12']["short"][np.random.choice(range(ms_test_half['12']["short"].shape[0]),75,replace=False)])
me_ms13short = clf2.predict(ms_test_half['13behav']["short"][np.random.choice(range(ms_test_half['13behav']["short"].shape[0]),75,replace=False)])
me_ms13long = clf2.predict(ms_test_half['13behav']["long"][np.random.choice(range(ms_test_half['13behav']["long"].shape[0]),50,replace=False)])

print(" & ".join('{:.2f}'.format(x) for x in [
    ((sum(me_tranco12 == 0)+sum(me_tranco13 == 0)) / (len(me_tranco12)+len(me_tranco13)) * 100),
    ((sum(me_ms12 == 1)+sum(me_ms13short == 1)+sum(me_ms13long == 1)) / (len(me_ms12)+len(me_ms13short)+len(me_ms13long)) * 100)
]))

99.30 & 93.50


## What about RFs?

In [34]:
# Skip this if not performing grid search again!
rfc = RandomForestClassifier(random_state=42)
parameters_rf = {'n_estimators':[40, 50, 60, 70],
                 'max_depth': [5, 10, 15, 20, 25, None],
                 'max_features': [None, 'sqrt', 'log2'],
                 'class_weight': [None, 'balanced']
                }
#parameters_rf = {'n_estimators':[40,50,60,70], 'max_depth': [5, 10, 20, None]}
clf_rf = GridSearchCV(rfc, parameters_rf, n_jobs=-1, scoring=scoring, refit='F1', cv=sgkf)

In [40]:
# Skip this if not performing grid search again!
clf_rf.fit(x_train_half_err_noerr, y_train_err_noerr, groups=groups_err_noerr)
print(clf_rf.best_score_, clf_rf.best_params_)

0.9334827339968618 {'class_weight': 'balanced', 'max_depth': 20, 'max_features': 'sqrt', 'n_estimators': 70}


In [41]:
# Skip this if not performing grid search again!
scores_rf = cross_validate(clf_rf.best_estimator_, x_train, y_train, groups=groups_train, cv=sgkf, n_jobs=-1, scoring=scoring, return_estimator=True)
display(scores_rf)

{'fit_time': array([7.08787179, 7.39554024, 7.49931502, 7.33064198, 7.16093993,
        7.32096338, 7.15544152, 6.94084287, 7.40243268, 7.14355063]),
 'score_time': array([0.21103644, 0.21999192, 0.22428608, 0.21192789, 0.21284366,
        0.22582054, 0.2346437 , 0.20498586, 0.20584226, 0.21194911]),
 'estimator': [RandomForestClassifier(class_weight='balanced', max_depth=20, n_estimators=70,
                         random_state=42),
  RandomForestClassifier(class_weight='balanced', max_depth=20, n_estimators=70,
                         random_state=42),
  RandomForestClassifier(class_weight='balanced', max_depth=20, n_estimators=70,
                         random_state=42),
  RandomForestClassifier(class_weight='balanced', max_depth=20, n_estimators=70,
                         random_state=42),
  RandomForestClassifier(class_weight='balanced', max_depth=20, n_estimators=70,
                         random_state=42),
  RandomForestClassifier(class_weight='balanced', max_depth=20, n

In [38]:
# Skip this if not performing grid search again!
# Print the average feature importance for all CV tree classifiers:
feature_importances = []
for idx,estimator in enumerate(scores_rf['estimator']):
    feature_importances.append(pd.DataFrame(estimator.feature_importances_,
                                            index = joy_indices_tls,
                                            columns=['importance'+str(idx)]).sort_values('importance'+str(idx), ascending=False))

all_feature_importances = reduce(lambda left,right: pd.merge(left,right,how='outer',
                                                             left_index=True,
                                                             right_index=True), feature_importances)
display(all_feature_importances.mean(axis=1).sort_values(ascending=False).head(10))

tls_b_0      0.371215
tls_b_1      0.187003
tls_b_2      0.099558
tls_dir_0    0.084633
tls_b_3      0.049187
tls_dir_3    0.037631
tls_dir_1    0.035464
tls_b_4      0.032881
tls_dir_2    0.025541
tls_b_5      0.018170
dtype: float64

In [34]:
# 0.9334827339968618 {'class_weight': 'balanced', 'max_depth': 20, 'max_features': 'sqrt', 'n_estimators': 70}
clf_rf = RandomForestClassifier(random_state=42, class_weight='balanced', max_depth=20, max_features='sqrt', n_estimators=70)
clf_rf.fit(x_train_half_err_noerr, y_train_err_noerr)

In [35]:
print(" & ".join('{:.2f}'.format(x) for x in [ 
                   sum(clf_rf.predict(mta_test_half)[np.where(mta_test_y==0)] == 0) / sum(mta_test_y==0) *100,
                   sum(clf_rf.predict(mta_test_half)[np.where(mta_test_y)]) / sum(mta_test_y) *100,
                   sum(clf_rf.predict(tranco_test_half['12'])==0)/len(tranco_test_half['12']) * 100,
                   sum(clf_rf.predict(ms_test_half['12']["short"])==1)/len(ms_test_half['12']["short"]) * 100,
                   sum(clf_rf.predict(ms_test_half['12']["long"])==1)/len(ms_test_half['12']["long"]) * 100
]))

91.58 & 97.59 & 99.85 & 99.16 & 99.03


In [36]:
print(" & ".join('{:.2f}'.format(x) for x in [ 
                   sum(clf_rf.predict(tranco_test_half['13filter'])==0)/len(tranco_test_half['13filter']) * 100,
                   sum(clf_rf.predict(ms_test_half['13filter']["short"])==1)/len(ms_test_half['13filter']["short"]) * 100,
                   sum(clf_rf.predict(ms_test_half['13filter']["long"])==1)/len(ms_test_half['13filter']["long"]) * 100
]))

100.00 & 0.00 & 0.00


In [37]:
print(" & ".join('{:.2f}'.format(x) for x in [ 
                   sum(clf_rf.predict(tranco_test_half['13behav'])==0)/len(tranco_test_half['13behav']) * 100,
                   sum(clf_rf.predict(ms_test_half['13behav']["short"])==1)/len(ms_test_half['13behav']["short"]) * 100,
                   sum(clf_rf.predict(ms_test_half['13behav']["long"])==1)/len(ms_test_half['13behav']["long"]) * 100,
                   sum(clf_rf.predict(doh_test_half)==0)/len(doh_test_half) * 100
]))

99.78 & 93.87 & 93.21 & 99.42


In [38]:
# Motivating Example:
np.random.seed(42)

me_tranco12 = clf_rf.predict(tranco_test_half['12'][np.random.choice(range(tranco_test_half['12'].shape[0]),10000,replace=False)])
me_tranco13 = clf_rf.predict(tranco_test_half['13behav'][np.random.choice(range(tranco_test_half['13behav'].shape[0]),10000,replace=False)])
me_ms12 = clf_rf.predict(ms_test_half['12']["short"][np.random.choice(range(ms_test_half['12']["short"].shape[0]),75,replace=False)])
me_ms13short = clf_rf.predict(ms_test_half['13behav']["short"][np.random.choice(range(ms_test_half['13behav']["short"].shape[0]),75,replace=False)])
me_ms13long = clf_rf.predict(ms_test_half['13behav']["long"][np.random.choice(range(ms_test_half['13behav']["long"].shape[0]),50,replace=False)])

print(" & ".join('{:.2f}'.format(x) for x in [
    ((sum(me_tranco12 == 0)+sum(me_tranco13 == 0)) / (len(me_tranco12)+len(me_tranco13)) * 100),
    ((sum(me_ms12 == 1)+sum(me_ms13short == 1)+sum(me_ms13long == 1)) / (len(me_ms12)+len(me_ms13short)+len(me_ms13long)) * 100)
]))

99.83 & 93.50


## UCNet

In [None]:
realnet_all12 = pd.read_csv('data/ucnet/all_data12.csv', index_col=0)
realnet_all13filter = pd.read_csv('data/ucnet/all_data13.csv', index_col=0)

In [None]:
realnet_all13behav = pd.read_csv('data/ucnet/all_behav13_new.csv', index_col=0)

realnet_test = {
    '12': realnet_all12.loc[:,joy_indices_tls_half].values.astype(np.float64),
    '13filter': realnet_all13filter.loc[:, joy_indices_tls_half].values.astype(np.float64),
    '13behav': realnet_all13behav.loc[:, joy_indices_tls_half].values.astype(np.float64),
}

In [82]:
print(sum(clf2.predict(realnet_test['12'])==0) / len(realnet_test['12']),
      sum(clf_rf.predict(realnet_test['12'])==0) / len(realnet_test['12']))

print(sum(clf2.predict(realnet_test['13filter'])==0) / len(realnet_test['13filter']),
      sum(clf_rf.predict(realnet_test['13filter'])==0) / len(realnet_test['13filter']))

print(sum(clf2.predict(realnet_test['13behav'])==0) / len(realnet_test['13behav']),
      sum(clf_rf.predict(realnet_test['13behav'])==0) / len(realnet_test['13behav']))

0.9039330321872493 0.9569316872825084
0.8782679340038034 0.9999989584481572
0.9228153058268326 0.9634656606143835
