# Encrypted Echoes

Note that the implementation of the data preprocessing follows the paper: Using TLS Fingerprints for OS Identification in Encrypted Traffic. Refer to the following link for implementation details: <a href="https://ieeexplore.ieee.org/document/9110319">link</a>. 

In summary, we treat each feature of data as a binary one hot-encoded vector where order is preserved. Each feature then represents that absence or presence of a specific encryption type or suite. We then run XGBoost on the dataset. 

## File Requirements: 
Please note that within the same directory, you must include the "flows_anonymized" fileset that contains raw data that is then parsed and used in the XGBoost. For the moment, this model is only using a subset of data given the size of the flows_anonymized dataset. As a result, we must load to the cloud in order to get a better sense of accuracy utilizing the entire dataset. Access the fileset here: <a href="https://zenodo.org/records/3461771">dataset</a>

In [222]:
import joblib
import glob

import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    precision_recall_fscore_support,
    confusion_matrix,
)


In [None]:
files = sorted(glob.glob("flows_anonymized/*_ground_truth.csv"))[:15]

KEEP_COLS = [
    "TLS Client Version",
    "Client Cipher Suites",
    "TLS Extension Types",
    "TLS Extension Lengths",
    "TLS Elliptic Curves",
    "Ground Truth OS",
]
df_list = []

for path in files:
    df_part = pd.read_csv(path, usecols=KEEP_COLS, low_memory=False)
    df_list.append(df_part)

tls_df = pd.concat(df_list, ignore_index=True)
print("Combined shape:", tls_df.shape)
print(tls_df.columns)

Combined shape: (2033850, 6)
Index(['TLS Client Version', 'Client Cipher Suites', 'TLS Extension Types',
       'TLS Extension Lengths', 'TLS Elliptic Curves', 'Ground Truth OS'],
      dtype='object')


In [209]:
tls_df = tls_df.dropna().reset_index(drop=True)
tls_df

Unnamed: 0,TLS Client Version,Client Cipher Suites,TLS Extension Types,TLS Extension Lengths,TLS Elliptic Curves,Ground Truth OS
0,TLSv1.2,4a4a0113021303132bc02fc02cc030c0,0a0a0000170001ff0a000b00230005000d00120033002d...,00001700000001000a000200c0000500140000002b0002...,6a6a1d00170018000000000000000000,Mac OS
1,TLSv1.2,dada0113021303132bc02fc02cc030c0,1a1a0000170001ff0a000b002300100005000d00120033...,00001800000001000a00020000000e000500140000002b...,caca1d00170018000000000000000000,Android
2,TLSv1.2,2bc02cc0a9cc2fc030c0a8cc09c00ac0,01ff0000170023000d00050010000b000a00ffffffffff...,0100160000000000100005000b0002000800ffffffffff...,1d001700180000000000000000000000,Android
3,TLSv1.2,6a6a0113021303132bc02fc02cc030c0,baba0000170001ff0a000b002300100005000d00120033...,00002200000001000a000200c0000e000500140000002b...,1a1a1d00170018000000000000000000,Mac OS
4,TLSv1.2,2bc02cc0a9cc2fc030c0a8cc09c00ac0,01ff000017000d0005000b000a00ffffffffffffffffff...,01001b0000001000050002000800ffffffffffffffffff...,1d001700180000000000000000000000,Android
...,...,...,...,...,...,...
1240762,SSLv2.0,00000000000000000000000000000000,ffffffffffffffffffffffffffffffffffffffffffffff...,ffffffffffffffffffffffffffffffffffffffffffffff...,00000000000000000000000000000000,Windows
1240763,SSLv2.0,00000000000000000000000000000000,ffffffffffffffffffffffffffffffffffffffffffffff...,ffffffffffffffffffffffffffffffffffffffffffffff...,00000000000000000000000000000000,Windows
1240764,SSLv2.0,00000000000000000000000000000000,ffffffffffffffffffffffffffffffffffffffffffffff...,ffffffffffffffffffffffffffffffffffffffffffffff...,00000000000000000000000000000000,Windows
1240765,SSLv2.0,00000000000000000000000000000000,ffffffffffffffffffffffffffffffffffffffffffffff...,ffffffffffffffffffffffffffffffffffffffffffffff...,00000000000000000000000000000000,Windows


In [210]:
def parse_hex_list(raw_str: str) -> list[str]:
    if pd.isna(raw_str):
        return []

    # keep hex chars only, split every 4‑hex
    clean = "".join(x for x in raw_str if x in "0123456789abcdefABCDEF")
    return [clean[i : i + 4].lower() for i in range(0, len(clean), 4)]


tls_df["cipher_list"] = tls_df["Client Cipher Suites"].apply(parse_hex_list)
tls_df["group_list"] = tls_df["TLS Elliptic Curves"].apply(parse_hex_list)
tls_df["ext_id_list"] = tls_df["TLS Extension Types"].apply(parse_hex_list)
tls_df["ext_len_list"] = tls_df["TLS Extension Lengths"].apply(parse_hex_list)

In [None]:
def to_slots(series, k, prefix):
    return pd.DataFrame(
        series.apply(lambda lst: (lst + ["MISSING"] * k)[:k]).to_list(),
        columns=[f"{prefix}_pos{i}" for i in range(k)],
    )


K_CIPHER = 8  # first 8 cipher IDs
K_GROUP = 8  # first 8 supported‑group IDs
K_EXT = 100  # first 23 extension IDs
K_EXLEN = 100  # first 23 extension lengths

In [None]:
X_raw = pd.concat(
    [
        to_slots(tls_df["cipher_list"], K_CIPHER, "cipher"),
        to_slots(tls_df["group_list"], K_GROUP, "group"),
        to_slots(tls_df["ext_id_list"], K_EXT, "extid"),
        to_slots(tls_df["ext_len_list"], K_EXLEN, "extlen"),
        tls_df[["TLS Client Version"]],
    ],
    axis=1,
)

y = tls_df["Ground Truth OS"]

In [None]:
onehot = OneHotEncoder(handle_unknown="ignore")
pre = ColumnTransformer([("oh", onehot, X_raw.columns)], sparse_threshold=0.3)

X_encoded = pre.fit_transform(X_raw)  # sparse CSR matrix

joblib.dump(pre, "tls_onehot_encoder.joblib")

['tls_onehot_encoder.joblib']

In [None]:
y_int, os_labels = pd.factorize(y)  # os_labels keeps the names
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y_int, test_size=0.30, random_state=42, stratify=y_int
)

In [None]:
import xgboost as xgb

xgb_clf = xgb.XGBClassifier(
    objective="multi:softprob",
    num_class=len(os_labels),
    tree_method="exact",
    max_depth=8,
    n_estimators=400,
    learning_rate=0.05,
)

xgb_clf.fit(X_train, y_train)
y_pred = xgb_clf.predict(X_test)

print("XGBoost accuracy:", accuracy_score(y_test, y_pred))

print(classification_report(y_test, y_pred, target_names=os_labels))

cm = confusion_matrix(y_test, y_pred)
print("\nConfusion matrix\n", cm)

prec, rec, f1, _ = precision_recall_fscore_support(y_test, y_pred, average="macro")
print(f"Macro‑avg  precision={prec:.4f}  recall={rec:.4f}  f1={f1:.4f}")


XGBoost accuracy: 0.8514927558424742
               precision    recall  f1-score   support

       Mac OS       0.83      0.47      0.60     37242
      Android       0.85      0.96      0.91    183063
      Windows       0.84      0.76      0.80     76857
          iOS       0.85      0.86      0.86     73866
Windows Phone       0.99      0.73      0.84      1203

     accuracy                           0.85    372231
    macro avg       0.87      0.76      0.80    372231
 weighted avg       0.85      0.85      0.84    372231


Confusion matrix
 [[ 17485   5154   4714   9889      0]
 [   199 176295   5741    828      0]
 [   556  17566  58603    122     10]
 [  2744   7237    198  63687      0]
 [     0      1    320      0    882]]
Macro‑avg  precision=0.8747  recall=0.7581  f1=0.8014
