In [4]:
import warnings


warnings.filterwarnings(action="ignore")


import os
import numpy as np
import pandas as pd
from typing import Tuple, List, Dict, Any


SEGMENT_LEN = 100
TRAIN_HEADER_COUNT = 5000
TEST_HEADER_COUNT = 10000

TRAIN_SEGMENT_COUNT = TRAIN_HEADER_COUNT // SEGMENT_LEN
TEST_SEGMENT_COUNT = TEST_HEADER_COUNT // SEGMENT_LEN


# 90 of the test segments are genuine (i.e., benign) and 10 segments are entered bya masquerader (randomly sorted).

In [17]:
#   load answer for classifcation
gt_df = pd.read_csv("challengeToFill.csv", index_col=0)
gt_df.T[TRAIN_SEGMENT_COUNT:]

Unnamed: 0,User0,User1,User2,User3,User4,User5,User6,User7,User8,User9,...,User30,User31,User32,User33,User34,User35,User36,User37,User38,User39
5000-5100,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,,,,,,,,,,
5100-5200,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
5200-5300,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,,,,,,,,,,
5300-5400,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
5400-5500,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14500-14600,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
14600-14700,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
14700-14800,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
14800-14900,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,


In [5]:
def user_list_to_df(user_id: str, user_data_list: List[str]):
    df_user = pd.DataFrame({
        "cmd": user_data_list
    })
    df_user["user"] = user_id
    df_user["split"] = "train"
    df_user["segment_id"] = np.repeat(range(0, int(len(user_data_list) / SEGMENT_LEN)), SEGMENT_LEN)
    df_user["cmd"] = df_user["cmd"].astype("category")
    return df_user


def load_user_data(user_id: str, file_path: os.PathLike) -> Tuple[pd.DataFrame, pd.DataFrame]:
    with open(file_path, "r") as fp:
        user_data = fp.readlines()
        user_data = [s.strip() for s in user_data]

    # get training data:
    train_user_data = user_data[:TRAIN_HEADER_COUNT]
    test_user_data = user_data[TRAIN_HEADER_COUNT:]

    # convert to dataframes
    train_segments = user_list_to_df(user_id, train_user_data)
    test_segments = user_list_to_df(user_id, test_user_data)

    return train_segments, test_segments


In [6]:
''' load all user data '''

user_id = "User0"
user_file_path = os.path.join("data", user_id)

df_user0_train, df_user0_test = load_user_data(user_id, user_file_path)

''' all commands '''

user_cmd_set_train = set(df_user0_train["cmd"].unique())
user_cmd_set_test = set(df_user0_test["cmd"].unique())

user_cmd_set = user_cmd_set_test.union(user_cmd_set_train)

In [7]:
user_cmd_set_test.difference(user_cmd_set_train)

{'MediaMai',
 'as1',
 'bc',
 'cfe',
 'convert',
 'deroff',
 'df',
 'driver',
 'du',
 'enscript',
 'ex',
 'finger',
 'frm',
 'hpost',
 'imake',
 'jot',
 'ld_',
 'mail',
 'make',
 'postprin',
 'ppost',
 'ps',
 'ps2pdf',
 'rcp',
 'rsh',
 'scheme',
 'sgihelp',
 'sleep',
 'sort',
 'spell',
 'tcsh',
 'tput',
 'ugen',
 'unzip',
 'uudecode',
 'uuencode',
 'whereis',
 'who',
 'whois',
 'xdvi',
 'xdvi.rea',
 'xemacs-1',
 'xmkmf',
 'xpaint'}

In [8]:
cmd_map_code = {c: i for i, c in enumerate(user_cmd_set)}

df_user0_train["cmd_code"] = df_user0_train["cmd"].map(cmd_map_code).astype(int)
df_user0_test["cmd_code"] = df_user0_test["cmd"].map(cmd_map_code).astype(int)

In [9]:
def build_segment_features(user_data_df: pd.DataFrame, segment_id: int) -> Dict[str, Any]:

    '''
    build features per segment:
        1.  V - cmd that has been used the most
        2.  X - cmd that has been used the least
        3.  X - count of cmd that has been used the most
        4.  V - len of longest sequence of same cmds
        5.  X - has zip/compression cmds
        6.  X - has encryption cmds
        7.  X - has networking cmds
        8.  V - count distinct of cmds
        9.  V - first cmd
        10. V - last cmd
        11. X - has command that user never used in training
    '''

    segment_features = {}

    segment_df = user_data_df[user_data_df["segment_id"] == segment_id]
    segment_cmd_value_counts = segment_df["cmd"].value_counts().to_dict()
    segment_df["cmd_count"] = segment_df["cmd"].map(segment_cmd_value_counts)

    # features
    segment_features["cmd_most_used"] = next(iter(segment_cmd_value_counts))
    segment_features["first_cmd"] = segment_df["cmd"].iloc[0]
    segment_features["last_cmd"] = segment_df["cmd"].iloc[SEGMENT_LEN - 1]
    segment_features["unique_cmds"] = len(segment_df["cmd"].unique())

    # longest subsequence of same commands
    s = segment_df["cmd_code"].diff().astype(bool)
    segment_features["longest_same_cmd_sequence"] = (~s).cumsum()[s].value_counts().max()  # TODO - there is a bug here, getting 100s

    return segment_features



In [10]:
train_segment_features_list = []
test_segment_features_list = []

for i in range(TRAIN_SEGMENT_COUNT):
    train_segment_features_list.append(build_segment_features(df_user0_train, i))

for i in range(TEST_SEGMENT_COUNT - 1):
    test_segment_features_list.append(build_segment_features(df_user0_test, i))

In [11]:
segment_df_train = pd.DataFrame.from_records(train_segment_features_list)
segment_df_test = pd.DataFrame.from_records(test_segment_features_list)

In [12]:
pd.get_dummies(segment_df_train, columns=["first_cmd", "last_cmd"], prefix="is" ,dtype=float)

Unnamed: 0,cmd_most_used,unique_cmds,longest_same_cmd_sequence,is_.xsessio,is_awk,is_cat,is_chmod,is_date,is_find,is_generic,...,is_post,is_sed,is_sendmail,is_sh,is_touch,is_troff,is_uname,is_which,is_whoami,is_xgvis
0,sh,18,8,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,uname,18,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,ksh,29,28,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,sh,24,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,sed,24,11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,ls,34,9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
6,sh,22,13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
7,sh,22,17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,sh,15,17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,sh,13,12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
