In [132]:
import warnings


warnings.filterwarnings(action="ignore")


import os
import numpy as np
import pandas as pd
from typing import Tuple, List, Dict, Any

SEGMENT_LEN = 100
TRAIN_HEADER_COUNT = 5000
TEST_HEADER_COUNT = 10000

TRAIN_SEGMENT_COUNT = TRAIN_HEADER_COUNT // SEGMENT_LEN
TEST_SEGMENT_COUNT = TEST_HEADER_COUNT // SEGMENT_LEN

In [42]:
def user_list_to_df(user_id: str, user_data_list: List[str]):
    df_user = pd.DataFrame({
        "cmd": user_data_list
    })
    df_user["user"] = user_id
    df_user["split"] = "train"
    df_user["segment_id"] = np.repeat(range(0, int(len(user_data_list) / SEGMENT_LEN)), SEGMENT_LEN)
    df_user["cmd"] = df_user["cmd"].astype("category")
    return df_user


def load_user_data(user_id: str, file_path: os.PathLike) -> Tuple[pd.DataFrame, pd.DataFrame]:
    with open(file_path, "r") as fp:
        user_data = fp.readlines()
        user_data = [s.strip() for s in user_data]

    # get training data:
    train_user_data = user_data[:TRAIN_HEADER_COUNT]
    test_user_data = user_data[TRAIN_HEADER_COUNT:]

    # convert to dataframes
    train_segments = user_list_to_df(user_id, train_user_data)
    test_segments = user_list_to_df(user_id, test_user_data)

    return train_segments, test_segments


In [44]:
''' load all user data '''

user_id = "User0"
user_file_path = os.path.join("data", user_id)

df_user0_train, df_user0_test = load_user_data(user_id, user_file_path)

''' all commands '''

user_cmd_set_train = set(df_user0_train["cmd"].unique())
user_cmd_set_test = set(df_user0_test["cmd"].unique())

user_cmd_set = user_cmd_set_test.union(user_cmd_set_train)

In [52]:
user_cmd_set_test.difference(user_cmd_set_train)

{'MediaMai',
 'as1',
 'bc',
 'cfe',
 'convert',
 'deroff',
 'df',
 'driver',
 'du',
 'enscript',
 'ex',
 'finger',
 'frm',
 'hpost',
 'imake',
 'jot',
 'ld_',
 'mail',
 'make',
 'postprin',
 'ppost',
 'ps',
 'ps2pdf',
 'rcp',
 'rsh',
 'scheme',
 'sgihelp',
 'sleep',
 'sort',
 'spell',
 'tcsh',
 'tput',
 'ugen',
 'unzip',
 'uudecode',
 'uuencode',
 'whereis',
 'who',
 'whois',
 'xdvi',
 'xdvi.rea',
 'xemacs-1',
 'xmkmf',
 'xpaint'}

In [87]:
cmd_map_code = {c: i for i, c in enumerate(user_cmd_set)}

df_user0_train["cmd_code"] = df_user0_train["cmd"].map(cmd_map_code).astype(int)
df_user0_test["cmd_code"] = df_user0_test["cmd"].map(cmd_map_code).astype(int)

In [122]:
def build_segment_features(user_data_df: pd.DataFrame, segment_id: int) -> Dict[str, Any]:

    '''
    build features per segment:
        1.  V - cmd that has been used the most
        2.  X - cmd that has been used the least
        3.  X - count of cmd that has been used the most
        4.  V - len of longest sequence of same cmds
        5.  X - has zip/compression cmds
        6.  X - has encryption cmds
        7.  X - has networking cmds
        8.  V - count distinct of cmds
        9.  V - first cmd
        10. V - last cmd
    '''

    segment_features = {}

    segment_df = user_data_df[user_data_df["segment_id"] == segment_id]
    segment_cmd_value_counts = segment_df["cmd"].value_counts().to_dict()
    segment_df["cmd_count"] = segment_df["cmd"].map(segment_cmd_value_counts)

    # features
    segment_features["cmd_most_used"] = next(iter(segment_cmd_value_counts))
    segment_features["first_cmd"] = segment_df["cmd"].iloc[0]
    segment_features["last_cmd"] = segment_df["cmd"].iloc[SEGMENT_LEN - 1]
    segment_features["unique_cmds"] = len(segment_df["cmd"].unique())

    # longest subsequence of same commands
    s = segment_df["cmd_code"].diff().astype(bool)
    segment_features["longest_same_cmd_sequence"] = (~s).cumsum()[s].value_counts().max()  # TODO - there is a bug here, getting 100s

    return segment_features



In [133]:
train_segment_features_list = []
test_segment_features_list = []

for i in range(TRAIN_SEGMENT_COUNT):
    train_segment_features_list.append(build_segment_features(df_user0_train, i))

for i in range(TEST_SEGMENT_COUNT - 1):
    test_segment_features_list.append(build_segment_features(df_user0_test, i))

In [134]:
df_train = pd.DataFrame.from_records(train_segment_features_list)
df_test = pd.DataFrame.from_records(test_segment_features_list)