In [20]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [21]:
import warnings

warnings.filterwarnings(action="ignore")

import os
import numpy as np
import pandas as pd
from typing import Tuple, List, Dict, Any
from constants import *



# 90 of the test segments are genuine (i.e., benign) and 10 segments are entered bya masquerader (randomly sorted).

In [22]:
#   load answer for classifcation
gt_df = pd.read_csv("challengeToFill.csv", index_col=0)
gt_df.T[TRAIN_SEGMENT_COUNT:]



dev_set_df = gt_df.T.iloc[TRAIN_SEGMENT_COUNT:, :DEV_USERS_COUNT]

In [23]:
def evaluate_user_results(pred_series: pd.Series, true_series: pd.Series):
    ''' evaluate the classification result of a single user '''
    assert len(pred_series) == len(true_series)
    assert pred_series.name == true_series.name

    equal_series = pred_series == true_series
    return equal_series.mean()

In [24]:
def user_list_to_df(user_id: str, user_data_list: List[str]):
    df_user = pd.DataFrame({
        "cmd": user_data_list
    })
    df_user["user"] = user_id
    df_user["split"] = "train"
    df_user["segment_id"] = np.repeat(range(0, int(len(user_data_list) / SEGMENT_LEN)), SEGMENT_LEN)
    df_user["cmd"] = df_user["cmd"].astype("category")
    return df_user


def load_user_data(user_id: str, file_path: os.PathLike) -> Tuple[pd.DataFrame, pd.DataFrame]:
    with open(file_path, "r") as fp:
        user_data = fp.readlines()
        user_data = [s.strip() for s in user_data]

    # get training data:
    train_user_data = user_data[:TRAIN_HEADER_COUNT]
    test_user_data = user_data[TRAIN_HEADER_COUNT:]

    # convert to dataframes
    train_segments = user_list_to_df(user_id, train_user_data)
    test_segments = user_list_to_df(user_id, test_user_data)

    return train_segments, test_segments


In [25]:
''' load all user data '''

user_id = "User5"
user_file_path = os.path.join("data", user_id)

df_user0_train, df_user0_test = load_user_data(user_id, user_file_path)
anomaly_ground_truth = gt_df.T[user_id].values.astype(int)[TRAIN_SEGMENT_COUNT:]# test set only

''' all commands '''

user_cmd_set_train = set(df_user0_train["cmd"].unique())
user_cmd_set_test = set(df_user0_test["cmd"].unique())

user_cmd_set = user_cmd_set_test.union(user_cmd_set_train)

In [26]:
# cmds that appear in the test and not in train
user_cmd_set_not_in_train = user_cmd_set_test.difference(user_cmd_set_train)

In [27]:
cmd_map_code = {c: i for i, c in enumerate(user_cmd_set)}
MAP_CODE_LEN = len(cmd_map_code)

df_user0_train["cmd_code"] = df_user0_train["cmd"].map(cmd_map_code).astype(int)
df_user0_test["cmd_code"] = df_user0_test["cmd"].map(cmd_map_code).astype(int)

In [28]:
from features.segment import build_segment_features


train_segment_features_list = []
test_segment_features_list = []

for i in range(TRAIN_SEGMENT_COUNT):
    train_segment_features_list.append(build_segment_features(df_user0_train, user_cmd_set_not_in_train ,i))

for i in range(TEST_SEGMENT_COUNT):
    test_segment_features_list.append(build_segment_features(df_user0_test, user_cmd_set_not_in_train ,i))

In [29]:
segment_df_train = pd.DataFrame.from_records(train_segment_features_list)
segment_df_test = pd.DataFrame.from_records(test_segment_features_list)

In [30]:
# categorize_commands
segment_df_train["cmd_most_used_code"] = pd.Categorical(segment_df_train["cmd_most_used"].astype("category"),
                                                        categories=user_cmd_set)
segment_df_train["first_cmd_code"] = pd.Categorical(segment_df_train["first_cmd"], categories=user_cmd_set)
segment_df_train["last_cmd_code"] = pd.Categorical(segment_df_train["last_cmd"], categories=user_cmd_set)

segment_df_test["cmd_most_used_code"] = pd.Categorical(segment_df_test["cmd_most_used"], categories=user_cmd_set)
segment_df_test["first_cmd_code"] = pd.Categorical(segment_df_test["first_cmd"], categories=user_cmd_set)
segment_df_test["last_cmd_code"] = pd.Categorical(segment_df_test["last_cmd"], categories=user_cmd_set)

In [72]:
import matplotlib.pyplot as plt


corr = pd.concat([pd.DataFrame({"anomaly": anomaly_ground_truth}), segment_df_test.iloc[:, 3:-3]], axis=1).corr()


In [73]:
corr

Unnamed: 0,anomaly,unique_cmds,longest_same_cmd_sequence,cmd_not_in_train_count,single_chars_cmd_count,two_chars_cmds_count,three_chars_cmds_count,four_chars_cmds_count,ends_with_dot_cmds_count,has_dot_in_middle
anomaly,1.0,-0.056223,0.140882,-0.386507,,,,,,
unique_cmds,-0.056223,1.0,-0.158038,0.146715,,,,,,
longest_same_cmd_sequence,0.140882,-0.158038,1.0,0.175519,,,,,,
cmd_not_in_train_count,-0.386507,0.146715,0.175519,1.0,,,,,,
single_chars_cmd_count,,,,,,,,,,
two_chars_cmds_count,,,,,,,,,,
three_chars_cmds_count,,,,,,,,,,
four_chars_cmds_count,,,,,,,,,,
ends_with_dot_cmds_count,,,,,,,,,,
has_dot_in_middle,,,,,,,,,,


In [31]:
train_feature_df = pd.get_dummies(segment_df_train, columns=["cmd_most_used_code", "first_cmd_code", "last_cmd_code"],
                                  prefix="is", dtype=float)
dev_feature_df = pd.get_dummies(segment_df_test, columns=["cmd_most_used_code", "first_cmd_code", "last_cmd_code"],
                                prefix="is", dtype=float)


# remove non numeric features
train_feature_df = train_feature_df.iloc[:, 3:]
dev_feature_df = dev_feature_df.iloc[:, 3:]

In [32]:
''' first outlier detection algo '''
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import precision_score, recall_score
from sklearn.ensemble import IsolationForest
from sklearn.cluster import DBSCAN


anomaly_ground_truth = gt_df.T[user_id].values.astype(int)[TRAIN_SEGMENT_COUNT:]# test set only
anomaly_ground_truth = np.array([1 if x == 0 else -1 for x in anomaly_ground_truth])

detector = LocalOutlierFactor(novelty=True, leaf_size=5)

detector.fit(train_feature_df)

predicted_anomalies = detector.predict(dev_feature_df)

In [33]:
from umap import UMAP

reducer = UMAP()
S_train_low_dim = reducer.fit_transform(train_feature_df)
S_dev_low_dim = reducer.transform(dev_feature_df)

In [43]:
detector = LocalOutlierFactor(novelty=True)

detector.fit(S_train_low_dim)

low_dim_predicted_anomalies = detector.predict(S_dev_low_dim)

In [44]:
from eval.metrics import detection_metrics


print("all bengin score")
high_dim_score = detection_metrics(np.ones_like(anomaly_ground_truth), anomaly_ground_truth)
print(high_dim_score)


print("all fraud score")
high_dim_score = detection_metrics(-1 * np.ones_like(anomaly_ground_truth), anomaly_ground_truth)
print(high_dim_score)

print("high dim")
high_dim_score = detection_metrics(predicted_anomalies, anomaly_ground_truth)
print(high_dim_score)

print("low dim")
low_dim_score = detection_metrics(low_dim_predicted_anomalies, anomaly_ground_truth)
print(low_dim_score)

all bengin score
{'detection_score': 0.0, 'precison': '1.000', 'recall': '0.900'}
all fraud score
{'detection_score': 1.0, 'precison': '0.000', 'recall': '0.000'}
high dim
{'detection_score': 0.0, 'precison': '0.844', 'recall': '0.884'}
low dim
{'detection_score': 0.0, 'precison': '1.000', 'recall': '0.900'}


In [39]:
low_dim_predicted_anomalies

array([-1, -1, -1,  1,  1, -1, -1, -1, -1, -1, -1,  1, -1,  1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1,
        1, -1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1])

In [40]:

import plotly.express as px

viz_df_train = pd.DataFrame({
    "x": S_train_low_dim[:, 0],
    "y": S_train_low_dim[:, 1],
    "user": TRAIN_SEGMENT_COUNT * [user_id],
    "set": TRAIN_SEGMENT_COUNT * ["train"]
})

viz_df_test = pd.DataFrame({
    "x": S_dev_low_dim[:, 0],
    "y": S_dev_low_dim[:, 1],
    "user": TEST_SEGMENT_COUNT * [user_id],
    "set": TEST_SEGMENT_COUNT * ["test"]
})


viz_df = pd.concat([viz_df_train, viz_df_test], axis=0)
viz_df["anomaly"] = gt_df.T[user_id].values
viz_df["anomaly"] = viz_df["anomaly"].astype(int)


In [None]:
# import plotly.io as pio
# pio.renderers.default='notebook'
#
px.scatter(viz_df, x="x", y="y", color="anomaly")