In [1]:
%load_ext autoreload
%autoreload 2

In [28]:
import warnings

warnings.filterwarnings(action="ignore")

import os
import tqdm
import numpy as np
import pandas as pd

from typing import Tuple, List, Dict, Any
from constants import *
from features.user import User


# 90 of the test segments are genuine (i.e., benign) and 10 segments are entered bya masquerader (randomly sorted).

In [3]:
#   load answer for classifcation
gt_df = pd.read_csv("challengeToFill.csv", index_col=0).T[TRAIN_SEGMENT_COUNT:].reset_index(drop=True)




# dev_set_df = gt_df.T.iloc[TRAIN_SEGMENT_COUNT:, :DEV_USERS_COUNT]

In [31]:
user_list = [f"User{i}" for i in range(10)]
submission_user_list = [f"User{i}" for i in range(10, 40)]
data_path = [f"data/{uid}" for uid in user_list]
submission_data_path = [f"data/{uid}" for uid in submission_user_list]

user_data = {}

for user_id, user_data_path in tqdm.tqdm(zip(user_list, data_path), total=len(user_list)):
    #   load user data
    user_data[user_id] = User(user_id, user_data_path, gt_df[user_id])

    # create segment features
    user_data[user_id].build_segment_features()


100%|██████████| 10/10 [03:01<00:00, 18.12s/it]


In [5]:
import matplotlib.pyplot as plt

all_test_segments = [user_data[user_id].segment_df_test.select_dtypes("number") for user_id in user_list]
all_test_segments_gt = [user_data[user_id].user_anomaly_gt for user_id in user_list]

all_test_segments = pd.concat(all_test_segments).reset_index(drop=True)
all_test_segments["anomaly"] = pd.concat(all_test_segments_gt).reset_index(drop=True)

all_test_segments["noise"] = np.random.rand(len(all_test_segments))


corr = all_test_segments.corr()

In [32]:
#   get feature correlations to anomaly

TOP_K = 50

corr["anomaly"].sort_values(ascending=False)

top_k_features = corr["anomaly"].sort_values(ascending=False)[1:TOP_K].index.tolist()

In [23]:
top_k_features

['cmds_not_in_train',
 'not_in_train_has_all_lowercase_cmds',
 'not_in_train_has_mail_cmds',
 'not_in_train_has_uppercase_cmds',
 'not_in_train_three_chars_cmds',
 'not_in_train_four_chars_cmds',
 'not_in_train_has_numerics_cmds',
 'not_in_train_has_dot_in_middle_cmds',
 'not_in_train_two_chars_cmds',
 'tfidf_mediamai',
 'tfidf_maple',
 'tfidf_sy',
 'tfidf_sendmail',
 'not_in_train_has_coding_cmds',
 'not_in_train_single_chars_cmds',
 'tfidf_tty',
 'tfidf_magma',
 'tfidf_sprog',
 'tfidf_mapletty',
 'tfidf_tput',
 'not_in_train_starts_with_dotfile_cmds',
 'not_in_train_has_ssh_cmds',
 'has_ssh_cmds',
 'has_uppercase_cmds',
 'tfidf_detex',
 'tfidf_xmaplev5',
 'tfidf_tracy',
 'tfidf_pine',
 'not_in_train_has_navigation_cmds',
 'tfidf_archie',
 'tfidf_spell',
 'tfidf_mp',
 'tfidf_m4',
 'has_dot_in_middle_cmds',
 'tfidf_lks',
 'tfidf_ds_ar',
 'tfidf_xbiff',
 'tfidf_resize',
 'tfidf_sizup',
 'tfidf_whois',
 'tfidf_reducyr',
 'tfidf_dviselec',
 'tfidf_arch',
 'tfidf_exe',
 'tfidf_less',
 'has

In [24]:
# import plotly.express as px
# fig = px.imshow(corr)
# fig.show()

In [34]:
''' first outlier detection algo '''
from sklearn.decomposition import PCA
from eval.metrics import detection_metrics
from detection.pipeline import DetectionPipeline
from sklearn.preprocessing import MinMaxScaler


user_scores_high_dim = {}
user_scores_low_dim = {}
session_classification = {}

for user_id in tqdm.tqdm(user_list):

    anomaly_ground_truth = user_data[user_id].user_anomaly_gt.values.astype(int) # test set only
    anomaly_ground_truth = np.array([1 if x == 0 else -1 for x in anomaly_ground_truth])

    ''' high dim data '''

    detector = DetectionPipeline()

    X_train = user_data[user_id].normalized_features_train[top_k_features]
    X_test = user_data[user_id].normalized_features_test[top_k_features]

    ''' feature selection '''


    detector.fit(X_train)
    predicted_anomalies = detector.predict(X_test)


    user_scores_high_dim[user_id] = detection_metrics(predicted_anomalies, anomaly_ground_truth)

    session_classification[user_id] = [1 if x == -1 else 0 for x in predicted_anomalies]



''' aggregate metrics '''


overall_scores_high_dim = {}
for score_type in ["f1_score", "detection_score" ,"precision_score", "recall_score"]:
    overall_scores_high_dim[score_type] = np.mean([user_scores_high_dim[user_id][score_type] for user_id in user_list])


 20%|██        | 2/10 [07:59<31:56, 239.62s/it]


ValueError: Input X contains NaN.
LocalOutlierFactor does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [26]:
# {'f1_score': 0.8529,
#  'detection_score': 0.66,
#  'precision_score': 0.7899,
#  'recall_score': 0.9563}

overall_scores_high_dim

{'f1_score': 0.8112999999999999,
 'detection_score': 0.85,
 'precision_score': 0.7112,
 'recall_score': 0.9782}

In [12]:
''' submission data '''

submission_user_data = {}
session_classification = {}

for user_id, user_data_path in tqdm.tqdm(zip(submission_user_list, submission_data_path), total=len(submission_user_list)):
    #   load user data
    submission_user_data[user_id] = User(user_id, user_data_path, gt_df[user_id])

    # create segment features
    submission_user_data[user_id].build_segment_features()

    anomaly_ground_truth = submission_user_data[user_id].user_anomaly_gt.values.astype(int) # test set only
    anomaly_ground_truth = np.array([1 if x == 0 else -1 for x in anomaly_ground_truth])

    ''' high dim data '''

    detector = DetectionPipeline()

    detector.fit(submission_user_data[user_id].segment_df_train)
    predicted_anomalies = detector.predict(submission_user_data[user_id].segment_df_test)
    user_scores_high_dim[user_id] = detection_metrics(predicted_anomalies, anomaly_ground_truth)

    session_classification[user_id] = [1 if x == -1 else 0 for x in predicted_anomalies]

  0%|          | 0/30 [00:01<?, ?it/s]


ValueError: Input contains NaN.

In [None]:
''' submission file '''
submission_df = pd.read_csv("challengeToFill.csv", index_col=0).T
for user_id in submission_user_list:
    submission_df[user_id].iloc[50:] = session_classification[user_id]

submission_df = submission_df.T.astype(int)
submission_df.to_csv("submissions/203763339_1.csv")

In [None]:

# import plotly.express as px
#
# viz_df_train = pd.DataFrame({
#     "x": S_train_low_dim[:, 0],
#     "y": S_train_low_dim[:, 1],
#     "user": TRAIN_SEGMENT_COUNT * [user_id],
#     "set": TRAIN_SEGMENT_COUNT * ["train"]
# })
#
# viz_df_test = pd.DataFrame({
#     "x": S_dev_low_dim[:, 0],
#     "y": S_dev_low_dim[:, 1],
#     "user": TEST_SEGMENT_COUNT * [user_id],
#     "set": TEST_SEGMENT_COUNT * ["test"]
# })
#
#
# viz_df = pd.concat([viz_df_train, viz_df_test], axis=0)
# viz_df["anomaly"] = gt_df.T[user_id].values
# viz_df["anomaly"] = viz_df["anomaly"].astype(int)
