In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import warnings

warnings.filterwarnings(action="ignore")

import os
import tqdm
import numpy as np
import pandas as pd

from typing import Tuple, List, Dict, Any
from constants import *
from features.user import User


# 90 of the test segments are genuine (i.e., benign) and 10 segments are entered bya masquerader (randomly sorted).




In [3]:
#   load answer for classifcation
gt_df = pd.read_csv("challengeToFill.csv", index_col=0).T[TRAIN_SEGMENT_COUNT:].reset_index(drop=True)




# dev_set_df = gt_df.T.iloc[TRAIN_SEGMENT_COUNT:, :DEV_USERS_COUNT]

In [4]:
user_list = [f"User{i}" for i in range(10)]
submission_user_list = [f"User{i}" for i in range(10, 40)]
data_path = [f"data/{uid}" for uid in user_list]
submission_data_path = [f"data/{uid}" for uid in submission_user_list]

user_data = {}

for user_id, user_data_path in tqdm.tqdm(zip(user_list, data_path), total=len(user_list)):
    #   load user data
    user_data[user_id] = User(user_id, user_data_path, gt_df[user_id])

    # create segment features
    user_data[user_id].build_segment_features()


100%|██████████| 10/10 [00:16<00:00,  1.63s/it]


In [5]:
import matplotlib.pyplot as plt

all_test_segments = [user_data[user_id].segment_df_test.select_dtypes("number") for user_id in user_list]
all_test_segments_gt = [user_data[user_id].user_anomaly_gt for user_id in user_list]

all_test_segments = pd.concat(all_test_segments).reset_index(drop=True)
all_test_segments["anomaly"] = pd.concat(all_test_segments_gt).reset_index(drop=True)

all_test_segments["noise"] = np.random.rand(len(all_test_segments))


corr = all_test_segments.corr()

In [6]:
#   get feature correlations to anomaly

TOP_K = 1000

corr["anomaly"].sort_values(ascending=False)

top_k_features = corr["anomaly"].sort_values(ascending=False)[1:TOP_K].index.tolist()

if "noise" in top_k_features:
    top_k_features.remove("noise")

In [7]:
corr["anomaly"].sort_values(ascending=False)

anomaly                                     1.000000
cmds_not_in_train                           0.568771
not_in_train_has_all_lowercase_cmds         0.534410
not_in_train_has_mail_cmds                  0.455947
not_in_train_has_uppercase_cmds             0.424999
not_in_train_four_chars_cmds                0.399250
not_in_train_three_chars_cmds               0.399250
not_in_train_has_numerics_cmds              0.363472
not_in_train_has_dot_in_middle_cmds         0.298680
not_in_train_two_chars_cmds                 0.273712
not_in_train_has_coding_cmds                0.234834
not_in_train_single_chars_cmds              0.226589
not_in_train_starts_with_dotfile_cmds       0.190117
has_ssh_cmds                                0.190117
not_in_train_has_ssh_cmds                   0.190117
has_uppercase_cmds                          0.182880
not_in_train_has_navigation_cmds            0.158878
has_dot_in_middle_cmds                      0.144612
has_mail_cmds                               0.

In [8]:
# import plotly.express as px
# fig = px.imshow(corr)
# fig.show()

In [9]:
''' first outlier detection algo '''
from sklearn.decomposition import PCA
from eval.metrics import detection_metrics, ScoreCounter
from detection.pipeline import DetectionPipeline
from sklearn.preprocessing import MinMaxScaler


user_scores_high_dim = {}
user_scores_low_dim = {}
session_classification = {}

score_counter = ScoreCounter(len(user_list))

for user_id in tqdm.tqdm(user_list):

    anomaly_ground_truth = user_data[user_id].user_anomaly_gt.values.astype(int) # test set only
    anomaly_ground_truth = np.array([1 if x == 0 else -1 for x in anomaly_ground_truth])

    ''' high dim data '''

    detector = DetectionPipeline()

    # X_train = user_data[user_id].segment_df_train  # [top_k_features]
    # X_test = user_data[user_id].segment_df_test    #[top_k_features]

    X_train = user_data[user_id].normalized_features_train
    X_test = user_data[user_id].normalized_features_test

    # let's try dimension reduction
    # reducer = PCA(n_components=50)
    #
    # X_train = reducer.fit_transform(X_train.iloc[:, 3:])
    # X_test = reducer.transform(X_test.iloc[:, 3:])

    ''' feature selection '''


    detector.fit(X_train)
    predicted_anomalies = detector.predict(X_test)


    ''' update metrics '''
    score_counter.update(predicted_anomalies, anomaly_ground_truth)
    user_scores_high_dim[user_id] = detection_metrics(predicted_anomalies, anomaly_ground_truth)

    session_classification[user_id] = [1 if x == -1 else 0 for x in predicted_anomalies]



''' aggregate metrics '''


overall_scores_high_dim = {}
for score_type in ["f1_score", "detection_score" ,"precision_score", "recall_score"]:
    overall_scores_high_dim[score_type] = np.mean([user_scores_high_dim[user_id][score_type] for user_id in user_list])


100%|██████████| 10/10 [00:01<00:00,  9.99it/s]


In [10]:
# {'f1_score': 0.8529,
#  'detection_score': 0.66,
#  'precision_score': 0.7899,
#  'recall_score': 0.9563}

overall_scores_high_dim

raw_score, normalized = score_counter.calc()
print(raw_score, normalized)

print(f"predicted score: {3 * raw_score}")

overall_scores_high_dim

1393 0.7738888888888888
predicted score: 4179


{'f1_score': 0.817,
 'detection_score': 0.82,
 'precision_score': 0.7276999999999999,
 'recall_score': 0.975}

In [11]:
''' submission data '''

submission_user_data = {}
session_classification = {}

for user_id, user_data_path in tqdm.tqdm(zip(submission_user_list, submission_data_path), total=len(submission_user_list)):
    #   load user data
    submission_user_data[user_id] = User(user_id, user_data_path, gt_df[user_id])

    # create segment features
    submission_user_data[user_id].build_segment_features()

    anomaly_ground_truth = submission_user_data[user_id].user_anomaly_gt.values.astype(int) # test set only
    anomaly_ground_truth = np.array([1 if x == 0 else -1 for x in anomaly_ground_truth])

    ''' high dim data '''


    X_train = submission_user_data[user_id].normalized_features_train# [top_k_features]
    X_test = submission_user_data[user_id].normalized_features_test# [top_k_features]

    detector = DetectionPipeline()

    detector.fit(X_train)
    predicted_anomalies = detector.predict(X_test)
    user_scores_high_dim[user_id] = detection_metrics(predicted_anomalies, anomaly_ground_truth)

    session_classification[user_id] = [1 if x == -1 else 0 for x in predicted_anomalies]

100%|██████████| 30/30 [00:52<00:00,  1.75s/it]


In [12]:
''' submission file '''
submission_df = pd.read_csv("challengeToFill.csv", index_col=0).T
for user_id in submission_user_list:
    submission_df[user_id].iloc[50:] = session_classification[user_id]

submission_df = submission_df.T.astype(int)
submission_df.to_csv("submissions/203763339_3.csv")