In [None]:
%load_ext autoreload
%autoreload 2

In [94]:
import warnings

warnings.filterwarnings(action="ignore")

import os
import tqdm
import numpy as np
import pandas as pd

from typing import Tuple, List, Dict, Any
from constants import *
from features.user import User


# 90 of the test segments are genuine (i.e., benign) and 10 segments are entered bya masquerader (randomly sorted).

In [2]:
#   load answer for classifcation
gt_df = pd.read_csv("challengeToFill.csv", index_col=0).T[TRAIN_SEGMENT_COUNT:].reset_index(drop=True)




# dev_set_df = gt_df.T.iloc[TRAIN_SEGMENT_COUNT:, :DEV_USERS_COUNT]

In [None]:
def evaluate_user_results(pred_series: pd.Series, true_series: pd.Series):
    ''' evaluate the classification result of a single user '''
    assert len(pred_series) == len(true_series)
    assert pred_series.name == true_series.name

    equal_series = pred_series == true_series
    return equal_series.mean()

In [95]:
user_list = [f"User{i}" for i in range(10)]
data_path = [f"data/{uid}" for uid in user_list]

user_data = {}

for user_id, user_data_path in tqdm.tqdm(zip(user_list, data_path), total=len(user_list)):
    #   load user data
    user_data[user_id] = User(user_id, user_data_path, gt_df[user_id])

    # create segment features
    user_data[user_id].build_segment_features()


100%|██████████| 10/10 [00:27<00:00,  2.73s/it]


In [96]:
import matplotlib.pyplot as plt

all_test_segments = [user_data[user_id].segment_df_test.select_dtypes("number") for user_id in user_list]
all_test_segments_gt = [user_data[user_id].user_anomaly_gt for user_id in user_list]

all_test_segments = pd.concat(all_test_segments).reset_index(drop=True)
all_test_segments["anomaly"] = pd.concat(all_test_segments_gt).reset_index(drop=True)


corr = all_test_segments.corr()

In [None]:
all_test_segments[all_test_segments["anomaly"] == 1].to_csv("anomalies.csv")

In [None]:
corr

In [None]:
corr.columns

In [None]:
# import plotly.express as px
# fig = px.imshow(corr)
# fig.show()

In [97]:
''' first outlier detection algo '''
import itertools
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest
from sklearn.metrics import precision_score, recall_score
from sklearn.decomposition import PCA
from eval.metrics import detection_metrics




user_scores_high_dim = {}
user_scores_low_dim = {}

for user_id in tqdm.tqdm(user_list):

    anomaly_ground_truth = user_data[user_id].user_anomaly_gt.values.astype(int) # test set only
    anomaly_ground_truth = np.array([1 if x == 0 else -1 for x in anomaly_ground_truth])

    ''' high dim data '''
    #   each detector is trained per user
    detector = LocalOutlierFactor(
        n_neighbors=5,
        novelty=True,
        leaf_size=5,
        algorithm="kd_tree",
        p=1,
        # contamination=0.5
    )

    dummy_user_train_data = pd.get_dummies(user_data[user_id].segment_df_train)
    dummy_user_test_data = pd.get_dummies(user_data[user_id].segment_df_test)

    # predicted_anomalies = detector.fit_predict(pd.concat([dummy_user_train_data, dummy_user_test_data]).reset_index(drop=True))[50:]

    detector.fit(dummy_user_train_data)
    predicted_anomalies = detector.predict(dummy_user_test_data)
    user_scores_high_dim[user_id] = detection_metrics(predicted_anomalies, anomaly_ground_truth)

    ''' low dim data '''

    detector = LocalOutlierFactor(novelty=True, n_neighbors=5)
    # detector = OneClassSVM()

    # reducer = UMAP(n_components=10)
    reducer = PCA(n_components=20)
    S_train_low_dim = reducer.fit_transform(dummy_user_train_data)
    S_dev_low_dim = reducer.transform(dummy_user_test_data)

    detector.fit(S_train_low_dim)
    predicted_anomalies = detector.predict(S_dev_low_dim)

    user_scores_low_dim[user_id] = detection_metrics(predicted_anomalies, anomaly_ground_truth)


''' aggregate metrics '''


overall_scores_high_dim = {}
for score_type in ["detection_score" ,"precision_score", "recall_score"]:
    overall_scores_high_dim[score_type] = np.mean([user_scores_high_dim[user_id][score_type] for user_id in user_list])


overall_scores_low_dim = {}
for score_type in ["detection_score" ,"precision_score", "recall_score"]:
    overall_scores_low_dim[score_type] = np.mean([user_scores_low_dim[user_id][score_type] for user_id in user_list])


100%|██████████| 10/10 [00:01<00:00,  8.03it/s]


In [98]:
overall_scores_high_dim

{'detection_score': 0.66,
 'precision_score': 0.7909999999999999,
 'recall_score': 0.9564}

In [99]:
overall_scores_low_dim

{'detection_score': 0.25, 'precision_score': 0.8111, 'recall_score': 0.9048}

In [100]:
[user_scores_low_dim[user_id]["precision_score"] for user_id in user_list]

[0.844, 0.822, 0.578, 0.867, 0.889, 0.944, 0.978, 0.9, 0.378, 0.911]

In [None]:

import plotly.express as px

viz_df_train = pd.DataFrame({
    "x": S_train_low_dim[:, 0],
    "y": S_train_low_dim[:, 1],
    "user": TRAIN_SEGMENT_COUNT * [user_id],
    "set": TRAIN_SEGMENT_COUNT * ["train"]
})

viz_df_test = pd.DataFrame({
    "x": S_dev_low_dim[:, 0],
    "y": S_dev_low_dim[:, 1],
    "user": TEST_SEGMENT_COUNT * [user_id],
    "set": TEST_SEGMENT_COUNT * ["test"]
})


viz_df = pd.concat([viz_df_train, viz_df_test], axis=0)
viz_df["anomaly"] = gt_df.T[user_id].values
viz_df["anomaly"] = viz_df["anomaly"].astype(int)


In [None]:
# import plotly.io as pio
# pio.renderers.default='notebook'
#
px.scatter(viz_df, x="x", y="y", color="anomaly")