In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.neighbors import KernelDensity
from scipy.spatial import distance
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt

In [None]:
class AnomalyDetector:
    def __init__(self, bandwidth=0.5, n_samples=2000, n_splits=20):
        self.bandwidth = bandwidth
        self.n_samples = n_samples
        self.n_splits = n_splits

    def compute_internal_divergence(self, df):
        df = df.iloc[:, :5].reset_index()
        kf = KFold(n_splits=self.n_splits)
        distances = []

        for train_idx, test_idx in kf.split(df):
            part1 = df.loc[train_idx].drop('index', axis=1)
            part2 = df.loc[test_idx].drop('index', axis=1)

            kde1 = KernelDensity(kernel='gaussian', bandwidth=self.bandwidth).fit(part1)
            kde2 = KernelDensity(kernel='gaussian', bandwidth=self.bandwidth).fit(part2)

            smpl1 = kde1.sample(self.n_samples)
            smpl2 = kde2.sample(self.n_samples)

            combined = np.sort(np.concatenate([smpl1, smpl2]))
            p = kde1.score_samples(combined)
            q = kde2.score_samples(combined)

            jsd = distance.jensenshannon(p, q)
            distances.append(jsd)
            print(jsd)

        return distances

    def compute_divergence_from_legit(self, legit_df, unknown_df):
        legit_kde = KernelDensity(kernel='gaussian', bandwidth=self.bandwidth).fit(legit_df)
        legit_sample = legit_kde.sample(self.n_samples)

        df = unknown_df.iloc[:, :5].reset_index()
        kf = KFold(n_splits=self.n_splits)
        distances = []

        for _, test_idx in kf.split(df):
            part = df.loc[test_idx].drop('index', axis=1)
            part_kde = KernelDensity(kernel='gaussian', bandwidth=self.bandwidth).fit(part)
            part_sample = part_kde.sample(self.n_samples)

            combined = np.sort(np.concatenate([legit_sample, part_sample]))
            p = part_kde.score_samples(combined)
            q = legit_kde.score_samples(combined)

            jsd = distance.jensenshannon(p, q)
            distances.append(jsd)
            print(jsd)

        return distances

    def evaluate_on_test(self, legit_train_df, unknown_test_df, n_splits=40):
        legit_kde = KernelDensity(kernel='gaussian', bandwidth=self.bandwidth).fit(legit_train_df)
        legit_sample = legit_kde.sample(self.n_samples)

        df = unknown_test_df.iloc[:, :5].reset_index()
        kf = KFold(n_splits=n_splits)
        distances = []

        for _, test_idx in kf.split(df):
            part = df.loc[test_idx].drop('index', axis=1)
            part_kde = KernelDensity(kernel='gaussian', bandwidth=self.bandwidth).fit(part)
            part_sample = part_kde.sample(self.n_samples)

            combined = np.sort(np.concatenate([legit_sample, part_sample]))
            p = legit_kde.score_samples(combined)
            q = part_kde.score_samples(combined)

            jsd = distance.jensenshannon(p, q)
            distances.append(jsd)
            print(jsd)

        return distances
