In [2]:
import numpy as np
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split

In [17]:
class anomaly_detection:
    def __init__(self):
      # Mean of the features
      self.mu = None
      # Variance of the features
      self.var = None

      # Initialize variable for the anomaly detection threshold (epsilon)
      self.best_epsilon = None

      # Initialize variable for best F1 score
      self.best_F1 = 0



    def generate_synthetic_data(self,num_normal=1000, num_anomalies=50):
        mu_normal = [7, 5]  # Mean of the normal data
        cov_normal = [[1, 0], [0, 1]]  # Covariance of the normal data
        X_normal = np.random.multivariate_normal(mu_normal, cov_normal, num_normal)

        # Generate anomaly data (e.g., random points far from the mean)
        X_anomalies = np.random.uniform(low=10, high=15, size=(num_anomalies, 2))

        # Combine the normal data and anomalies
        X = np.vstack((X_normal, X_anomalies))

        # Labels: 0 for normal, 1 for anomalies
        y = np.hstack((np.zeros(num_normal), np.ones(num_anomalies)))

        return X, y
    def cross_vlidation(self):
      X,y=self.generate_synthetic_data()
      X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
      self.mu ,self.var=self.estimate_gaussian(X_train)

      p_val=self.multivariate_gaussian(X_val)
      return p_val ,X_val,y_val



    def estimate_gaussian(self,X):
        m, n = X.shape
        mu = np.zeros(n)
        var = np.zeros(n)
        # Calculate mu (mean) and var (variance) for each feature
        for i in range(n):
            mu[i] = np.sum(X[:, i]) / m
            var[i] = np.sum((X[:, i] - mu[i]) ** 2) / m
        return mu, var
    def multivariate_gaussian(self,X):
        k = len(self.mu)
        sigma2 = np.diag(self.var)
        X = X - self.mu
        p = (1 / (np.power((2 * np.pi), k / 2) * np.sqrt(np.linalg.det(sigma2)))) *\
        np.exp(-0.5 * np.sum(np.dot(X, np.linalg.pinv(sigma2)) * X, axis=1))
        return p

    def select_threshold(self):
        p_val ,x_val,y_val=self.cross_vlidation()
        self.best_epsilon = 0
        self.best_F1 = 0
        F1 = 0

        step_size = (max(p_val) - min(p_val)) / 1000

        for epsilon in np.arange(min(p_val), max(p_val), step_size):

            ### START CODE HERE ###
            predictions = (p_val < epsilon)

            tp = np.sum((predictions == 1) & (y_val == 1))
            fp = np.sum((predictions == 1) & (y_val == 0))
            fn = np.sum((predictions == 0) & (y_val == 1))


            if tp + fp == 0:
                precision = 0
            else:
                precision = tp / (tp + fp)

            if tp + fn == 0:
                recall = 0
            else:
                recall = tp / (tp + fn)


            if precision + recall > 0:
                F1 =( 2 * precision * recall) / (precision + recall)
            else:
                F1 = 0

            if F1 > self.best_F1:
                    self.best_F1 = F1
                    self.best_epsilon = epsilon

        return self.best_epsilon, self.best_F1

    def fit(self):
      self.best_epsilon, self.best_F1=self.select_threshold()
      print(f"best_epsilon: {self.best_epsilon} , best_F1: {self.best_F1}")


In [18]:
obj1=anomaly_detection()
obj1.fit()

best_epsilon: 0.0002668635789197943 , best_F1: 1.0
