#### Workflow for Anomaly Detection: ####

Train the model on the training data to estimate the parameters (mean and covariance for multivariate, mean and standard deviation for univariate).
Tune the threshold (epsilon) using the training data (e.g., by selecting a value that flags a reasonable number of anomalies or using cross-validation).
Apply the model and the chosen epsilon to the test data to detect anomalies.


#### How to Tune epsilon: ####

Visual Inspection: Plot the probabilities and choose epsilon such that it captures the tail of the distribution.
Quantile-Based: Set epsilon to a low quantile (e.g., 1st or 5th percentile) of the training probabilities.
Domain Knowledge: Use prior knowledge to set a reasonable threshold.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import multivariate_normal, norm

from sklearn.preprocessing import StandardScaler

In [None]:

# Load the data
ano_train = pd.read_csv('ano_train.csv', header=None).values
ano_test = pd.read_csv('ano_test.csv', header=None).values


scaler = StandardScaler()
ano_train_scaled = scaler.fit_transform(ano_train)
ano_test_scaled = scaler.transform(ano_test)


In [None]:


# Anomaly detection functions
def univariate_gaussian_anomaly_detection(data, epsilon=0.01):
    mu = np.mean(data, axis=0)
    sigma = np.std(data, axis=0)
    p = np.prod(norm.pdf(data, loc=mu, scale=sigma), axis=1)
    anomalies = p < epsilon
    return anomalies, mu, sigma, p

def multivariate_gaussian_anomaly_detection_singular(data, epsilon=0.01):
    mu = np.mean(data, axis=0)
    cov = np.cov(data.T)
    mv_normal = multivariate_normal(mean=mu, cov=cov, allow_singular=True)
    p = mv_normal.pdf(data)
    anomalies = p < epsilon
    return anomalies, mu, cov, p


# Plotting function
def plot_contours_with_data(data, anomalies, mu, cov=None, title="Anomaly Detection"):
    plt.figure(figsize=(6, 5))  # Smaller figure size
    plt.scatter(data[:, 0], data[:, 1], c=anomalies, cmap='coolwarm', alpha=0.7, s=20, edgecolors='none')
    plt.title(title, fontsize=10)
    plt.xlabel("Feature 1", fontsize=9)
    plt.ylabel("Feature 2", fontsize=9)

    # Set axis limits to zoom in on the data
    plt.xlim(data[:, 0].min() - 0.5, data[:, 0].max() + 0.5)
    plt.ylim(data[:, 1].min() - 5, data[:, 1].max() + 5)

    x, y = np.meshgrid(
        np.linspace(data[:, 0].min(), data[:, 0].max(), 100),
        np.linspace(data[:, 1].min(), data[:, 1].max(), 100)
    )
    grid = np.stack([x.ravel(), y.ravel()], axis=1)

    if cov is not None:  # Multivariate
        mv_normal = multivariate_normal(mean=mu, cov=cov, allow_singular=True)
        z = mv_normal.pdf(grid).reshape(x.shape)
    else:  # Univariate
        z = np.prod(norm.pdf(grid, loc=mu, scale=np.std(data, axis=0)), axis=1).reshape(x.shape)

    # Plot contour with lower alpha for better visibility of data points
    plt.contour(x, y, z, levels=10, colors='k', alpha=0.3, linewidths=0.5)
    plt.colorbar(label="Anomaly (Red) / Normal (Blue)", shrink=0.8)
    plt.tight_layout()  # Remove unnecessary whitespace
    plt.show()




In [None]:
# Manually set epsilon values
epsilon_u = 0.0002  # Change this value as needed

In [None]:
# Univariate
# Fit models on training data
univariate_anomalies_train, mu_u_train, sigma_u_train, p_u_train = univariate_gaussian_anomaly_detection(ano_train_scaled,epsilon_u)

# Extract anomalous points
anomalous_points_univariate = ano_train_scaled[univariate_anomalies_train]

print("Univariate anomalies:\n", anomalous_points_univariate.shape)


plot_contours_with_data(ano_train_scaled, univariate_anomalies_train, mu_u_train, title="Univariate Gaussian Anomaly Detection (Train Data)")

In [None]:
# Manually set epsilon values
epsilon_m = 0.005  # Change this value as needed

In [None]:
# Multivariate train

# Extract anomalous points
anomalous_points_multivariate = ano_train_scaled[multivariate_anomalies_train]
print("Multivariate anomalies:\n", anomalous_points_multivariate.shape)

multivariate_anomalies_train, mu_m_train, cov_m_train, p_m_train = multivariate_gaussian_anomaly_detection_singular(ano_train_scaled, epsilon_m)
plot_contours_with_data(ano_train_scaled, multivariate_anomalies_train, mu_m_train, cov_m_train, title="Multivariate Gaussian Anomaly Detection (Train Data)")

