# Imports

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.covariance import EllipticEnvelope
from sklearn.metrics import f1_score, recall_score
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.neighbors import LocalOutlierFactor
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
from sklearn.feature_selection import VarianceThreshold
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from pyod.models.deep_svdd import DeepSVDD

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Scaling and Removing EventDay for one classification

In [None]:
def preprocess_for_models(df):              
    # Get rid of asthma attack days 
    df_withoutRowEventDay = df[df['EventDay'] == 0]
    
    # Remove now usless EventDay column
    df_withoutColEventDay = df_withoutRowEventDay.drop(['EventDay'], axis = 1)

    # Initialize the StandardScaler and scale the dataframe
    scaler = StandardScaler()
    scaled_df = pd.DataFrame(scaler.fit_transform(df_withoutColEventDay), columns=df_withoutColEventDay.columns)
    
    '''
    Important note!
    Really important to use a random state here
    Since the number of exacerbations is really low we should create multiple samples of the train test split
    '''
    
    X_train, X_test = train_test_split(scaled_df, test_size=0.2)

    return X_train, X_test

# Performance metrics

In [None]:
def display_confusion_matrix(cm):
    # Create a heatmap for visualization
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm_df, annot=True, cmap='Blues', fmt='d')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.show()

def calculate_metrics(cm):
    # Extracting values from confusion matrix
    TP = cm[0][0]
    TN = cm[1][1]
    FP = cm[0][1]
    FN = cm[1][0]

    # Calculating accuracy
    accuracy = (TP + TN) / (TP + TN + FP + FN)

    # Calculating precision
    precision = TP / (TP + FP)

    # Calculating recall
    recall = TP / (TP + FN)

    # Calculating F1 score
    f1_score = 2 * (precision * recall) / (precision + recall)

    # Print the results
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1_score)

# k-Means Cluster

In [None]:
def suggested_no_of_clusters(min_clusters, max_clusters):
    # Initialize lists to store the number of clusters and corresponding WCSS values
    num_clusters = []
    wcss_values = []

    # Perform clustering for different numbers of clusters
    for k in range(min_clusters, max_clusters + 1):
        kmeans = KMeans(n_clusters=k, random_state=0)
        kmeans.fit(df)

        # Compute the within-cluster sum of squares (WCSS)
        wcss = kmeans.inertia_

        # Append the number of clusters and WCSS values to the lists
        num_clusters.append(k)
        wcss_values.append(wcss)

    # Calculate the differences between consecutive WCSS values
    wcss_diff = [wcss_values[i] - wcss_values[i-1] for i in range(1, len(wcss_values))]

    # Find the index of the maximum difference
    max_diff_index = wcss_diff.index(max(wcss_diff))

    # Suggested number of clusters
    suggested_clusters = num_clusters[max_diff_index]

    print("Suggested number of clusters:", suggested_clusters)

    
# TODO can refine clusters for diffeent data sets
# suggested_no_of_clusters(data_train, 1, 10)

In [None]:
def k_means(df, data_train, data_pred, num_clusters=5, threshold=10):
    # Select relevant columns for clustering
    columns_to_cluster = data_train.columns

    # Extract the subset of data for clustering from training DataFrame
    data_for_clustering = data_train[columns_to_cluster]

    # Standardize the data
    scaler = StandardScaler()
    data_for_clustering_standardized = scaler.fit_transform(data_for_clustering)

    # Apply k-means clustering on training data
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    kmeans.fit(data_for_clustering_standardized)

    # Get the cluster labels for training data
    training_cluster_labels = kmeans.labels_

    # Assign cluster labels to the training DataFrame
    data_train['ClusterLabel'] = training_cluster_labels

    # Check the distribution of clusters in training data
    training_cluster_counts = data_train['ClusterLabel'].value_counts()

    # Calculate cluster means on training data
    training_cluster_means = data_train.groupby('ClusterLabel').mean()

    # Extract the subset of data for anomaly prediction from predicting DataFrame
    data_for_prediction = data_pred[columns_to_cluster]

    # Standardize the data for prediction using the same scaler as training
    data_for_prediction_standardized = scaler.transform(data_for_prediction)

    # Get the cluster labels for prediction data
    prediction_cluster_labels = kmeans.predict(data_for_prediction_standardized)

    # Assign cluster labels to the predicting DataFrame
    data_pred['ClusterLabel'] = prediction_cluster_labels

    # Check the distribution of clusters in predicting data
    prediction_cluster_counts = data_pred['ClusterLabel'].value_counts()

    # Detect anomalies based on cluster means
    anomaly_rows = []
    for index, row in data_pred.iterrows():
        cluster_label = row['ClusterLabel']
        features = row[columns_to_cluster]
        cluster_mean = training_cluster_means.loc[cluster_label]
        if any(abs(features - cluster_mean) > threshold):
            anomaly_rows.append(index)

    # detected anomalies
    anomalies = data_pred.loc[anomaly_rows]
    
    df['Anomalies_KMeans'] = 1  # Initialize the column with -1 values
    df.loc[anomalies.index, 'Anomalies_KMeans'] = -1  # Set the corresponding anomalies as 1

# Isolation Forest

In [None]:
def iforest(df, data_train, data_pred, outliers_fraction=0.5):
    # Train
    ifo = IsolationForest(contamination = outliers_fraction)
    ifo.fit(data_train)

    # Predict
    df['Anomalies_IF'] = pd.Series(ifo.predict(data_pred))
    
    # Fill NaN values (samples from the training set) in the column with 1
    df['Anomalies_IF'].fillna(1, inplace=True)

# OSVM

In [None]:
def ocsvm(df, data_train, data_pred, outliers_fraction=0.9):
    # Train
    osvm = OneClassSVM(nu = outliers_fraction, kernel = 'sigmoid')
    osvm.fit(data_train)

    # Predict
    df['Anomalies_OSVM'] = pd.Series(osvm.predict(data_pred))
    
    # Fill NaN values (samples from the training set) in the column with 1
    df['Anomalies_OSVM'].fillna(1, inplace=True)

# Local Outlier Factor

In [None]:
def local_of(df, data_train, data_pred, outliers_fraction=0.3):
    # Train
    lof = LocalOutlierFactor(contamination=outliers_fraction)
    lof.fit(data_train)

    # Predict
    df['Anomalies_LOF'] = pd.Series(lof.fit_predict(data_pred))
    
    # Fill NaN values (samples from the training set) in the column with 1
    df['Anomalies_LOF'].fillna(1, inplace=True)

# Autoencoders

In [None]:
def autoencoders(df, train, test):
    # Define the shape of the input data
    input_dim = train.shape[1]

    # Define the architecture of the autoencoder
    input_layer = Input(shape=(input_dim,))
    encoder = Dense(32, activation='relu')(input_layer)  # Encoding layer
    decoder = Dense(input_dim, activation='linear')(encoder)  # Decoding layer

    # Create the autoencoder model
    autoencoder = Model(inputs=input_layer, outputs=decoder)

    # Compile the model
    autoencoder.compile(optimizer='adam', loss='mse')

    # Train the autoencoder
    autoencoder.fit(train, train, epochs=50, batch_size=32, validation_data=(test, test))

    # Use the trained autoencoder to predict on the test dataset
    reconstructed_data = autoencoder.predict(test)

    # Calculate the mean squared error (MSE) between the original and reconstructed data
    mse = np.mean(np.power(test - reconstructed_data, 2), axis=1)

    # Define a threshold to determine outliers
    threshold = np.mean(mse) + 3 * np.std(mse)  # Adjust the multiplier (3) as per your requirements

    # Map outliers as -1 and inliers as 1
    outliers = np.where(mse > threshold, -1, 1)

    # Return anomalies
    anomalies = test.copy()
    anomalies['Anomalies_AE'] = outliers    
    
    # Add the anomaly column to the original data
    df['Anomalies_AE'] = 1  # Initialize all values as 1 (normal)
    df.iloc[test.index, -1] = anomalies['Anomalies_AE'].values  # Assign anomaly values to test data indices

# Deep Support Vector Data Description

In [None]:
def dsvdd(df, train, test):
    # Train the autoencoder
    input_dim = train.shape[1]
    input_layer = Input(shape=(input_dim,))
    
    encoder = Dense(32, activation='relu')(input_layer)  # Adjust the number of nodes in the encoder layer
    decoder = Dense(input_dim, activation='sigmoid')(encoder)
    autoencoder = Model(inputs=input_layer, outputs=decoder)
    autoencoder.compile(optimizer='adam', loss='mean_squared_error')
    autoencoder.fit(train, train, epochs=50, batch_size=32, shuffle=True, validation_data=(test, test))

    # Obtain the encoded representations from the trained autoencoder
    encoder_model = Model(inputs=input_layer, outputs=encoder)
    encoded_train_data = encoder_model.predict(train)
    encoded_test_data = encoder_model.predict(test)

    # Train the ODSVDDM using the encoded representations
    deep_svdd = DeepSVDD()
    deep_svdd.fit(encoded_train_data)

    # Predict the anomaly scores for the test data
    anomaly_scores = deep_svdd.decision_function(encoded_test_data)

    # Calculate the threshold for anomaly detection using the anomaly scores
    threshold = np.percentile(anomaly_scores, 97)  # Adjust the percentile as needed

    # Classify data points as normal (inliers) or anomalous (outliers)
    predictions = np.where(anomaly_scores > threshold, -1, 1)

    # Add the anomaly column to the original data
    df['Anomalies_DSVDD'] = 1  # Initialize all values as 1 (inliers)
    df.iloc[test.index, -1] = predictions  # Assign anomaly values to test data indices

# MICE
Load files

In [None]:
# Move up 2 directories
data_directory = '../..' 

# Load the CSV files
asthma_df = pd.read_csv(os.path.join(data_directory, 'Data\Preprocessed', 'preprocessed_MICE_asthma.csv'))
healthy_df = pd.read_csv(os.path.join(data_directory, 'Data\Preprocessed', 'preprocessed_MICE_healthy.csv'))

# Merged df
merged_df = pd.concat([asthma_df, healthy_df], ignore_index=True)

In [None]:
merged_df.head(5)

In [None]:
# Preprocess for models
train, pred = preprocess_for_models(merged_df)

In [None]:
# Run all models and store anomaly results in merged_df
k_means(merged_df, train, pred)
iforest(merged_df, train, pred)
local_of(merged_df, train, pred)
ocsvm(merged_df, train, pred)
autoencoders(merged_df, train, pred) 
dsvdd(merged_df, train, pred)

In [None]:
print(merged_df[['Anomalies_KMeans', 'Anomalies_IF', 'Anomalies_OSVM', 'Anomalies_LOF', 'Anomalies_AE', 'Anomalies_DSVDD']])
# Select the desired columns
columns = ['Anomalies_KMeans', 'Anomalies_IF', 'Anomalies_OSVM', 'Anomalies_LOF', 'Anomalies_AE', 'Anomalies_DSVDD']

# Iterate over each column and print the count of -1s
for col in columns:
    count_neg_ones = (merged_df[col] == -1).sum()
    print(f"Column '{col}': Count of -1s = {count_neg_ones}")

In [None]:
print(pred.shape)

In [None]:
# Calculate metrics
print("K-Means")
cm = confusion_matrix(pred, anomalies_kmeans)
display_confusion_matrix(cm)
calculate_metrics(cm)

print("iforst")
cm = confusion_matrix(pred, anomalies_kmeans)
display_confusion_matrix(cm)
calculate_metrics(cm)

print("ocsvm")
cm = confusion_matrix(pred, anomalies_kmeans)
display_confusion_matrix(cm)
calculate_metrics(cm)

# TODO try feature selection