# lab 5

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from scipy.io import loadmat
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score
from pyod.models.pca import PCA as PyOD_PCA
from pyod.models.kpca import KPCA
from pyod.utils.utility import standardizer

np.random.seed(42)

## Ex 1

### 1.1

In [None]:
mean = [5, 10, 2]
cov = [[3, 2, 2], [2, 10, 1], [2, 1, 2]]
X = np.random.multivariate_normal(mean, cov, 500)

fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X[:, 0], X[:, 1], X[:, 2], c='blue', alpha=0.6)
ax.set_xlabel('X')
ax.set_ylabel('Y')
ax.set_zlabel('Z')
ax.set_title('3D Dataset')
plt.show()

X_centered = X - np.mean(X, axis=0)

cov_matrix = (X_centered.T @ X_centered) / len(X)

eigenvalues, eigenvectors = np.linalg.eigh(cov_matrix)

idx = np.argsort(eigenvalues)[::-1]
eigenvalues = eigenvalues[idx]
eigenvectors = eigenvectors[:, idx]

### 1.2

In [None]:
total_variance = np.sum(eigenvalues)
explained_ratio = eigenvalues / total_variance
cumulative_ratio = np.cumsum(explained_ratio)

fig, ax = plt.subplots(figsize=(10, 6))

ax.bar(range(1, 4), explained_ratio, alpha=0.7, label='Individual Variance', color='steelblue')

ax.step(range(1, 4), cumulative_ratio, where='mid', label='Cumulative Variance', color='red', linewidth=2)

ax.set_xlabel('Principal Component')
ax.set_ylabel('Explained Variance Ratio')
ax.set_title('Cumulative and Individual Explained Variance')
ax.set_xticks([1, 2, 3])
ax.legend()
ax.grid(True, alpha=0.3)
plt.show()

print("Individual variance ratios:", explained_ratio)
print("Cumulative variance ratios:", cumulative_ratio)

### 1.3

In [None]:
X_transformed = X_centered @ eigenvectors

contamination = 0.1

pc3_values = X_transformed[:, 2]
pc3_mean = np.mean(pc3_values)
pc3_deviation = np.abs(pc3_values - pc3_mean)

threshold_pc3 = np.quantile(pc3_deviation, 1 - contamination)
y_pred_pc3 = (pc3_deviation > threshold_pc3).astype(int)

fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X[y_pred_pc3==0, 0], X[y_pred_pc3==0, 1], X[y_pred_pc3==0, 2], c='blue', alpha=0.6, label='Normal')
ax.scatter(X[y_pred_pc3==1, 0], X[y_pred_pc3==1, 1], X[y_pred_pc3==1, 2], c='red', alpha=0.8, label='Anomaly', s=50)
ax.set_xlabel('X')
ax.set_ylabel('Y')
ax.set_zlabel('Z')
ax.set_title('Outliers based on 3rd Principal Component')
ax.legend()
plt.show()

print(f"anomalies detected: {sum(y_pred_pc3)}")

In [None]:
pc2_values = X_transformed[:, 1]
pc2_mean = np.mean(pc2_values)
pc2_deviation = np.abs(pc2_values - pc2_mean)

threshold_pc2 = np.quantile(pc2_deviation, 1 - contamination)
y_pred_pc2 = (pc2_deviation > threshold_pc2).astype(int)

fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X[y_pred_pc2==0, 0], X[y_pred_pc2==0, 1], X[y_pred_pc2==0, 2], c='blue', alpha=0.6, label='Normal')
ax.scatter(X[y_pred_pc2==1, 0], X[y_pred_pc2==1, 1], X[y_pred_pc2==1, 2], c='red', alpha=0.8, label='Anomaly', s=50)
ax.set_xlabel('X')
ax.set_ylabel('Y')
ax.set_zlabel('Z')
ax.set_title('Outliers based on 2nd Principal Component')
ax.legend()
plt.show()

print(f"anomalies detected: {sum(y_pred_pc2)}")

### 1.4

In [None]:
X_normalized = X_transformed / np.sqrt(eigenvalues)

anomaly_scores = np.sum(X_normalized ** 2, axis=1)

threshold_all = np.quantile(anomaly_scores, 1 - contamination)
y_pred_all = (anomaly_scores > threshold_all).astype(int)

fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X[y_pred_all==0, 0], X[y_pred_all==0, 1], X[y_pred_all==0, 2], c='blue', alpha=0.6, label='Normal')
ax.scatter(X[y_pred_all==1, 0], X[y_pred_all==1, 1], X[y_pred_all==1, 2], c='red', alpha=0.8, label='Anomaly', s=50)
ax.set_xlabel('X')
ax.set_ylabel('Y')
ax.set_zlabel('Z')
ax.set_title('Outliers based on Normalized Distance (All Principal Components)')
ax.legend()
plt.show()

print(f"anomalies detected: {sum(y_pred_all)}")

## Ex 2

### 2.1

In [None]:
mat_data = loadmat('../lab3/shuttle.mat')
X_shuttle = mat_data['X']
y_shuttle = mat_data['y'].ravel()

print(f"Number of outliers: {sum(y_shuttle)} ({sum(y_shuttle)/len(y_shuttle)*100:.2f}%)")

X_train, X_test, y_train, y_test = train_test_split(
    X_shuttle, y_shuttle, train_size=0.6, random_state=42, stratify=y_shuttle
)

contamination_rate = sum(y_train) / len(y_train)
print(f"Training contamination rate: {contamination_rate:.4f}")

X_train_std, X_test_std = standardizer(X_train, X_test)

pca_model = PyOD_PCA(contamination=contamination_rate)
pca_model.fit(X_train_std)

explained_var = pca_model.explained_variance_
explained_ratio = explained_var / np.sum(explained_var)
cumulative_ratio = np.cumsum(explained_ratio)

fig, ax = plt.subplots(figsize=(10, 6))
ax.bar(range(1, len(explained_ratio)+1), explained_ratio, alpha=0.7, label='Individual Variance', color='steelblue')
ax.step(range(1, len(cumulative_ratio)+1), cumulative_ratio, where='mid', label='Cumulative Variance', color='red', linewidth=2)
ax.set_xlabel('Principal Component')
ax.set_ylabel('Explained Variance Ratio')
ax.set_title('PCA - Cumulative and Individual Explained Variance (Shuttle Dataset)')
ax.legend()
ax.grid(True, alpha=0.3)
plt.show()

### 2.2

In [None]:
y_pred_train_pca = pca_model.labels_
y_pred_test_pca = pca_model.predict(X_test_std)

ba_train_pca = balanced_accuracy_score(y_train, y_pred_train_pca)
ba_test_pca = balanced_accuracy_score(y_test, y_pred_test_pca)

print("PCA Results:")
print(f"Train Balanced Accuracy: {ba_train_pca:.4f}")
print(f"Test Balanced Accuracy: {ba_test_pca:.4f}")

kpca_model = KPCA(contamination=contamination_rate)
kpca_model.fit(X_train_std)

y_pred_train_kpca = kpca_model.labels_
y_pred_test_kpca = kpca_model.predict(X_test_std)

ba_train_kpca = balanced_accuracy_score(y_train, y_pred_train_kpca)
ba_test_kpca = balanced_accuracy_score(y_test, y_pred_test_kpca)

print("\nKPCA Results:")
print(f"Train Balanced Accuracy: {ba_train_kpca:.4f}")
print(f"Test Balanced Accuracy: {ba_test_kpca:.4f}")

## Ex 3

### 3.1

In [None]:
import tensorflow as tf
from tensorflow import keras

mat_data = loadmat('../lab3/shuttle.mat')
X_shuttle = mat_data['X']
y_shuttle = mat_data['y'].ravel()

X_train_ae, X_test_ae, y_train_ae, y_test_ae = train_test_split(
    X_shuttle, y_shuttle, test_size=0.5, random_state=42, stratify=y_shuttle
)

X_min = X_train_ae.min(axis=0)
X_max = X_train_ae.max(axis=0)
X_train_norm = (X_train_ae - X_min) / (X_max - X_min + 1e-8)
X_test_norm = (X_test_ae - X_min) / (X_max - X_min + 1e-8)

print(f"Training set: {X_train_norm.shape}")
print(f"Test set: {X_test_norm.shape}")
print(f"Contamination rate: {sum(y_train_ae)/len(y_train_ae):.4f}")

### 3.2 Design Autoencoder class

In [None]:
class Autoencoder(keras.Model):
    def __init__(self):
        super(Autoencoder, self).__init__()
        
        self.encoder = keras.Sequential([
            keras.layers.Dense(8, activation='relu'),
            keras.layers.Dense(5, activation='relu'),
            keras.layers.Dense(3, activation='relu')
        ])
        
        self.decoder = keras.Sequential([
            keras.layers.Dense(5, activation='relu'),
            keras.layers.Dense(8, activation='relu'),
            keras.layers.Dense(9, activation='sigmoid')
        ])
    
    def call(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

### 3.3

In [None]:
autoencoder = Autoencoder()
autoencoder.compile(optimizer='adam', loss='mse')

history = autoencoder.fit(
    X_train_norm, X_train_norm,
    epochs=100,
    batch_size=1024,
    validation_data=(X_test_norm, X_test_norm),
    verbose=1
)

plt.figure(figsize=(10, 6))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss (MSE)')
plt.title('Autoencoder Training and Validation Loss')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

### 3.4

In [None]:
train_reconstructed = autoencoder.predict(X_train_norm)
test_reconstructed = autoencoder.predict(X_test_norm)

train_errors = np.mean((X_train_norm - train_reconstructed) ** 2, axis=1)
test_errors = np.mean((X_test_norm - test_reconstructed) ** 2, axis=1)

contamination_ae = sum(y_train_ae) / len(y_train_ae)
threshold = np.quantile(train_errors, 1 - contamination_ae)

y_pred_train_ae = (train_errors > threshold).astype(int)
y_pred_test_ae = (test_errors > threshold).astype(int)

ba_train_ae = balanced_accuracy_score(y_train_ae, y_pred_train_ae)
ba_test_ae = balanced_accuracy_score(y_test_ae, y_pred_test_ae)

print(f"Threshold: {threshold:.6f}")
print(f"Training Balanced Accuracy: {ba_train_ae:.4f}")
print(f"Test Balanced Accuracy: {ba_test_ae:.4f}")

## Ex 4

### 4.1

In [None]:
(x_train_mnist, _), (x_test_mnist, _) = keras.datasets.mnist.load_data()

x_train_mnist = x_train_mnist.astype('float32') / 255.0
x_test_mnist = x_test_mnist.astype('float32') / 255.0

x_train_mnist = x_train_mnist[..., np.newaxis]
x_test_mnist = x_test_mnist[..., np.newaxis]

noise_factor = 0.35
x_train_noisy = x_train_mnist + noise_factor * tf.random.normal(shape=x_train_mnist.shape)
x_test_noisy = x_test_mnist + noise_factor * tf.random.normal(shape=x_test_mnist.shape)

x_train_noisy = tf.clip_by_value(x_train_noisy, 0.0, 1.0)
x_test_noisy = tf.clip_by_value(x_test_noisy, 0.0, 1.0)

print(f"Training data shape: {x_train_mnist.shape}")
print(f"Test data shape: {x_test_mnist.shape}")

### 4.2

In [None]:
class ConvAutoencoder(keras.Model):
    def __init__(self):
        super(ConvAutoencoder, self).__init__()
        
        self.encoder = keras.Sequential([
            keras.layers.Conv2D(8, (3, 3), activation='relu', strides=2, padding='same'),
            keras.layers.Conv2D(4, (3, 3), activation='relu', strides=2, padding='same')
        ])
        
        self.decoder = keras.Sequential([
            keras.layers.Conv2DTranspose(4, (3, 3), activation='relu', strides=2, padding='same'),
            keras.layers.Conv2DTranspose(8, (3, 3), activation='relu', strides=2, padding='same'),
            keras.layers.Conv2D(1, (3, 3), activation='sigmoid', padding='same')
        ])
    
    def call(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

### 4.3

In [None]:
conv_autoencoder = ConvAutoencoder()
conv_autoencoder.compile(optimizer='adam', loss='mse')

history_conv = conv_autoencoder.fit(
    x_train_mnist, x_train_mnist,
    epochs=10,
    batch_size=64,
    validation_data=(x_test_mnist, x_test_mnist),
    verbose=1
)

train_reconstructed_conv = conv_autoencoder.predict(x_train_mnist)
train_errors_conv = np.mean((x_train_mnist - train_reconstructed_conv) ** 2, axis=(1, 2, 3))

threshold_conv = np.mean(train_errors_conv) + np.std(train_errors_conv)
print(f"Threshold: {threshold_conv:.6f}")

test_reconstructed_orig = conv_autoencoder.predict(x_test_mnist)
test_reconstructed_noisy = conv_autoencoder.predict(x_test_noisy)

test_errors_orig = np.mean((x_test_mnist - test_reconstructed_orig) ** 2, axis=(1, 2, 3))
test_errors_noisy = np.mean((x_test_noisy.numpy() - test_reconstructed_noisy) ** 2, axis=(1, 2, 3))

y_true_orig = np.zeros(len(x_test_mnist))
y_true_noisy = np.ones(len(x_test_noisy))

y_pred_orig = (test_errors_orig > threshold_conv).astype(int)
y_pred_noisy = (test_errors_noisy > threshold_conv).astype(int)

acc_orig = np.mean(y_pred_orig == y_true_orig)
acc_noisy = np.mean(y_pred_noisy == y_true_noisy)

print(f"Accuracy on original test images: {acc_orig:.4f}")
print(f"Accuracy on noisy test images: {acc_noisy:.4f}")

### 4.4

In [None]:
n_images = 5
fig, axes = plt.subplots(4, n_images, figsize=(15, 12))

for i in range(n_images):
    # Row 1
    axes[0, i].imshow(x_test_mnist[i].squeeze(), cmap='gray')
    axes[0, i].axis('off')
    if i == 0:
        axes[0, i].set_title('Original', fontsize=12)
    
    # Row 2
    axes[1, i].imshow(x_test_noisy[i].numpy().squeeze(), cmap='gray')
    axes[1, i].axis('off')
    if i == 0:
        axes[1, i].set_title('Noisy', fontsize=12)
    
    # Row 3
    axes[2, i].imshow(test_reconstructed_orig[i].squeeze(), cmap='gray')
    axes[2, i].axis('off')
    if i == 0:
        axes[2, i].set_title('Reconstructed (Original)', fontsize=12)
    
    # Row 4
    axes[3, i].imshow(test_reconstructed_noisy[i].squeeze(), cmap='gray')
    axes[3, i].axis('off')
    if i == 0:
        axes[3, i].set_title('Reconstructed (Noisy)', fontsize=12)

plt.suptitle('Standard Convolutional Autoencoder', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

### 4.5

In [None]:
denoising_autoencoder = ConvAutoencoder()
denoising_autoencoder.compile(optimizer='adam', loss='mse')

x_train_noisy_dae = x_train_mnist + noise_factor * tf.random.normal(shape=x_train_mnist.shape)
x_train_noisy_dae = tf.clip_by_value(x_train_noisy_dae, 0.0, 1.0)

history_dae = denoising_autoencoder.fit(
    x_train_noisy_dae, x_train_mnist,
    epochs=10,
    batch_size=64,
    validation_data=(x_test_noisy, x_test_mnist),
    verbose=1
)

test_reconstructed_orig_dae = denoising_autoencoder.predict(x_test_mnist)
test_reconstructed_noisy_dae = denoising_autoencoder.predict(x_test_noisy)

In [None]:
n_images = 5
fig, axes = plt.subplots(4, n_images, figsize=(15, 12))

for i in range(n_images):
    # Row 1
    axes[0, i].imshow(x_test_mnist[i].squeeze(), cmap='gray')
    axes[0, i].axis('off')
    if i == 0:
        axes[0, i].set_title('Original', fontsize=12)
    
    # Row 2
    axes[1, i].imshow(x_test_noisy[i].numpy().squeeze(), cmap='gray')
    axes[1, i].axis('off')
    if i == 0:
        axes[1, i].set_title('Noisy', fontsize=12)
    
    # Row 3
    axes[2, i].imshow(test_reconstructed_orig_dae[i].squeeze(), cmap='gray')
    axes[2, i].axis('off')
    if i == 0:
        axes[2, i].set_title('Reconstructed (Original)', fontsize=12)
    
    # Row 47
    axes[3, i].imshow(test_reconstructed_noisy_dae[i].squeeze(), cmap='gray')
    axes[3, i].axis('off')
    if i == 0:
        axes[3, i].set_title('Reconstructed (Noisy)', fontsize=12)

plt.suptitle('Denoising Convolutional Autoencoder', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()