In [None]:
# ============================================================
# Exercise 10 - Part 2: DBSCAN Equipment Anomaly Detection
# ============================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN

# ============================================================
# 1. Load Dataset
# ============================================================

print("Loading equipment anomaly dataset...")
df = pd.read_csv('equipment_anomaly_dataset.csv')

print(f"Dataset shape: {df.shape}")
print(f"\nFirst few rows:")
print(df.head())

print(f"\nColumn names: {df.columns.tolist()}")
print(f"\nDataset info:")
print(df.info())

# ============================================================
# 2. Select and Prepare Features
# ============================================================

print("\n" + "="*60)
print("Selecting features for anomaly detection...")
print("="*60)

# Drop non-numeric and non-useful columns
# Keep only the sensor measurements
features_to_use = ['temperature', 'pressure', 'vibration', 'humidity']
X = df[features_to_use].values

print(f"Selected features: {features_to_use}")
print(f"Feature matrix shape: {X.shape}")

# Store the ground truth for comparison
y_true = df['faulty'].values
print(f"\nActual faulty equipment count: {y_true.sum()}")

# ============================================================
# 3. Standardize the Data
# ============================================================

print("\nStandardizing features...")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("Mean after scaling:", X_scaled.mean(axis=0))
print("Std after scaling:", X_scaled.std(axis=0))

# ============================================================
# 4. Apply PCA for 2D Visualization
# ============================================================

print("\n" + "="*60)
print("Applying PCA to reduce to 2D...")
print("="*60)

pca = PCA(n_components=2)
X_2d = pca.fit_transform(X_scaled)

print(f"PCA shape: {X_2d.shape}")
print(f"Explained variance: {pca.explained_variance_ratio_}")
print(f"Total variance explained: {sum(pca.explained_variance_ratio_):.2%}")

# ============================================================
# 5. Apply DBSCAN
# ============================================================

print("\n" + "="*60)
print("Applying DBSCAN clustering...")
print("="*60)

# DBSCAN parameters
eps = 0.5
min_samples = 5

dbscan = DBSCAN(eps=eps, min_samples=min_samples)
labels = dbscan.fit_predict(X_2d)

# Count clusters and anomalies
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
n_noise = list(labels).count(-1)

print(f"eps: {eps}")
print(f"min_samples: {min_samples}")
print(f"\nEstimated number of clusters: {n_clusters}")
print(f"Estimated number of noise points (anomalies): {n_noise}")
print(f"Percentage of anomalies: {n_noise/len(labels)*100:.1f}%")

# ============================================================
# 6. Plot Original Data vs DBSCAN Results
# ============================================================

print("\n" + "="*60)
print("Creating visualizations...")
print("="*60)

# Create side-by-side plots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 7))

# --- Plot 1: Original Data (Ground Truth) ---
scatter1 = ax1.scatter(X_2d[:, 0], X_2d[:, 1], 
                       c=y_true, cmap='coolwarm',
                       s=50, alpha=0.7, edgecolor='k')
ax1.set_title('Original Data (Ground Truth)', fontsize=14, fontweight='bold')
ax1.set_xlabel('PC1', fontsize=12)
ax1.set_ylabel('PC2', fontsize=12)
ax1.grid(True, linestyle='--', alpha=0.5)

# Add legend for ground truth
from matplotlib.lines import Line2D
legend_elements = [
    Line2D([0], [0], marker='o', color='w', markerfacecolor='blue', 
           markersize=10, label='Normal (0)'),
    Line2D([0], [0], marker='o', color='w', markerfacecolor='red', 
           markersize=10, label='Faulty (1)')
]
ax1.legend(handles=legend_elements, title="True Labels", loc='best')

# --- Plot 2: DBSCAN Results ---
unique_labels = set(labels)
colors = {0: 'blue', 1: 'green', 2: 'orange', 3: 'purple', -1: 'black'}
markers = {0: 'o', 1: 'o', 2: 'o', 3: 'o', -1: 'x'}

for k in unique_labels:
    if k == -1:
        # Black for anomalies/noise
        col = 'black'
        marker = 'x'
        label = 'Noise / Anomaly'
        size = 80
    else:
        col = colors.get(k, 'cyan')
        marker = markers.get(k, 'o')
        label = f'Cluster {k}'
        size = 50
    
    class_mask = (labels == k)
    ax2.scatter(X_2d[class_mask, 0], X_2d[class_mask, 1],
               marker=marker, c=col, s=size,
               label=label, alpha=0.8, edgecolor='k')

ax2.set_title('DBSCAN Clustering Results', fontsize=14, fontweight='bold')
ax2.set_xlabel('PC1', fontsize=12)
ax2.set_ylabel('PC2', fontsize=12)
ax2.legend(title="DBSCAN Found", loc='best')
ax2.grid(True, linestyle='--', alpha=0.5)

plt.suptitle('DBSCAN Anomaly Detection on Equipment Data', 
             fontsize=16, fontweight='bold')
plt.tight_layout(rect=[0, 0.03, 1, 0.96])
plt.show()

# ============================================================
# 7. Analyze Results
# ============================================================

print("\n" + "="*60)
print("Analysis of Detected Anomalies")
print("="*60)

# Identify anomalies detected by DBSCAN
anomalies = (labels == -1)

print(f"\nDetected {anomalies.sum()} anomalies out of {len(X)} samples")
print(f"Actual faulty equipment: {y_true.sum()}")

# Show some anomalous samples
print("\nSample anomalous data points:")
anomaly_df = df[anomalies].head(10)
print(anomaly_df[['temperature', 'pressure', 'vibration', 'humidity', 
                   'equipment', 'location', 'faulty']])

# Feature statistics for anomalies
print("\n" + "="*60)
print("Feature Statistics: Normal vs Anomalies")
print("="*60)

normal_mask = ~anomalies
for feature in features_to_use:
    normal_mean = df[normal_mask][feature].mean()
    anomaly_mean = df[anomalies][feature].mean()
    print(f"\n{feature.upper()}:")
    print(f"  Normal mean:  {normal_mean:.2f}")
    print(f"  Anomaly mean: {anomaly_mean:.2f}")

# ============================================================
# 8. Feature Distribution Plots
# ============================================================

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

for idx, feature in enumerate(features_to_use):
    row = idx // 2
    col = idx % 2
    ax = axes[row, col]
    
    # Plot histograms
    ax.hist(df[normal_mask][feature], bins=30, alpha=0.6, 
            label='Normal', color='blue', edgecolor='black')
    ax.hist(df[anomalies][feature], bins=30, alpha=0.6, 
            label='Anomaly', color='red', edgecolor='black')
    
    ax.set_xlabel(feature.capitalize(), fontsize=11)
    ax.set_ylabel('Frequency', fontsize=11)
    ax.set_title(f'{feature.capitalize()} Distribution', fontsize=12, fontweight='bold')
    ax.legend(fontsize=10)
    ax.grid(True, alpha=0.3)

plt.suptitle('Feature Distributions: Normal vs Detected Anomalies', 
             fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

print("\n" + "="*60)
print("DBSCAN Anomaly Detection Complete!")
print("="*60)