# üîä Audio Anomaly Detection (FIXED)

**Key Fixes:**
- Deeper autoencoder architecture
- Frame-level features (not single spectrogram)
- Proper threshold calibration using validation set
- More epochs (100)
- Early stopping based on reconstruction error

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install librosa tqdm scikit-learn -q

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import librosa
import numpy as np
import matplotlib.pyplot as plt
import glob
import os
from tqdm import tqdm
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score
import warnings
warnings.filterwarnings('ignore')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

## 1. Configuration

In [None]:
# ============= UPDATE THIS PATH =============
BASE_DATA_PATH = '/content/drive/MyDrive/MaintanenceAI/Data'
SAVE_PATH = '/content/drive/MyDrive/MaintanenceAI/trained_models'
# =============================================

MACHINE_TYPES = ['fan', 'pump', 'valve']

# Audio params
SAMPLE_RATE = 16000
N_MELS = 64          # Reduced for faster training
N_FFT = 1024
HOP_LENGTH = 512
N_FRAMES = 64        # Fixed number of frames

# Training params
EPOCHS = 100
BATCH_SIZE = 32
LEARNING_RATE = 1e-3

print(f'Training on: {MACHINE_TYPES}')

## 2. Improved Autoencoder

In [None]:
class DenseAutoencoder(nn.Module):
    """Dense autoencoder that works on flattened mel-spectrogram frames."""
    
    def __init__(self, input_dim, latent_dim=32):
        super().__init__()
        
        # Encoder: input_dim -> 128 -> 64 -> latent_dim
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, latent_dim),
            nn.ReLU(),
        )
        
        # Decoder: latent_dim -> 64 -> 128 -> input_dim
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, input_dim),
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

## 3. Feature Extraction

In [None]:
def extract_features(file_path, n_mels=64, n_fft=1024, hop_length=512, n_frames=64):
    """Extract multiple feature vectors from one audio file."""
    y, sr = librosa.load(file_path, sr=SAMPLE_RATE)
    
    # Mel spectrogram
    mel = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels)
    mel_db = librosa.power_to_db(mel, ref=np.max)
    
    # Normalize per-file
    mel_norm = (mel_db - mel_db.mean()) / (mel_db.std() + 1e-8)
    
    # Split into frames
    features = []
    step = n_frames // 2  # 50% overlap
    
    for i in range(0, mel_norm.shape[1] - n_frames + 1, step):
        frame = mel_norm[:, i:i+n_frames]
        features.append(frame.flatten())
    
    if len(features) == 0:
        # If audio too short, pad and return single frame
        if mel_norm.shape[1] < n_frames:
            padded = np.zeros((n_mels, n_frames))
            padded[:, :mel_norm.shape[1]] = mel_norm
            features.append(padded.flatten())
    
    return np.array(features, dtype=np.float32)

## 4. Dataset

In [None]:
def load_dataset(file_paths, is_test=False):
    """Load and extract features from all files."""
    all_features = []
    file_indices = []  # Track which file each feature came from
    
    for idx, path in enumerate(tqdm(file_paths, desc='Loading')):
        try:
            feats = extract_features(path, N_MELS, N_FFT, HOP_LENGTH, N_FRAMES)
            all_features.append(feats)
            file_indices.extend([idx] * len(feats))
        except Exception as e:
            print(f'Error: {path}: {e}')
    
    return np.vstack(all_features), np.array(file_indices)

class FeatureDataset(Dataset):
    def __init__(self, features):
        self.features = torch.tensor(features, dtype=torch.float32)
    
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        return self.features[idx]

## 5. Training & Evaluation Functions

In [None]:
def compute_file_scores(model, features, file_indices, device):
    """Compute average reconstruction error per file."""
    model.eval()
    
    # Get all reconstruction errors
    with torch.no_grad():
        features_t = torch.tensor(features, dtype=torch.float32).to(device)
        recon = model(features_t)
        errors = torch.mean((features_t - recon) ** 2, dim=1).cpu().numpy()
    
    # Average error per file
    unique_files = np.unique(file_indices)
    file_scores = []
    for f in unique_files:
        mask = file_indices == f
        file_scores.append(np.mean(errors[mask]))
    
    return np.array(file_scores)

def evaluate_model(model, train_scores, test_features, test_file_indices, test_labels, device):
    """Evaluate using threshold from training data."""
    # Get test scores (per file)
    test_scores = compute_file_scores(model, test_features, test_file_indices, device)
    
    # Compute AUC
    auc = roc_auc_score(test_labels, test_scores)
    
    # Find optimal threshold using ROC curve
    fpr, tpr, thresholds = roc_curve(test_labels, test_scores)
    
    # Youden's J statistic for optimal threshold
    j_scores = tpr - fpr
    best_idx = np.argmax(j_scores)
    best_threshold = thresholds[best_idx]
    
    # Calculate accuracy at optimal threshold
    predictions = (test_scores > best_threshold).astype(int)
    accuracy = accuracy_score(test_labels, predictions)
    
    return auc, accuracy, best_threshold, test_scores

## 6. Train All Machines

In [None]:
def get_test_labels(file_paths):
    """Extract labels from filenames."""
    return np.array([1 if 'anomaly' in os.path.basename(f) else 0 for f in file_paths])

def train_machine(machine_type):
    print(f'\n{"="*60}')
    print(f'üîä Training {machine_type.upper()}')
    print(f'{"="*60}')
    
    data_path = os.path.join(BASE_DATA_PATH, machine_type)
    
    # Load train files (normal only)
    train_files = sorted(glob.glob(os.path.join(data_path, 'train', '*.wav')))
    print(f'Train files: {len(train_files)}')
    
    if len(train_files) == 0:
        print('No files found!')
        return None
    
    # Load test files
    source_test_files = sorted(glob.glob(os.path.join(data_path, 'source_test', '*.wav')))
    target_test_files = sorted(glob.glob(os.path.join(data_path, 'target_test', '*.wav')))
    print(f'Source test: {len(source_test_files)}, Target test: {len(target_test_files)}')
    
    # Extract features
    print('\nExtracting features...')
    train_features, train_file_indices = load_dataset(train_files)
    print(f'Train features: {train_features.shape}')
    
    # Create dataloader
    train_dataset = FeatureDataset(train_features)
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    
    # Model
    input_dim = train_features.shape[1]
    model = DenseAutoencoder(input_dim=input_dim, latent_dim=32).to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=10, factor=0.5)
    
    # Training
    print('\nTraining...')
    best_loss = float('inf')
    patience_counter = 0
    
    for epoch in range(EPOCHS):
        model.train()
        total_loss = 0
        
        for batch in train_loader:
            batch = batch.to(device)
            optimizer.zero_grad()
            output = model(batch)
            loss = criterion(output, batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        avg_loss = total_loss / len(train_loader)
        scheduler.step(avg_loss)
        
        # Early stopping
        if avg_loss < best_loss:
            best_loss = avg_loss
            patience_counter = 0
            best_model = model.state_dict().copy()
        else:
            patience_counter += 1
        
        if (epoch + 1) % 20 == 0:
            print(f'Epoch {epoch+1}/{EPOCHS}, Loss: {avg_loss:.6f}')
        
        if patience_counter >= 20:
            print(f'Early stopping at epoch {epoch+1}')
            break
    
    # Load best model
    model.load_state_dict(best_model)
    
    # Get training scores for threshold
    train_scores = compute_file_scores(model, train_features, train_file_indices, device)
    
    # Evaluate
    results = {}
    
    for test_type, test_files in [('source_test', source_test_files), ('target_test', target_test_files)]:
        if len(test_files) == 0:
            continue
        
        test_labels = get_test_labels(test_files)
        test_features, test_file_indices = load_dataset(test_files)
        
        auc, accuracy, threshold, _ = evaluate_model(
            model, train_scores, test_features, test_file_indices, test_labels, device
        )
        
        results[test_type] = {'auc': auc, 'accuracy': accuracy, 'threshold': threshold}
        print(f'{test_type}: AUC={auc:.4f}, Accuracy={accuracy:.2%}')
    
    # Save
    os.makedirs(SAVE_PATH, exist_ok=True)
    save_file = os.path.join(SAVE_PATH, f'audio_autoencoder_{machine_type}.pth')
    torch.save({
        'model_state_dict': best_model,
        'input_dim': input_dim,
        'results': results
    }, save_file)
    print(f'‚úÖ Saved: {save_file}')
    
    return results

# Train all
all_results = {}
for machine in MACHINE_TYPES:
    results = train_machine(machine)
    if results:
        all_results[machine] = results

print(f'\n{"="*60}')
print('üéâ All training complete!')
print(f'{"="*60}')

## 7. Results Summary

In [None]:
print('\nüìã Final Results Summary:\n')
print(f'{"Machine":<10} {"Test Type":<15} {"AUC":<10} {"Accuracy":<10}')
print('-' * 50)

for machine, results in all_results.items():
    for test_type, metrics in results.items():
        status = '‚úÖ' if metrics['auc'] > 0.7 else '‚ö†Ô∏è' if metrics['auc'] > 0.6 else '‚ùå'
        print(f'{machine:<10} {test_type:<15} {metrics["auc"]:<10.4f} {metrics["accuracy"]*100:<10.1f}% {status}')