# Anomaly Detection Model Training
This notebook trains multiple anomaly detection models on the cleaned sensor datasets.

In [None]:
# Install required packages
!pip install pandas numpy scikit-learn boto3 matplotlib seaborn joblib -q
!pip install imbalanced-learn xgboost lightgbm -q

In [None]:
import pandas as pd
import numpy as np
import boto3
import json
import joblib
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.svm import OneClassSVM
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
import xgboost as xgb
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

# Configuration
INPUT_BUCKET = "processedd-synthetic-data"
OUTPUT_BUCKET = "processedd-synthetic-data"
REGION = "us-east-1"

# Initialize clients
s3_client = boto3.client('s3', region_name=REGION)
sagemaker_session = boto3.Session().region_name

print(f"Training models from: s3://{INPUT_BUCKET}/cleaned-data/")
print(f"Saving models to: s3://{OUTPUT_BUCKET}/trained-models/")

In [None]:
# Define datasets to train on
datasets = [
    "arc_data_cleaned.csv",
    "asd_data_cleaned.csv", 
    "basement_data_cleaned.csv",
    "laundry_data_cleaned.csv",
    "voc_data_cleaned.csv"
]

print("Datasets for training:")
for dataset in datasets:
    print(f"  - {dataset}")

In [None]:
def load_and_prepare_data(dataset_name):
    """Load and prepare dataset for training"""
    print(f"\n=== Loading {dataset_name} ===")
    
    # Load data from S3
    df = pd.read_csv(f"s3://{INPUT_BUCKET}/cleaned-data/{dataset_name}")
    print(f"Dataset shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    
    # Feature engineering
    features = []
    
    # Time-based features
    if 'timestamp' in df.columns:
        df['timestamp'] = pd.to_datetime(df['timestamp'])
        df['hour'] = df['timestamp'].dt.hour
        df['day_of_week'] = df['timestamp'].dt.dayofweek
        df['minute'] = df['timestamp'].dt.minute
        features.extend(['hour', 'day_of_week', 'minute'])
    
    # Sensor value features
    if 'value' in df.columns:
        # Rolling statistics
        df['value_rolling_mean_5'] = df['value'].rolling(window=5, min_periods=1).mean()
        df['value_rolling_std_5'] = df['value'].rolling(window=5, min_periods=1).std().fillna(0)
        df['value_rolling_mean_10'] = df['value'].rolling(window=10, min_periods=1).mean()
        df['value_rolling_std_10'] = df['value'].rolling(window=10, min_periods=1).std().fillna(0)
        
        # Lag features
        df['value_lag_1'] = df['value'].shift(1).fillna(df['value'].mean())
        df['value_lag_2'] = df['value'].shift(2).fillna(df['value'].mean())
        
        # Difference features
        df['value_diff_1'] = df['value'].diff().fillna(0)
        df['value_diff_2'] = df['value'].diff(2).fillna(0)
        
        features.extend([
            'value', 'value_rolling_mean_5', 'value_rolling_std_5',
            'value_rolling_mean_10', 'value_rolling_std_10',
            'value_lag_1', 'value_lag_2', 'value_diff_1', 'value_diff_2'
        ])
    
    # Sensor ID encoding
    if 'sensor_id' in df.columns:
        le = LabelEncoder()
        df['sensor_id_encoded'] = le.fit_transform(df['sensor_id'])
        features.append('sensor_id_encoded')
    
    # Prepare features and target
    X = df[features].fillna(0)
    y = df['is_anomaly'] if 'is_anomaly' in df.columns else None
    
    print(f"Features: {features}")
    print(f"Feature matrix shape: {X.shape}")
    
    if y is not None:
        print(f"Anomaly distribution:")
        print(y.value_counts())
        print(f"Anomaly rate: {y.mean():.4f}")
    
    return X, y, features

# Test with one dataset
X_test, y_test, features_test = load_and_prepare_data(datasets[0])

In [None]:
def train_anomaly_models(X, y, dataset_name):
    """Train multiple anomaly detection models"""
    print(f"\n=== Training Models for {dataset_name} ===")
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Handle class imbalance with SMOTE
    smote = SMOTE(random_state=42)
    X_train_balanced, y_train_balanced = smote.fit_resample(X_train_scaled, y_train)
    
    print(f"Original training set: {X_train.shape}")
    print(f"Balanced training set: {X_train_balanced.shape}")
    
    models = {}
    results = {}
    
    # 1. Isolation Forest (Unsupervised)
    print("\nTraining Isolation Forest...")
    iso_forest = IsolationForest(contamination=y_train.mean(), random_state=42)
    iso_forest.fit(X_train_scaled)
    iso_pred = iso_forest.predict(X_test_scaled)
    iso_pred = np.where(iso_pred == -1, 1, 0)  # Convert to 0/1
    
    models['isolation_forest'] = {'model': iso_forest, 'scaler': scaler}
    results['isolation_forest'] = {
        'predictions': iso_pred,
        'accuracy': (iso_pred == y_test).mean(),
        'classification_report': classification_report(y_test, iso_pred, output_dict=True)
    }
    
    # 2. One-Class SVM (Unsupervised)
    print("Training One-Class SVM...")
    oc_svm = OneClassSVM(nu=y_train.mean())
    oc_svm.fit(X_train_scaled[y_train == 0])  # Train only on normal data
    svm_pred = oc_svm.predict(X_test_scaled)
    svm_pred = np.where(svm_pred == -1, 1, 0)  # Convert to 0/1
    
    models['one_class_svm'] = {'model': oc_svm, 'scaler': scaler}
    results['one_class_svm'] = {
        'predictions': svm_pred,
        'accuracy': (svm_pred == y_test).mean(),
        'classification_report': classification_report(y_test, svm_pred, output_dict=True)
    }
    
    # 3. Random Forest (Supervised)
    print("Training Random Forest...")
    rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
    rf.fit(X_train_balanced, y_train_balanced)
    rf_pred = rf.predict(X_test_scaled)
    rf_proba = rf.predict_proba(X_test_scaled)[:, 1]
    
    models['random_forest'] = {'model': rf, 'scaler': scaler}
    results['random_forest'] = {
        'predictions': rf_pred,
        'probabilities': rf_proba,
        'accuracy': (rf_pred == y_test).mean(),
        'auc_score': roc_auc_score(y_test, rf_proba),
        'classification_report': classification_report(y_test, rf_pred, output_dict=True)
    }
    
    # 4. XGBoost (Supervised)
    print("Training XGBoost...")
    xgb_model = xgb.XGBClassifier(
        n_estimators=100, 
        random_state=42, 
        scale_pos_weight=len(y_train_balanced[y_train_balanced==0])/len(y_train_balanced[y_train_balanced==1])
    )
    xgb_model.fit(X_train_balanced, y_train_balanced)
    xgb_pred = xgb_model.predict(X_test_scaled)
    xgb_proba = xgb_model.predict_proba(X_test_scaled)[:, 1]
    
    models['xgboost'] = {'model': xgb_model, 'scaler': scaler}
    results['xgboost'] = {
        'predictions': xgb_pred,
        'probabilities': xgb_proba,
        'accuracy': (xgb_pred == y_test).mean(),
        'auc_score': roc_auc_score(y_test, xgb_proba),
        'classification_report': classification_report(y_test, xgb_pred, output_dict=True)
    }
    
    # 5. LightGBM (Supervised)
    print("Training LightGBM...")
    lgb_model = lgb.LGBMClassifier(
        n_estimators=100, 
        random_state=42, 
        class_weight='balanced',
        verbose=-1
    )
    lgb_model.fit(X_train_balanced, y_train_balanced)
    lgb_pred = lgb_model.predict(X_test_scaled)
    lgb_proba = lgb_model.predict_proba(X_test_scaled)[:, 1]
    
    models['lightgbm'] = {'model': lgb_model, 'scaler': scaler}
    results['lightgbm'] = {
        'predictions': lgb_pred,
        'probabilities': lgb_proba,
        'accuracy': (lgb_pred == y_test).mean(),
        'auc_score': roc_auc_score(y_test, lgb_proba),
        'classification_report': classification_report(y_test, lgb_pred, output_dict=True)
    }
    
    return models, results, X_test, y_test

# Test with one dataset
models_test, results_test, X_test_data, y_test_data = train_anomaly_models(X_test, y_test, datasets[0])

In [None]:
def evaluate_and_visualize_models(models, results, X_test, y_test, dataset_name):
    """Evaluate and visualize model performance"""
    print(f"\n=== Model Evaluation for {dataset_name} ===")
    
    # Performance summary
    performance_df = pd.DataFrame({
        'Model': list(results.keys()),
        'Accuracy': [results[model]['accuracy'] for model in results.keys()],
        'AUC Score': [results[model].get('auc_score', 'N/A') for model in results.keys()]
    })
    
    print("\nModel Performance Summary:")
    print(performance_df.to_string(index=False))
    
    # Visualizations
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    fig.suptitle(f'Model Performance Analysis - {dataset_name}', fontsize=16)
    
    # 1. Accuracy comparison
    axes[0, 0].bar(performance_df['Model'], performance_df['Accuracy'])
    axes[0, 0].set_title('Model Accuracy Comparison')
    axes[0, 0].set_ylabel('Accuracy')
    axes[0, 0].tick_params(axis='x', rotation=45)
    
    # 2. ROC Curves for supervised models
    supervised_models = ['random_forest', 'xgboost', 'lightgbm']
    for model_name in supervised_models:
        if model_name in results and 'probabilities' in results[model_name]:
            fpr, tpr, _ = roc_curve(y_test, results[model_name]['probabilities'])
            auc_score = results[model_name]['auc_score']
            axes[0, 1].plot(fpr, tpr, label=f'{model_name} (AUC: {auc_score:.3f})')
    
    axes[0, 1].plot([0, 1], [0, 1], 'k--', label='Random')
    axes[0, 1].set_xlabel('False Positive Rate')
    axes[0, 1].set_ylabel('True Positive Rate')
    axes[0, 1].set_title('ROC Curves')
    axes[0, 1].legend()
    
    # 3. Confusion matrices for best model
    best_model = max(results.keys(), key=lambda x: results[x]['accuracy'])
    cm = confusion_matrix(y_test, results[best_model]['predictions'])
    sns.heatmap(cm, annot=True, fmt='d', ax=axes[1, 0], cmap='Blues')
    axes[1, 0].set_title(f'Confusion Matrix - {best_model}')
    axes[1, 0].set_xlabel('Predicted')
    axes[1, 0].set_ylabel('Actual')
    
    # 4. Feature importance (for tree-based models)
    if best_model in ['random_forest', 'xgboost', 'lightgbm']:
        model = models[best_model]['model']
        if hasattr(model, 'feature_importances_'):
            importances = model.feature_importances_
            feature_names = [f'Feature_{i}' for i in range(len(importances))]
            
            # Top 10 features
            top_indices = np.argsort(importances)[-10:]
            axes[1, 1].barh(range(len(top_indices)), importances[top_indices])
            axes[1, 1].set_yticks(range(len(top_indices)))
            axes[1, 1].set_yticklabels([feature_names[i] for i in top_indices])
            axes[1, 1].set_title(f'Top 10 Feature Importances - {best_model}')
            axes[1, 1].set_xlabel('Importance')
    
    plt.tight_layout()
    plt.show()
    
    return best_model, performance_df

# Test evaluation
best_model_test, perf_df_test = evaluate_and_visualize_models(
    models_test, results_test, X_test_data, y_test_data, datasets[0]
)

In [None]:
def save_models_to_s3(models, dataset_name, performance_df):
    """Save trained models to S3"""
    print(f"\nSaving models for {dataset_name} to S3...")
    
    for model_name, model_data in models.items():
        # Save model locally first
        model_filename = f"/tmp/{dataset_name}_{model_name}_model.joblib"
        joblib.dump(model_data, model_filename)
        
        # Upload to S3
        s3_key = f"trained-models/{dataset_name}/{model_name}_model.joblib"
        s3_client.upload_file(model_filename, OUTPUT_BUCKET, s3_key)
        print(f"  ✅ Uploaded {model_name} to s3://{OUTPUT_BUCKET}/{s3_key}")
    
    # Save performance metrics
    perf_filename = f"/tmp/{dataset_name}_performance.csv"
    performance_df.to_csv(perf_filename, index=False)
    
    s3_key = f"trained-models/{dataset_name}/performance_metrics.csv"
    s3_client.upload_file(perf_filename, OUTPUT_BUCKET, s3_key)
    print(f"  📊 Uploaded performance metrics to s3://{OUTPUT_BUCKET}/{s3_key}")

# Test saving
save_models_to_s3(models_test, datasets[0].replace('_cleaned.csv', ''), perf_df_test)

In [None]:
# Train models for all datasets
all_results = {}
all_models = {}
all_performance = {}

for dataset in datasets:
    dataset_name = dataset.replace('_cleaned.csv', '')
    
    try:
        print(f"\n{'='*80}")
        print(f"PROCESSING DATASET: {dataset_name}")
        print(f"{'='*80}")
        
        # Load and prepare data
        X, y, features = load_and_prepare_data(dataset)
        
        # Train models
        models, results, X_test, y_test = train_anomaly_models(X, y, dataset_name)
        
        # Evaluate and visualize
        best_model, performance_df = evaluate_and_visualize_models(
            models, results, X_test, y_test, dataset_name
        )
        
        # Save models to S3
        save_models_to_s3(models, dataset_name, performance_df)
        
        # Store results
        all_results[dataset_name] = results
        all_models[dataset_name] = models
        all_performance[dataset_name] = {
            'best_model': best_model,
            'performance_df': performance_df,
            'features': features
        }
        
        print(f"\n✅ Successfully trained models for {dataset_name}")
        print(f"Best model: {best_model}")
        
    except Exception as e:
        print(f"\n❌ Error processing {dataset_name}: {str(e)}")
        all_results[dataset_name] = {'error': str(e)}

In [None]:
# Create overall training summary
training_summary = {
    'training_timestamp': datetime.now().isoformat(),
    'total_datasets': len(datasets),
    'successful_trainings': len([r for r in all_results.values() if 'error' not in r]),
    'failed_trainings': len([r for r in all_results.values() if 'error' in r]),
    'models_trained': ['isolation_forest', 'one_class_svm', 'random_forest', 'xgboost', 'lightgbm'],
    'dataset_results': {}
}

print("\n" + "="*80)
print("TRAINING SUMMARY")
print("="*80)
print(f"Total datasets: {training_summary['total_datasets']}")
print(f"Successful trainings: {training_summary['successful_trainings']}")
print(f"Failed trainings: {training_summary['failed_trainings']}")
print(f"Models per dataset: {len(training_summary['models_trained'])}")

print("\nDataset Results:")
for dataset_name, perf_data in all_performance.items():
    if 'best_model' in perf_data:
        best_model = perf_data['best_model']
        best_accuracy = perf_data['performance_df'][perf_data['performance_df']['Model'] == best_model]['Accuracy'].iloc[0]
        print(f"  ✅ {dataset_name}: Best model = {best_model} (Accuracy: {best_accuracy:.4f})")
        
        training_summary['dataset_results'][dataset_name] = {
            'best_model': best_model,
            'best_accuracy': float(best_accuracy),
            'features_count': len(perf_data['features'])
        }
    else:
        print(f"  ❌ {dataset_name}: Training failed")
        training_summary['dataset_results'][dataset_name] = {'status': 'failed'}

# Save training summary
summary_json = json.dumps(training_summary, indent=2)
with open('/tmp/training_summary.json', 'w') as f:
    f.write(summary_json)

s3_client.upload_file('/tmp/training_summary.json', OUTPUT_BUCKET, 'trained-models/training_summary.json')
print(f"\n📊 Training summary saved to s3://{OUTPUT_BUCKET}/trained-models/training_summary.json")

In [None]:
# Create model comparison visualization
if all_performance:
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
    
    # Best model per dataset
    datasets_names = []
    best_models = []
    best_accuracies = []
    
    for dataset_name, perf_data in all_performance.items():
        if 'best_model' in perf_data:
            datasets_names.append(dataset_name)
            best_models.append(perf_data['best_model'])
            best_accuracy = perf_data['performance_df'][perf_data['performance_df']['Model'] == perf_data['best_model']]['Accuracy'].iloc[0]
            best_accuracies.append(best_accuracy)
    
    # Bar chart of best accuracies
    bars = ax1.bar(datasets_names, best_accuracies)
    ax1.set_title('Best Model Accuracy by Dataset')
    ax1.set_ylabel('Accuracy')
    ax1.set_xlabel('Dataset')
    ax1.tick_params(axis='x', rotation=45)
    
    # Add accuracy values on bars
    for bar, acc in zip(bars, best_accuracies):
        ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, 
                f'{acc:.3f}', ha='center', va='bottom')
    
    # Model frequency chart
    model_counts = pd.Series(best_models).value_counts()
    ax2.pie(model_counts.values, labels=model_counts.index, autopct='%1.1f%%', startangle=90)
    ax2.set_title('Best Model Distribution Across Datasets')
    
    plt.tight_layout()
    plt.show()
    
    print("\n🎉 Model training completed successfully!")
    print(f"📁 All models saved to s3://{OUTPUT_BUCKET}/trained-models/")
    print("\n🚀 Your models are ready for deployment and inference!")
else:
    print("\n⚠️ No models were successfully trained. Please check the errors above.")