In [4]:
# Water Quality Analysis System: Complete Implementation
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from pathlib import Path

# Statistical analysis
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Deep Learning
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

# Utilities
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

def assess_data_quality(df):
    """
    Perform comprehensive data quality assessment.
    """
    print("\nData Quality Assessment:")
    
    # Check for missing values
    missing = df.isnull().sum()
    print("\nMissing Values:")
    print(missing[missing > 0])
    
    # Check value ranges
    print("\nValue Ranges:")
    print(f"Range: {df['value'].min():.2f} to {df['value'].max():.2f}")
    
    # Calculate time gaps
    time_gaps = df['datetime'].diff()
    print("\nTime Gaps:")
    print(f"Minimum gap: {time_gaps.min()}")
    print(f"Maximum gap: {time_gaps.max()}")
    print(f"Mean gap: {time_gaps.mean()}")
    
    # Check for duplicates
    duplicates = df.duplicated('datetime').sum()
    print(f"\nDuplicate timestamps: {duplicates}")
    
    return {
        'missing_values': missing,
        'time_gaps': time_gaps,
        'duplicates': duplicates
    }

def create_visualizations(df, sensor_name):
    """
    Create comprehensive visualizations of the sensor data.
    """
    # Create a copy of the dataframe
    df = df.copy()
    
    # Convert to naive datetime for plotting
    df['datetime'] = pd.to_datetime(df['datetime']).dt.tz_localize(None)
    
    # Add time-based features
    df['hour'] = df['datetime'].dt.hour
    df['month'] = df['datetime'].dt.month
    df['year'] = df['datetime'].dt.year
    
    # Create figure with subplots
    fig = plt.figure(figsize=(20, 15))
    
    # 1. Time series plot
    plt.subplot(3, 2, 1)
    plt.plot(df['datetime'], df['value'], 'b-', alpha=0.5)
    plt.title(f'{sensor_name} Over Time')
    plt.xlabel('Date')
    plt.ylabel('Value')
    plt.xticks(rotation=45)
    
    # 2. Distribution plot
    plt.subplot(3, 2, 2)
    sns.histplot(data=df, x='value', bins=50)
    plt.title(f'Distribution of {sensor_name} Values')
    plt.xlabel('Value')
    
    # 3. Box plot by month
    plt.subplot(3, 2, 3)
    sns.boxplot(data=df, x='month', y='value')
    plt.title(f'{sensor_name} Values by Month')
    plt.xlabel('Month')
    plt.ylabel('Value')
    
    # 4. Daily pattern
    plt.subplot(3, 2, 4)
    sns.boxplot(data=df, x='hour', y='value')
    plt.title(f'{sensor_name} Values by Hour')
    plt.xlabel('Hour of Day')
    plt.ylabel('Value')
    
    # 5. Values by year
    plt.subplot(3, 2, 5)
    sns.boxplot(data=df, x='year', y='value')
    plt.title(f'{sensor_name} Values by Year')
    plt.xlabel('Year')
    plt.ylabel('Value')
    
    # 6. Time gaps analysis
    plt.subplot(3, 2, 6)
    time_gaps = df['datetime'].diff().dt.total_seconds() / 60  # Convert to minutes
    sns.histplot(time_gaps[time_gaps < time_gaps.quantile(0.95)])  # Exclude extreme gaps
    plt.title('Distribution of Time Gaps')
    plt.xlabel('Gap (minutes)')
    plt.ylabel('Count')
    
    plt.tight_layout()
    return fig

class SensorAnalyzer:
    def __init__(self, sensor_name, data):
        self.sensor_name = sensor_name
        self.data = data
        self.results = None
        self.model = None
        self.history = None
    
    def preprocess_data(self, resample_freq='1H'):
        """Preprocess sensor data"""
        df_copy = self.data.copy()
        df_copy['datetime'] = pd.to_datetime(df_copy['datetime'])
        df_copy.set_index('datetime', inplace=True)
        df_resampled = df_copy['value'].resample(resample_freq).mean()
        df_resampled = df_resampled.interpolate(method='linear')
        return pd.DataFrame(df_resampled.values, index=df_resampled.index, columns=['value'])
    
    def detect_anomalies(self, seq_length=24):
        """Run LSTM-based anomaly detection with improved architecture"""
        # Scale the data
        scaler = StandardScaler()
        scaled_data = scaler.fit_transform(self.data['value'].values.reshape(-1, 1))
        
        # Create sequences
        sequences = []
        targets = []
        for i in range(len(scaled_data) - seq_length):
            sequences.append(scaled_data[i:i + seq_length])
            targets.append(scaled_data[i + seq_length])
        X = np.array(sequences)
        y = np.array(targets)
        
        # Split data
        train_size = int(len(X) * 0.8)
        X_train, X_val = X[:train_size], X[train_size:]
        y_train, y_val = y[:train_size], y[train_size:]
        
        # Build model with improved architecture
        model = Sequential([
            LSTM(32, activation='relu', input_shape=(seq_length, 1), return_sequences=True),
            Dropout(0.3),
            LSTM(16, activation='relu'),
            Dropout(0.3),
            Dense(8, activation='relu'),
            Dense(1)
        ])
        
        # Use reduced learning rate and add decay
        optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
        model.compile(optimizer=optimizer, loss='mse')
        
        # Improved callbacks
        early_stopping = tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=10,
            min_delta=0.0001,
            restore_best_weights=True
        )
        
        reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.5,
            patience=5,
            min_delta=0.0001
        )
        
        # Train model
        history = model.fit(
            X_train, y_train,
            epochs=50,
            batch_size=32,
            validation_data=(X_val, y_val),
            callbacks=[early_stopping, reduce_lr],
            verbose=1
        )
        
        # Make predictions
        predictions = model.predict(X)
        
        # Calculate reconstruction error
        reconstruction_error = np.mean(np.abs(predictions - y), axis=1)
        error_threshold = np.mean(reconstruction_error) + 3 * np.std(reconstruction_error)
        
        # Create results DataFrame
        results = pd.DataFrame()
        results.index = self.data.index[seq_length:]
        results['original_value'] = self.data['value'].values[seq_length:]
        results['predicted_value'] = scaler.inverse_transform(predictions).flatten()
        results['reconstruction_error'] = reconstruction_error
        results['is_anomaly'] = reconstruction_error > error_threshold
        
        # Store results
        self.results = results
        self.model = model
        self.history = history
        
        return results

def load_sensor_data(file_path):
    """Load and basic preprocessing of sensor data"""
    df = pd.read_csv(file_path)
    df['datetime'] = pd.to_datetime(df['datetime'], utc=True)
    return df.sort_values('datetime')

# The rest of the code remains the same until the analyze_all_sensors function
def analyze_all_sensors(sensor_files):
    """Analyze all sensor data files"""
    results = {}
    
    # Create subplot figure - Fixed the subplot creation
    n_sensors = len(sensor_files)
    fig = make_subplots(
        rows=n_sensors,
        cols=1,
        subplot_titles=[f.stem for f in sensor_files],
        vertical_spacing=0.05
    )
    
    # Update layout with height - Moved height parameter here
    fig.update_layout(
        height=300 * n_sensors,
        title_text="Multi-Sensor Anomaly Detection Results",
        showlegend=True,
        template="plotly_white"
    )
    
    for idx, file_path in enumerate(sensor_files, 1):
        print(f"\nProcessing {file_path.stem}...")
        
        # Load data
        data = load_sensor_data(file_path)
        
        # Perform data quality assessment
        quality_metrics = assess_data_quality(data)
        
        # Create basic visualizations
        vis_fig = create_visualizations(data, file_path.stem)
        plt.close(vis_fig)  # Close matplotlib figure to free memory
        
        # Analyze data
        analyzer = SensorAnalyzer(file_path.stem, data)
        processed_data = analyzer.preprocess_data()
        results_df = analyzer.detect_anomalies()
        
        results[file_path.stem] = {
            'analyzer': analyzer,
            'results': results_df,
            'quality_metrics': quality_metrics
        }
        
        # Add traces to subplot
        fig.add_trace(
            go.Scatter(
                x=results_df.index,
                y=results_df['original_value'],
                mode='lines',
                name=f'{file_path.stem} Original',
                line=dict(color='blue', width=1)
            ),
            row=idx, col=1
        )
        
        fig.add_trace(
            go.Scatter(
                x=results_df.index,
                y=results_df['predicted_value'],
                mode='lines',
                name=f'{file_path.stem} Predicted',
                line=dict(color='green', width=1, dash='dash')
            ),
            row=idx, col=1
        )
        
        anomalies = results_df[results_df['is_anomaly']]
        if not anomalies.empty:
            fig.add_trace(
                go.Scatter(
                    x=anomalies.index,
                    y=anomalies['original_value'],
                    mode='markers',
                    name=f'{file_path.stem} Anomalies',
                    marker=dict(color='red', size=8, symbol='x')
                ),
                row=idx, col=1
            )
        
        # Print summary statistics
        print(f"\nSummary for {file_path.stem}:")
        print(f"Total observations: {len(results_df)}")
        print(f"Anomalies detected: {results_df['is_anomaly'].sum()}")
        print(f"Final training loss: {analyzer.history.history['loss'][-1]:.4f}")
        print(f"Final validation loss: {analyzer.history.history['val_loss'][-1]:.4f}")
    
    return results, fig

if __name__ == "__main__":
    # Get all sensor files
    sensor_files = list(Path('.').glob('*.csv'))
    
    # Run analysis
    print("Starting Water Quality Analysis System...")
    results, fig = analyze_all_sensors(sensor_files)
    
    # Show results
    fig.show()
    
    # Print overall summary
    print("\nOverall Analysis Complete!")
    print(f"Total sensors analyzed: {len(results)}")
    for sensor_name, data in results.items():
        anomaly_count = data['results']['is_anomaly'].sum()
        total_obs = len(data['results'])
        print(f"\n{sensor_name}:")
        print(f"  - Anomalies: {anomaly_count} ({(anomaly_count/total_obs)*100:.2f}%)")
        print(f"  - Training Loss: {data['analyzer'].history.history['loss'][-1]:.4f}")
        print(f"  - Validation Loss: {data['analyzer'].history.history['val_loss'][-1]:.4f}")

Starting Water Quality Analysis System...


TypeError: make_subplots() got unexpected keyword argument(s): ['height']