# Day 2: Data Quality & Cleaning Pipeline
# Smart City IoT Analytics Pipeline

"""
🎯 LEARNING OBJECTIVES:
- Implement comprehensive data quality assessment
- Design cleaning procedures for IoT sensor data
- Handle missing values and outliers appropriately
- Create reusable data quality functions

📅 SCHEDULE:
Morning (4 hours):
1. Data Quality Assessment (2 hours)
2. Missing Data Strategy (2 hours)

Afternoon (4 hours):
3. Outlier Detection & Treatment (2 hours)
4. Data Standardization (2 hours)

✅ DELIVERABLES:
- Data quality assessment report
- Comprehensive cleaning pipeline
- Outlier detection and treatment functions
- Standardized datasets ready for analysis
"""

# =============================================================================
# IMPORTS AND SETUP
# =============================================================================

In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# PySpark imports
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
import pyspark.sql.functions as F

# Machine learning imports (for outlier detection)
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans
from pyspark.ml.stat import Correlation

In [16]:
# Initialize Spark Session (should already exist from Day 1)
try:
    spark.sparkContext.setLogLevel("WARN")
    print("✅ Using existing Spark session")
except:
    spark = (SparkSession.builder
             .appName("SmartCityIoTPipeline-Day2")
             .master("local[*]")
             .config("spark.driver.memory", "4g")
             .config("spark.sql.adaptive.enabled", "true")
             .getOrCreate())
    print("✅ Created new Spark session")

print("🔧 Day 2: Data Quality & Cleaning Pipeline")
print("=" * 60)

✅ Using existing Spark session
🔧 Day 2: Data Quality & Cleaning Pipeline


# =============================================================================
# SECTION 1: COMPREHENSIVE DATA PROFILING (Morning - 2 hours)
# =============================================================================

In [17]:
print("\n📊 SECTION 1: COMPREHENSIVE DATA PROFILING")
print("=" * 60)


📊 SECTION 1: COMPREHENSIVE DATA PROFILING


In [18]:
# Load cleaned data from Day 1 (or reload if needed)
data_dir = "../data/raw"

# TODO 1.1: Load all datasets with error handling
def load_all_datasets():
    """Load all sensor datasets with consistent error handling"""
    datasets = {}
    
    try:
        # Load each dataset
        datasets['zones'] = spark.read.option("header", "true").option("inferSchema", "true").csv(f"{data_dir}/city_zones.csv")
        datasets['traffic'] = spark.read.option("header", "true").option("inferSchema", "true").csv(f"{data_dir}/traffic_sensors.csv")
        datasets['air_quality'] = spark.read.option("multiline", "true").json(f"{data_dir}/air_quality.json")
        datasets['weather'] = spark.read.parquet(f"{data_dir}/weather_data.parquet")
        datasets['energy'] = spark.read.option("header", "true").option("inferSchema", "true").csv(f"{data_dir}/energy_meters.csv")
        
        # Convert timestamp columns to proper format
        for name, df in datasets.items():
            if name != 'zones' and 'timestamp' in df.columns:
                datasets[name] = df.withColumn("timestamp", F.to_timestamp(F.col("timestamp")))
        
        print("✅ All datasets loaded successfully")
        return datasets
        
    except Exception as e:
        print(f"❌ Error loading datasets: {str(e)}")
        return {}

datasets = load_all_datasets()

✅ All datasets loaded successfully


# =============================================================================
# TODO 1.2: Advanced Data Quality Metrics (45 minutes)
# =============================================================================

In [19]:
"""
🎯 TASK: Create comprehensive data quality profiling functions
💡 HINT: Look beyond basic missing values - consider temporal patterns, distributions
📚 CONCEPTS: Data profiling, quality metrics, statistical validation
"""

def comprehensive_data_profile(df, dataset_name, time_col="timestamp"):
    """
    Generate comprehensive data quality profile
    
    Args:
        df: Spark DataFrame to profile
        dataset_name: Name for reporting
        time_col: Timestamp column name (can be None for non-time series data)
    
    Returns:
        Dictionary with quality metrics
    """
    print(f"\n🔍 Comprehensive Profile: {dataset_name}")
    print("-" * 50)
    
    # Basic statistics
    total_rows = df.count()
    total_cols = len(df.columns)
    
    profile = {
        'dataset_name': dataset_name,
        'total_rows': total_rows,
        'total_columns': total_cols,
        'memory_usage_mb': 0,  # Estimate
        'quality_issues': []
    }
    
    # TODO: Calculate missing value patterns
    print("📋 Missing Value Analysis:")
    missing_analysis = {}
    for col in df.columns:
        missing_count = df.filter(F.col(col).isNull()).count()
        missing_pct = (missing_count / total_rows) * 100 if total_rows > 0 else 0
        missing_analysis[col] = {'count': missing_count, 'percentage': missing_pct}
        
        if missing_pct > 5:  # Flag columns with >5% missing
            profile['quality_issues'].append(f"High missing values in {col}: {missing_pct:.2f}%")
        
        if missing_count > 0:
            print(f"   {col}: {missing_count:,} ({missing_pct:.2f}%)")
    
    profile['missing_analysis'] = missing_analysis
    
    # TODO: Temporal data gaps (if timestamp column exists and is not None)
    if time_col and time_col in df.columns:
        print("⏰ Temporal Analysis:")
        
        # Get time range
        time_stats = df.agg(
            F.min(time_col).alias('min_time'),
            F.max(time_col).alias('max_time'),
            F.count(time_col).alias('time_count')
        ).collect()[0]
        
        print(f"   Time Range: {time_stats['min_time']} to {time_stats['max_time']}")
        print(f"   Records with timestamps: {time_stats['time_count']:,}")
        
        # TODO: Check for temporal gaps
        # Calculate expected vs actual record counts
        if dataset_name == 'traffic':
            expected_interval_minutes = 5
        elif dataset_name == 'air_quality':
            expected_interval_minutes = 15
        elif dataset_name == 'weather':
            expected_interval_minutes = 30
        elif dataset_name == 'energy':
            expected_interval_minutes = 10
        else:
            expected_interval_minutes = 15
        
        print(f"   Expected interval: {expected_interval_minutes} minutes")
        
        # TODO: Data freshness (for time series data)
        latest_record = df.agg(F.max(time_col).alias('latest')).collect()[0]['latest']
        if latest_record:
            from datetime import datetime
            hours_old = (datetime.now() - latest_record).total_seconds() / 3600
            print(f"📅 Data Freshness: Latest record is {hours_old:.1f} hours old")
    
    # TODO: Numeric column distributions
    numeric_cols = [field.name for field in df.schema.fields 
                   if field.dataType in [IntegerType(), DoubleType(), FloatType(), LongType()]]
    
    if numeric_cols:
        print("📈 Numeric Column Analysis:")
        # Get basic statistics for numeric columns
        stats_df = df.select(numeric_cols).describe()
        stats_df.show()
        
        # TODO: Check for suspicious patterns in numeric data
        for col in numeric_cols:
            if col not in ['location_lat', 'location_lon']:
                # Check for columns with very low variance (potentially stuck sensors)
                variance_check = df.agg(F.variance(col).alias('variance')).collect()[0]['variance']
                if variance_check is not None and variance_check < 0.001:
                    profile['quality_issues'].append(f"Very low variance in {col}: {variance_check}")
    
    # TODO: Categorical column analysis
    categorical_cols = [field.name for field in df.schema.fields 
                       if field.dataType == StringType() and field.name not in [time_col] if time_col]
    
    if categorical_cols:
        print("📂 Categorical Column Analysis:")
        for col in categorical_cols:
            distinct_count = df.select(col).distinct().count()
            print(f"   {col}: {distinct_count} distinct values")
            
            # Show top values
            if distinct_count < 20:
                top_values = df.groupBy(col).count().orderBy(F.desc("count")).limit(5)
                print(f"      Top values:")
                top_values.show(5, truncate=False)
    
    # TODO: Check for duplicate records
    duplicate_count = total_rows - df.dropDuplicates().count()
    if duplicate_count > 0:
        profile['quality_issues'].append(f"Duplicate records found: {duplicate_count}")
        print(f"🔄 Duplicate Records: {duplicate_count:,}")
    
    return profile

      

# TODO: Profile all datasets
print("🔍 Starting comprehensive data profiling...")

profiles = {}
for name, df in datasets.items():
    if df is not None:
        try:
            # Check if the dataset has a timestamp column
            if name != 'zones' and 'timestamp' in df.columns:
                profiles[name] = comprehensive_data_profile(df, name, time_col="timestamp")
            else:
                # For datasets without timestamp column (like zones), don't pass time_col
                profiles[name] = comprehensive_data_profile(df, name, time_col=None)
        except Exception as e:
            print(f"❌ Error profiling {name}: {str(e)}")


🔍 Starting comprehensive data profiling...

🔍 Comprehensive Profile: zones
--------------------------------------------------
📋 Missing Value Analysis:
📈 Numeric Column Analysis:
+-------+--------------------+--------------------+-------------------+-------------------+------------------+
|summary|             lat_min|             lat_max|            lon_min|            lon_max|        population|
+-------+--------------------+--------------------+-------------------+-------------------+------------------+
|  count|                   8|                   8|                  8|                  8|                 8|
|   mean|  40.730000000000004|             40.7525| -73.99125000000001|          -73.97125|           21250.0|
| stddev|0.023904572186687328|0.028157719063465373|0.02474873734153055|0.02474873734153458|14260.334598358582|
|    min|                40.7|               40.72|             -74.02|              -74.0|              5000|
|    max|               40.76|              

                                                                                

# =============================================================================
# TODO 1.3: Sensor Health Analysis (45 minutes)
# =============================================================================

In [20]:
"""
🎯 TASK: Identify sensors with potential operational issues
💡 HINT: Look for patterns that indicate sensor malfunctions
📚 CONCEPTS: Sensor diagnostics, operational monitoring, health scoring
"""

def analyze_sensor_health(df, sensor_id_col, value_cols, time_col="timestamp"):
    """
    Analyze individual sensor health and identify problematic sensors
    
    Args:
        df: DataFrame with sensor data
        sensor_id_col: Column name for sensor ID
        value_cols: List of measurement columns to analyze
        time_col: Timestamp column
    
    Returns:
        DataFrame with sensor health metrics
    """
    print(f"\n🏥 Sensor Health Analysis")
    print("-" * 30)
    
    # TODO: Calculate health metrics per sensor
    window_spec = Window.partitionBy(sensor_id_col)
    
    health_metrics = df.groupBy(sensor_id_col).agg(
        F.count("*").alias("total_readings"),
        F.min(time_col).alias("first_reading"),
        F.max(time_col).alias("last_reading")
    )
    
    # TODO: Add missing data percentage per sensor
    for col in value_cols:
        missing_col_name = f"{col}_missing_pct"
        
        # Calculate missing percentage per sensor using separate aggregation
        missing_stats = df.groupBy(sensor_id_col).agg(
            F.count("*").alias("total_count"),
            F.sum(F.when(F.col(col).isNull(), 1).otherwise(0)).alias("missing_count")
        ).withColumn(
            missing_col_name,
            (F.col("missing_count") / F.col("total_count") * 100).cast("double")
        ).select(sensor_id_col, missing_col_name)
        
        # Join the missing percentage back to health_metrics
        health_metrics = health_metrics.join(missing_stats, sensor_id_col, "left")

    # TODO: Add variance analysis (detect stuck sensors)
    for col in value_cols:
        if col in df.columns:
            variance_col_name = f"{col}_variance"
            sensor_variance = df.groupBy(sensor_id_col).agg(
                F.variance(col).alias(variance_col_name)
            )
            health_metrics = health_metrics.join(sensor_variance, sensor_id_col, "left")
    
    # TODO: Calculate data gaps (irregular reporting)
    # This is more complex - calculate time differences between consecutive readings
    if time_col in df.columns:
        time_diff_col = "time_diff_minutes"
        df_with_lag = df.withColumn(
            "prev_time",
            F.lag(time_col).over(Window.partitionBy(sensor_id_col).orderBy(time_col))
        ).withColumn(
            time_diff_col,
            (F.unix_timestamp(time_col) - F.unix_timestamp("prev_time")) / 60
        )
        
        avg_time_diff = df_with_lag.groupBy(sensor_id_col).agg(
            F.avg(time_diff_col).alias("avg_time_diff_minutes"),
            F.max(time_diff_col).alias("max_time_diff_minutes")
        )
        
        health_metrics = health_metrics.join(avg_time_diff, sensor_id_col, "left")
    
    # TODO: Create overall health score
    # Combine multiple factors into a single health score (0-100)
    first_value_col = value_cols[0] if value_cols else "vehicle_count"
    missing_pct_col = f"{first_value_col}_missing_pct"
    variance_col = f"{first_value_col}_variance"
    
    health_metrics = health_metrics.withColumn(
        "health_score",
        # TODO: Create a formula that combines:
        # - Missing data percentage (lower is better)
        # - Data variance (too low = stuck sensor, too high = noisy sensor)
        # - Reporting regularity
        # - Recent data availability
        F.greatest(
            F.lit(0.0),
            F.least(
                F.lit(100.0),
                # Start with 100 and subtract penalties
                F.lit(100.0) -
                # Penalty for missing data (0-40 points lost)
                F.coalesce(F.col(missing_pct_col), F.lit(0.0)) * 0.4 -
                # Penalty for irregular reporting (0-30 points lost)
                F.when(F.col("avg_time_diff_minutes").isNull(), F.lit(0.0))
                 .when(F.col("avg_time_diff_minutes") > 60, F.lit(30.0))  # >1 hour gaps
                 .when(F.col("avg_time_diff_minutes") > 30, F.lit(15.0))  # >30 min gaps
                 .when(F.col("avg_time_diff_minutes") > 15, F.lit(5.0))   # >15 min gaps
                 .otherwise(F.lit(0.0)) -
                # Penalty for very low variance (stuck sensors) (0-20 points lost)
                F.when(F.coalesce(F.col(variance_col), F.lit(1.0)) < 0.001, F.lit(20.0))
                 .otherwise(F.lit(0.0)) -
                # Penalty for very high variance (noisy sensors) (0-10 points lost)
                F.when(F.coalesce(F.col(variance_col), F.lit(0.0)) > 1000, F.lit(10.0))
                 .otherwise(F.lit(0.0))
            )
        ).cast("double")
        
    )
    
    # TODO: Flag problematic sensors
    health_metrics = health_metrics.withColumn(
        "status",
        F.when(F.col("health_score") > 80, "healthy")
         .when(F.col("health_score") > 60, "warning")
         .otherwise("critical")
    )
    
    return health_metrics

# TODO: Analyze health for each sensor type
print("🏥 Analyzing sensor health across all datasets...")

sensor_health_results = {}

# Traffic sensors
if 'traffic' in datasets:
    traffic_health = analyze_sensor_health(
        datasets['traffic'], 
        'sensor_id', 
        ['vehicle_count', 'avg_speed']
    )
    sensor_health_results['traffic'] = traffic_health
    
    print("🚗 Traffic Sensor Health Summary:")
    traffic_health.groupBy("status").count().show()

# TODO: Analyze other sensor types
# Air quality sensors
if 'air_quality' in datasets:
    # TODO: Implement air quality sensor health analysis
    air_quality_health=analyze_sensor_health(
        datasets['air_quality'], 
        'sensor_id', 
        ['pm25', 'pm10', 'no2', 'co']
    )
    sensor_health_results['air_quality'] = air_quality_health
    print("🌫️ Air Quality Sensor Health Summary:")
    air_quality_health.groupBy("status").count().show()

# Weather sensors  
if 'weather' in datasets:
    # TODO: Implement weather sensor health analysis
    weather_health=analyze_sensor_health(
        datasets['weather'], 
        'station_id', 
        ['temperature', 'humidity', 'wind_speed', 'precipitation']
    )
    sensor_health_results['weather'] = weather_health
    print("🌦️ Weather Sensor Health Summary:")
    weather_health.groupBy("status").count().show()

# Energy sensors
if 'energy' in datasets:
    # TODO: Implement energy sensor health analysis
    energy_health=analyze_sensor_health(
        datasets['energy'], 
        'meter_id', 
        ['power_consumption', 'voltage', 'current', 'power_factor']
    )
    sensor_health_results['energy'] = energy_health
    print("⚡ Energy Sensor Health Summary:")
    energy_health.groupBy("status").count().show()


🏥 Analyzing sensor health across all datasets...

🏥 Sensor Health Analysis
------------------------------
🚗 Traffic Sensor Health Summary:
+-------+-----+
| status|count|
+-------+-----+
|healthy|   50|
+-------+-----+


🏥 Sensor Health Analysis
------------------------------
🌫️ Air Quality Sensor Health Summary:
+-------+-----+
| status|count|
+-------+-----+
|healthy|   20|
+-------+-----+


🏥 Sensor Health Analysis
------------------------------
🌦️ Weather Sensor Health Summary:
+-------+-----+
| status|count|
+-------+-----+
|healthy|   10|
+-------+-----+


🏥 Sensor Health Analysis
------------------------------
⚡ Energy Sensor Health Summary:


[Stage 1267:>                                                       (0 + 8) / 8]

+-------+-----+
| status|count|
+-------+-----+
|healthy|  200|
+-------+-----+



                                                                                

# =============================================================================
# SECTION 2: MISSING DATA STRATEGY (Morning - 2 hours)
# =============================================================================


In [21]:
print("\n" + "=" * 60)
print("🕳️ SECTION 2: MISSING DATA HANDLING STRATEGY")
print("=" * 60)


🕳️ SECTION 2: MISSING DATA HANDLING STRATEGY


# =============================================================================
# TODO 2.1: Missing Data Pattern Analysis (60 minutes)
# =============================================================================


In [22]:
"""
🎯 TASK: Understand patterns in missing data across time and sensors
💡 HINT: Missing data in IoT often has temporal or spatial patterns
📚 CONCEPTS: Missing data mechanisms, pattern analysis, imputation strategies
"""

def analyze_missing_patterns(df, time_col="timestamp", sensor_col=None):
    """
    Analyze patterns in missing data
    
    Args:
        df: DataFrame to analyze
        time_col: Timestamp column
        sensor_col: Sensor ID column (if applicable)
    
    Returns:
        Dictionary with missing data insights
    """
    print("\n🔍 Missing Data Pattern Analysis")
    print("-" * 40)
    
    patterns = {}
    
    # TODO: Temporal patterns in missing data
    if time_col in df.columns:
        print("⏰ Temporal Missing Data Patterns:")
        
        # Create time-based aggregations
        df_with_time_features = df.withColumn("hour", F.hour(time_col)) \
                                 .withColumn("day_of_week", F.dayofweek(time_col)) \
                                 .withColumn("date", F.to_date(time_col))
        
        # TODO: Check missing data by hour of day
        numeric_cols = [field.name for field in df.schema.fields 
                       if field.dataType in [IntegerType(), DoubleType(), FloatType()]]
        
        for col in numeric_cols[:3]:  # Analyze first 3 numeric columns
            missing_by_hour = df_with_time_features.groupBy("hour").agg(
                F.count("*").alias("total_records"),
                F.sum(F.when(F.col(col).isNull(), 1).otherwise(0)).alias("missing_count")
            ).withColumn(
                "missing_pct", 
                (F.col("missing_count") / F.col("total_records")) * 100
            ).orderBy("hour")
            
            print(f"\n   Missing data by hour for {col}:")
            missing_by_hour.show(24)
            
            # TODO: Identify problematic hours
            high_missing_hours = missing_by_hour.filter(F.col("missing_pct") > 10)
            if high_missing_hours.count() > 0:
                print(f"   ⚠️ High missing data hours for {col}:")
                high_missing_hours.show()
    
    # TODO: Sensor-specific missing patterns (if sensor column provided)
    if sensor_col and sensor_col in df.columns:
        print(f"\n📡 Sensor-specific Missing Patterns:")
        
        # Calculate missing percentage per sensor
        sensor_missing = df.groupBy(sensor_col).agg(
            F.count("*").alias("total_readings")
        )
        
        numeric_cols = [field.name for field in df.schema.fields 
                       if field.dataType in [IntegerType(), DoubleType(), FloatType()]]
        
        for col in numeric_cols[:2]:  # Analyze first 2 numeric columns
            col_missing = df.groupBy(sensor_col).agg(
                F.sum(F.when(F.col(col).isNull(), 1).otherwise(0)).alias(f"{col}_missing")
            )
            sensor_missing = sensor_missing.join(col_missing, sensor_col, "left")
        
        # Calculate percentages and identify problematic sensors
        for col in numeric_cols[:2]:
            sensor_missing = sensor_missing.withColumn(
                f"{col}_missing_pct",
                (F.col(f"{col}_missing") / F.col("total_readings")) * 100
            )
        
        print("   Sensors with highest missing data:")
        sensor_missing.orderBy(F.desc(f"{numeric_cols[0]}_missing_pct")).show(10)
    
    return patterns

# TODO: Analyze missing patterns for each dataset
for name, df in datasets.items():
    if df is not None and name != 'zones':
        try:
            sensor_col = None
            if 'sensor_id' in df.columns:
                sensor_col = 'sensor_id'
            elif 'station_id' in df.columns:
                sensor_col = 'station_id'
            elif 'meter_id' in df.columns:
                sensor_col = 'meter_id'
            
            patterns = analyze_missing_patterns(df, sensor_col=sensor_col)
        except Exception as e:
            print(f"❌ Error analyzing missing patterns for {name}: {str(e)}")



🔍 Missing Data Pattern Analysis
----------------------------------------
⏰ Temporal Missing Data Patterns:

   Missing data by hour for location_lat:
+----+-------------+-------------+-----------+
|hour|total_records|missing_count|missing_pct|
+----+-------------+-------------+-----------+
|   0|         4200|            0|        0.0|
|   1|         4200|            0|        0.0|
|   2|         4200|            0|        0.0|
|   3|         4200|            0|        0.0|
|   4|         4200|            0|        0.0|
|   5|         4200|            0|        0.0|
|   6|         4200|            0|        0.0|
|   7|         4200|            0|        0.0|
|   8|         4200|            0|        0.0|
|   9|         4250|            0|        0.0|
|  10|         4200|            0|        0.0|
|  11|         4200|            0|        0.0|
|  12|         4200|            0|        0.0|
|  13|         4200|            0|        0.0|
|  14|         4200|            0|        0.0|
|  

# =============================================================================
# TODO 2.2: Time Series Interpolation (60 minutes)
# =============================================================================

In [23]:
"""
🎯 TASK: Implement interpolation strategies for time series gaps
💡 HINT: Different gap sizes need different strategies
📚 CONCEPTS: Linear interpolation, forward fill, seasonal patterns
"""

def interpolate_time_series_gaps(df, value_columns, time_col="timestamp", sensor_col=None, max_gap_hours=6):
    """
    Implement time series interpolation for missing values
    
    Args:
        df: DataFrame with time series data
        value_columns: List of columns to interpolate
        time_col: Timestamp column
        sensor_col: Sensor ID column for per-sensor interpolation
        max_gap_hours: Maximum gap size to interpolate (larger gaps left as missing)
    
    Returns:
        DataFrame with interpolated values
    """
    print(f"\n🔧 Time Series Interpolation")
    print("-" * 30)
    
    result_df = df
    
    # TODO: For each sensor (if sensor_col provided), interpolate separately
    if sensor_col:
        # This is complex in Spark - we'll implement a simplified version
        # In practice, you might use pandas UDFs or process sensor by sensor
        
        # TODO: Create window specification for each sensor
        window_spec = Window.partitionBy(sensor_col).orderBy(time_col)
        
        for col in value_columns:
            if col in df.columns:
                print(f"   Interpolating {col}...")
                
                # TODO: Simple forward fill for small gaps
                # Use lag and lead functions to fill gaps
                result_df = result_df.withColumn(
                    f"{col}_filled",
                    F.when(
                        F.col(col).isNull(),
                        # TODO: Implement interpolation logic
                        # For now, use last observation carried forward
                        F.last(col, True).over(window_spec.rowsBetween(Window.unboundedPreceding, -1))
                    ).otherwise(F.col(col))
                )
                
                # TODO: Add metadata column to track what was interpolated
                result_df = result_df.withColumn(
                    f"{col}_interpolated",
                    F.when(F.col(col).isNull() & F.col(f"{col}_filled").isNotNull(), True).otherwise(False)
                )
        
    else:
        # Global interpolation (simpler case)
        window_spec = Window.orderBy(time_col)
        
        for col in value_columns:
            if col in df.columns:
                print(f"   Interpolating {col} (global)...")
                
                # Simple forward fill
                result_df = result_df.withColumn(
                    f"{col}_filled",
                    F.when(
                        F.col(col).isNull(),
                        F.last(col, True).over(window_spec.rowsBetween(Window.unboundedPreceding, -1))
                    ).otherwise(F.col(col))
                )
    
    # TODO: Report interpolation statistics
    print("📊 Interpolation Summary:")
    for col in value_columns:
        if f"{col}_filled" in result_df.columns:
            interpolated_count = result_df.filter(F.col(f"{col}_interpolated") == True).count()
            total_missing = df.filter(F.col(col).isNull()).count()
            print(f"   {col}: {interpolated_count}/{total_missing} missing values interpolated")
    
    return result_df

In [24]:
# TODO: Test interpolation on traffic data
if 'traffic' in datasets:
    print("🚗 Testing interpolation on traffic data...")
    traffic_interpolated = interpolate_time_series_gaps(
        datasets['traffic'],
        ['vehicle_count', 'avg_speed'],
        sensor_col='sensor_id'
    )
    
    # Show before/after comparison
    print("\n📊 Before/After Interpolation Comparison:")
    comparison = datasets['traffic'].agg(
        F.sum(F.when(F.col("vehicle_count").isNull(), 1).otherwise(0)).alias("vehicle_count_missing_before")
    ).join(
        traffic_interpolated.agg(
            F.sum(F.when(F.col("vehicle_count_filled").isNull(), 1).otherwise(0)).alias("vehicle_count_missing_after")
        )
    )
    comparison.show()

🚗 Testing interpolation on traffic data...

🔧 Time Series Interpolation
------------------------------
   Interpolating vehicle_count...
   Interpolating avg_speed...
📊 Interpolation Summary:
   vehicle_count: 0/0 missing values interpolated
   avg_speed: 0/0 missing values interpolated

📊 Before/After Interpolation Comparison:
+----------------------------+---------------------------+
|vehicle_count_missing_before|vehicle_count_missing_after|
+----------------------------+---------------------------+
|                           0|                          0|
+----------------------------+---------------------------+



# =============================================================================
# SECTION 3: OUTLIER DETECTION & TREATMENT (Afternoon - 2 hours)
# =============================================================================

In [27]:
print("\n" + "=" * 60)
print("🎯 SECTION 3: OUTLIER DETECTION & TREATMENT")
print("=" * 60)


🎯 SECTION 3: OUTLIER DETECTION & TREATMENT


# =============================================================================
# DAY 2 DELIVERABLES & VALIDATION
# =============================================================================

In [25]:
print("\n" + "=" * 60)
print("📋 DAY 2 COMPLETION CHECKLIST")
print("=" * 60)


📋 DAY 2 COMPLETION CHECKLIST


In [26]:
def validate_day2_completion():
    """Validate that Day 2 objectives have been met"""
    
    checklist = {
        "data_profiling_completed": False,
        "sensor_health_analyzed": False,
        "missing_data_patterns_identified": False,
        "interpolation_implemented": False,
        "outlier_detection_working": False,
        "outlier_treatment_applied": False,
        "units_standardized": False,
        "data_lineage_tracked": False,
        "quality_scores_calculated": False
    }
    
    try:
        # Check if profiling was completed
        if len(profiles) > 0:
            checklist["data_profiling_completed"] = True
            
        # Check if sensor health analysis was done
        if len(sensor_health_results) > 0:
            checklist["sensor_health_analyzed"] = True
            
        # Check if outlier detection was implemented
        if len(outlier_results) > 0:
            checklist["outlier_detection_working"] = True
            
        # Check if standardization was applied
        if len(standardized_datasets) > 0:
            checklist["units_standardized"] = True
            
        # Check if final datasets have lineage
        if len(final_datasets) > 0:
            checklist["data_lineage_tracked"] = True
            
        # TODO: Add more specific validation checks
        # Check if missing data patterns were analyzed
        if 'missing_patterns_results' in globals():
            checklist["missing_data_patterns_identified"] = True
         
        
    except Exception as e:
        print(f"❌ Validation error: {str(e)}")
    
    # Display results
    print("✅ COMPLETION STATUS:")
    for item, status in checklist.items():
        status_icon = "✅" if status else "❌"
        print(f"   {status_icon} {item.replace('_', ' ').title()}")
    
    import builtins
    completion_rate = builtins.sum(checklist.values()) / len(checklist) * 100
    print(f"\n📊 Overall Completion: {completion_rate:.1f}%")
    
    if completion_rate >= 70:
        print("🎉 Great progress! You're ready for Day 3!")
        print("\n📈 KEY INSIGHTS FROM DAY 2:")
        print("- Data quality patterns identified across sensors")
        print("- Missing data handling strategies implemented") 
        print("- Outlier detection and treatment procedures established")
        print("- Standardized data formats for consistent analysis")
    else:
        print("📝 Please review incomplete items before proceeding to Day 3.")
    
    return checklist

# Run the validation
completion_status = validate_day2_completion()


❌ Validation error: name 'outlier_results' is not defined
✅ COMPLETION STATUS:
   ✅ Data Profiling Completed
   ✅ Sensor Health Analyzed
   ❌ Missing Data Patterns Identified
   ❌ Interpolation Implemented
   ❌ Outlier Detection Working
   ❌ Outlier Treatment Applied
   ❌ Units Standardized
   ❌ Data Lineage Tracked
   ❌ Quality Scores Calculated

📊 Overall Completion: 22.2%
📝 Please review incomplete items before proceeding to Day 3.
