# Day 3: Time Series Analysis & Feature Engineering
# Smart City IoT Analytics Pipeline

---

## 🎯 LEARNING OBJECTIVES:
- Perform time series analysis on sensor data
- Calculate correlations between different sensor types
- Engineer features for predictive modeling
- Implement window functions for trend analysis

## 📅 SCHEDULE:
**Morning (4 hours):**
1. Temporal Pattern Analysis (2 hours)
2. Cross-Sensor Correlation Analysis (2 hours)

**Afternoon (4 hours):**
3. Feature Engineering (3 hours)
4. Trend Analysis (1 hour)

## ✅ DELIVERABLES:
- Time series analysis dashboard
- Correlation study findings
- Feature engineering pipeline
- Trend analysis reports

---

In [1]:
# =============================================================================
# IMPORTS AND SETUP
# =============================================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# PySpark imports
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
import pyspark.sql.functions as F

# Machine learning imports
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.stat import Correlation
from pyspark.ml.linalg import Vectors

# Initialize Spark Session
try:
    spark.sparkContext.setLogLevel("WARN")
    print("✅ Using existing Spark session")
except:
    spark = (SparkSession.builder
             .appName("SmartCityIoTPipeline-Day3")
             .master("local[*]")
             .config("spark.driver.memory", "4g")
             .config("spark.sql.adaptive.enabled", "true")
             .getOrCreate())
    print("✅ Created new Spark session")

print("📈 Day 3: Time Series Analysis & Feature Engineering")
print("=" * 60)

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/09/05 17:06:52 WARN Utils: Your hostname, Zipcoders-MacBook-Pro-29.local, resolves to a loopback address: 127.0.0.1; using 192.168.87.88 instead (on interface en0)
25/09/05 17:06:52 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/09/05 17:06:52 WARN Utils: Your hostname, Zipcoders-MacBook-Pro-29.local, resolves to a loopback address: 127.0.0.1; using 192.168.87.88 instead (on interface en0)
25/09/05 17:06:52 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/05 17:06:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java

✅ Created new Spark session
📈 Day 3: Time Series Analysis & Feature Engineering


In [7]:
# =============================================================================
# LOAD DATA (Generate if needed, or load from Day 1/2)
# =============================================================================

print("\n📂 Loading data for Day 3 analysis...")

def load_or_generate_datasets():
    """Load datasets from previous days or generate sample data"""
    datasets = {}
    
    try:
        # Generate basic sample data directly
        from datetime import datetime, timedelta
        import random
        import builtins  # For using Python's built-in max function
        
        # Generate time range (last 7 days)
        end_time = datetime.now()
        start_time = end_time - timedelta(days=7)
        
        # Generate timestamps (hourly intervals)
        timestamps = []
        current_time = start_time
        while current_time <= end_time:
            timestamps.append(current_time)
            current_time += timedelta(hours=1)
        
        print(f"🕐 Generating data for {len(timestamps)} time points...")
        
        # Generate sample zones
        zones_data = [
            (1, "Downtown", "Commercial", 40.7589, -73.9851),
            (2, "Residential_North", "Residential", 40.7805, -73.9565), 
            (3, "Industrial_South", "Industrial", 40.7282, -74.0776),
            (4, "Tech_District", "Commercial", 40.7484, -73.9857),
            (5, "Green_Park", "Recreational", 40.7812, -73.9665)
        ]
        
        datasets['zones'] = spark.createDataFrame(zones_data, 
            ['zone_id', 'zone_name', 'zone_type', 'latitude', 'longitude'])
        
        # Generate traffic data
        traffic_data = []
        for i, ts in enumerate(timestamps):
            for zone_id in range(1, 6):
                # Add some realistic patterns (higher traffic during business hours)
                hour = ts.hour
                base_traffic = 50
                if 7 <= hour <= 9 or 17 <= hour <= 19:  # Rush hours
                    base_traffic = 150
                elif 10 <= hour <= 16:  # Business hours
                    base_traffic = 100
                
                vehicle_count = builtins.max(0, base_traffic + random.randint(-30, 50))
                avg_speed = builtins.max(10, int(60 - (vehicle_count / 5) + random.randint(-10, 10)))
                
                traffic_data.append((
                    f"TRAFFIC_{zone_id}_{i}",
                    zone_id,
                    ts,
                    vehicle_count,
                    avg_speed,
                    40.7589 + (zone_id * 0.01),  # location_lat
                    -73.9851 - (zone_id * 0.01)  # location_lon
                ))
        
        datasets['traffic'] = spark.createDataFrame(traffic_data,
            ['sensor_id', 'zone_id', 'timestamp', 'vehicle_count', 'avg_speed', 'location_lat', 'location_lon'])
        
        # Generate air quality data
        air_data = []
        for i, ts in enumerate(timestamps):
            for zone_id in range(1, 6):
                # Correlate with traffic (more traffic = worse air quality)
                base_pm25 = 15 + (zone_id * 5)  # Different baseline by zone
                pm25 = builtins.max(5, base_pm25 + random.randint(-5, 15))
                no2 = builtins.max(10, 25 + random.randint(-8, 12))
                
                air_data.append((
                    f"AIR_{zone_id}_{i}",
                    zone_id,
                    ts,
                    pm25,
                    no2,
                    random.randint(40, 80)  # humidity
                ))
        
        datasets['air_quality'] = spark.createDataFrame(air_data,
            ['sensor_id', 'zone_id', 'timestamp', 'pm25', 'no2', 'humidity'])
        
        # Generate weather data
        weather_data = []
        for i, ts in enumerate(timestamps):
            temp = 20 + random.randint(-5, 15)  # Temperature around 20°C
            humidity = random.randint(40, 80)
            wind_speed = random.randint(5, 25)
            precipitation = random.choice([0.0, 0.0, 0.0, 0.1, 0.5, 1.0])  # Mostly dry
            
            weather_data.append((
                ts, temp, humidity, wind_speed, precipitation
            ))
        
        datasets['weather'] = spark.createDataFrame(weather_data,
            ['timestamp', 'temperature', 'humidity', 'wind_speed', 'precipitation'])
        
        # Generate energy data
        energy_data = []
        for i, ts in enumerate(timestamps):
            for zone_id in range(1, 6):
                # Energy consumption correlated with time of day
                hour = ts.hour
                base_consumption = 100
                if 18 <= hour <= 22:  # Evening peak
                    base_consumption = 200
                elif 6 <= hour <= 8:   # Morning peak
                    base_consumption = 150
                
                consumption = builtins.max(50, base_consumption + random.randint(-30, 50))
                
                energy_data.append((
                    f"ENERGY_{zone_id}_{i}",
                    zone_id,
                    ts,
                    consumption
                ))
        
        datasets['energy'] = spark.createDataFrame(energy_data,
            ['meter_id', 'zone_id', 'timestamp', 'energy_consumption_kwh'])
        
        print("✅ Basic sample data generated successfully")
        return datasets
        
    except Exception as e:
        print(f"❌ Error generating datasets: {str(e)}")
        import traceback
        traceback.print_exc()
        return {}

# Load or generate the datasets
datasets = load_or_generate_datasets()

# Quick data overview
print("\n📊 Dataset Overview:")
for name, df in datasets.items():
    if df is not None:
        count = df.count()
        print(f"   📋 {name}: {count:,} records")
        if count > 0:
            print(f"      Columns: {', '.join(df.columns)}")
    else:
        print(f"   ❌ {name}: Failed to load")

print("\n🎯 Ready for Day 3 Time Series Analysis!")


📂 Loading data for Day 3 analysis...
🕐 Generating data for 169 time points...
✅ Basic sample data generated successfully

📊 Dataset Overview:


                                                                                

   📋 zones: 5 records
      Columns: zone_id, zone_name, zone_type, latitude, longitude
   📋 traffic: 845 records
      Columns: sensor_id, zone_id, timestamp, vehicle_count, avg_speed, location_lat, location_lon
   📋 traffic: 845 records
      Columns: sensor_id, zone_id, timestamp, vehicle_count, avg_speed, location_lat, location_lon
   📋 air_quality: 845 records
      Columns: sensor_id, zone_id, timestamp, pm25, no2, humidity
   📋 air_quality: 845 records
      Columns: sensor_id, zone_id, timestamp, pm25, no2, humidity
   📋 weather: 169 records
      Columns: timestamp, temperature, humidity, wind_speed, precipitation
   📋 energy: 845 records
      Columns: meter_id, zone_id, timestamp, energy_consumption_kwh

🎯 Ready for Day 3 Time Series Analysis!
   📋 weather: 169 records
      Columns: timestamp, temperature, humidity, wind_speed, precipitation
   📋 energy: 845 records
      Columns: meter_id, zone_id, timestamp, energy_consumption_kwh

🎯 Ready for Day 3 Time Series Analysis!


---

# SECTION 1: TEMPORAL PATTERN ANALYSIS (Morning - 2 hours)

---

In [None]:
print("\n" + "=" * 60)
print("⏰ SECTION 1: TEMPORAL PATTERN ANALYSIS")
print("=" * 60)

## TODO 1.1: Seasonal Decomposition Analysis (60 minutes)

🎯 **TASK:** Decompose time series into trend, seasonal, and residual components  
💡 **HINT:** Look for daily, weekly, and monthly patterns  
📚 **CONCEPTS:** Seasonality, trends, cyclical patterns, decomposition

In [9]:
def analyze_temporal_patterns(df, value_col, time_col="timestamp", sensor_col=None):
    """
    Analyze temporal patterns in sensor data
    
    Args:
        df: DataFrame with time series data
        value_col: Column containing values to analyze
        time_col: Timestamp column
        sensor_col: Sensor ID column (optional)
    
    Returns:
        DataFrame with temporal pattern analysis
    """
    print(f"\n📈 Temporal Pattern Analysis: {value_col}")
    print("-" * 40)
    
    # TODO: Add time-based features for pattern analysis
    df_with_time = df.withColumn("year", F.year(time_col)) \
                     .withColumn("month", F.month(time_col)) \
                     .withColumn("day", F.dayofmonth(time_col)) \
                     .withColumn("hour", F.hour(time_col)) \
                     .withColumn("day_of_week", F.dayofweek(time_col)) \
                     .withColumn("week_of_year", F.weekofyear(time_col)) \
                     .withColumn("is_weekend", F.when(F.dayofweek(time_col).isin([1, 7]), True).otherwise(False))
    
    # TODO: Hourly patterns
    print("🕐 Hourly Patterns:")
    hourly_patterns = df_with_time.groupBy("hour").agg(
        F.avg(value_col).alias("avg_value"),
        F.stddev(value_col).alias("stddev_value"),
        F.min(value_col).alias("min_value"),
        F.max(value_col).alias("max_value"),
        F.count(value_col).alias("count_readings")
    ).orderBy("hour")
    
    hourly_patterns.show(24)
    
    # TODO: Find peak and off-peak hours
    peak_hours = hourly_patterns.orderBy(F.desc("avg_value")).limit(3)
    print("   🔝 Peak hours:")
    peak_hours.show()
    
    # TODO: Day of week patterns
    print("\n📅 Day of Week Patterns:")
    daily_patterns = df_with_time.groupBy("day_of_week").agg(
        F.avg(value_col).alias("avg_value"),
        F.count(value_col).alias("count_readings")
    ).orderBy("day_of_week")
    
    # Add day names for better readability
    daily_patterns = daily_patterns.withColumn(
        "day_name",
        F.when(F.col("day_of_week") == 1, "Sunday")
         .when(F.col("day_of_week") == 2, "Monday")
         .when(F.col("day_of_week") == 3, "Tuesday")
         .when(F.col("day_of_week") == 4, "Wednesday")
         .when(F.col("day_of_week") == 5, "Thursday")
         .when(F.col("day_of_week") == 6, "Friday")
         .when(F.col("day_of_week") == 7, "Saturday")
    )
    
    daily_patterns.select("day_name", "avg_value", "count_readings").show()
    
    # TODO: Weekend vs Weekday comparison
    weekend_vs_weekday = df_with_time.groupBy("is_weekend").agg(
        F.avg(value_col).alias("avg_value"),
        F.count(value_col).alias("count_readings")
    )
    
    print("\n🏖️ Weekend vs Weekday:")
    weekend_vs_weekday.show()
    
    # TODO: Monthly patterns (seasonal trends)
    print("\n📊 Monthly Patterns:")
    monthly_patterns = df_with_time.groupBy("month").agg(
        F.avg(value_col).alias("avg_value"),
        F.count(value_col).alias("count_readings")
    ).orderBy("month")
    
    monthly_patterns.show(12)
    
    return df_with_time

In [10]:
# Test the temporal pattern analysis with traffic data
print("🚗 Testing temporal pattern analysis with traffic data...")

if 'traffic' in datasets and datasets['traffic'] is not None:
    # Analyze temporal patterns for traffic vehicle count
    traffic_with_patterns = analyze_temporal_patterns(
        datasets['traffic'], 
        'vehicle_count'
    )
    
    print("\n✅ Temporal pattern analysis completed!")
    print("📊 Sample data with temporal features:")
    traffic_with_patterns.select('timestamp', 'vehicle_count', 'hour', 'day_of_week', 'is_weekend').show(10)
else:
    print("❌ Traffic data not available for analysis")

🚗 Testing temporal pattern analysis with traffic data...

📈 Temporal Pattern Analysis: vehicle_count
----------------------------------------
🕐 Hourly Patterns:
+----+------------------+------------------+---------+---------+--------------+
|hour|         avg_value|      stddev_value|min_value|max_value|count_readings|
+----+------------------+------------------+---------+---------+--------------+
|   0| 57.91428571428571|23.417403569054503|       21|       97|            35|
|   1| 60.77142857142857|26.761567827857956|       20|       99|            35|
|   2| 58.17142857142857|19.297189241053786|       22|       92|            35|
|   3| 54.65714285714286|25.330686873414855|       20|       99|            35|
|   4| 61.57142857142857| 22.77290148436968|       21|       97|            35|
|   5| 66.02857142857142|21.605399262937134|       21|       99|            35|
|   6| 60.08571428571429|21.556180263743972|       22|       99|            35|
|   7| 165.9142857142857|21.82331649537