# Energy Consumption Forecasting Project
## Initial Data Exploration & Pipeline Demonstration

### 1. Project Setup

In [60]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from IPython.display import Markdown, display
import gc  # Garbage collector

plt.style.use('seaborn-v0_8-whitegrid') 
pd.set_option('display.max_columns', 50)

### 2. Raw Data Inspection

In [64]:
dtypes = {
    'Date': str,
    'Time': str,
    'Global_active_power': float,
    'Global_reactive_power': float,
    'Voltage': float,
    'Global_intensity': float,
    'Sub_metering_1': float,
    'Sub_metering_2': float,
    'Sub_metering_3': float
}

raw_path = r'C:\Users\AliRashaideh\OneDrive - Seagulls\Desktop\energy_forecasting_project\data\raw\household_power_consumption.csv'

df=pd.read_csv(raw_path, sep=';', dtype=dtypes, na_values=['?', 'nan'])
  
    
df['datetime'] = pd.to_datetime(
    df['Date'] + ' ' + df['Time'],
    format='%d/%m/%Y %H:%M:%S',
    errors='coerce'
)

df = df[df['datetime'].between('2006-12-16', '2010-11-26')]


display(Markdown(f"#### Final Dataset: {len(df)} rows"))

#### Final Dataset: 2073997 rows

In [65]:
min_date = df['datetime'].min()
max_date = df['datetime'].max()
display(Markdown(f"**Date Range:** {min_date} to {max_date}"))

invalid_dates = df[df['datetime'].isna()]
display(Markdown(f"**Invalid Dates:** {len(invalid_dates)} rows"))
if len(invalid_dates) > 0:
    display(invalid_dates.head(3))

df = df.dropna(subset=['datetime'])
df.set_index('datetime', inplace=True)

**Date Range:** 2006-12-16 17:24:00 to 2010-11-26 00:00:00

**Invalid Dates:** 0 rows

In [70]:
#showing null values
display(Markdown("### Null Values in Each Column:"))
null_counts = df.isnull().sum()
null_counts = null_counts[null_counts > 0]
if not null_counts.empty:
    display(null_counts)

### Null Values in Each Column:

Global_active_power      25979
Global_reactive_power    25979
Voltage                  25979
Global_intensity         25979
Sub_metering_1           25979
Sub_metering_2           25979
Sub_metering_3           25979
dtype: int64

### 3. Fixing dtypes and Handle Missing Values (Temporary for eda) 

In [68]:
def plot_safe_aggregation(series, title, ax, agg_freq='D'):
    """Safe plotting using aggregation"""
    try:
        # Aggregate to daily frequency
        agg_data = series.resample(agg_freq).mean()
        
        # Plot with proper date formatting
        agg_data.plot(ax=ax, title=title)
        ax.xaxis.set_major_locator(mdates.YearLocator())
        ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
        return True
    except Exception as e:
        print(f"Error plotting {title}: {str(e)}")
        return False
    
def plot_hourly_patterns(df, title_suffix=""):
    """Plot aggregated hourly patterns without timeline"""
    fig, axes = plt.subplots(3, 1, figsize=(15, 15))
    
    # 1. By hour of day
    hourly = df.groupby(df.index.hour).mean()
    hourly['Global_active_power'].plot(
        ax=axes[0], 
        kind='bar',
        title=f'Hourly Pattern {title_suffix}',
        color='skyblue'
    )
    axes[0].set_xlabel('Hour of Day')
    
    # 2. By day of week
    weekday = df.groupby(df.index.dayofweek).mean()
    weekday['Global_active_power'].plot(
        ax=axes[1], 
        kind='bar',
        title=f'Weekly Pattern {title_suffix}',
        color='lightgreen'
    )
    axes[1].set_xlabel('Day of Week (0=Monday, 6=Sunday)')
    
    # 3. By month of year
    monthly = df.groupby(df.index.month).mean()
    monthly['Global_active_power'].plot(
        ax=axes[2], 
        kind='bar',
        title=f'Monthly Pattern {title_suffix}',
        color='salmon'
    )
    axes[2].set_xlabel('Month of Year')
    
    plt.tight_layout()
    return fig