In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pickle
import warnings
warnings.filterwarnings('ignore')

print("âœ… Libraries loaded")

âœ… Libraries loaded


## 1. Load Data

In [2]:
print("Loading data...")
df = pd.read_csv('../datasets/household_power_consumption.txt', 
                 sep=';',
                 low_memory=False,
                 na_values=['?'],
                 parse_dates={'datetime': ['Date', 'Time']},
                 infer_datetime_format=True)

print(f"âœ… Loaded {len(df):,} rows")

Loading data...
âœ… Loaded 2,075,259 rows
âœ… Loaded 2,075,259 rows


## 2. Handle Missing Values

In [3]:
# Convert to numeric
numeric_cols = ['Global_active_power', 'Global_reactive_power', 'Voltage', 
                'Global_intensity', 'Sub_metering_1', 'Sub_metering_2', 'Sub_metering_3']

for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

print(f"Missing before: {df.isnull().sum().sum():,}")

# Drop rows with missing values (only ~1.25%)
df = df.dropna()

print(f"Missing after: {df.isnull().sum().sum()}")
print(f"Rows remaining: {len(df):,}")

Missing before: 181,853
Missing after: 0
Rows remaining: 2,049,280


## 3. Create Time-Based Features

In [4]:
# Extract time features
df['hour'] = df['datetime'].dt.hour
df['day'] = df['datetime'].dt.day
df['month'] = df['datetime'].dt.month
df['dayofweek'] = df['datetime'].dt.dayofweek  # 0=Monday, 6=Sunday
df['quarter'] = df['datetime'].dt.quarter
df['year'] = df['datetime'].dt.year

# Is weekend?
df['is_weekend'] = (df['dayofweek'] >= 5).astype(int)

# Season (1=Winter, 2=Spring, 3=Summer, 4=Fall)
df['season'] = df['month'].apply(lambda x: (
    1 if x in [12, 1, 2] else
    2 if x in [3, 4, 5] else
    3 if x in [6, 7, 8] else 4
))

print("âœ… Time features created:")
print(df[['datetime', 'hour', 'dayofweek', 'is_weekend', 'month', 'season']].head())

âœ… Time features created:
             datetime  hour  dayofweek  is_weekend  month  season
0 2006-12-16 17:24:00    17          5           1     12       1
1 2006-12-16 17:25:00    17          5           1     12       1
2 2006-12-16 17:26:00    17          5           1     12       1
3 2006-12-16 17:27:00    17          5           1     12       1
4 2006-12-16 17:28:00    17          5           1     12       1


## 4. Sample Data (For Faster Training)

2M rows is too large - sample 100k rows for training

In [5]:
# Stratified sample to preserve temporal patterns
SAMPLE_SIZE = 100000

df_sampled = df.sample(n=SAMPLE_SIZE, random_state=42)
df_sampled = df_sampled.sort_values('datetime').reset_index(drop=True)

print(f"âœ… Sampled {len(df_sampled):,} rows")
print(f"Date range: {df_sampled['datetime'].min()} to {df_sampled['datetime'].max()}")

âœ… Sampled 100,000 rows
Date range: 2006-12-16 17:48:00 to 2010-11-26 20:19:00


## 5. Prepare Features and Target

In [6]:
# Features for modeling
feature_cols = ['Voltage', 'Global_intensity', 'Sub_metering_1', 'Sub_metering_2', 'Sub_metering_3',
                'hour', 'dayofweek', 'is_weekend', 'month', 'season']

X = df_sampled[feature_cols].copy()
y = df_sampled['Global_active_power'].copy()

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nFeatures used: {feature_cols}")

Features shape: (100000, 10)
Target shape: (100000,)

Features used: ['Voltage', 'Global_intensity', 'Sub_metering_1', 'Sub_metering_2', 'Sub_metering_3', 'hour', 'dayofweek', 'is_weekend', 'month', 'season']


## 6. Train-Test Split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Training set: {X_train.shape[0]:,} samples")
print(f"Testing set: {X_test.shape[0]:,} samples")

Training set: 80,000 samples
Testing set: 20,000 samples


## 7. Feature Scaling

In [8]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("âœ… Features scaled (StandardScaler)")
print(f"\nMean after scaling: {X_train_scaled.mean():.6f}")
print(f"Std after scaling: {X_train_scaled.std():.6f}")

âœ… Features scaled (StandardScaler)

Mean after scaling: 0.000000
Std after scaling: 1.000000


## 8. Save Preprocessed Data

In [9]:
# Save to pickle for next notebooks
preprocessed_data = {
    'X_train': X_train,
    'X_test': X_test,
    'X_train_scaled': X_train_scaled,
    'X_test_scaled': X_test_scaled,
    'y_train': y_train,
    'y_test': y_test,
    'feature_names': feature_cols,
    'scaler': scaler
}

with open('../datasets/processed/household_preprocessed.pkl', 'wb') as f:
    pickle.dump(preprocessed_data, f)

print("âœ… Preprocessed data saved to: ../datasets/processed/household_preprocessed.pkl")
print("\nðŸ“Š Summary:")
print(f"   â€¢ Train samples: {len(X_train):,}")
print(f"   â€¢ Test samples: {len(X_test):,}")
print(f"   â€¢ Features: {len(feature_cols)}")
print(f"   â€¢ Target: Global_active_power (kW)")
print("\nâœ… Ready for modeling!")

âœ… Preprocessed data saved to: ../datasets/processed/household_preprocessed.pkl

ðŸ“Š Summary:
   â€¢ Train samples: 80,000
   â€¢ Test samples: 20,000
   â€¢ Features: 10
   â€¢ Target: Global_active_power (kW)

âœ… Ready for modeling!
