In [11]:
# ============================================================================
# CELL 1: Imports
# ============================================================================
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler, FunctionTransformer, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn import set_config
from pathlib import Path
import joblib
import warnings
warnings.filterwarnings('ignore')

set_config(transform_output="pandas")  # To preserve column names in transform


In [12]:
# ============================================================================
# FEATURE ENGINEERING FOR MODIFIED DATAFRAME
# ============================================================================

print("="*80)
print("FEATURE ENGINEERING - MODIFIED DATAFRAME")
print("="*80)

FEATURE ENGINEERING - MODIFIED DATAFRAME


In [13]:
data_path_o = Path('../data/raw/steel_energy_original.csv')
df_o = pd.read_csv(data_path_o)

data_path_m = Path('../data/raw/steel_energy_modified.csv')
df_m = pd.read_csv(data_path_m)

In [14]:
# ============================================================================
# CELL 2: Load Data
# ============================================================================
# Option 1: If loading from CSV
# df = pd.read_csv('../data/raw/energy_data.csv')

# Option 2: Use existing df_m from previous notebook
df = df_m.copy()

print(f"Initial shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")

Initial shape: (35740, 12)
Columns: ['date', 'Usage_kWh', 'Lagging_Current_Reactive.Power_kVarh', 'Leading_Current_Reactive_Power_kVarh', 'CO2(tCO2)', 'Lagging_Current_Power_Factor', 'Leading_Current_Power_Factor', 'NSM', 'WeekStatus', 'Day_of_week', 'Load_Type', 'mixed_type_col']


In [15]:
# ============================================================================
# CELL 3: Convert Data Types
# ============================================================================
print("=" * 80)
print("STEP 1: DATA TYPE CONVERSION")
print("=" * 80)

# Convert date to datetime
df['date'] = pd.to_datetime(df['date'], errors='coerce')

# Convert numeric columns
numeric_cols = [
    'Usage_kWh',
    'Lagging_Current_Reactive.Power_kVarh',
    'Leading_Current_Reactive_Power_kVarh',
    'CO2(tCO2)',
    'Lagging_Current_Power_Factor',
    'Leading_Current_Power_Factor',
    'NSM'
]

for col in numeric_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

print(f"✓ Data types converted")
print(f"  Shape: {df.shape}")


STEP 1: DATA TYPE CONVERSION
✓ Data types converted
  Shape: (35740, 12)


In [16]:
# ============================================================================
# CELL 4: Create Engineered Features
# ============================================================================
print("\n" + "=" * 80)
print("STEP 2: FEATURE ENGINEERING")
print("=" * 80)

def engineer_features(df):
    """Create engineered features for energy consumption"""
    df_eng = df.copy()
    
    # 1. Temporal Features
    df_eng['year'] = df_eng['date'].dt.year
    df_eng['month'] = df_eng['date'].dt.month
    df_eng['day'] = df_eng['date'].dt.day
    df_eng['hour'] = df_eng['date'].dt.hour
    df_eng['day_of_week_num'] = df_eng['date'].dt.dayofweek  # 0=Monday, 6=Sunday
    df_eng['is_weekend'] = (df_eng['day_of_week_num'] >= 5).astype(int)
    df_eng['quarter'] = df_eng['date'].dt.quarter
    
    # 2. Cyclical Encoding for Temporal Features
    df_eng['hour_sin'] = np.sin(2 * np.pi * df_eng['hour'] / 24)
    df_eng['hour_cos'] = np.cos(2 * np.pi * df_eng['hour'] / 24)
    df_eng['month_sin'] = np.sin(2 * np.pi * df_eng['month'] / 12)
    df_eng['month_cos'] = np.cos(2 * np.pi * df_eng['month'] / 12)
    df_eng['dow_sin'] = np.sin(2 * np.pi * df_eng['day_of_week_num'] / 7)
    df_eng['dow_cos'] = np.cos(2 * np.pi * df_eng['day_of_week_num'] / 7)
    
    # 3. Power Factor Features
    if 'Lagging_Current_Power_Factor' in df_eng.columns and 'Leading_Current_Power_Factor' in df_eng.columns:
        df_eng['power_factor_ratio'] = df_eng['Lagging_Current_Power_Factor'] / (df_eng['Leading_Current_Power_Factor'] + 1e-6)
        df_eng['power_factor_diff'] = df_eng['Lagging_Current_Power_Factor'] - df_eng['Leading_Current_Power_Factor']
        df_eng['avg_power_factor'] = (df_eng['Lagging_Current_Power_Factor'] + df_eng['Leading_Current_Power_Factor']) / 2
    
    # 4. Reactive Power Features
    if 'Lagging_Current_Reactive.Power_kVarh' in df_eng.columns and 'Leading_Current_Reactive_Power_kVarh' in df_eng.columns:
        df_eng['reactive_power_total'] = df_eng['Lagging_Current_Reactive.Power_kVarh'] + df_eng['Leading_Current_Reactive_Power_kVarh']
        df_eng['reactive_power_diff'] = df_eng['Lagging_Current_Reactive.Power_kVarh'] - df_eng['Leading_Current_Reactive_Power_kVarh']
        df_eng['reactive_power_ratio'] = df_eng['Lagging_Current_Reactive.Power_kVarh'] / (df_eng['Leading_Current_Reactive_Power_kVarh'] + 1e-6)
    
    # 5. Energy Efficiency Indicators
    if 'CO2(tCO2)' in df_eng.columns and 'Usage_kWh' in df_eng.columns:
        df_eng['co2_per_kwh'] = df_eng['CO2(tCO2)'] / (df_eng['Usage_kWh'] + 1e-6)
    
    # 6. High Consumption Flag
    if 'Usage_kWh' in df_eng.columns:
        df_eng['is_high_consumption'] = (df_eng['Usage_kWh'] > df_eng['Usage_kWh'].median()).astype(int)
    
    # 7. NSM ratio (normalized)
    if 'NSM' in df_eng.columns and 'Usage_kWh' in df_eng.columns:
        df_eng['nsm_per_kwh'] = df_eng['NSM'] / (df_eng['Usage_kWh'] + 1e-6)
    
    return df_eng

df_engineered = engineer_features(df)
print(f"✓ Features engineered")
print(f"  New shape: {df_engineered.shape}")
print(f"  New columns added: {df_engineered.shape[1] - df.shape[1]}")


STEP 2: FEATURE ENGINEERING
✓ Features engineered
  New shape: (35740, 34)
  New columns added: 22


In [17]:
# ============================================================================
# CELL 5: Split Features and Target
# ============================================================================
print("\n" + "=" * 80)
print("STEP 3: SPLIT FEATURES AND TARGET")
print("=" * 80)

# Define columns to exclude from features
exclude_cols = ['date', 'Usage_kWh', 'Day_of_week']

# Get feature columns
feature_cols = [col for col in df_engineered.columns if col not in exclude_cols]

# Create X and y
X = df_engineered[feature_cols].copy()
y = df_engineered['Usage_kWh'].copy()

# Handle any remaining NaN values in target
y = y.fillna(y.median())

print(f"✓ Data split completed")
print(f"  Features shape: {X.shape}")
print(f"  Target shape: {y.shape}")


STEP 3: SPLIT FEATURES AND TARGET
✓ Data split completed
  Features shape: (35740, 31)
  Target shape: (35740,)


In [18]:
# ============================================================================
# CELL 6: Define Column Groups for Preprocessing
# ============================================================================
print("\n" + "=" * 80)
print("STEP 4: DEFINE PREPROCESSING COLUMN GROUPS")
print("=" * 80)

# Skewed numeric columns (will apply log1p transformation)
num_skew = [
    'Lagging_Current_Reactive.Power_kVarh',
    'Leading_Current_Reactive_Power_kVarh',
    'CO2(tCO2)',
    'reactive_power_total',
    'NSM',
    'co2_per_kwh',
    'nsm_per_kwh'
]
# Filter only existing columns
num_skew = [col for col in num_skew if col in X.columns]

# Linear numeric columns (no transformation needed)
num_lin = [
    'Lagging_Current_Power_Factor',
    'Leading_Current_Power_Factor',
    'hour', 'hour_sin', 'hour_cos',
    'dow_sin', 'dow_cos',
    'month_sin', 'month_cos',
    'power_factor_ratio', 'power_factor_diff', 'avg_power_factor',
    'reactive_power_diff', 'reactive_power_ratio',
    'year', 'month', 'day', 'quarter',
    'day_of_week_num', 'is_weekend', 'is_high_consumption'
]
# Filter only existing columns
num_lin = [col for col in num_lin if col in X.columns]

# Nominal categorical (binary/unordered)
cat_nom = ['WeekStatus']
cat_nom = [col for col in cat_nom if col in X.columns]

# Ordinal categorical (ordered categories)
cat_ord = ['Load_Type']
cat_ord = [col for col in cat_ord if col in X.columns]

print(f"Column groups defined:")
print(f"  Skewed numeric ({len(num_skew)}): {num_skew}")
print(f"  Linear numeric ({len(num_lin)}): {num_lin}")
print(f"  Nominal categorical ({len(cat_nom)}): {cat_nom}")
print(f"  Ordinal categorical ({len(cat_ord)}): {cat_ord}")


STEP 4: DEFINE PREPROCESSING COLUMN GROUPS
Column groups defined:
  Skewed numeric (7): ['Lagging_Current_Reactive.Power_kVarh', 'Leading_Current_Reactive_Power_kVarh', 'CO2(tCO2)', 'reactive_power_total', 'NSM', 'co2_per_kwh', 'nsm_per_kwh']
  Linear numeric (21): ['Lagging_Current_Power_Factor', 'Leading_Current_Power_Factor', 'hour', 'hour_sin', 'hour_cos', 'dow_sin', 'dow_cos', 'month_sin', 'month_cos', 'power_factor_ratio', 'power_factor_diff', 'avg_power_factor', 'reactive_power_diff', 'reactive_power_ratio', 'year', 'month', 'day', 'quarter', 'day_of_week_num', 'is_weekend', 'is_high_consumption']
  Nominal categorical (1): ['WeekStatus']
  Ordinal categorical (1): ['Load_Type']


In [19]:
# ============================================================================
# CELL 7: Create Preprocessing Pipelines
# ============================================================================
print("\n" + "=" * 80)
print("STEP 5: CREATE PREPROCESSING PIPELINES")
print("=" * 80)

# Pipeline for skewed numeric features (log1p + MinMaxScaler)
num_skew_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('log1p', FunctionTransformer(np.log1p, feature_names_out='one-to-one')),
    ('scaler', MinMaxScaler())
])

# Pipeline for linear numeric features (MinMaxScaler only)
num_lin_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler())
])

# Pipeline for nominal categorical features (OneHotEncoder)
nom_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore', drop='first'))
])

# Pipeline for ordinal categorical features (OrdinalEncoder)
ord_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ord', OrdinalEncoder(categories=[['Light_Load', 'Medium_Load', 'Maximum_Load']], 
                          handle_unknown='use_encoded_value', unknown_value=-1))
])

# Combine all pipelines using ColumnTransformer
preprocessor = ColumnTransformer(transformers=[
    ('num_skew', num_skew_pipe, num_skew),
    ('num_lin',  num_lin_pipe,  num_lin),
    ('cat_nom',  nom_pipe,      cat_nom),
    ('cat_ord',  ord_pipe,      cat_ord),
], remainder='drop')

print("✓ Preprocessing pipelines created")
print(f"  - Skewed numeric: Imputer → Log1p → MinMaxScaler")
print(f"  - Linear numeric: Imputer → MinMaxScaler")
print(f"  - Nominal categorical: Imputer → OneHotEncoder")
print(f"  - Ordinal categorical: Imputer → OrdinalEncoder")



STEP 5: CREATE PREPROCESSING PIPELINES
✓ Preprocessing pipelines created
  - Skewed numeric: Imputer → Log1p → MinMaxScaler
  - Linear numeric: Imputer → MinMaxScaler
  - Nominal categorical: Imputer → OneHotEncoder
  - Ordinal categorical: Imputer → OrdinalEncoder


In [20]:
# ============================================================================
# CELL 8: Fit and Transform Data
# ============================================================================
print("\n" + "=" * 80)
print("STEP 6: FIT AND TRANSFORM DATA")
print("=" * 80)

# Fit the preprocessor and transform the data
X_processed = preprocessor.fit_transform(X)

print(f"✓ Preprocessing completed")
print(f"  Original shape: {X.shape}")
print(f"  Processed shape: {X_processed.shape}")
print(f"\nProcessed feature names:")
print(f"  {list(X_processed.columns)}")



STEP 6: FIT AND TRANSFORM DATA
✓ Preprocessing completed
  Original shape: (35740, 31)
  Processed shape: (35740, 33)

Processed feature names:
  ['num_skew__Lagging_Current_Reactive.Power_kVarh', 'num_skew__Leading_Current_Reactive_Power_kVarh', 'num_skew__CO2(tCO2)', 'num_skew__reactive_power_total', 'num_skew__NSM', 'num_skew__co2_per_kwh', 'num_skew__nsm_per_kwh', 'num_lin__Lagging_Current_Power_Factor', 'num_lin__Leading_Current_Power_Factor', 'num_lin__hour', 'num_lin__hour_sin', 'num_lin__hour_cos', 'num_lin__dow_sin', 'num_lin__dow_cos', 'num_lin__month_sin', 'num_lin__month_cos', 'num_lin__power_factor_ratio', 'num_lin__power_factor_diff', 'num_lin__avg_power_factor', 'num_lin__reactive_power_diff', 'num_lin__reactive_power_ratio', 'num_lin__year', 'num_lin__month', 'num_lin__day', 'num_lin__quarter', 'num_lin__day_of_week_num', 'num_lin__is_weekend', 'num_lin__is_high_consumption', 'cat_nom__WeekStatus_ wEEKDAY ', 'cat_nom__WeekStatus_ wEEKEND ', 'cat_nom__WeekStatus_Weekday

In [21]:
# ============================================================================
# CELL 9: Save Processed Data and Pipeline
# ============================================================================
output_dir = Path('../data/processed')
output_dir.mkdir(parents=True, exist_ok=True)

# Save processed features and target
X_processed.to_csv(output_dir / 'X_features.csv', index=False)
y.to_csv(output_dir / 'y_target.csv', index=False)

# Save preprocessor pipeline
joblib.dump(preprocessor, output_dir / 'preprocessor.pkl')

# Save feature information
feature_info = {
    'original_features': list(X.columns),
    'processed_features': list(X_processed.columns),
    'num_skew': num_skew,
    'num_lin': num_lin,
    'cat_nom': cat_nom,
    'cat_ord': cat_ord
}
joblib.dump(feature_info, output_dir / 'feature_info.pkl')

print("✅ Processed data saved!")
print(f"\nSaved files:")
print(f"  • X_features.csv         ({X_processed.shape[0]} rows × {X_processed.shape[1]} columns)")
print(f"  • y_target.csv           ({y.shape[0]} rows)")
print(f"  • preprocessor.pkl       (Complete preprocessing pipeline)")
print(f"  • feature_info.pkl       (Feature metadata)")
print("\n" + "=" * 80)
print("PIPELINE READY FOR MODEL TRAINING!")
print("=" * 80)
print("\nExample usage for modeling:")
print("""
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

# Load preprocessor
preprocessor = joblib.load('../data/processed/preprocessor.pkl')

# Create model pipeline
pipe_lr = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit model
pipe_lr.fit(X_train, y_train)

# Predict
y_pred = pipe_lr.predict(X_test)
""")

✅ Processed data saved!

Saved files:
  • X_features.csv         (35740 rows × 33 columns)
  • y_target.csv           (35740 rows)
  • preprocessor.pkl       (Complete preprocessing pipeline)
  • feature_info.pkl       (Feature metadata)

PIPELINE READY FOR MODEL TRAINING!

Example usage for modeling:

from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

# Load preprocessor
preprocessor = joblib.load('../data/processed/preprocessor.pkl')

# Create model pipeline
pipe_lr = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit model
pipe_lr.fit(X_train, y_train)

# Predict
y_pred = pipe_lr.predict(X_test)

