In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as mp
import seaborn as sns

In [15]:
#Load the dataset
data = pd.read_csv(
    "household_power_consumption.txt",
    sep=';',
    na_values='?',            # Convert '?' to NaN
    low_memory=False
)

In [17]:
# Clean column names first
data.columns = data.columns.str.strip()

# Combine Date and Time
data['DateTime'] = pd.to_datetime(data['Date'] + ' ' + data['Time'])

# Set DateTime as index
data.set_index('DateTime', inplace=True)

# Drop rows with missing values
data.dropna(inplace=True)

# Convert numeric columns to float, ignore object/text
data = data.astype(float, errors='ignore')

# See missing values
print(data.isnull().sum())
data


  data['DateTime'] = pd.to_datetime(data['Date'] + ' ' + data['Time'])


Date                     0
Time                     0
Global_active_power      0
Global_reactive_power    0
Voltage                  0
Global_intensity         0
Sub_metering_1           0
Sub_metering_2           0
Sub_metering_3           0
dtype: int64


Unnamed: 0_level_0,Date,Time,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2006-12-16 17:24:00,16/12/2006,17:24:00,4.216,0.418,234.84,18.4,0.0,1.0,17.0
2006-12-16 17:25:00,16/12/2006,17:25:00,5.360,0.436,233.63,23.0,0.0,1.0,16.0
2006-12-16 17:26:00,16/12/2006,17:26:00,5.374,0.498,233.29,23.0,0.0,2.0,17.0
2006-12-16 17:27:00,16/12/2006,17:27:00,5.388,0.502,233.74,23.0,0.0,1.0,17.0
2006-12-16 17:28:00,16/12/2006,17:28:00,3.666,0.528,235.68,15.8,0.0,1.0,17.0
...,...,...,...,...,...,...,...,...,...
2010-11-26 20:58:00,26/11/2010,20:58:00,0.946,0.000,240.43,4.0,0.0,0.0,0.0
2010-11-26 20:59:00,26/11/2010,20:59:00,0.944,0.000,240.00,4.0,0.0,0.0,0.0
2010-11-26 21:00:00,26/11/2010,21:00:00,0.938,0.000,239.82,3.8,0.0,0.0,0.0
2010-11-26 21:01:00,26/11/2010,21:01:00,0.934,0.000,239.70,3.8,0.0,0.0,0.0


In [8]:
hourly_data = data.resample('H').mean()

#Add time-related features

hourly_data['hour'] = hourly_data.index.hour 
hourly_data['dayofweek'] = hourly_data.index.dayofweek 
hourly_data['is_weekend'] = (hourly_data['dayofweek'] >= 5).astype(int)


  hourly_data = data.resample('H').mean()


In [9]:
from sklearn.metrics import mean_absolute_error, mean_squared_error 
from xgboost import XGBRegressor 
from statsmodels.tsa.arima.model import ARIMA 
from prophet import Prophet

ModuleNotFoundError: No module named 'xgboost'

In [None]:
Split the data into training and testing sets

train = hourly_data[:-168]  # use all but last week for training 
test = hourly_data[-168:]  # use last week for testing

In [None]:
ARIMA Model

arima_model = ARIMA(train['Global_active_power'], order=(3,1,2)) 
arima_result = arima_model.fit() 
arima_forecast = arima_result.forecast(steps=len(test))

In [None]:
Prophet Model

prophet_df = train[['Global_active_power']].reset_index() 
prophet_df.columns = ['ds', 'y'] 
prophet_model = Prophet(daily_seasonality=True) 
prophet_model.fit(prophet_df) 
future_dates = prophet_model.make_future_dataframe(periods=len(test), freq='H') 
forecast_prophet = prophet_model.predict(future_dates) 
prophet_forecast = forecast_prophet.set_index('ds').loc[test.index]['yhat']


In [None]:
XGBoost Model

train['hour'] = train.index.hour 
train['dayofweek'] = train.index.dayofweek 
train['is_weekend'] = (train.index.dayofweek >= 5).astype(int)

test['hour'] = test.index.hour 

test['dayofweek'] = test.index.dayofweek 
test['is_weekend'] = (test.index.dayofweek >= 5).astype(int)

In [None]:
X_train = train.drop(['Global_active_power'], axis=1) 
y_train = train['Global_active_power'] 
X_test = test.drop(['Global_active_power'], axis=1)

xgb_model = XGBRegressor(n_estimators=100) 
xgb_model.fit(X_train, y_train) 
xgb_forecast = xgb_model.predict(X_test)

In [None]:
Evaluation of each model

mae_arima = mean_absolute_error(test['Global_active_power'], arima_forecast) 
rmse_arima = np.sqrt(mean_squared_error(test['Global_active_power'], arima_forecast)) 
print("ARIMA Model - MAE:", mae_arima) print("ARIMA Model - RMSE:", rmse_arima)

mae_prophet = mean_absolute_error(test['Global_active_power'], prophet_forecast) 
rmse_prophet = np.sqrt(mean_squared_error(test['Global_active_power'], prophet_forecast)) 
print("Prophet Model - MAE:", mae_prophet) print("Prophet Model - RMSE:", rmse_prophet)

mae_xgb = mean_absolute_error(test['Global_active_power'], xgb_forecast) 
rmse_xgb = np.sqrt(mean_squared_error(test['Global_active_power'], xgb_forecast)) 
print("XGBoost Model - MAE:", mae_xgb) print("XGBoost Model - RMSE:", rmse_xgb)

Plotting the results

mp.figure(figsize=(15, 6)) 
mp.plot(test.index, test['Global_active_power'], label='Actual', color='black') 
mp.plot(test.index, arima_forecast, label='ARIMA', linestyle='--') 
mp.plot(test.index, prophet_forecast, label='Prophet', linestyle='--') 
mp.plot(test.index, xgb_forecast, label='XGBoost', linestyle='--') 
mp.title("Actual vs Forecasted Global Active Power") 
mp.xlabel("Datetime") 
mp.ylabel("Power (kilowatts)")
mp.legend() 
mp.grid(True) 
mp.tight_layout() 
mp.show()