In [None]:
pip install meteostat
pip install seaborn

# Data Preprocessing and Feature Engineering

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV


## Getting consumption Dataframe


In [None]:
file_path = "household_power_consumption.csv"
dtypes = {
  'Date': str,
  'Time': str,
  'Global_active_power': float,
  'Global_reactive_power': float,
  'Voltage': float,
  'Global_intensity': float,
  'Sub_metering_1': float,
  'Sub_metering_2': float,
  'Sub_metering_3': float
}
try:
  df_cons = pd.read_csv(file_path, sep=';', dtype=dtypes, na_values=['nan','?'])
except FileNotFoundError:
  print(f"Error: File not found at {file_path}. Please check the file path.")
  df_cons = None

df_cons['Datetime'] = pd.to_datetime(df_cons['Date'] + ' ' + df_cons['Time'], format='%d/%m/%Y %H:%M:%S')

df_cons.drop(columns=['Date', 'Time'], inplace=True)
df_cons.info()
df_cons.head()
df_cons.dropna(inplace=True)
df_cons.isna().sum()

In [None]:
from datetime import datetime
from meteostat import Hourly, Point

# Set time period
start = datetime(2006,12,16, 17,27,00)
end = datetime(2010, 12, 31, 23, 59)
sceaux= Point(48.7943, 2.2715)
# Get hourly data
df_weather = Hourly(sceaux, start, end)
df_weather = df_weather.fetch()
df_weather['time'] = df_weather.index
df_weather['time'] = pd.to_datetime(df_weather['time'])
df_weather.info()

In [None]:
df_weather['Year'] = df_weather['time'].dt.year
df_weather['Month'] = df_weather['time'].dt.month
df_weather['Day'] = df_weather['time'].dt.day
df_weather['Hour'] = df_weather['time'].dt.hour
df_cons['Minute'] = df_cons['Datetime'].dt.minute
df_weather['Day_of_week'] = df_weather['time'].dt.dayofweek #0 being monday and 6 being sunday
df_weather.drop(columns=['time'], inplace=True)
df_weather.isnull().sum()
df_weather.head()

In [None]:
# Merge datasets based on the datetime
df_both = pd.merge(df_cons, df_weather, left_on='Datetime', right_on='time', how='outer')
df_both.info()
df_both.head()

In [None]:
df_both.dropna(subset=['Global_active_power'], inplace=True)
df_both.isnull().sum()

In [None]:
df_both.fillna(method='ffill', inplace=True)
df_both.isnull().sum()

In [None]:
df_both.drop(columns=['tsun','coco', 'wpgt', 'snow', ], inplace=True)
df_both.dropna(inplace=True)
df_both.isna().sum()

In [None]:
df_both.drop(columns=['Global_reactive_power','Global_intensity', 'Voltage', 'Sub_metering_1','Sub_metering_2', 'Sub_metering_3'], inplace=True)
df_both.info()

In [None]:
from scipy.stats import zscore

numerical_cols = df_both.select_dtypes(include=np.number).columns
# Calculate z-scores for all columns
z_scores = np.abs(zscore(df_both[numerical_cols]))

# Define a threshold for outlier detection (e.g., z-score > 3)
threshold = 3

# Identify outliers
outliers = np.where(z_scores > threshold)

# Remove rows with outliers
df_both = df_both[(z_scores < 3).all(axis=1)]

df_both.describe()

In [None]:
df_both.info()

In [None]:
df_both = df_both.set_index('Datetime')

# Convert index to DatetimeIndex
df_both.index = pd.to_datetime(df_both.index)

df_resampled = df_both.resample('60min').mean()

df_resampled = df_resampled.drop(columns=['Minute'])
df_resampled = df_resampled.drop(columns=['Year'])

# df_resampled = df_resampled.dropna()



# df_both['energy_Wh'] = df_both['Global_active_power'] * (1000 / 60)
# df_resampled['Global_active_power'] = (df_resampled['energy_Wh'] * (60 / 1000)) / 15

# df['energy_Wh'] = df['power_kW'] * (1000 / 60)  # Convert power to energy per minute
# df_resampled = df.resample('15T').agg({
#     'energy_Wh': 'sum',        # Sum energy over 15 minutes
#     'day_of_week': 'first',    # Keep the first day's value
#     'temperature': 'mean',     # Example: average temperature over 15 minutes
#     'voltage': 'mean',         # Example: average voltage
# })

df_both = df_resampled
# df_both.drop(columns=['energy_Wh'], inplace=True)

In [None]:
df_both.isna().sum()

In [None]:
df_both['previous1hr'] = df_both['Global_active_power'].shift(1)
df_both['previous2hr'] = df_both['Global_active_power'].shift(2)
df_both['previous3hr'] = df_both['Global_active_power'].shift(3)
# df_both.dropna(inplace=True)
df_both.info()

In [None]:
df_both.head()

In [None]:
needs_scaling = [ 'temp', 'dwpt', 'rhum', 'prcp', 'wdir', 'wspd',
       'pres', 'previous1hr', 'previous2hr',
       'previous3hr']
scaler = MinMaxScaler()
df_both[needs_scaling] = scaler.fit_transform(df_both[needs_scaling])
df_both.head()

In [None]:
corr_mat=df_both.corr()
plt.figure(figsize=(15,7))
sns.heatmap(corr_mat, annot=True, fmt=".3f", cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
###########################################
# Saving the scaler and processed dataset #
###########################################

import os
import joblib

drive_save_dir = ''

os.makedirs(drive_save_dir, exist_ok=True)

# Save the scaler
scaler_path = os.path.join(drive_save_dir, 'short_term_scaler.pkl')
joblib.dump(scaler, scaler_path)

# Save the DataFrame
df_both_path = os.path.join(drive_save_dir, 'short_term_consumption.csv')
df_both.to_csv(df_both_path)

print(f"Scaler saved to: {scaler_path}")
print(f"DataFrame saved to: {df_both_path}")

