In [None]:
pip install meteostat

# Data Preprocessing and Feature Engineering

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV


## Getting consumption Dataframe


In [None]:
file_path = "household_power_consumption.csv"
dtypes = {
  'Date': str,
  'Time': str,
  'Global_active_power': float,
  'Global_reactive_power': float,
  'Voltage': float,
  'Global_intensity': float,
  'Sub_metering_1': float,
  'Sub_metering_2': float,
  'Sub_metering_3': float
}

try:
  df_cons = pd.read_csv(file_path, sep=';', dtype=dtypes, na_values=['nan','?'])
except FileNotFoundError:
  print(f"Error: File not found at {file_path}. Please check the file path.")
  df_cons = None

df_cons['Datetime'] = pd.to_datetime(df_cons['Date'] , format='%d/%m/%Y')

df_cons.drop(columns=['Date', 'Time'], inplace=True)

df_cons.dropna(inplace=True)

df_cons['Energy_kWh'] = df_cons['Global_active_power'] * (1 / 60)

daily_energy = df_cons.groupby(df_cons['Datetime'].dt.date)['Energy_kWh'].sum()

df_daily_energy = pd.DataFrame({'Date': daily_energy.index, 'Daily_Energy_kWh': daily_energy.values})

df_daily_energy['Date'] = pd.to_datetime(df_daily_energy['Date'], format='%d/%m/%Y')

df_daily_energy

In [None]:
from datetime import datetime
from meteostat import Hourly, Point
from meteostat import Stations, Daily

# Set time period
start = datetime(2006,12,16, 17,27,00)
end = datetime(2010, 12, 31, 23, 59)
sceaux= Point(48.7943, 2.2715)

# Get hourly data
df_weather = Daily(sceaux, start, end)
df_weather = df_weather.fetch()
df_weather['time'] = df_weather.index
df_weather['time'] = pd.to_datetime(df_weather['time'])
df_weather.info()
df_weather

In [None]:
df_weather['Year'] = df_weather['time'].dt.year
df_weather['Month'] = df_weather['time'].dt.month
df_weather['Day'] = df_weather['time'].dt.day
df_weather['Day_of_week'] = df_weather['time'].dt.dayofweek #0 being monday and 6 being sunday
df_weather.drop(columns=['time'], inplace=True)
df_weather.isnull().sum()
df_weather

In [None]:
import pandas as pd
df_merged = pd.merge(df_daily_energy, df_weather, left_on='Date', right_on='time')
df_merged


In [None]:
df_merged.dropna(subset=['Daily_Energy_kWh'], inplace=True)
df_merged.isnull().sum()

In [None]:
df_merged.drop(columns=['snow','wpgt', 'tsun'], inplace=True)
df_merged.dropna(inplace=True)
df_merged.isna().sum()

In [None]:
from scipy.stats import zscore

numerical_cols = df_merged.select_dtypes(include=np.number).columns
# Calculate z-scores for all columns
z_scores = np.abs(zscore(df_merged[numerical_cols]))

# Define a threshold for outlier detection (e.g., z-score > 3)
threshold = 3

# Identify outliers
outliers = np.where(z_scores > threshold)

# Remove rows with outliers
df_merged = df_merged[(z_scores < 3).all(axis=1)]

df_merged.describe()

In [None]:
df_merged.info()

In [None]:
df_merged.isna().sum()

In [None]:
from google.colab import data_table

df_merged['previousDay'] = df_merged['Daily_Energy_kWh'].shift(1)
df_merged['previous2Day'] = df_merged['Daily_Energy_kWh'].shift(2)
df_merged['previous3Day'] = df_merged['Daily_Energy_kWh'].shift(3)
df_merged['previous4Day'] = df_merged['Daily_Energy_kWh'].shift(4)
df_merged['previous5Day'] = df_merged['Daily_Energy_kWh'].shift(5)
df_merged['previous6Day'] = df_merged['Daily_Energy_kWh'].shift(6)
df_merged['previous7Day'] = df_merged['Daily_Energy_kWh'].shift(7)
df_merged.info()
df_merged

data_table.DataTable(df_merged)

In [None]:
df_merged.head()

In [None]:
needs_scaling = [ 'tavg', 'tmin', 'tmax', 'prcp', 'wdir', 'wspd',
       'pres', 'previousDay', 'previous2Day',
       'previous3Day','previous4Day','previous5Day','previous6Day','previous7Day']

scaler = MinMaxScaler()
df_merged[needs_scaling] = scaler.fit_transform(df_merged[needs_scaling])
df_merged.head()

In [None]:
corr_mat=df_merged.corr()
plt.figure(figsize=(15,7))
sns.heatmap(corr_mat, annot=True, fmt=".3f", cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
###########################################
# Saving the scaler and processed dataset #
###########################################

import os
import joblib

drive_save_dir = ''

os.makedirs(drive_save_dir, exist_ok=True)

# Save the scaler
scaler_path = os.path.join(drive_save_dir, 'long_term_scaler.pkl')
joblib.dump(scaler, scaler_path)

# Save the DataFrame
df_both_path = os.path.join(drive_save_dir, 'long_term_consumption.csv')
df_merged.to_csv(df_both_path)

print(f"Scaler saved to: {scaler_path}")
print(f"DataFrame saved to: {df_both_path}")

