In [None]:
import sys
import os
import pandas as pd

# Preprocessing Notebook
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(project_root)

from src.preprocess import encode_categorical, cap_outliers_iqr, extract_datetime_features, add_face_of_day, variance_threshold_selector, f_value_selector
from src.utils import load_appliance_energy_data

## Step 1: Load and inspect data

In [2]:
df = load_appliance_energy_data()
df.head()

Unnamed: 0,date,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,...,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2,Appliances
0,2016-01-11 17:00:00,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,45.566667,...,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433,60
1,2016-01-11 17:10:00,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,45.9925,...,45.56,6.48,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195,60
2,2016-01-11 17:20:00,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,45.89,...,45.5,6.37,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668,50
3,2016-01-11 17:30:00,40,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,45.723333,...,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.41039,45.41039,50
4,2016-01-11 17:40:00,40,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,45.53,...,45.4,6.13,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097,60


## Step 2:  Separate Type of Features 

In [3]:
# Extract datetime features from the 'date' column
df = extract_datetime_features(df,column='date')

In [4]:
df.head()

Unnamed: 0,date,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,...,Tdewpoint,rv1,rv2,Appliances,hour,day,day_of_week,day_name,month,month_name
0,2016-01-11 17:00:00,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,45.566667,...,5.3,13.275433,13.275433,60,17,11,0,Monday,1,January
1,2016-01-11 17:10:00,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,45.9925,...,5.2,18.606195,18.606195,60,17,11,0,Monday,1,January
2,2016-01-11 17:20:00,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,45.89,...,5.1,28.642668,28.642668,50,17,11,0,Monday,1,January
3,2016-01-11 17:30:00,40,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,45.723333,...,5.0,45.41039,45.41039,50,17,11,0,Monday,1,January
4,2016-01-11 17:40:00,40,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,45.53,...,4.9,10.084097,10.084097,60,17,11,0,Monday,1,January


In [5]:
# Apply it to create new feature
df = add_face_of_day(df)

In [6]:
df.head()

Unnamed: 0,date,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,...,rv1,rv2,Appliances,hour,day,day_of_week,day_name,month,month_name,face_of_day
0,2016-01-11 17:00:00,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,45.566667,...,13.275433,13.275433,60,17,11,0,Monday,1,January,Afternoon
1,2016-01-11 17:10:00,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,45.9925,...,18.606195,18.606195,60,17,11,0,Monday,1,January,Afternoon
2,2016-01-11 17:20:00,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,45.89,...,28.642668,28.642668,50,17,11,0,Monday,1,January,Afternoon
3,2016-01-11 17:30:00,40,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,45.723333,...,45.41039,45.41039,50,17,11,0,Monday,1,January,Afternoon
4,2016-01-11 17:40:00,40,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,45.53,...,10.084097,10.084097,60,17,11,0,Monday,1,January,Afternoon


In [7]:
# encode categorical variables
df = encode_categorical(df)

In [8]:
df.head()

Unnamed: 0,date,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,...,day_name_Thursday,day_name_Tuesday,day_name_Wednesday,month_name_February,month_name_January,month_name_March,month_name_May,face_of_day_Evening,face_of_day_Morning,face_of_day_Night
0,1452531600000000000,30,19,47,19,44,19,44,19,45,...,0,0,0,0,1,0,0,0,0,0
1,1452532200000000000,30,19,46,19,44,19,44,19,45,...,0,0,0,0,1,0,0,0,0,0
2,1452532800000000000,30,19,46,19,44,19,44,18,45,...,0,0,0,0,1,0,0,0,0,0
3,1452533400000000000,40,19,46,19,44,19,45,18,45,...,0,0,0,0,1,0,0,0,0,0
4,1452534000000000000,40,19,46,19,44,19,45,18,45,...,0,0,0,0,1,0,0,0,0,0


In [9]:
# Cap outliers using IQR method for numerical columns
for col in df.select_dtypes(include='number').columns:
    if col != "Appliances":  # Don't modify your target
        df = cap_outliers_iqr(df, col)

In [10]:
df.head()

Unnamed: 0,date,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,...,day_name_Thursday,day_name_Tuesday,day_name_Wednesday,month_name_February,month_name_January,month_name_March,month_name_May,face_of_day_Evening,face_of_day_Morning,face_of_day_Night
0,1.452532e+18,0.0,19.0,47.0,19.0,44.0,19.0,44.0,19.0,45.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.452532e+18,0.0,19.0,46.0,19.0,44.0,19.0,44.0,19.0,45.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.452533e+18,0.0,19.0,46.0,19.0,44.0,19.0,44.0,18.0,45.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.452533e+18,0.0,19.0,46.0,19.0,44.0,19.0,45.0,18.0,45.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.452534e+18,0.0,19.0,46.0,19.0,44.0,19.0,45.0,18.0,45.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# Separate features and target
X = df.drop(columns=["Appliances", "date"])
y = df["Appliances"]

In [12]:
# Save to CSV
X.to_csv("data//X_processed.csv", index=False)
y.to_csv("data/y_processed.csv", index=False)