In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import fastparquet
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [2]:
# Load  dataset
df = pd.read_parquet("intermediate_set.parquet")

In [3]:
print("Shape:", df.shape)
print("\nPreview:")
display(df.head())

Shape: (918136, 24)

Preview:


Unnamed: 0,date,building_id,daily_consumption,site_id,sqft,sqm,timezone,sub_primaryspaceusage,primaryspaceusage,lat,...,seaLvlPressure,windDirection,windSpeed,is_outlier,month,year,day_of_week,is_weekend,log_daily_consumption,log_sqm
0,2016-01-01,Robin_public_Carolina,2183.75,Robin,118231,10984.0,Europe/London,Library,Public services,51.51879,...,1016.941667,116.666667,4.470833,False,1,2016,4,False,7.689257,9.304286
1,2016-01-02,Robin_public_Carolina,2205.563,Robin,118231,10984.0,Europe/London,Library,Public services,51.51879,...,998.0875,166.666667,7.241667,False,1,2016,5,True,7.699191,9.304286
2,2016-01-03,Robin_public_Carolina,2108.75,Robin,118231,10984.0,Europe/London,Library,Public services,51.51879,...,991.8375,187.916667,5.633333,False,1,2016,6,True,7.654325,9.304286
3,2016-01-04,Robin_public_Carolina,3536.562,Robin,118231,10984.0,Europe/London,Library,Public services,51.51879,...,982.766667,177.083333,4.145833,False,1,2016,0,False,8.171193,9.304286
4,2016-01-05,Robin_public_Carolina,3771.376,Robin,118231,10984.0,Europe/London,Library,Public services,51.51879,...,984.7125,175.416667,2.75,False,1,2016,1,False,8.23546,9.304286


In [4]:
# Data types
print("\n--- Data Types ---")
print(df.dtypes)


--- Data Types ---
date                     datetime64[ns]
building_id                      object
daily_consumption               float64
site_id                          object
sqft                              int64
sqm                             float64
timezone                         object
sub_primaryspaceusage            object
primaryspaceusage                object
lat                             float64
lng                             float64
has_location                      int64
airTemperature                  float64
dewTemperature                  float64
seaLvlPressure                  float64
windDirection                   float64
windSpeed                       float64
is_outlier                         bool
month                             int32
year                              int32
day_of_week                       int32
is_weekend                         bool
log_daily_consumption           float64
log_sqm                         float64
dtype: object


In [7]:
# Ensure categorical types are properly set
categorical_cols = ["building_id", "primaryspaceusage", "timezone"]
for col in categorical_cols:
    if col in df.columns:
        df[col] = df[col].astype("category")

In [8]:
# Drop redundant columns 
drop_cols = ["sub_primaryspaceusage"]
df = df.drop(columns=[c for c in drop_cols if c in df.columns], errors="ignore")

In [9]:
df.head()

Unnamed: 0,date,building_id,daily_consumption,site_id,sqft,sqm,timezone,primaryspaceusage,lat,lng,...,seaLvlPressure,windDirection,windSpeed,is_outlier,month,year,day_of_week,is_weekend,log_daily_consumption,log_sqm
0,2016-01-01,Robin_public_Carolina,2183.75,Robin,118231,10984.0,Europe/London,Public services,51.51879,-0.134556,...,1016.941667,116.666667,4.470833,False,1,2016,4,False,7.689257,9.304286
1,2016-01-02,Robin_public_Carolina,2205.563,Robin,118231,10984.0,Europe/London,Public services,51.51879,-0.134556,...,998.0875,166.666667,7.241667,False,1,2016,5,True,7.699191,9.304286
2,2016-01-03,Robin_public_Carolina,2108.75,Robin,118231,10984.0,Europe/London,Public services,51.51879,-0.134556,...,991.8375,187.916667,5.633333,False,1,2016,6,True,7.654325,9.304286
3,2016-01-04,Robin_public_Carolina,3536.562,Robin,118231,10984.0,Europe/London,Public services,51.51879,-0.134556,...,982.766667,177.083333,4.145833,False,1,2016,0,False,8.171193,9.304286
4,2016-01-05,Robin_public_Carolina,3771.376,Robin,118231,10984.0,Europe/London,Public services,51.51879,-0.134556,...,984.7125,175.416667,2.75,False,1,2016,1,False,8.23546,9.304286


In [10]:
# Check for duplicates 
duplicate_count = df.duplicated().sum()
print(f"\nDuplicate rows found: {duplicate_count}")


Duplicate rows found: 0


In [11]:
# Check for missing values 
missing = df.isna().mean() * 100
print("\n--- Missing Values (% per column) ---")
print(missing[missing > 0].sort_values(ascending=False))


--- Missing Values (% per column) ---
lat               12.977707
lng               12.977707
airTemperature     0.014159
dewTemperature     0.014159
seaLvlPressure     0.014159
windDirection      0.014159
windSpeed          0.014159
dtype: float64


In [12]:
# Drop rows missing date, building_id, or target variable
df = df.dropna(subset=["date", "building_id", "daily_consumption"])

In [15]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918136 entries, 0 to 918135
Data columns (total 23 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   date                   918136 non-null  datetime64[ns]
 1   building_id            918136 non-null  category      
 2   daily_consumption      918136 non-null  float64       
 3   site_id                918136 non-null  object        
 4   sqft                   918136 non-null  int64         
 5   sqm                    918136 non-null  float64       
 6   timezone               918136 non-null  category      
 7   primaryspaceusage      918136 non-null  category      
 8   lat                    798983 non-null  float64       
 9   lng                    798983 non-null  float64       
 10  has_location           918136 non-null  int64         
 11  airTemperature         918006 non-null  float64       
 12  dewTemperature         918006 non-null  floa

In [16]:
# Save consolidated dataset
df.to_parquet("building_energy_cleaned_base.parquet", index=False)
print("\n Saved cleaned base dataset to 'building_energy_cleaned_base.parquet'")


 Saved cleaned base dataset to 'building_energy_cleaned_base.parquet'


In [18]:
df.shape

(918136, 23)

In [20]:
df.head()

Unnamed: 0,date,building_id,daily_consumption,site_id,sqft,sqm,timezone,primaryspaceusage,lat,lng,has_location,airTemperature,dewTemperature,seaLvlPressure,windDirection,windSpeed,is_outlier,log_daily_consumption,log_sqm
0,2016-01-01,Robin_public_Carolina,2183.75,Robin,118231,10984.0,Europe/London,Public services,51.51879,-0.134556,1,5.3875,3.879167,1016.941667,116.666667,4.470833,False,7.689257,9.304286
1,2016-01-02,Robin_public_Carolina,2205.563,Robin,118231,10984.0,Europe/London,Public services,51.51879,-0.134556,1,9.783333,9.183333,998.0875,166.666667,7.241667,False,7.699191,9.304286
2,2016-01-03,Robin_public_Carolina,2108.75,Robin,118231,10984.0,Europe/London,Public services,51.51879,-0.134556,1,7.954167,6.6875,991.8375,187.916667,5.633333,False,7.654325,9.304286
3,2016-01-04,Robin_public_Carolina,3536.562,Robin,118231,10984.0,Europe/London,Public services,51.51879,-0.134556,1,7.841667,6.508333,982.766667,177.083333,4.145833,False,8.171193,9.304286
4,2016-01-05,Robin_public_Carolina,3771.376,Robin,118231,10984.0,Europe/London,Public services,51.51879,-0.134556,1,7.904167,7.058333,984.7125,175.416667,2.75,False,8.23546,9.304286


In [22]:
# Extract time-based features 
df["year"] = df["date"].dt.year
df["month"] = df["date"].dt.month
df["dayofweek"] = df["date"].dt.dayofweek  # Monday=0, Sunday=6
df["dayofyear"] = df["date"].dt.dayofyear
df["is_weekend"] = df["dayofweek"].isin([5, 6]).astype(int)  # 1 if Sat/Sun, else 0

In [23]:
df.head()

Unnamed: 0,date,building_id,daily_consumption,site_id,sqft,sqm,timezone,primaryspaceusage,lat,lng,...,windDirection,windSpeed,is_outlier,log_daily_consumption,log_sqm,year,month,dayofweek,dayofyear,is_weekend
0,2016-01-01,Robin_public_Carolina,2183.75,Robin,118231,10984.0,Europe/London,Public services,51.51879,-0.134556,...,116.666667,4.470833,False,7.689257,9.304286,2016,1,4,1,0
1,2016-01-02,Robin_public_Carolina,2205.563,Robin,118231,10984.0,Europe/London,Public services,51.51879,-0.134556,...,166.666667,7.241667,False,7.699191,9.304286,2016,1,5,2,1
2,2016-01-03,Robin_public_Carolina,2108.75,Robin,118231,10984.0,Europe/London,Public services,51.51879,-0.134556,...,187.916667,5.633333,False,7.654325,9.304286,2016,1,6,3,1
3,2016-01-04,Robin_public_Carolina,3536.562,Robin,118231,10984.0,Europe/London,Public services,51.51879,-0.134556,...,177.083333,4.145833,False,8.171193,9.304286,2016,1,0,4,0
4,2016-01-05,Robin_public_Carolina,3771.376,Robin,118231,10984.0,Europe/London,Public services,51.51879,-0.134556,...,175.416667,2.75,False,8.23546,9.304286,2016,1,1,5,0


In [24]:
# Cyclical encodings for month and dayofweek 
# Helps model cyclic behavior (e.g., month 12 close to month 1)
df["month_sin"] = np.sin(2 * np.pi * df["month"] / 12)
df["month_cos"] = np.cos(2 * np.pi * df["month"] / 12)

df["dow_sin"] = np.sin(2 * np.pi * df["dayofweek"] / 7)
df["dow_cos"] = np.cos(2 * np.pi * df["dayofweek"] / 7)

In [25]:
print(df[["date", "year", "month", "dayofweek", "is_weekend", "month_sin", "month_cos"]].head())

        date  year  month  dayofweek  is_weekend  month_sin  month_cos
0 2016-01-01  2016      1          4           0        0.5   0.866025
1 2016-01-02  2016      1          5           1        0.5   0.866025
2 2016-01-03  2016      1          6           1        0.5   0.866025
3 2016-01-04  2016      1          0           0        0.5   0.866025
4 2016-01-05  2016      1          1           0        0.5   0.866025


In [26]:
# Quick sanity check on ranges 
print("\nFeature Ranges:")
print("Month:", df["month"].unique())
print("Day of Week:", df["dayofweek"].unique())
print("Weekend flag:", df["is_weekend"].unique())


Feature Ranges:
Month: [ 1  2  3  4  5  6  7  8  9 10 11 12]
Day of Week: [4 5 6 0 1 2 3]
Weekend flag: [0 1]


In [27]:
df.to_parquet("building_energy_with_timefeatures.parquet", index=False)
print("Saved dataset with time-based features to 'building_energy_with_timefeatures.parquet'")

Saved dataset with time-based features to 'building_energy_with_timefeatures.parquet'


In [28]:
df.shape

(918136, 28)

In [2]:
# Load  dataset
df = pd.read_parquet("building_energy_with_timefeatures.parquet")

In [6]:
df.head()

Unnamed: 0,date,building_id,daily_consumption,site_id,sqft,sqm,timezone,primaryspaceusage,lat,lng,has_location,airTemperature,dewTemperature,seaLvlPressure,windDirection,windSpeed,is_outlier,log_daily_consumption,log_sqm,year,month,dayofweek,dayofyear,is_weekend,month_sin,month_cos,dow_sin,dow_cos
0,2016-01-01,Robin_public_Carolina,2183.75,Robin,118231,10984.0,Europe/London,Public services,51.51879,-0.134556,1,5.3875,3.879167,1016.941667,116.666667,4.470833,False,7.689257,9.304286,2016,1,4,1,0,0.5,0.866025,-0.433884,-0.900969
1,2016-01-02,Robin_public_Carolina,2205.563,Robin,118231,10984.0,Europe/London,Public services,51.51879,-0.134556,1,9.783333,9.183333,998.0875,166.666667,7.241667,False,7.699191,9.304286,2016,1,5,2,1,0.5,0.866025,-0.974928,-0.222521
2,2016-01-03,Robin_public_Carolina,2108.75,Robin,118231,10984.0,Europe/London,Public services,51.51879,-0.134556,1,7.954167,6.6875,991.8375,187.916667,5.633333,False,7.654325,9.304286,2016,1,6,3,1,0.5,0.866025,-0.781831,0.62349
3,2016-01-04,Robin_public_Carolina,3536.562,Robin,118231,10984.0,Europe/London,Public services,51.51879,-0.134556,1,7.841667,6.508333,982.766667,177.083333,4.145833,False,8.171193,9.304286,2016,1,0,4,0,0.5,0.866025,0.0,1.0
4,2016-01-05,Robin_public_Carolina,3771.376,Robin,118231,10984.0,Europe/London,Public services,51.51879,-0.134556,1,7.904167,7.058333,984.7125,175.416667,2.75,False,8.23546,9.304286,2016,1,1,5,0,0.5,0.866025,0.781831,0.62349


In [5]:
pd.set_option('display.max_columns', None)

In [7]:
df = df.drop(columns=['log_sqm', 'is_outlier', ])

In [8]:
df.head()

Unnamed: 0,date,building_id,daily_consumption,site_id,sqft,sqm,timezone,primaryspaceusage,lat,lng,has_location,airTemperature,dewTemperature,seaLvlPressure,windDirection,windSpeed,log_daily_consumption,year,month,dayofweek,dayofyear,is_weekend,month_sin,month_cos,dow_sin,dow_cos
0,2016-01-01,Robin_public_Carolina,2183.75,Robin,118231,10984.0,Europe/London,Public services,51.51879,-0.134556,1,5.3875,3.879167,1016.941667,116.666667,4.470833,7.689257,2016,1,4,1,0,0.5,0.866025,-0.433884,-0.900969
1,2016-01-02,Robin_public_Carolina,2205.563,Robin,118231,10984.0,Europe/London,Public services,51.51879,-0.134556,1,9.783333,9.183333,998.0875,166.666667,7.241667,7.699191,2016,1,5,2,1,0.5,0.866025,-0.974928,-0.222521
2,2016-01-03,Robin_public_Carolina,2108.75,Robin,118231,10984.0,Europe/London,Public services,51.51879,-0.134556,1,7.954167,6.6875,991.8375,187.916667,5.633333,7.654325,2016,1,6,3,1,0.5,0.866025,-0.781831,0.62349
3,2016-01-04,Robin_public_Carolina,3536.562,Robin,118231,10984.0,Europe/London,Public services,51.51879,-0.134556,1,7.841667,6.508333,982.766667,177.083333,4.145833,8.171193,2016,1,0,4,0,0.5,0.866025,0.0,1.0
4,2016-01-05,Robin_public_Carolina,3771.376,Robin,118231,10984.0,Europe/London,Public services,51.51879,-0.134556,1,7.904167,7.058333,984.7125,175.416667,2.75,8.23546,2016,1,1,5,0,0.5,0.866025,0.781831,0.62349


In [9]:
# Log-transform building size 
df["log_sqm"] = np.log1p(df["sqm"])  # log(1 + sqm) avoids log(0)
print("\nAdded log-transformed building size.")

# Derived metric: Consumption per sqm 
df["consumption_per_sqm"] = df["daily_consumption"] / df["sqm"].replace(0, np.nan)
df["log_consumption_per_sqm"] = np.log1p(df["consumption_per_sqm"].fillna(0))


Added log-transformed building size.


In [10]:
# Handle categorical encoding 
categorical_cols = ["primaryspaceusage", "timezone"]

df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
print("\nEncoded categorical variables:", categorical_cols)


Encoded categorical variables: ['primaryspaceusage', 'timezone']


In [11]:
df.head()

Unnamed: 0,date,building_id,daily_consumption,site_id,sqft,sqm,lat,lng,has_location,airTemperature,dewTemperature,seaLvlPressure,windDirection,windSpeed,log_daily_consumption,year,month,dayofweek,dayofyear,is_weekend,month_sin,month_cos,dow_sin,dow_cos,log_sqm,consumption_per_sqm,log_consumption_per_sqm,primaryspaceusage_Entertainment/public assembly,primaryspaceusage_Food sales and service,primaryspaceusage_Healthcare,primaryspaceusage_Lodging/residential,primaryspaceusage_Manufacturing/industrial,primaryspaceusage_Office,primaryspaceusage_Other,primaryspaceusage_Parking,primaryspaceusage_Public services,primaryspaceusage_Religious worship,primaryspaceusage_Retail,primaryspaceusage_Services,primaryspaceusage_Technology/science,primaryspaceusage_Utility,primaryspaceusage_Warehouse/storage,timezone_Europe/London,timezone_US/Central,timezone_US/Eastern,timezone_US/Mountain,timezone_US/Pacific
0,2016-01-01,Robin_public_Carolina,2183.75,Robin,118231,10984.0,51.51879,-0.134556,1,5.3875,3.879167,1016.941667,116.666667,4.470833,7.689257,2016,1,4,1,0,0.5,0.866025,-0.433884,-0.900969,9.304286,0.198812,0.181331,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False
1,2016-01-02,Robin_public_Carolina,2205.563,Robin,118231,10984.0,51.51879,-0.134556,1,9.783333,9.183333,998.0875,166.666667,7.241667,7.699191,2016,1,5,2,1,0.5,0.866025,-0.974928,-0.222521,9.304286,0.200798,0.182986,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False
2,2016-01-03,Robin_public_Carolina,2108.75,Robin,118231,10984.0,51.51879,-0.134556,1,7.954167,6.6875,991.8375,187.916667,5.633333,7.654325,2016,1,6,3,1,0.5,0.866025,-0.781831,0.62349,9.304286,0.191984,0.175619,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False
3,2016-01-04,Robin_public_Carolina,3536.562,Robin,118231,10984.0,51.51879,-0.134556,1,7.841667,6.508333,982.766667,177.083333,4.145833,8.171193,2016,1,0,4,0,0.5,0.866025,0.0,1.0,9.304286,0.321974,0.279126,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False
4,2016-01-05,Robin_public_Carolina,3771.376,Robin,118231,10984.0,51.51879,-0.134556,1,7.904167,7.058333,984.7125,175.416667,2.75,8.23546,2016,1,1,5,0,0.5,0.866025,0.781831,0.62349,9.304286,0.343352,0.295168,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False


In [12]:
# Flag known high-consumption outlier buildings 
outlier_buildings = [
    "Hog_education_Janell", "Fox_education_Willis", "Rat_office_Colby",
    "Eagle_education_Peter", "Bear_education_Wilton", "Gator_assembly_Kimberly",
    "Hog_office_Nia", "Rat_health_Guy", "Bull_education_Dottie",
    "Bear_education_Bulah", "Hog_office_Cornell", "Moose_education_Abbie",
    "Moose_education_Gladys"
]

In [13]:
df["is_outlier_building"] = df["building_id"].isin(outlier_buildings).astype(int)
print("Flagged outlier buildings (binary indicator).")

Flagged outlier buildings (binary indicator).


In [14]:
df.head()

Unnamed: 0,date,building_id,daily_consumption,site_id,sqft,sqm,lat,lng,has_location,airTemperature,dewTemperature,seaLvlPressure,windDirection,windSpeed,log_daily_consumption,year,month,dayofweek,dayofyear,is_weekend,month_sin,month_cos,dow_sin,dow_cos,log_sqm,consumption_per_sqm,log_consumption_per_sqm,primaryspaceusage_Entertainment/public assembly,primaryspaceusage_Food sales and service,primaryspaceusage_Healthcare,primaryspaceusage_Lodging/residential,primaryspaceusage_Manufacturing/industrial,primaryspaceusage_Office,primaryspaceusage_Other,primaryspaceusage_Parking,primaryspaceusage_Public services,primaryspaceusage_Religious worship,primaryspaceusage_Retail,primaryspaceusage_Services,primaryspaceusage_Technology/science,primaryspaceusage_Utility,primaryspaceusage_Warehouse/storage,timezone_Europe/London,timezone_US/Central,timezone_US/Eastern,timezone_US/Mountain,timezone_US/Pacific,is_outlier_building
0,2016-01-01,Robin_public_Carolina,2183.75,Robin,118231,10984.0,51.51879,-0.134556,1,5.3875,3.879167,1016.941667,116.666667,4.470833,7.689257,2016,1,4,1,0,0.5,0.866025,-0.433884,-0.900969,9.304286,0.198812,0.181331,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,0
1,2016-01-02,Robin_public_Carolina,2205.563,Robin,118231,10984.0,51.51879,-0.134556,1,9.783333,9.183333,998.0875,166.666667,7.241667,7.699191,2016,1,5,2,1,0.5,0.866025,-0.974928,-0.222521,9.304286,0.200798,0.182986,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,0
2,2016-01-03,Robin_public_Carolina,2108.75,Robin,118231,10984.0,51.51879,-0.134556,1,7.954167,6.6875,991.8375,187.916667,5.633333,7.654325,2016,1,6,3,1,0.5,0.866025,-0.781831,0.62349,9.304286,0.191984,0.175619,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,0
3,2016-01-04,Robin_public_Carolina,3536.562,Robin,118231,10984.0,51.51879,-0.134556,1,7.841667,6.508333,982.766667,177.083333,4.145833,8.171193,2016,1,0,4,0,0.5,0.866025,0.0,1.0,9.304286,0.321974,0.279126,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,0
4,2016-01-05,Robin_public_Carolina,3771.376,Robin,118231,10984.0,51.51879,-0.134556,1,7.904167,7.058333,984.7125,175.416667,2.75,8.23546,2016,1,1,5,0,0.5,0.866025,0.781831,0.62349,9.304286,0.343352,0.295168,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,0


In [15]:
# Verify 
print(" Building feature engineering complete!")
print(df[["building_id", "sqm", "log_sqm", "consumption_per_sqm", "is_outlier_building"]].head())

 Building feature engineering complete!
             building_id      sqm   log_sqm  consumption_per_sqm  \
0  Robin_public_Carolina  10984.0  9.304286             0.198812   
1  Robin_public_Carolina  10984.0  9.304286             0.200798   
2  Robin_public_Carolina  10984.0  9.304286             0.191984   
3  Robin_public_Carolina  10984.0  9.304286             0.321974   
4  Robin_public_Carolina  10984.0  9.304286             0.343352   

   is_outlier_building  
0                    0  
1                    0  
2                    0  
3                    0  
4                    0  


In [16]:
df.shape

(918136, 48)

In [19]:
df.columns[-21:]

Index(['primaryspaceusage_Entertainment/public assembly',
       'primaryspaceusage_Food sales and service',
       'primaryspaceusage_Healthcare', 'primaryspaceusage_Lodging/residential',
       'primaryspaceusage_Manufacturing/industrial',
       'primaryspaceusage_Office', 'primaryspaceusage_Other',
       'primaryspaceusage_Parking', 'primaryspaceusage_Public services',
       'primaryspaceusage_Religious worship', 'primaryspaceusage_Retail',
       'primaryspaceusage_Services', 'primaryspaceusage_Technology/science',
       'primaryspaceusage_Utility', 'primaryspaceusage_Warehouse/storage',
       'timezone_Europe/London', 'timezone_US/Central', 'timezone_US/Eastern',
       'timezone_US/Mountain', 'timezone_US/Pacific', 'is_outlier_building'],
      dtype='object')

In [20]:
# Identify boolean columns
bool_cols = df.select_dtypes(include=['bool']).columns

# Convert all bool columns to int8 (0/1)
df[bool_cols] = df[bool_cols].astype('int8')

In [21]:
df.head()

Unnamed: 0,date,building_id,daily_consumption,site_id,sqft,sqm,lat,lng,has_location,airTemperature,dewTemperature,seaLvlPressure,windDirection,windSpeed,log_daily_consumption,year,month,dayofweek,dayofyear,is_weekend,month_sin,month_cos,dow_sin,dow_cos,log_sqm,consumption_per_sqm,log_consumption_per_sqm,primaryspaceusage_Entertainment/public assembly,primaryspaceusage_Food sales and service,primaryspaceusage_Healthcare,primaryspaceusage_Lodging/residential,primaryspaceusage_Manufacturing/industrial,primaryspaceusage_Office,primaryspaceusage_Other,primaryspaceusage_Parking,primaryspaceusage_Public services,primaryspaceusage_Religious worship,primaryspaceusage_Retail,primaryspaceusage_Services,primaryspaceusage_Technology/science,primaryspaceusage_Utility,primaryspaceusage_Warehouse/storage,timezone_Europe/London,timezone_US/Central,timezone_US/Eastern,timezone_US/Mountain,timezone_US/Pacific,is_outlier_building
0,2016-01-01,Robin_public_Carolina,2183.75,Robin,118231,10984.0,51.51879,-0.134556,1,5.3875,3.879167,1016.941667,116.666667,4.470833,7.689257,2016,1,4,1,0,0.5,0.866025,-0.433884,-0.900969,9.304286,0.198812,0.181331,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0
1,2016-01-02,Robin_public_Carolina,2205.563,Robin,118231,10984.0,51.51879,-0.134556,1,9.783333,9.183333,998.0875,166.666667,7.241667,7.699191,2016,1,5,2,1,0.5,0.866025,-0.974928,-0.222521,9.304286,0.200798,0.182986,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0
2,2016-01-03,Robin_public_Carolina,2108.75,Robin,118231,10984.0,51.51879,-0.134556,1,7.954167,6.6875,991.8375,187.916667,5.633333,7.654325,2016,1,6,3,1,0.5,0.866025,-0.781831,0.62349,9.304286,0.191984,0.175619,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0
3,2016-01-04,Robin_public_Carolina,3536.562,Robin,118231,10984.0,51.51879,-0.134556,1,7.841667,6.508333,982.766667,177.083333,4.145833,8.171193,2016,1,0,4,0,0.5,0.866025,0.0,1.0,9.304286,0.321974,0.279126,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0
4,2016-01-05,Robin_public_Carolina,3771.376,Robin,118231,10984.0,51.51879,-0.134556,1,7.904167,7.058333,984.7125,175.416667,2.75,8.23546,2016,1,1,5,0,0.5,0.866025,0.781831,0.62349,9.304286,0.343352,0.295168,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0


In [25]:
df  = df.drop(columns=["sqft"], errors="ignore")

In [26]:
df.head()

Unnamed: 0,date,building_id,daily_consumption,site_id,sqm,lat,lng,has_location,airTemperature,dewTemperature,seaLvlPressure,windDirection,windSpeed,log_daily_consumption,year,month,dayofweek,dayofyear,is_weekend,month_sin,month_cos,dow_sin,dow_cos,log_sqm,consumption_per_sqm,log_consumption_per_sqm,primaryspaceusage_Entertainment/public assembly,primaryspaceusage_Food sales and service,primaryspaceusage_Healthcare,primaryspaceusage_Lodging/residential,primaryspaceusage_Manufacturing/industrial,primaryspaceusage_Office,primaryspaceusage_Other,primaryspaceusage_Parking,primaryspaceusage_Public services,primaryspaceusage_Religious worship,primaryspaceusage_Retail,primaryspaceusage_Services,primaryspaceusage_Technology/science,primaryspaceusage_Utility,primaryspaceusage_Warehouse/storage,timezone_Europe/London,timezone_US/Central,timezone_US/Eastern,timezone_US/Mountain,timezone_US/Pacific,is_outlier_building
0,2016-01-01,Robin_public_Carolina,2183.75,Robin,10984.0,51.51879,-0.134556,1,5.3875,3.879167,1016.941667,116.666667,4.470833,7.689257,2016,1,4,1,0,0.5,0.866025,-0.433884,-0.900969,9.304286,0.198812,0.181331,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0
1,2016-01-02,Robin_public_Carolina,2205.563,Robin,10984.0,51.51879,-0.134556,1,9.783333,9.183333,998.0875,166.666667,7.241667,7.699191,2016,1,5,2,1,0.5,0.866025,-0.974928,-0.222521,9.304286,0.200798,0.182986,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0
2,2016-01-03,Robin_public_Carolina,2108.75,Robin,10984.0,51.51879,-0.134556,1,7.954167,6.6875,991.8375,187.916667,5.633333,7.654325,2016,1,6,3,1,0.5,0.866025,-0.781831,0.62349,9.304286,0.191984,0.175619,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0
3,2016-01-04,Robin_public_Carolina,3536.562,Robin,10984.0,51.51879,-0.134556,1,7.841667,6.508333,982.766667,177.083333,4.145833,8.171193,2016,1,0,4,0,0.5,0.866025,0.0,1.0,9.304286,0.321974,0.279126,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0
4,2016-01-05,Robin_public_Carolina,3771.376,Robin,10984.0,51.51879,-0.134556,1,7.904167,7.058333,984.7125,175.416667,2.75,8.23546,2016,1,1,5,0,0.5,0.866025,0.781831,0.62349,9.304286,0.343352,0.295168,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0


In [23]:
df.to_parquet("building_energy_with_buildingfeatures_org_siteid.parquet", index=False)
print("Saved dataset with buildinf-based features to 'building_energy_with_buildingfeatures_org_siteid.parquet' site id not encoded")

Saved dataset with buildinf-based features to 'building_energy_with_buildingfeatures_org_siteid.parquet' site id not encoded


In [29]:
# Encode site_id 
# If site_id has few unique values, use one-hot encoding:
if df["site_id"].nunique() <= 20:
    df = pd.get_dummies(df, columns=["site_id"], drop_first=True)
else:
    # For many sites, use mean encoding
    site_encoding = df.groupby("site_id")["log_daily_consumption"].transform("mean")
    df["site_id_encoded"] = site_encoding
    df.drop(columns=["site_id"], inplace=True)

In [30]:
# Ensure boolean columns are numeric (0/1) 
bool_cols = df.select_dtypes(include="bool").columns
df[bool_cols] = df[bool_cols].astype(int)

In [31]:
df.head()

Unnamed: 0,date,building_id,daily_consumption,sqm,lat,lng,has_location,airTemperature,dewTemperature,seaLvlPressure,windDirection,windSpeed,log_daily_consumption,year,month,dayofweek,dayofyear,is_weekend,month_sin,month_cos,dow_sin,dow_cos,log_sqm,consumption_per_sqm,log_consumption_per_sqm,primaryspaceusage_Entertainment/public assembly,primaryspaceusage_Food sales and service,primaryspaceusage_Healthcare,primaryspaceusage_Lodging/residential,primaryspaceusage_Manufacturing/industrial,primaryspaceusage_Office,primaryspaceusage_Other,primaryspaceusage_Parking,primaryspaceusage_Public services,primaryspaceusage_Religious worship,primaryspaceusage_Retail,primaryspaceusage_Services,primaryspaceusage_Technology/science,primaryspaceusage_Utility,primaryspaceusage_Warehouse/storage,timezone_Europe/London,timezone_US/Central,timezone_US/Eastern,timezone_US/Mountain,timezone_US/Pacific,is_outlier_building,site_id_Bobcat,site_id_Bull,site_id_Cockatoo,site_id_Crow,site_id_Eagle,site_id_Fox,site_id_Gator,site_id_Hog,site_id_Lamb,site_id_Moose,site_id_Mouse,site_id_Peacock,site_id_Rat,site_id_Robin,site_id_Wolf
0,2016-01-01,Robin_public_Carolina,2183.75,10984.0,51.51879,-0.134556,1,5.3875,3.879167,1016.941667,116.666667,4.470833,7.689257,2016,1,4,1,0,0.5,0.866025,-0.433884,-0.900969,9.304286,0.198812,0.181331,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1,2016-01-02,Robin_public_Carolina,2205.563,10984.0,51.51879,-0.134556,1,9.783333,9.183333,998.0875,166.666667,7.241667,7.699191,2016,1,5,2,1,0.5,0.866025,-0.974928,-0.222521,9.304286,0.200798,0.182986,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2,2016-01-03,Robin_public_Carolina,2108.75,10984.0,51.51879,-0.134556,1,7.954167,6.6875,991.8375,187.916667,5.633333,7.654325,2016,1,6,3,1,0.5,0.866025,-0.781831,0.62349,9.304286,0.191984,0.175619,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
3,2016-01-04,Robin_public_Carolina,3536.562,10984.0,51.51879,-0.134556,1,7.841667,6.508333,982.766667,177.083333,4.145833,8.171193,2016,1,0,4,0,0.5,0.866025,0.0,1.0,9.304286,0.321974,0.279126,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,2016-01-05,Robin_public_Carolina,3771.376,10984.0,51.51879,-0.134556,1,7.904167,7.058333,984.7125,175.416667,2.75,8.23546,2016,1,1,5,0,0.5,0.866025,0.781831,0.62349,9.304286,0.343352,0.295168,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [32]:
df.shape

(918136, 61)

In [33]:
df.to_parquet("building_energy_with_buildingfeatures.parquet", index=False)
print("Saved dataset with building-based features to 'building_energy_with_buildingfeatures.parquet' site id not encoded")

Saved dataset with building-based features to 'building_energy_with_buildingfeatures.parquet' site id not encoded


In [2]:
# Load  dataset
df = pd.read_parquet("building_energy_with_buildingfeatures.parquet")

In [3]:
# Temperature difference (proxy for humidity)
df["temp_diff"] = df["airTemperature"] - df["dewTemperature"]

In [3]:
# Load  dataset
df = pd.read_parquet("building_energy_with_buildingfeatures_org_siteid.parquet")

In [4]:
df.head()

Unnamed: 0,date,building_id,daily_consumption,site_id,sqm,lat,lng,has_location,airTemperature,dewTemperature,...,primaryspaceusage_Services,primaryspaceusage_Technology/science,primaryspaceusage_Utility,primaryspaceusage_Warehouse/storage,timezone_Europe/London,timezone_US/Central,timezone_US/Eastern,timezone_US/Mountain,timezone_US/Pacific,is_outlier_building
0,2016-01-01,Robin_public_Carolina,2183.75,Robin,10984.0,51.51879,-0.134556,1,5.3875,3.879167,...,0,0,0,0,1,0,0,0,0,0
1,2016-01-02,Robin_public_Carolina,2205.563,Robin,10984.0,51.51879,-0.134556,1,9.783333,9.183333,...,0,0,0,0,1,0,0,0,0,0
2,2016-01-03,Robin_public_Carolina,2108.75,Robin,10984.0,51.51879,-0.134556,1,7.954167,6.6875,...,0,0,0,0,1,0,0,0,0,0
3,2016-01-04,Robin_public_Carolina,3536.562,Robin,10984.0,51.51879,-0.134556,1,7.841667,6.508333,...,0,0,0,0,1,0,0,0,0,0
4,2016-01-05,Robin_public_Carolina,3771.376,Robin,10984.0,51.51879,-0.134556,1,7.904167,7.058333,...,0,0,0,0,1,0,0,0,0,0


In [5]:
df["site_id_orig"] = df["site_id"]

In [6]:
df.head()

Unnamed: 0,date,building_id,daily_consumption,site_id,sqm,lat,lng,has_location,airTemperature,dewTemperature,...,primaryspaceusage_Technology/science,primaryspaceusage_Utility,primaryspaceusage_Warehouse/storage,timezone_Europe/London,timezone_US/Central,timezone_US/Eastern,timezone_US/Mountain,timezone_US/Pacific,is_outlier_building,site_id_orig
0,2016-01-01,Robin_public_Carolina,2183.75,Robin,10984.0,51.51879,-0.134556,1,5.3875,3.879167,...,0,0,0,1,0,0,0,0,0,Robin
1,2016-01-02,Robin_public_Carolina,2205.563,Robin,10984.0,51.51879,-0.134556,1,9.783333,9.183333,...,0,0,0,1,0,0,0,0,0,Robin
2,2016-01-03,Robin_public_Carolina,2108.75,Robin,10984.0,51.51879,-0.134556,1,7.954167,6.6875,...,0,0,0,1,0,0,0,0,0,Robin
3,2016-01-04,Robin_public_Carolina,3536.562,Robin,10984.0,51.51879,-0.134556,1,7.841667,6.508333,...,0,0,0,1,0,0,0,0,0,Robin
4,2016-01-05,Robin_public_Carolina,3771.376,Robin,10984.0,51.51879,-0.134556,1,7.904167,7.058333,...,0,0,0,1,0,0,0,0,0,Robin


In [7]:
# Temperature difference (proxy for humidity)
df["temp_diff"] = df["airTemperature"] - df["dewTemperature"]

In [8]:
# Rolling mean temperature (3-day window) per site
df = df.sort_values(["site_id_orig", "date"])
df["rolling_temp_mean_3d"] = (
    df.groupby("site_id_orig")["airTemperature"]
    .transform(lambda x: x.rolling(window=3, min_periods=1).mean())
)

In [9]:
# Monthly temperature anomaly (relative to monthly mean per site)
monthly_means = (
    df.groupby(["site_id_orig", "month"])["airTemperature"]
    .transform("mean")
)

In [10]:
df["monthly_temp_anomaly"] = df["airTemperature"] - monthly_means

In [11]:
# Rolling temperature volatility (3-day std)
df["rolling_temp_std_3d"] = (
    df.groupby("site_id_orig")["airTemperature"]
    .transform(lambda x: x.rolling(window=3, min_periods=1).std())
)

In [12]:
df.drop(columns=["rolling_temp_std_3d"], inplace=True)

In [13]:
# Encode site_id 
# If site_id has few unique values, use one-hot encoding:
if df["site_id"].nunique() <= 20:
    df = pd.get_dummies(df, columns=["site_id"], drop_first=True)
else:
    # For many sites, use mean encoding
    site_encoding = df.groupby("site_id")["log_daily_consumption"].transform("mean")
    df["site_id_encoded"] = site_encoding
    df.drop(columns=["site_id"], inplace=True)

In [14]:
# Ensure boolean columns are numeric (0/1) 
bool_cols = df.select_dtypes(include="bool").columns
df[bool_cols] = df[bool_cols].astype(int)

In [15]:
df.head()

Unnamed: 0,date,building_id,daily_consumption,sqm,lat,lng,has_location,airTemperature,dewTemperature,seaLvlPressure,...,site_id_Fox,site_id_Gator,site_id_Hog,site_id_Lamb,site_id_Moose,site_id_Mouse,site_id_Peacock,site_id_Rat,site_id_Robin,site_id_Wolf
323102,2016-01-01,Bear_public_Orville,300.5108,2700.0,37.871903,-122.260729,1,6.175,-5.229167,1020.891667,...,0,0,0,0,0,0,0,0,0,0
323833,2016-01-01,Bear_education_Lidia,189.5,2824.3,37.871903,-122.260729,1,6.175,-5.229167,1020.891667,...,0,0,0,0,0,0,0,0,0,0
324564,2016-01-01,Bear_education_Nanette,2625.9552,6150.5,37.871903,-122.260729,1,6.175,-5.229167,1020.891667,...,0,0,0,0,0,0,0,0,0,0
325295,2016-01-01,Bear_education_Lewis,371.6389,4588.3,37.871903,-122.260729,1,6.175,-5.229167,1020.891667,...,0,0,0,0,0,0,0,0,0,0
326026,2016-01-01,Bear_science_Alison,1801.95,4945.6,37.871903,-122.260729,1,6.175,-5.229167,1020.891667,...,0,0,0,0,0,0,0,0,0,0


In [16]:
pd.set_option('display.max_columns', None)

In [17]:
df.head()

Unnamed: 0,date,building_id,daily_consumption,sqm,lat,lng,has_location,airTemperature,dewTemperature,seaLvlPressure,windDirection,windSpeed,log_daily_consumption,year,month,dayofweek,dayofyear,is_weekend,month_sin,month_cos,dow_sin,dow_cos,log_sqm,consumption_per_sqm,log_consumption_per_sqm,primaryspaceusage_Entertainment/public assembly,primaryspaceusage_Food sales and service,primaryspaceusage_Healthcare,primaryspaceusage_Lodging/residential,primaryspaceusage_Manufacturing/industrial,primaryspaceusage_Office,primaryspaceusage_Other,primaryspaceusage_Parking,primaryspaceusage_Public services,primaryspaceusage_Religious worship,primaryspaceusage_Retail,primaryspaceusage_Services,primaryspaceusage_Technology/science,primaryspaceusage_Utility,primaryspaceusage_Warehouse/storage,timezone_Europe/London,timezone_US/Central,timezone_US/Eastern,timezone_US/Mountain,timezone_US/Pacific,is_outlier_building,site_id_orig,temp_diff,rolling_temp_mean_3d,monthly_temp_anomaly,site_id_Bobcat,site_id_Bull,site_id_Cockatoo,site_id_Crow,site_id_Eagle,site_id_Fox,site_id_Gator,site_id_Hog,site_id_Lamb,site_id_Moose,site_id_Mouse,site_id_Peacock,site_id_Rat,site_id_Robin,site_id_Wolf
323102,2016-01-01,Bear_public_Orville,300.5108,2700.0,37.871903,-122.260729,1,6.175,-5.229167,1020.891667,68.75,3.070833,5.708806,2016,1,4,1,0,0.5,0.866025,-0.433884,-0.900969,7.901377,0.1113,0.105531,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,Bear,11.404167,6.175,-4.737231,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
323833,2016-01-01,Bear_education_Lidia,189.5,2824.3,37.871903,-122.260729,1,6.175,-5.229167,1020.891667,68.75,3.070833,5.249652,2016,1,4,1,0,0.5,0.866025,-0.433884,-0.900969,7.94637,0.067096,0.064941,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,Bear,11.404167,6.175,-4.737231,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
324564,2016-01-01,Bear_education_Nanette,2625.9552,6150.5,37.871903,-122.260729,1,6.175,-5.229167,1020.891667,68.75,3.070833,7.873581,2016,1,4,1,0,0.5,0.866025,-0.433884,-0.900969,8.724451,0.42695,0.355539,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,Bear,11.404167,6.175,-4.737231,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
325295,2016-01-01,Bear_education_Lewis,371.6389,4588.3,37.871903,-122.260729,1,6.175,-5.229167,1020.891667,68.75,3.070833,5.92061,2016,1,4,1,0,0.5,0.866025,-0.433884,-0.900969,8.431483,0.080997,0.077884,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,Bear,11.404167,6.175,-4.737231,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
326026,2016-01-01,Bear_science_Alison,1801.95,4945.6,37.871903,-122.260729,1,6.175,-5.229167,1020.891667,68.75,3.070833,7.497179,2016,1,4,1,0,0.5,0.866025,-0.433884,-0.900969,8.506456,0.364354,0.310681,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,Bear,11.404167,6.175,-4.737231,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [18]:
df.to_parquet("building_energy_with_buildingfeatures.parquet", index=False)
print("Saved dataset with building-based features to 'building_energy_with_buildingfeatures.parquet' With initial temp features")

Saved dataset with building-based features to 'building_energy_with_buildingfeatures.parquet' With initial temp features


In [3]:
# Load  dataset
df = pd.read_parquet("building_energy_with_buildingfeatures.parquet")

In [4]:
df.head()

Unnamed: 0,date,building_id,daily_consumption,sqm,lat,lng,has_location,airTemperature,dewTemperature,seaLvlPressure,...,site_id_Fox,site_id_Gator,site_id_Hog,site_id_Lamb,site_id_Moose,site_id_Mouse,site_id_Peacock,site_id_Rat,site_id_Robin,site_id_Wolf
0,2016-01-01,Bear_public_Orville,300.5108,2700.0,37.871903,-122.260729,1,6.175,-5.229167,1020.891667,...,0,0,0,0,0,0,0,0,0,0
1,2016-01-01,Bear_education_Lidia,189.5,2824.3,37.871903,-122.260729,1,6.175,-5.229167,1020.891667,...,0,0,0,0,0,0,0,0,0,0
2,2016-01-01,Bear_education_Nanette,2625.9552,6150.5,37.871903,-122.260729,1,6.175,-5.229167,1020.891667,...,0,0,0,0,0,0,0,0,0,0
3,2016-01-01,Bear_education_Lewis,371.6389,4588.3,37.871903,-122.260729,1,6.175,-5.229167,1020.891667,...,0,0,0,0,0,0,0,0,0,0
4,2016-01-01,Bear_science_Alison,1801.95,4945.6,37.871903,-122.260729,1,6.175,-5.229167,1020.891667,...,0,0,0,0,0,0,0,0,0,0


In [5]:
# Avoid division by zero
df["relative_humidity"] = 100 * (
    np.exp((17.625 * df["dewTemperature"]) / (243.04 + df["dewTemperature"])) /
    np.exp((17.625 * df["airTemperature"]) / (243.04 + df["airTemperature"]))
)

# Clip to valid range [0, 100]
df["relative_humidity"] = df["relative_humidity"].clip(0, 100)


In [6]:
df["wind_power"] = df["windSpeed"] ** 2

In [7]:
# convert wind speed and direction into Cartesian components:
df["wind_u"] = df["windSpeed"] * np.cos(np.deg2rad(df["windDirection"]))
df["wind_v"] = df["windSpeed"] * np.sin(np.deg2rad(df["windDirection"]))

In [8]:
pd.set_option('display.max_columns', None)

In [9]:
df.head()

Unnamed: 0,date,building_id,daily_consumption,sqm,lat,lng,has_location,airTemperature,dewTemperature,seaLvlPressure,windDirection,windSpeed,log_daily_consumption,year,month,dayofweek,dayofyear,is_weekend,month_sin,month_cos,dow_sin,dow_cos,log_sqm,consumption_per_sqm,log_consumption_per_sqm,primaryspaceusage_Entertainment/public assembly,primaryspaceusage_Food sales and service,primaryspaceusage_Healthcare,primaryspaceusage_Lodging/residential,primaryspaceusage_Manufacturing/industrial,primaryspaceusage_Office,primaryspaceusage_Other,primaryspaceusage_Parking,primaryspaceusage_Public services,primaryspaceusage_Religious worship,primaryspaceusage_Retail,primaryspaceusage_Services,primaryspaceusage_Technology/science,primaryspaceusage_Utility,primaryspaceusage_Warehouse/storage,timezone_Europe/London,timezone_US/Central,timezone_US/Eastern,timezone_US/Mountain,timezone_US/Pacific,is_outlier_building,site_id_orig,temp_diff,rolling_temp_mean_3d,monthly_temp_anomaly,site_id_Bobcat,site_id_Bull,site_id_Cockatoo,site_id_Crow,site_id_Eagle,site_id_Fox,site_id_Gator,site_id_Hog,site_id_Lamb,site_id_Moose,site_id_Mouse,site_id_Peacock,site_id_Rat,site_id_Robin,site_id_Wolf,relative_humidity,wind_power,wind_u,wind_v
0,2016-01-01,Bear_public_Orville,300.5108,2700.0,37.871903,-122.260729,1,6.175,-5.229167,1020.891667,68.75,3.070833,5.708806,2016,1,4,1,0,0.5,0.866025,-0.433884,-0.900969,7.901377,0.1113,0.105531,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,Bear,11.404167,6.175,-4.737231,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,43.855907,9.430017,1.112987,2.862041
1,2016-01-01,Bear_education_Lidia,189.5,2824.3,37.871903,-122.260729,1,6.175,-5.229167,1020.891667,68.75,3.070833,5.249652,2016,1,4,1,0,0.5,0.866025,-0.433884,-0.900969,7.94637,0.067096,0.064941,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,Bear,11.404167,6.175,-4.737231,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,43.855907,9.430017,1.112987,2.862041
2,2016-01-01,Bear_education_Nanette,2625.9552,6150.5,37.871903,-122.260729,1,6.175,-5.229167,1020.891667,68.75,3.070833,7.873581,2016,1,4,1,0,0.5,0.866025,-0.433884,-0.900969,8.724451,0.42695,0.355539,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,Bear,11.404167,6.175,-4.737231,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,43.855907,9.430017,1.112987,2.862041
3,2016-01-01,Bear_education_Lewis,371.6389,4588.3,37.871903,-122.260729,1,6.175,-5.229167,1020.891667,68.75,3.070833,5.92061,2016,1,4,1,0,0.5,0.866025,-0.433884,-0.900969,8.431483,0.080997,0.077884,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,Bear,11.404167,6.175,-4.737231,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,43.855907,9.430017,1.112987,2.862041
4,2016-01-01,Bear_science_Alison,1801.95,4945.6,37.871903,-122.260729,1,6.175,-5.229167,1020.891667,68.75,3.070833,7.497179,2016,1,4,1,0,0.5,0.866025,-0.433884,-0.900969,8.506456,0.364354,0.310681,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,Bear,11.404167,6.175,-4.737231,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,43.855907,9.430017,1.112987,2.862041


In [10]:
df.shape

(918136, 69)

In [11]:
df.to_parquet("building_energy_with_complete_feature_eng.parquet", index=False)
print("Saved dataset with complete feature engineering to building_energy_with_complete_feature_eng.parquet")

Saved dataset with complete feature engineering to building_energy_with_complete_feature_eng.parquet
