## 1. Introduction <a id='1'></a>

The main task for the notebook is to perform feature engineering for the cleaned and merged datatset. Produce training and testing dataset.

## 2. Feature Engineering <a id='2'></a>

### 2.1 Imports <a id='2.1'></a>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

import matplotlib as mpl
import matplotlib.pylab as pylab
import matplotlib.patches as mpatches
from matplotlib.gridspec import GridSpec

# Metrics & Models
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import tree, metrics
%matplotlib inline

import pandas as pd
import numpy as np
import os
import pickle
from sklearn import __version__ as sklearn_version
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
import datetime



In [2]:
path = 'data/'
file_name = 'mlmodeldata.csv'
full_path = path + file_name
df = pd.read_csv(full_path, index_col=[0])

In [3]:
df.head()

Unnamed: 0,building_id,loc_id,subUsage,sqm,Unnamed: 0_x,index,timestamp,meter_reading,Unnamed: 0_y,site_id,airTemperature,dewTemperature,seaLvlPressure,windDirection,windSpeed
0,789,7.0,College Classroom,6000.0,0,0,2016-01-01 00:00:00,516.43,17544,Ottawa,-2.2,-3.6,1014.5,270.0,2.1
1,789,7.0,College Classroom,6000.0,1,1,2016-01-01 01:00:00,519.4,17545,Ottawa,-2.3,-3.8,1014.5,260.0,2.1
2,789,7.0,College Classroom,6000.0,2,2,2016-01-01 02:00:00,501.68,17546,Ottawa,-2.8,-4.0,1014.0,260.0,1.5
3,789,7.0,College Classroom,6000.0,3,3,2016-01-01 03:00:00,507.87,17547,Ottawa,-3.0,-3.8,1014.0,260.0,1.5
4,789,7.0,College Classroom,6000.0,4,4,2016-01-01 04:00:00,509.45,17548,Ottawa,-3.3,-3.8,1013.5,230.0,1.0


In [4]:
# columns to be removed
columns = ['loc_id', 'Unnamed: 0_x', 'index', 'Unnamed: 0_y']

df = df.drop(columns=columns)

In [5]:
df.head()

Unnamed: 0,building_id,subUsage,sqm,timestamp,meter_reading,site_id,airTemperature,dewTemperature,seaLvlPressure,windDirection,windSpeed
0,789,College Classroom,6000.0,2016-01-01 00:00:00,516.43,Ottawa,-2.2,-3.6,1014.5,270.0,2.1
1,789,College Classroom,6000.0,2016-01-01 01:00:00,519.4,Ottawa,-2.3,-3.8,1014.5,260.0,2.1
2,789,College Classroom,6000.0,2016-01-01 02:00:00,501.68,Ottawa,-2.8,-4.0,1014.0,260.0,1.5
3,789,College Classroom,6000.0,2016-01-01 03:00:00,507.87,Ottawa,-3.0,-3.8,1014.0,260.0,1.5
4,789,College Classroom,6000.0,2016-01-01 04:00:00,509.45,Ottawa,-3.3,-3.8,1013.5,230.0,1.0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 349756 entries, 0 to 349755
Data columns (total 11 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   building_id     349756 non-null  int64  
 1   subUsage        349756 non-null  object 
 2   sqm             349756 non-null  float64
 3   timestamp       349756 non-null  object 
 4   meter_reading   349756 non-null  float64
 5   site_id         349756 non-null  object 
 6   airTemperature  336171 non-null  float64
 7   dewTemperature  335252 non-null  float64
 8   seaLvlPressure  335492 non-null  float64
 9   windDirection   336171 non-null  float64
 10  windSpeed       336171 non-null  float64
dtypes: float64(7), int64(1), object(3)
memory usage: 32.0+ MB


In [7]:
# fill airTemperature,dewTemperature,seaLvlPressure, windDirection, windSpeed missing values with interpolation
df.airTemperature.interpolate(method='linear', inplace=True)
df.dewTemperature.interpolate(method='linear', inplace=True)
df.seaLvlPressure.interpolate(method='linear', inplace=True)
df.windDirection.interpolate(method='linear', inplace=True)
df.windSpeed.interpolate(method='linear', inplace=True)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 349756 entries, 0 to 349755
Data columns (total 11 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   building_id     349756 non-null  int64  
 1   subUsage        349756 non-null  object 
 2   sqm             349756 non-null  float64
 3   timestamp       349756 non-null  object 
 4   meter_reading   349756 non-null  float64
 5   site_id         349756 non-null  object 
 6   airTemperature  349756 non-null  float64
 7   dewTemperature  349756 non-null  float64
 8   seaLvlPressure  349756 non-null  float64
 9   windDirection   349756 non-null  float64
 10  windSpeed       349756 non-null  float64
dtypes: float64(7), int64(1), object(3)
memory usage: 32.0+ MB


In [9]:
df = df[df['meter_reading'] != 0]

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 341735 entries, 0 to 348505
Data columns (total 11 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   building_id     341735 non-null  int64  
 1   subUsage        341735 non-null  object 
 2   sqm             341735 non-null  float64
 3   timestamp       341735 non-null  object 
 4   meter_reading   341735 non-null  float64
 5   site_id         341735 non-null  object 
 6   airTemperature  341735 non-null  float64
 7   dewTemperature  341735 non-null  float64
 8   seaLvlPressure  341735 non-null  float64
 9   windDirection   341735 non-null  float64
 10  windSpeed       341735 non-null  float64
dtypes: float64(7), int64(1), object(3)
memory usage: 31.3+ MB


In [11]:
# Add features - timestamp features
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['month'] = df.timestamp.dt.month
df['weekday'] = df.timestamp.dt.weekday
df['hour'] = df.timestamp.dt.hour
df['year'] = df.timestamp.dt.year

In [12]:
df.head()

Unnamed: 0,building_id,subUsage,sqm,timestamp,meter_reading,site_id,airTemperature,dewTemperature,seaLvlPressure,windDirection,windSpeed,month,weekday,hour,year
0,789,College Classroom,6000.0,2016-01-01 00:00:00,516.43,Ottawa,-2.2,-3.6,1014.5,270.0,2.1,1,4,0,2016
1,789,College Classroom,6000.0,2016-01-01 01:00:00,519.4,Ottawa,-2.3,-3.8,1014.5,260.0,2.1,1,4,1,2016
2,789,College Classroom,6000.0,2016-01-01 02:00:00,501.68,Ottawa,-2.8,-4.0,1014.0,260.0,1.5,1,4,2,2016
3,789,College Classroom,6000.0,2016-01-01 03:00:00,507.87,Ottawa,-3.0,-3.8,1014.0,260.0,1.5,1,4,3,2016
4,789,College Classroom,6000.0,2016-01-01 04:00:00,509.45,Ottawa,-3.3,-3.8,1013.5,230.0,1.0,1,4,4,2016


### 2.2 Data Saving

In [13]:
path = 'data/'
file = "mlmodeldata_final.csv"
df.to_csv(path+file)