## Downloading the data

In [None]:
!wget https://github.com/rezpe/kaggleh/blob/main/household_power_consumption.txt.zip?raw=true

--2021-09-23 14:21:50--  https://github.com/rezpe/kaggleh/blob/main/household_power_consumption.txt.zip?raw=true
Resolving github.com (github.com)... 140.82.114.4
Connecting to github.com (github.com)|140.82.114.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://github.com/rezpe/kaggleh/raw/main/household_power_consumption.txt.zip [following]
--2021-09-23 14:21:50--  https://github.com/rezpe/kaggleh/raw/main/household_power_consumption.txt.zip
Reusing existing connection to github.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/rezpe/kaggleh/main/household_power_consumption.txt.zip [following]
--2021-09-23 14:21:50--  https://raw.githubusercontent.com/rezpe/kaggleh/main/household_power_consumption.txt.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|18

In [None]:
!unzip household_power_consumption.txt.zip?raw=true

Archive:  household_power_consumption.txt.zip?raw=true
  inflating: household_power_consumption.txt  


## Loading the data

In [None]:
%pylab inline
import pandas as pd

Populating the interactive namespace from numpy and matplotlib


In [None]:
# Loading data
df = pd.read_csv("household_power_consumption.txt",sep=";")
df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,Date,Time,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
0,16/12/2006,17:24:00,4.216,0.418,234.84,18.4,0.0,1.0,17.0
1,16/12/2006,17:25:00,5.36,0.436,233.63,23.0,0.0,1.0,16.0
2,16/12/2006,17:26:00,5.374,0.498,233.29,23.0,0.0,2.0,17.0
3,16/12/2006,17:27:00,5.388,0.502,233.74,23.0,0.0,1.0,17.0
4,16/12/2006,17:28:00,3.666,0.528,235.68,15.8,0.0,1.0,17.0


## Feature Engineering

In [None]:
# Date Column as datetime type
df["Date"]=df["Date"]+" "+df["Time"]
df["Date"]=pd.to_datetime(df["Date"],format="%d/%m/%Y %H:%M:%S")

In [None]:
# We keep the Global active power only
df = df[["Date","Global_active_power"]]

In [None]:
# We check there are non empty values
np.where(df["Global_active_power"].isnull())

(array([], dtype=int64),)

In [None]:
# The column is not in a proper format
df["Global_active_power"].dtypes
 
# If we try to convert to float an error is sent
# This is due to character ?, which is in fact a nan
# df["Global_active_power"].astype(float)

dtype('O')

In [None]:
# We correct with the following function
def correct(s):
  if s=="?":
    return np.nan
  else: 
    return float(s)
df["Global_active_power"] = df["Global_active_power"].apply(correct)

In [None]:
# We fill the empty values
df["Global_active_power"]=df["Global_active_power"].interpolate()

In [None]:
# We can also check if all the minute values are counted.
pd.date_range(start = df["Date"].min(), end = df["Date"].max() , freq="min").difference(df["Date"])

DatetimeIndex([], dtype='datetime64[ns]', freq=None)

In [None]:
# Let's convert to hourly signal
df["hour"]=df["Date"].dt.strftime("%d/%m/%Y %H")
dfh = df.groupby("hour").mean().reset_index()
dfh.head()

Unnamed: 0,hour,Global_active_power
0,01/01/2007 00,2.550633
1,01/01/2007 01,2.5234
2,01/01/2007 02,2.582333
3,01/01/2007 03,2.541667
4,01/01/2007 04,2.475733


In [None]:
# Lagged values
for h in [24,48,168]:
  dfh[f"gap-{h}"]=dfh["Global_active_power"].shift(h)
 
# Seasonal features: We will add daily, weekly and yearly season
# sin(2*pi*x/period)
for period in [24,24*7,24*365]:
  dfh[f"s-{period}"]=np.sin(2*np.pi*np.arange(len(dfh))/period)
  dfh[f"c-{period}"]=np.cos(2*np.pi*np.arange(len(dfh))/period)
 
dfh=dfh.dropna()

## Machine Learning

In [None]:
dfh.columns

Index(['hour', 'Global_active_power', 'gap-24', 'gap-48', 'gap-168', 's-24',
       'c-24', 's-168', 'c-168', 's-8760', 'c-8760'],
      dtype='object')

In [None]:
X = dfh[['gap-24', 'gap-48', 'gap-168', 's-24',
       's-168', 's-8760', 'c-24', 'c-168', 'c-8760']]
y = dfh['Global_active_power']

In [None]:
# Minimum Date
dfh["hour"].min()

'01/01/2007 00'

In [None]:
# Maximum Date

In [None]:
dfh["hour"].max()

'31/12/2009 23'

In [None]:
# Train / Test possibility 1
# We take last 3 months (3*30*24)
# !!!!!!! Gridsearch (and cross_val) in sklearn uses POSITIONAL index, not dataframe index
train_index = np.arange(len(X.index))[:-3*30*24]
test_index = np.arange(len(X.index))[-3*30*24:]

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [None]:
reg = GridSearchCV(RandomForestRegressor(),
                   param_grid={"max_depth":np.arange(3,20)},
                   scoring="neg_mean_squared_error",
                   cv=[  (train_index,test_index)  ])
reg.fit(X,y)

GridSearchCV(cv=[(array([    0,     1,     2, ..., 32258, 32259, 32260]),
                  array([32261, 32262, 32263, ..., 34418, 34419, 34420]))],
             error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oo

In [None]:
reg.best_params_

{'max_depth': 4}

In [None]:
reg.best_score_

-0.7445459282495062

In [None]:
# Testing way 2
# We'll perform a cross-validation, but removing from training the points that are linked to the test set
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

tt_indexes = []
kf = KFold(5,shuffle=True)
# We loop through all the cv train/test sets
for train_index, test_index in kf.split(X):
  for i in [24,48,168]:
    train_index=np.array(list(set(train_index)-set(test_index-i)))
    tt_indexes.append((train_index,test_index))

reg = GridSearchCV(RandomForestRegressor(),
                   param_grid={"max_depth":np.arange(3,20)},
                   scoring="neg_mean_squared_error",
                   cv=tt_indexes)
reg.fit(X,y) 

GridSearchCV(cv=[(array([    0,     4,     6, ..., 34418, 34419, 34420]),
                  array([    1,     2,     5, ..., 34405, 34411, 34414])),
                 (array([    0,     4, 32773, ..., 32761, 32762, 32766]),
                  array([    1,     2,     5, ..., 34405, 34411, 34414])),
                 (array([    0, 32773, 32774, ..., 32761, 32762, 32766]),
                  array([    1,     2,     5, ..., 34405, 34411, 34414])),
                 (array([    0,     1,     2, ..., 34418, 34419, 34420]),
                  array([    4,     6,    14, ..., 34393, 34397, 34404])),
                 (array([32768,     1,...
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_sta

In [None]:
reg.best_params_

{'max_depth': 19}

In [None]:
reg.best_score_

-0.6066447118664455