## Downloading the data

In [1]:
!wget https://github.com/rezpe/kaggleh/blob/main/household_power_consumption.txt.zip?raw=true

zsh:1: no matches found: https://github.com/rezpe/kaggleh/blob/main/household_power_consumption.txt.zip?raw=true


In [2]:
!unzip household_power_consumption.txt.zip?raw=true

zsh:1: no matches found: household_power_consumption.txt.zip?raw=true


## Loading the data

In [3]:
%pylab inline
import pandas as pd

Populating the interactive namespace from numpy and matplotlib


In [4]:
# Loading data
df = pd.read_csv("household_power_consumption.txt.zip",sep=";")
df.head()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,Date,Time,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
0,16/12/2006,17:24:00,4.216,0.418,234.84,18.4,0.0,1.0,17.0
1,16/12/2006,17:25:00,5.36,0.436,233.63,23.0,0.0,1.0,16.0
2,16/12/2006,17:26:00,5.374,0.498,233.29,23.0,0.0,2.0,17.0
3,16/12/2006,17:27:00,5.388,0.502,233.74,23.0,0.0,1.0,17.0
4,16/12/2006,17:28:00,3.666,0.528,235.68,15.8,0.0,1.0,17.0


## Feature Engineering

In [5]:
# Date Column as datetime type
df["Date"]=df["Date"]+" "+df["Time"]
df["Date"]=pd.to_datetime(df["Date"],format="%d/%m/%Y %H:%M:%S")

In [6]:
# We keep the Global active power only
df = df[["Date","Global_active_power"]]

In [7]:
# We check there are non empty values
np.where(df["Global_active_power"].isnull())

(array([], dtype=int64),)

In [8]:
# The column is not in a proper format
df["Global_active_power"].dtypes
 
# If we try to convert to float an error is sent
# This is due to character ?, which is in fact a nan
# df["Global_active_power"].astype(float)

dtype('O')

In [9]:
# We correct with the following function
def correct(s):
  if s=="?":
    return np.nan
  else: 
    return float(s)
df["Global_active_power"] = df["Global_active_power"].apply(correct)

In [10]:
# We fill the empty values
df["Global_active_power"]=df["Global_active_power"].interpolate()

In [11]:
# We can also check if all the minute values are counted.
pd.date_range(start = df["Date"].min(), end = df["Date"].max() , freq="min").difference(df["Date"])

DatetimeIndex([], dtype='datetime64[ns]', freq=None)

In [12]:
# Let's convert to hourly signal
df["hour"]=df["Date"].dt.strftime("%d/%m/%Y %H")
dfh = df.groupby("hour").mean().reset_index()
dfh.head()

Unnamed: 0,hour,Global_active_power
0,01/01/2007 00,2.550633
1,01/01/2007 01,2.5234
2,01/01/2007 02,2.582333
3,01/01/2007 03,2.541667
4,01/01/2007 04,2.475733


In [13]:
# Lagged values
for h in [24,48,168]:
    dfh[f"gap-{h}"]=dfh["Global_active_power"].shift(h)
 
# Seasonal features: We will add daily, weekly and yearly season
# sin(2*pi*x/period)
for period in [24,24*7,24*365]:
    dfh[f"s-{period}"]=np.sin(2*np.pi*np.arange(len(dfh))/period)
    dfh[f"c-{period}"]=np.cos(2*np.pi*np.arange(len(dfh))/period)
 
dfh=dfh.dropna()

## Machine Learning

In [14]:
dfh.columns

Index(['hour', 'Global_active_power', 'gap-24', 'gap-48', 'gap-168', 's-24',
       'c-24', 's-168', 'c-168', 's-8760', 'c-8760'],
      dtype='object')

In [15]:
X = dfh[['gap-24', 'gap-48', 'gap-168', 's-24',
       's-168', 's-8760', 'c-24', 'c-168', 'c-8760']]
y = dfh['Global_active_power']

In [16]:
# Minimum Date
dfh["hour"].min()

'01/02/2010 00'

In [17]:
# Maximum Date

In [18]:
dfh["hour"].max()

'31/12/2009 23'

In [19]:
# Train / Test possibility 1
# We take last 3 months (3*30*24)
# !!!!!!! Gridsearch (and cross_val) in sklearn uses POSITIONAL index, not dataframe index
train_index = np.arange(len(X.index))[:-3*30*24]
test_index = np.arange(len(X.index))[-3*30*24:]

In [20]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [21]:
reg = GridSearchCV(RandomForestRegressor(),
                   param_grid={"max_depth":np.arange(3,20)},
                   scoring="neg_mean_squared_error",
                   cv=[  (train_index,test_index)  ])
reg.fit(X,y)

GridSearchCV(cv=[(array([    0,     1,     2, ..., 32258, 32259, 32260]),
                  array([32261, 32262, 32263, ..., 34418, 34419, 34420]))],
             estimator=RandomForestRegressor(),
             param_grid={'max_depth': array([ 3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19])},
             scoring='neg_mean_squared_error')

In [22]:
reg.best_params_

{'max_depth': 4}

In [23]:
reg.best_score_

-0.745039679521721

In [None]:
# Testing way 2
# We'll perform a cross-validation, but removing from training the points that are linked to the test set
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

tt_indexes = []
kf = KFold(5,shuffle=True)
# We loop through all the cv train/test sets
for train_index, test_index in kf.split(X):
    for i in [24,48,168]:
        train_index=np.array(list(set(train_index)-set(test_index-i)))
        tt_indexes.append((train_index,test_index))

reg = GridSearchCV(RandomForestRegressor(),
                   param_grid={"max_depth":np.arange(3,20)},
                   scoring="neg_mean_squared_error",
                   cv=tt_indexes)
reg.fit(X,y) 

In [None]:
reg.best_params_

In [None]:
reg.best_score_