## Downloading the data

In [1]:
!wget https://github.com/rezpe/kaggleh/blob/main/household_power_consumption.txt.zip?raw=true

zsh:1: no matches found: https://github.com/rezpe/kaggleh/blob/main/household_power_consumption.txt.zip?raw=true


In [2]:
!unzip household_power_consumption.txt.zip?raw=true

zsh:1: no matches found: household_power_consumption.txt.zip?raw=true


## Loading the data

In [3]:
%pylab inline
import pandas as pd

Populating the interactive namespace from numpy and matplotlib


In [4]:
# Loading data
df = pd.read_csv("household_power_consumption.txt",sep=";")
df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'household_power_consumption.txt'

## Feature Engineering

In [None]:
# Date Column as datetime type
df["Date"]=df["Date"]+" "+df["Time"]
df["Date"]=pd.to_datetime(df["Date"],format="%d/%m/%Y %H:%M:%S")

In [None]:
# We keep the Global active power only
df = df[["Date","Global_active_power"]]

In [None]:
# We check there are non empty values
np.where(df["Global_active_power"].isnull())

In [None]:
# The column is not in a proper format
df["Global_active_power"].dtypes
 
# If we try to convert to float an error is sent
# This is due to character ?, which is in fact a nan
# df["Global_active_power"].astype(float)

In [None]:
# We correct with the following function
def correct(s):
  if s=="?":
    return np.nan
  else: 
    return float(s)
df["Global_active_power"] = df["Global_active_power"].apply(correct)

In [None]:
# We fill the empty values
df["Global_active_power"]=df["Global_active_power"].interpolate()

In [None]:
# We can also check if all the minute values are counted.
pd.date_range(start = df["Date"].min(), end = df["Date"].max() , freq="min").difference(df["Date"])

In [None]:
# Let's convert to hourly signal
df["hour"]=df["Date"].dt.strftime("%d/%m/%Y %H")
dfh = df.groupby("hour").mean().reset_index()
dfh.head()

In [None]:
# Lagged values
for h in [24,48,168]:
  dfh[f"gap-{h}"]=dfh["Global_active_power"].shift(h)
 
# Seasonal features: We will add daily, weekly and yearly season
# sin(2*pi*x/period)
for period in [24,24*7,24*365]:
  dfh[f"s-{period}"]=np.sin(2*np.pi*np.arange(len(dfh))/period)
  dfh[f"c-{period}"]=np.cos(2*np.pi*np.arange(len(dfh))/period)
 
dfh=dfh.dropna()

## Machine Learning

In [None]:
dfh.columns

In [None]:
X = dfh[['gap-24', 'gap-48', 'gap-168', 's-24',
       's-168', 's-8760', 'c-24', 'c-168', 'c-8760']]
y = dfh['Global_active_power']

In [None]:
# Minimum Date
dfh["hour"].min()

In [None]:
# Maximum Date

In [None]:
dfh["hour"].max()

In [None]:
# Train / Test possibility 1
# We take last 3 months (3*30*24)
# !!!!!!! Gridsearch (and cross_val) in sklearn uses POSITIONAL index, not dataframe index
train_index = np.arange(len(X.index))[:-3*30*24]
test_index = np.arange(len(X.index))[-3*30*24:]

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [None]:
reg = GridSearchCV(RandomForestRegressor(),
                   param_grid={"max_depth":np.arange(3,20)},
                   scoring="neg_mean_squared_error",
                   cv=[  (train_index,test_index)  ])
reg.fit(X,y)

In [None]:
reg.best_params_

In [None]:
reg.best_score_

In [None]:
# Testing way 2
# We'll perform a cross-validation, but removing from training the points that are linked to the test set
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

tt_indexes = []
kf = KFold(5,shuffle=True)
# We loop through all the cv train/test sets
for train_index, test_index in kf.split(X):
  for i in [24,48,168]:
    train_index=np.array(list(set(train_index)-set(test_index-i)))
    tt_indexes.append((train_index,test_index))

reg = GridSearchCV(RandomForestRegressor(),
                   param_grid={"max_depth":np.arange(3,20)},
                   scoring="neg_mean_squared_error",
                   cv=tt_indexes)
reg.fit(X,y) 

In [None]:
reg.best_params_

In [None]:
reg.best_score_