Purpose of this notebook is to see if a model can use the half hourly cyclical features to predict the half hour of the day

In [1]:
import sys
sys.path.append('..')

import pandas as pd

from utils.pipelines import HalfHourlyCyclicalFeatures

#  Feature generation

In [2]:
target = pd.DataFrame(
    index=pd.DatetimeIndex(start='01/01/2018', end='31/12/2018', freq='30min')
)

In [3]:
target.loc[:, 'half_hour'] = target.index.hour * 2 + (target.index.minute)/30
target.head()

Unnamed: 0,half_hour
2018-01-01 00:00:00,0.0
2018-01-01 00:30:00,1.0
2018-01-01 01:00:00,2.0
2018-01-01 01:30:00,3.0
2018-01-01 02:00:00,4.0


In [4]:
target.head(8)

Unnamed: 0,half_hour
2018-01-01 00:00:00,0.0
2018-01-01 00:30:00,1.0
2018-01-01 01:00:00,2.0
2018-01-01 01:30:00,3.0
2018-01-01 02:00:00,4.0
2018-01-01 02:30:00,5.0
2018-01-01 03:00:00,6.0
2018-01-01 03:30:00,7.0


In [5]:
features = HalfHourlyCyclicalFeatures().transform(target)
import numpy as np

noise = np.random.normal(size=target.shape)

features.loc[:, 'noise'] = noise

features.head()

Unnamed: 0,sin_hh,cos_hh,noise
2018-01-01 00:00:00,0.0,1.0,1.375968
2018-01-01 00:30:00,0.130526,0.991445,-0.075366
2018-01-01 01:00:00,0.258819,0.965926,0.411108
2018-01-01 01:30:00,0.382683,0.92388,1.028611
2018-01-01 02:00:00,0.5,0.866025,1.22132


# Prediction

In [6]:
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score

In [7]:
from collections import namedtuple

Result = namedtuple('results', ['model', 'pred', 'score'])


def fit_classifier(model_, model_params, features, target, score_function=accuracy_score):
    """
    Fits a sklearn classifier model
    
    args
        model (sklearn estimator) uninistated
        model_params (dict)
        features (np.array)
        target (np.array)
        score_function (sklearn metrics)
        
    return
        model (sklearn estimator)
        pred (np.array)
        score (float)
        
    """
    model = model_(**model_params)
    
    pred = model.fit(features, target.flatten()).predict(features)
    
    score = score_function(target, pred)
    
    print(repr(model), )
    
    return Result(model, pred, score)
    

In [8]:
models = [
    (GaussianNB, {}),
    (RandomForestClassifier, {'n_estimators': 10})
]

results = []
for model, model_params in models:
    results.append(fit_classifier(model,
                     model_params,
                     features.values,
                     target.values))
    

GaussianNB(priors=None)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


In [9]:
results[1].model.feature_importances_

array([0.4906305 , 0.48379054, 0.02557896])

In [10]:
features.columns

Index(['sin_hh', 'cos_hh', 'noise'], dtype='object')

In [11]:
results[1].score

1.0