In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('electricity-normalized.csv')

In [3]:
df.head()

Unnamed: 0,date,day,period,nswprice,nswdemand,vicprice,vicdemand,transfer,class
0,0.0,2,0.0,0.056443,0.439155,0.003467,0.422915,0.414912,UP
1,0.0,2,0.021277,0.051699,0.415055,0.003467,0.422915,0.414912,UP
2,0.0,2,0.042553,0.051489,0.385004,0.003467,0.422915,0.414912,UP
3,0.0,2,0.06383,0.045485,0.314639,0.003467,0.422915,0.414912,UP
4,0.0,2,0.085106,0.042482,0.251116,0.003467,0.422915,0.414912,DOWN


In [4]:
df.dtypes

date         float64
day            int64
period       float64
nswprice     float64
nswdemand    float64
vicprice     float64
vicdemand    float64
transfer     float64
class         object
dtype: object

# Data cleaning

In [5]:
import dabl
clean_df = dabl.clean(df,verbose=2)

Detected feature types:
7 float, 1 int, 1 object, 0 date, 0 other
Interpreted as:
continuous      7
dirty_float     0
low_card_int    1
categorical     1
date            0
free_string     0
useless         0
dtype: int64


# Preprocessing

In [6]:
ep = dabl.EasyPreprocessor().fit(clean_df.drop(['class'],axis=1),clean_df['class'])

In [7]:
ep.ct_

ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.1,
                  transformer_weights=None,
                  transformers=[('continuous',
                                 Pipeline(memory=None,
                                          steps=[('simpleimputer',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='median',
                                                                verbose=0)),
                                                 ('standardscaler',
                                                  StandardScaler(copy=True,
                                                                 with_mean=True,
      

# Simple Prototypes

In [8]:
from dabl import SimpleClassifier
SimpleClassifier().fit(clean_df,target_col='class')

Running DummyClassifier(strategy='prior')
accuracy: 0.575 average_precision: 0.425 roc_auc: 0.500 recall_macro: 0.500 f1_macro: 0.365
=== new best DummyClassifier(strategy='prior') (using recall_macro):
accuracy: 0.575 average_precision: 0.425 roc_auc: 0.500 recall_macro: 0.500 f1_macro: 0.365

Running GaussianNB()
accuracy: 0.733 average_precision: 0.778 roc_auc: 0.810 recall_macro: 0.702 f1_macro: 0.692
=== new best GaussianNB() (using recall_macro):
accuracy: 0.733 average_precision: 0.778 roc_auc: 0.810 recall_macro: 0.702 f1_macro: 0.692

Running MultinomialNB()
accuracy: 0.595 average_precision: 0.554 roc_auc: 0.624 recall_macro: 0.530 f1_macro: 0.448
Running DecisionTreeClassifier(class_weight='balanced', max_depth=1)
accuracy: 0.757 average_precision: 0.641 roc_auc: 0.733 recall_macro: 0.733 f1_macro: 0.710
=== new best DecisionTreeClassifier(class_weight='balanced', max_depth=1) (using recall_macro):
accuracy: 0.757 average_precision: 0.641 roc_auc: 0.733 recall_macro: 0.733 f

SimpleClassifier(random_state=None, refit=True, verbose=1)

# Automatic Model Search

In [9]:
from dabl.models import AnyClassifier
from sklearn.model_selection import train_test_split
df_train,df_test = train_test_split(clean_df)
ac = AnyClassifier().fit(df_train,target_col='class')

best classifier:  HistGradientBoostingClassifier(l2_regularization=1e-05, learning_rate=0.1,
                               loss='auto', max_bins=255, max_depth=16,
                               max_iter=400, max_leaf_nodes=64,
                               min_samples_leaf=10, n_iter_no_change=None,
                               random_state=58027, scoring=None, tol=1e-07,
                               validation_fraction=0.1, verbose=0)
best score: 0.931


# Model Explanation

In [10]:
dabl.explain(ac,X_val=df_test,target_col='class')

              precision    recall  f1-score   support

        DOWN       0.94      0.95      0.95      6511
          UP       0.93      0.92      0.93      4817

    accuracy                           0.94     11328
   macro avg       0.94      0.94      0.94     11328
weighted avg       0.94      0.94      0.94     11328

[[6190  321]
 [ 367 4450]]


  warn("Can't plot roc curve, install sklearn 0.22-dev")


ValueError: The underlying estimator HistGradientBoostingClassifier has no `coef_` or `feature_importances_` attribute. Either pass a fitted estimator to SelectFromModel or call fit before calling transform.

# Model Prediction

In [11]:
ac.predict(clean_df.drop(['class'],axis=1).sample(10))

array(['DOWN', 'UP', 'DOWN', 'DOWN', 'DOWN', 'DOWN', 'DOWN', 'UP', 'UP',
       'UP'], dtype=object)

In [15]:
import joblib

In [16]:
joblib.dump(ac,'ac.pkl')

['ac.pkl']

In [18]:
model = joblib.load('ac.pkl')

In [20]:
model.predict(clean_df.drop(['class'],axis=1).sample(10))

array(['UP', 'UP', 'UP', 'DOWN', 'DOWN', 'UP', 'DOWN', 'DOWN', 'UP',
       'DOWN'], dtype=object)