In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
from skmultiflow.data import FileStream
from skmultiflow.trees import HoeffdingTreeClassifier
from skmultiflow.evaluation import EvaluatePrequential, EvaluateHoldout

In [13]:
data = pd.read_csv('data/electricity-normalized.csv').values
np.random.seed(0)
np.random.shuffle(data)
data_train, data_test = data[:35000, :], data[35000:, :]

In [14]:
# data_stream = FileStream("data/electricity-normalized.csv")

In [15]:
ht = HoeffdingTreeClassifier()

In [16]:
# evaluator = EvaluateHoldout(test_size=10000)
# evaluator.evaluate(stream=data_stream, model=ht)

In [17]:
ht.fit(data_train[:, :-1], data_train[:, -1])

HoeffdingTreeClassifier(binary_split=False, grace_period=200,
                        leaf_prediction='nba', max_byte_size=33554432,
                        memory_estimate_period=1000000, nb_threshold=0,
                        no_preprune=False, nominal_attributes=None,
                        remove_poor_atts=False, split_confidence=1e-07,
                        split_criterion='info_gain', stop_mem_management=False,
                        tie_threshold=0.05)

In [18]:
predictions = ht.predict(data_test[:, :-1])

In [19]:
(predictions == data_test[:, -1]).sum()/len(predictions)

0.7259503491078355

In [24]:
ht.get_rules_description()

'Att (3) <= 1.730 and Att (3) <= 0.450 and Att (6) <= 7.270 and Att (2) <= 3.450 | class: 1\nAtt (3) <= 1.730 and Att (3) <= 0.450 and Att (6) <= 7.270 and Att (2) > 3.450 and Att (1) <= 3.270 and Att (6) <= 0.180 | class: 1\nAtt (3) <= 1.730 and Att (3) <= 0.450 and Att (6) <= 7.270 and Att (2) > 3.450 and Att (1) <= 3.270 and Att (6) > 0.180 | class: 0\nAtt (3) <= 1.730 and Att (3) <= 0.450 and Att (6) <= 7.270 and Att (2) > 3.450 and Att (1) > 3.270 | class: 0\nAtt (3) <= 1.730 and Att (3) <= 0.450 and Att (6) > 7.270 and Att (0) <= 2.360 | class: 0\nAtt (3) <= 1.730 and Att (3) <= 0.450 and Att (6) > 7.270 and Att (0) > 2.360 | class: 0\nAtt (3) <= 1.730 and Att (3) > 0.450 and Att (0) <= 1.090 and Att (7) <= 5.000 | class: 1\nAtt (3) <= 1.730 and Att (3) > 0.450 and Att (0) <= 1.090 and Att (7) > 5.000 | class: 1\nAtt (3) <= 1.730 and Att (3) > 0.450 and Att (0) > 1.090 and Att (0) <= 5.270 and Att (2) <= 5.180 | class: 1\nAtt (3) <= 1.730 and Att (3) > 0.450 and Att (0) > 1.090 a

In [12]:
data = pd.read_csv("data/electricity-normalized.csv")

In [13]:
data.columns

Index(['date', 'day', 'period', 'nswprice', 'nswdemand', 'vicprice',
       'vicdemand', 'transfer', 'class'],
      dtype='object')

In [14]:
data['class']

0          UP
1          UP
2          UP
3          UP
4        DOWN
         ... 
45307    DOWN
45308    DOWN
45309    DOWN
45310      UP
45311    DOWN
Name: class, Length: 45312, dtype: object

In [15]:
for col in data.columns[:-1]:
    data[col] = pd.factorize(pd.cut(data[col], bins=20))[0]


In [16]:
data['class'] = pd.factorize(data['class'])[0]

In [17]:
data.to_csv('data/electricity-normalized.csv', index=False)

In [11]:
data.shape

(45312, 9)