In [None]:
## importing important libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.io import arff

import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [34]:
## loading the data
df, meta = arff.loadarff('electricity-normalized.arff')
df = pd.DataFrame(df)

## converting bytes to strings
df = df.applymap(lambda x: x.decode() if isinstance(x, bytes) else x)

In [35]:
df

Unnamed: 0,date,day,period,nswprice,nswdemand,vicprice,vicdemand,transfer,class
0,0.0000,2,0.000000,0.056443,0.439155,0.003467,0.422915,0.414912,UP
1,0.0000,2,0.021277,0.051699,0.415055,0.003467,0.422915,0.414912,UP
2,0.0000,2,0.042553,0.051489,0.385004,0.003467,0.422915,0.414912,UP
3,0.0000,2,0.063830,0.045485,0.314639,0.003467,0.422915,0.414912,UP
4,0.0000,2,0.085106,0.042482,0.251116,0.003467,0.422915,0.414912,DOWN
...,...,...,...,...,...,...,...,...,...
45307,0.9158,7,0.914894,0.044224,0.340672,0.003033,0.255049,0.405263,DOWN
45308,0.9158,7,0.936170,0.044884,0.355549,0.003072,0.241326,0.420614,DOWN
45309,0.9158,7,0.957447,0.043593,0.340970,0.002983,0.247799,0.362281,DOWN
45310,0.9158,7,0.978723,0.066651,0.329366,0.004630,0.345417,0.206579,UP


## Attribute information
- Date: date between 7 May 1996 to 5 December 1998. Here normalized between 0 and 1
- Day: day of the week (1-7)
- Period: time of the measurement (1-48) in half hour intervals over 24 hours. - Here normalized between 0 and 1
- NSWprice: New South Wales electricity price, normalized between 0 and 1
- NSWdemand: New South Wales electricity demand, normalized between 0 and 1
- VICprice: Victoria electricity price, normalized between 0 and 1
- VICdemand: Victoria electricity demand, normalized between 0 and 1
- transfer: scheduled electricity transfer between both states, normalized between 0 and 1

In [10]:
## label encoding
label_encoder = LabelEncoder()
df['class'] = label_encoder.fit_transform(df['class'])

In [11]:
df

Unnamed: 0,date,day,period,nswprice,nswdemand,vicprice,vicdemand,transfer,class
0,0.0000,2,0.000000,0.056443,0.439155,0.003467,0.422915,0.414912,1
1,0.0000,2,0.021277,0.051699,0.415055,0.003467,0.422915,0.414912,1
2,0.0000,2,0.042553,0.051489,0.385004,0.003467,0.422915,0.414912,1
3,0.0000,2,0.063830,0.045485,0.314639,0.003467,0.422915,0.414912,1
4,0.0000,2,0.085106,0.042482,0.251116,0.003467,0.422915,0.414912,0
...,...,...,...,...,...,...,...,...,...
45307,0.9158,7,0.914894,0.044224,0.340672,0.003033,0.255049,0.405263,0
45308,0.9158,7,0.936170,0.044884,0.355549,0.003072,0.241326,0.420614,0
45309,0.9158,7,0.957447,0.043593,0.340970,0.002983,0.247799,0.362281,0
45310,0.9158,7,0.978723,0.066651,0.329366,0.004630,0.345417,0.206579,1


## Feature Importance

In [None]:
## Defining features and target
X = df.drop('class', axis=1)
y = df['class']

## Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [37]:
## Training the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

## feature importance
importances = rf_model.feature_importances_

## Summarizing 
for i, v in enumerate(importances):
    print('Feature: %0d, %s, Score: %.5f' % (i, X.columns[i], v))

Feature: 0, date, Score: 0.17859
Feature: 1, day, Score: 0.05606
Feature: 2, period, Score: 0.10527
Feature: 3, nswprice, Score: 0.31125
Feature: 4, nswdemand, Score: 0.12519
Feature: 5, vicprice, Score: 0.10794
Feature: 6, vicdemand, Score: 0.06117
Feature: 7, transfer, Score: 0.05454


In [15]:
df.columns

Index(['date', 'day', 'period', 'nswprice', 'nswdemand', 'vicprice',
       'vicdemand', 'transfer', 'class'],
      dtype='object')

# Modelling
- first model is XG-Boost without any hyperparameter tuning
- second model is XG-Boost with Random Search CV
- third model is Bagging-classifier without hyper parameter tuning

In [39]:
## Training model
basic_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
basic_model.fit(X_train, y_train)

## Predicting and evaluating
y_pred = basic_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(accuracy * 100)



91.41564603332229


In [17]:
## Defining parameter grid
param_dist = {
    'n_estimators': np.arange(50, 400, 50),
    'learning_rate': np.linspace(0.01, 0.2, 10), ## start, stop, num=10
    'max_depth': np.arange(3, 10, 1), ## [start, ]stop, [step, ]
    'colsample_bytree': np.linspace(0.5, 1, 5),
    'subsample': np.linspace(0.6, 1, 5)
}

## Initializing and fitting RandomizedSearchCV
random_search = RandomizedSearchCV(basic_model, param_distributions=param_dist, n_iter=10, scoring='accuracy', n_jobs=-1, cv=5, random_state=42)
random_search.fit(X_train, y_train)

print("Best parameters found: ", random_search.best_params_)
print("Best accuracy found: ", random_search.best_score_)



Best parameters found:  {'subsample': 1.0, 'n_estimators': 300, 'max_depth': 8, 'learning_rate': 0.1788888888888889, 'colsample_bytree': 1.0}
Best accuracy found:  0.930811753345289


In [20]:
## Initializing the base model
base_estimator = DecisionTreeClassifier(random_state=42)

## Bagging classifier
bagging_model = BaggingClassifier(base_estimator=base_estimator, 
                                  n_estimators=100,
                                  random_state=42)

## model training
bagging_model.fit(X_train, y_train)

## Prediction and evaluation
y_pred = bagging_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of bagging model: {:.2f}%".format(accuracy * 100))



Accuracy of bagging model: 92.39%
