In [None]:
!pip install xgboost

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [4]:
#Load dataset
data = pd.read_csv("./dataset/csv_result-electricity-normalized.csv")
data.head(5)

Unnamed: 0,id,date,day,period,nswprice,nswdemand,vicprice,vicdemand,transfer,class
0,1,0.0,2,0.0,0.056443,0.439155,0.003467,0.422915,0.414912,UP
1,2,0.0,2,0.021277,0.051699,0.415055,0.003467,0.422915,0.414912,UP
2,3,0.0,2,0.042553,0.051489,0.385004,0.003467,0.422915,0.414912,UP
3,4,0.0,2,0.06383,0.045485,0.314639,0.003467,0.422915,0.414912,UP
4,5,0.0,2,0.085106,0.042482,0.251116,0.003467,0.422915,0.414912,DOWN


In [6]:
# Convert target labels to binary
new_data = data.iloc[:2500].copy(deep = True)
new_data['class'] = new_data['class'].replace({'UP': 1, 'DOWN': 0})
new_data.head()

Unnamed: 0,id,date,day,period,nswprice,nswdemand,vicprice,vicdemand,transfer,class
0,1,0.0,2,0.0,0.056443,0.439155,0.003467,0.422915,0.414912,1
1,2,0.0,2,0.021277,0.051699,0.415055,0.003467,0.422915,0.414912,1
2,3,0.0,2,0.042553,0.051489,0.385004,0.003467,0.422915,0.414912,1
3,4,0.0,2,0.06383,0.045485,0.314639,0.003467,0.422915,0.414912,1
4,5,0.0,2,0.085106,0.042482,0.251116,0.003467,0.422915,0.414912,0


In [10]:
# Define features and target variable
X = new_data.drop(columns=['class'])
y = new_data['class']
display('Independent Variables:',X.head(5))
display('Dependent Variable:',y.head(5))

'Independent Variables:'

Unnamed: 0,id,date,day,period,nswprice,nswdemand,vicprice,vicdemand,transfer
0,1,0.0,2,0.0,0.056443,0.439155,0.003467,0.422915,0.414912
1,2,0.0,2,0.021277,0.051699,0.415055,0.003467,0.422915,0.414912
2,3,0.0,2,0.042553,0.051489,0.385004,0.003467,0.422915,0.414912
3,4,0.0,2,0.06383,0.045485,0.314639,0.003467,0.422915,0.414912
4,5,0.0,2,0.085106,0.042482,0.251116,0.003467,0.422915,0.414912


'Dependent Variables:'

0    1
1    1
2    1
3    1
4    0
Name: class, dtype: int64

In [11]:
# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:

# Create polynomial features (composite columns)
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

In [13]:
# Initialize XGBoost classifier
model = XGBClassifier()

# Train the model
model.fit(X_train_poly, y_train)

In [24]:
# Get feature importances
feature_importances = model.feature_importances_

# Create a DataFrame to store feature importances
feature_importance_df = pd.DataFrame({
    'Feature': poly.get_feature_names_out(X.columns),
    'Importance': feature_importances
})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
display('Feature Composition',feature_importance_df.head(30))

'Feature Composition'

Unnamed: 0,Feature,Importance
4,nswprice,0.299251
0,id,0.105232
17,date day,0.095884
1,date,0.061281
9,id date,0.052538
10,id day,0.051888
25,day nswprice,0.044185
19,date nswprice,0.042363
12,id nswprice,0.038854
3,period,0.038326


In [25]:
# Sort features by importance
# top_features = feature_importance_df.nlargest(10, 'Importance')['Feature'].values
#     'nswprice',- 1
#     'id',- 2
#     'date day	',- 3
#     'day nswprice' - 4
#     'period',- 5
#     'nswprice nswdemand', 6
#     'period vicprice',7
#     'nswdemand transfer'8

top_features = ['id', 'nswprice', 'date day', 'day nswprice', 'period', 
                'nswprice nswdemand', 'period vicprice', 'nswdemand transfer']
top_features

['id',
 'nswprice',
 'date day',
 'day nswprice',
 'period',
 'nswprice nswdemand',
 'period vicprice',
 'nswdemand transfer']

In [28]:
# Filter polynomial features to include only top features
X_train_top = X_train_poly[:, np.isin(poly.get_feature_names_out(X.columns), top_features)]
X_test_top = X_test_poly[:, np.isin(poly.get_feature_names_out(X.columns), top_features)]

In [29]:
# Train model with top features
model_top = XGBClassifier()
model_top.fit(X_train_top, y_train)

In [30]:
# Predictions
y_pred_top = model_top.predict(X_test_top)
accuracy_top = accuracy_score(y_test, y_pred_top)
print("Accuracy with top features:", accuracy_top)


Accuracy with top features: 0.982
