In [None]:
!pip install xgboost

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier, XGBRegressor
from sklearn.metrics import accuracy_score, mean_squared_error
from xgboost import XGBClassifier

# Load dataset
data = pd.read_csv("../dataset/csv_result-electricity-normalized.csv")
data.head(5)

Unnamed: 0,id,date,day,period,nswprice,nswdemand,vicprice,vicdemand,transfer,class
0,1,0.0,2,0.0,0.056443,0.439155,0.003467,0.422915,0.414912,UP
1,2,0.0,2,0.021277,0.051699,0.415055,0.003467,0.422915,0.414912,UP
2,3,0.0,2,0.042553,0.051489,0.385004,0.003467,0.422915,0.414912,UP
3,4,0.0,2,0.06383,0.045485,0.314639,0.003467,0.422915,0.414912,UP
4,5,0.0,2,0.085106,0.042482,0.251116,0.003467,0.422915,0.414912,DOWN


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45312 entries, 0 to 45311
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   id         45312 non-null  int64  
 1   date       45312 non-null  float64
 2   day        45312 non-null  int64  
 3   period     45312 non-null  float64
 4   nswprice   45312 non-null  float64
 5   nswdemand  45312 non-null  float64
 6   vicprice   45312 non-null  float64
 7   vicdemand  45312 non-null  float64
 8   transfer   45312 non-null  float64
 9   class      45312 non-null  object 
dtypes: float64(7), int64(2), object(1)
memory usage: 3.5+ MB


In [10]:
data['class'] = data['class'].replace({'UP': 1, 'DOWN': 0})

In [11]:
X = data.drop(columns=['class'])
y = data['class']

In [15]:
# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize XGBoost tree model
model = XGBClassifier()

In [16]:
from sklearn.model_selection import GridSearchCV

# hyperparameters grid
hyper_param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [100, 200, 300]
}

# grid search
grid_search = GridSearchCV(model, hyper_param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)


Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 300}


In [17]:
# Train model with best hyperparameters
best_model = XGBClassifier(**best_params)
best_model.fit(X_train, y_train)

#prediction
y_pred = best_model.predict(X_test)
y_pred

array([1, 1, 0, ..., 0, 0, 0])

In [18]:
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.9245283018867925

In [19]:
# Analyze feature importance
feature_importances = best_model.feature_importances_
feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': feature_importances
})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
feature_importance_df

Unnamed: 0,Feature,Importance
4,nswprice,0.341892
1,date,0.138076
0,id,0.138074
2,day,0.094719
5,nswdemand,0.084635
3,period,0.058769
7,vicdemand,0.051192
6,vicprice,0.050101
8,transfer,0.042542
