### Load the Dataset and Examine

In [35]:
import pandas as pd 
import numpy as np 
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV

In [36]:
path = '../Datasets/train.csv'
df = pd.read_csv(path)
df['Tarih'] = pd.to_datetime(df['Tarih'])
df['date'] = df['Tarih'].dt.date;df.head()

Unnamed: 0,Tarih,Dağıtılan Enerji (MWh),date
0,2018-01-01 00:00:00,1593.944216,2018-01-01
1,2018-01-01 01:00:00,1513.933887,2018-01-01
2,2018-01-01 02:00:00,1402.612637,2018-01-01
3,2018-01-01 03:00:00,1278.527266,2018-01-01
4,2018-01-01 04:00:00,1220.697701,2018-01-01


In [37]:
kesintiler = pd.read_csv('../Datasets/med.csv')
kesintiler['kesinti'] = 1
kesintiler['Tarih'] = pd.to_datetime(kesintiler['Tarih']);kesintiler.head()

Unnamed: 0,Tarih,kesinti
0,2019-12-23,1
1,2019-06-12,1
2,2019-01-25,1
3,2019-09-25,1
4,2019-03-29,1


In [38]:
kesintiler[~kesintiler['Tarih'].isin(df['date'])]

Unnamed: 0,Tarih,kesinti


kesintilere ilişkin veriler sadece train veri setine ait verileri içeriyor. Submission yaparken kesinti feature'ını ekleyemeyeceğimiz için kesintileri bu kısımda dahil etmek doğru olmayacaktır.

In [39]:
df.drop(['date'], axis=1, inplace=True);df.head(3)

Unnamed: 0,Tarih,Dağıtılan Enerji (MWh)
0,2018-01-01 00:00:00,1593.944216
1,2018-01-01 01:00:00,1513.933887
2,2018-01-01 02:00:00,1402.612637


In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40152 entries, 0 to 40151
Data columns (total 2 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Tarih                   40152 non-null  datetime64[ns]
 1   Dağıtılan Enerji (MWh)  40152 non-null  float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 627.5 KB


In [41]:
weather = pd.read_csv('../Datasets/weather.csv')
weather['date'] = pd.to_datetime(weather['date']);weather.head()

Unnamed: 0,date,Min_temperature,Max_temperature
0,2018-01-01,4,12
1,2018-01-02,7,14
2,2018-01-03,9,14
3,2018-01-04,8,13
4,2018-01-05,7,12


<hr>

#### Feature Extraction Function

In [42]:
def extractor(df, date_col, weather, mapping=False):
    # dont show warnings
    import warnings
    warnings.filterwarnings('ignore')
    
    df[date_col] = pd.to_datetime(df[date_col])
    
    # Extracting date features
    df['Saat'] = df[date_col].dt.hour
    df['Aylık Gün'] = df[date_col].dt.day
    df['Yıllık Gün'] = df[date_col].dt.dayofyear
    df['Haftanın günü'] = df[date_col].dt.day_name()
    df['Hafta'] = df[date_col].dt.week
    df['Ay'] = df[date_col].dt.month
    df['Çeyreklik'] = df[date_col].dt.quarter
    df['Yıl'] = df[date_col].dt.year
    
    # Extracting holiday features
    import holidays
    tr_holidays = holidays.Turkey()
    df['Özel Gün'] = df[date_col].apply(lambda x: x in tr_holidays)
    
    # Extracting seasonality features
    def get_season(month):
        if month >= 3 and month <= 5:
            return 'Spring'
        elif month >= 6 and month <= 8:
            return 'Summer'
        elif month >= 9 and month <= 11:
            return 'Autumn'
        else:
            return 'Winter'
    
    df['Mevsim'] = df[date_col].dt.month.apply(get_season)
    
    # Adding weather features
    weather['date'] = pd.to_datetime(weather['date']).dt.date
    df['date_no_time'] = df[date_col].dt.date
    df = pd.merge(df, weather, left_on='date_no_time', right_on='date', how='left')
    df.drop(['date_no_time','date'], axis=1, inplace=True)
    
    # Extracting weekend features
    import datetime
    def is_weekend(date_str):
        date_obj = datetime.datetime.strptime(date_str, '%Y-%m-%d').date()
        return 1 if date_obj.weekday() >= 5 else 0
    
    df['Hafta Sonu'] = df[date_col].dt.strftime('%Y-%m-%d').apply(is_weekend)
    
    if mapping == True:
        ozel_gun = {
            True:1,
            False:0
        }

        mevsim = {
            "Winter":1,
            "Spring":2,
            "Summer":3,
            "Autumn":4
        }

        hafta_gunleri = {
            "Monday":1,
            "Tuesday":2,
            "Wednesday":3,
            "Thursday":4,
            "Friday":5,
            "Saturday":6,
            "Sunday":7
        }
        
        df['Haftanın günü'] = df['Haftanın günü'].map(hafta_gunleri)
        df['Özel Gün'] = df['Özel Gün'].map(ozel_gun)
        df['Mevsim'] = df['Mevsim'].map(mevsim)

        return df
    
    else:
        return df

<hr>

set the dataset settings

In [43]:
train = df.iloc[:int(len(df)*.75),:]
test = df.iloc[int(len(df)*.75):,:]

train = extractor(train, 'Tarih', weather, mapping=True)
test = extractor(test, 'Tarih', weather, mapping=True)

In [44]:
X_train = train.drop(['Tarih','Dağıtılan Enerji (MWh)'], axis=1)
y_train = train['Dağıtılan Enerji (MWh)']

X_test = test.drop(['Tarih','Dağıtılan Enerji (MWh)'], axis=1)
y_test = test['Dağıtılan Enerji (MWh)']

### XGBoost Model

In [45]:
# XGBoost
import xgboost as xgb
xgb_model = xgb.XGBRegressor()

In [46]:
# Hiperparametrelerin belirlenmesi
param_grid = {'n_estimators': [100, 500, 1000, 2000],
              'learning_rate': [0.01, 0.05, 0.1, 1.0],
              'max_depth': [3, 5, 7],
              'early_stopping_rounds': [20, 30, 50, 100, 200]}

grid_search = GridSearchCV(xgb_model, param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train,
        eval_set=[(X_train, y_train), (X_test, y_test)],
        verbose=2000)

print("En iyi parametreler: ", grid_search.best_params_)
print("En iyi skor: ", grid_search.best_score_)


In [47]:
xgb_model = xgb.XGBRegressor(early_stopping_rounds=30, learning_rate = 0.05, max_depth=5, n_estimators=500)
xgb_model.fit(X_train, y_train,
        eval_set=[(X_train, y_train), (X_test, y_test)],
        verbose=100)

[0]	validation_0-rmse:1757.03606	validation_1-rmse:1896.54325
[100]	validation_0-rmse:88.76055	validation_1-rmse:139.61341
[200]	validation_0-rmse:70.51104	validation_1-rmse:126.36895
[263]	validation_0-rmse:64.91381	validation_1-rmse:125.93168


In [48]:
xgb_preds = xgb_model.predict(X_test)

In [49]:
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error
mape = mean_absolute_percentage_error(y_test, xgb_preds)
rmse = np.sqrt(mean_squared_error(y_test,xgb_preds))

In [50]:
f"MAPE: {mape:.2f} RMSE: {rmse:.2f}"

'MAPE: 0.05 RMSE: 125.57'

In [51]:
# predictions and real values
fig = px.line(x=y_test.index, y=y_test.values, title='Gerçek Değerler')
fig.add_scatter(x=y_test.index, y=xgb_preds, mode='lines', name='Tahminler')

# set name to axis
fig.update_xaxes(title_text='Tarih')
fig.update_yaxes(title_text='Dağıtılan Enerji (MWh)')
fig.show()

<hr>

### Random Forest

In [52]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()

In [53]:
# random forest hyperparameter tuning
param_grid = {'n_estimators': [100, 500, 1000, 2000],
                'max_depth': [3, 5, 7],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4]}

grid_search = GridSearchCV(rf, param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

print("En iyi parametreler: ", grid_search.best_params_)
print("En iyi skor: ", grid_search.best_score_)

In [54]:
rf = RandomForestRegressor(max_depth=7, min_samples_leaf=1, min_samples_split=2, n_estimators=1000)
rf.fit(X_train, y_train)

In [55]:
rf_preds = rf.predict(X_test)

In [56]:
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error
mape = mean_absolute_percentage_error(y_test, rf_preds)
rmse = np.sqrt(mean_squared_error(y_test,rf_preds))
f"MAPE: {mape:.2f} RMSE: {rmse:.2f}"

'MAPE: 0.07 RMSE: 188.10'

In [57]:
# predictions and real values
fig = px.line(x=y_test.index, y=y_test.values, title='Gerçek Değerler')
fig.add_scatter(x=y_test.index, y=rf_preds, mode='lines', name='Tahminler')

# set name to axis
fig.update_xaxes(title_text='Tarih')
fig.update_yaxes(title_text='Dağıtılan Enerji (MWh)')
fig.show()

<hr>

### Robust Regression

In [58]:
#robust regression
from sklearn.linear_model import HuberRegressor, RANSACRegressor, TheilSenRegressor

In [59]:
hr = HuberRegressor()
hr.fit(X_train, y_train)
hr_preds = hr.predict(X_test)

In [60]:
rr = RANSACRegressor()
rr.fit(X_train, y_train)
rr_preds = rr.predict(X_test)

In [61]:
tr = TheilSenRegressor()
tr.fit(X_train, y_train)
tr_preds = tr.predict(X_test)

In [62]:
# compare models
fig = px.line(x=y_test.index, y=y_test.values, title='Gerçek Değerler')
fig.add_scatter(x=y_test.index, y=hr_preds, mode='lines', name='Huber Regressor')
fig.add_scatter(x=y_test.index, y=rr_preds, mode='lines', name='RANSAC Regressor')
fig.add_scatter(x=y_test.index, y=tr_preds, mode='lines', name='TheilSen Regressor')
fig.update_xaxes(title_text='Tarih')
fig.update_yaxes(title_text='Dağıtılan Enerji (MWh)')
fig.show()

In [63]:
fig = px.line(x=y_test.index, y=y_test.values, title='Gerçek Değerler')
fig.add_scatter(x=y_test.index, y=hr_preds, mode='lines', name='Huber Regressor')
fig.show()

In [64]:
# mape, rmse 
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error
mape = mean_absolute_percentage_error(y_test, hr_preds)
rmse = np.sqrt(mean_squared_error(y_test,hr_preds))
f"MAPE: {mape:.2f} RMSE: {rmse:.2f}"

'MAPE: 0.15 RMSE: 402.94'

### AdaBoost

In [65]:
# adaboost
from sklearn.ensemble import AdaBoostRegressor
ada = AdaBoostRegressor()

In [66]:
param_grid = {'n_estimators': [100, 500, 1000, 2000],
            'learning_rate': [0.01, 0.05, 0.1, 1.0],
            'loss': ['linear', 'square', 'exponential']}

grid_search = GridSearchCV(ada, param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

print("En iyi parametreler: ", grid_search.best_params_)
print("En iyi skor: ", grid_search.best_score_)

In [67]:
ada = AdaBoostRegressor(learning_rate=1.0, loss='square', n_estimators=1000)
ada.fit(X_train, y_train)
ada_preds = ada.predict(X_test)

In [68]:
fig = px.line(x=y_test.index, y=y_test.values, title='Gerçek Değerler')
fig.add_scatter(x=y_test.index, y=ada_preds, mode='lines', name='AdaBoost Regressor')
fig.update_xaxes(title_text='Tarih')
fig.update_yaxes(title_text='Dağıtılan Enerji (MWh)')
fig.show()

In [69]:
mape = mean_absolute_percentage_error(y_test, ada_preds)
rmse = np.sqrt(mean_squared_error(y_test,ada_preds))
f"MAPE: {mape:.2f} RMSE: {rmse:.2f}"

'MAPE: 0.07 RMSE: 185.93'

#### Model Evaluation

|Model|MAPE|RMSE|
|:---|---:|--:|
|XGBoost|0.05|125.57|
|Random Forest|0.07|188.38|
|AdaBoost|0.07|187.43|
|Huber Regressor|0.15|402.94|