In [18]:
import pandas as pd

import matplotlib.pyplot as plt

from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

from datetime import datetime, timedelta

In [49]:
path = 'D:\\Google Drive\\Datasets\\3- Time Series Datasets\\Residential_Energy_Consumption.csv'

data = pd.read_csv(path)

data.head()

Unnamed: 0,Month,Total Energy Consumed by the Residential Sector
0,1973-01-01,1957.641
1,1973-02-01,1712.143
2,1973-03-01,1510.079
3,1973-04-01,1183.421
4,1973-05-01,1006.326


In [35]:
data.shape

(588, 2)

In [36]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 588 entries, 0 to 587
Data columns (total 2 columns):
 #   Column                                           Non-Null Count  Dtype  
---  ------                                           --------------  -----  
 0   Month                                            588 non-null    object 
 1   Total Energy Consumed by the Residential Sector  588 non-null    float64
dtypes: float64(1), object(1)
memory usage: 9.3+ KB


In [37]:
data.tail()

Unnamed: 0,Month,Total Energy Consumed by the Residential Sector
583,2021-08-01,1755.212
584,2021-09-01,1456.901
585,2021-10-01,1315.799
586,2021-11-01,1644.306
587,2021-12-01,2032.352


In [51]:
data = data.rename(columns={'Total Energy Consumed by the Residential Sector': 'energy'})

In [52]:
split_ratio = 0.8

train_size = int(len(data)*split_ratio)

train = data.iloc[:train_size]
test = data.iloc[train_size:]

In [53]:
X = train.drop('energy', axis=1)
y = train['energy']

In [54]:
def data_prep(dt_):
    X = dt_.copy()
    X['Month'] = pd.to_datetime(X['Month'], utc=True)
    y = X['energy']

    X['month'] = X['Month'].dt.month
    X['quarter'] = X['Month'].dt.quarter
    X['year'] = X['Month'].dt.year

    feature_names = ['month', 'quarter', 'year']
    X = X[feature_names]
    
    return X, y

In [55]:
X_train, y_train = data_prep(train)
X_test, y_test = data_prep(test)
X_train.head()

Unnamed: 0,month,quarter,year
0,1,1,1973
1,2,1,1973
2,3,1,1973
3,4,2,1973
4,5,2,1973


In [56]:
params = {
    'depth': [3,4,5,6],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [20, 25, 30, 35, 40]
}

model = GridSearchCV(CatBoostRegressor(silent=True), params)

model.fit(X_train, y_train)

In [57]:
model.best_params_

{'depth': 6, 'learning_rate': 0.1, 'n_estimators': 40}

In [58]:
test_pred = model.predict(X_test)
train_pred = model.predict(X_train)

In [59]:
print(f'train_MAE = {round(mean_absolute_error(y_train, train_pred), 3)}')
print(f' test_MAE = {round(mean_absolute_error(y_test, test_pred), 3)}')

train_MAE = 68.979
 test_MAE = 100.958


In [61]:
feature_importance = model.best_estimator_.get_feature_importance().round(5)
dict(zip(X_train.columns, feature_importance))

{'month': 60.19977, 'quarter': 22.00267, 'year': 17.79756}

In [82]:
def future_data(data): # return X
    data['date'] = pd.to_datetime(data['date'])
    data['month'] = data['date'].dt.month
    data['year'] = data['date'].dt.year
    data['quarter'] = data['date'].dt.quarter
    X = data[['month', 'year', 'quarter']]
    return X

In [65]:
data.tail()

Unnamed: 0,Month,energy
583,2021-08-01,1755.212
584,2021-09-01,1456.901
585,2021-10-01,1315.799
586,2021-11-01,1644.306
587,2021-12-01,2032.352


In [83]:
# Define the start and end dates
start_date = datetime.date(2022, 1, 1)
end_date = datetime.date(2022, 12, 1)


In [84]:
import datetime
import calendar

def generate_monthly_dates(start_date, end_date):
    """Generates a list of dates representing the first day of each month within the given range."""
    month_dates = []
    current_month = start_date.month
    current_year = start_date.year

    while current_year <= end_date.year and current_month <= end_date.month:
        month_dates.append(datetime.date(current_year, current_month, 1))
        current_month += 1
        if current_month > 12:
            current_month = 1
            current_year += 1

    return month_dates

# Set the start and end dates
start_date = datetime.date(2022, 1, 1)
end_date = datetime.date(2022, 12, 1)

# Generate the list of dates
monthly_dates = generate_monthly_dates(start_date, end_date)


In [85]:
fdata = pd.DataFrame({'date': monthly_dates})
future_data(fdata)

Unnamed: 0,month,year,quarter
0,1,2022,1
1,2,2022,1
2,3,2022,1
3,4,2022,2
4,5,2022,2
5,6,2022,2
6,7,2022,3
7,8,2022,3
8,9,2022,3
9,10,2022,4


In [88]:
future_pred = model.predict(fdata)

In [89]:
future_pred

array([2413.41721549, 2124.92431358, 1929.7722005 , 1478.8327388 ,
       1449.04430475, 1494.37601011, 1700.86891512, 1681.44113126,
       1512.81888604, 1437.8582503 , 1598.73404987, 2170.44410909])

- extend hyperparameter optimization
- more data
- additional features