In [1]:
import pandas as pd
import holidays
import numpy as np

import warnings
warnings.filterwarnings('ignore')

# Read the Data

In [2]:
data = pd.read_csv("train.csv")
data['Tarih'] = pd.to_datetime(data['Tarih'])


med = pd.read_csv('med.csv' , parse_dates=['Tarih'])


future_df = pd.read_csv("future.csv" , parse_dates=['Tarih']).drop('Unnamed: 0' , axis = 1)
future_df.head()

Unnamed: 0,Tarih,Dağıtılan Enerji (MWh)
0,2022-08-01 00:00:00,
1,2022-08-01 01:00:00,
2,2022-08-01 02:00:00,
3,2022-08-01 03:00:00,
4,2022-08-01 04:00:00,


# Train | Test Split

In [3]:
test_df = data.loc[data.Tarih > '2022-07-23 23:00:00'].copy()
train_df = data.loc[data.Tarih <= '2022-07-23 23:00:00'].copy()

val_df = train_df[-96:]
train_df = train_df[:-96]


all_df = [train_df , val_df , test_df , future_df]

In [4]:
# from sklearn.preprocessing import MinMaxScaler

# scaler = MinMaxScaler(feature_range=(0,1))
# data_scaled = scaler.fit_transform(data)
# train_scaled = scaler.transform(X_train_temp)
# test_scaled = scaler.transform(X_test_temp)

# val_scaled = scaler.transform(X_val_temp)

# Packager

In [5]:
# def packager(train_scaled, val_scaled, test_scaled, window_size=24):
#     X_train, y_train, X_val, y_val, X_test, y_test = [], [], [], [], [], []

#     for i in range(window_size, len(train_scaled)):
#         X_train.append(train_scaled[i-window_size:i, 0])
#         y_train.append(train_scaled[i, 0])

#     for i in range(window_size, len(val_scaled)):
#         X_val.append(val_scaled[i-window_size:i, 0])
#         y_val.append(val_scaled[i, 0])

#     for i in range(window_size, len(test_scaled)):
#         X_test.append(test_scaled[i-window_size:i, 0])
#         y_test.append(test_scaled[i, 0])

#     X_train, y_train = np.array(X_train), np.array(y_train)
#     X_val, y_val = np.array(X_val), np.array(y_val)
#     X_test, y_test = np.array(X_test), np.array(y_test)

#     X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
#     X_val = np.reshape(X_val, (X_val.shape[0], X_val.shape[1], 1))
#     X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))

#     X = np.concatenate((X_train, X_val, X_test), axis=0)
#     y = np.concatenate((y_train, y_val, y_test), axis=0)

#     return X_train, y_train, X_val, y_val, X_test, y_test, X, y

In [6]:
# X_train, y_train, X_val, y_val, X_test, y_test, X, y = packager(train_scaled, val_scaled, test_scaled)

# Eval Metrics

In [7]:
def eval_metrics(y_true , y_pred):
    from sklearn.metrics import r2_score , mean_absolute_error , mean_squared_error , mean_absolute_percentage_error
    
    # MAPE hesaplama
    mape = mean_absolute_percentage_error(y_true, y_pred)
    
    # r2 hesaplama
    r2 = r2_score(y_true , y_pred)
    
    # mae hesaplama
    mae = mean_absolute_error(y_true , y_pred)

    # rmse hesaplama
    mse = mean_squared_error(y_true,y_pred)**0.5
    
    print(f"""
          Mape Score : {mape}
          R2 Score : {r2}
          MAE Score : {mae}
          MSE Score : {mse}
          """)
    
def eval_plot(y_true , y_pred):
    tests = pd.DataFrame(data = y_true , columns=['Real Values'] , index = X_test[:-24].index)
    preds = pd.DataFrame(data = y_pred , columns=['Predicts'] , index = future_data[:-24].index)
    compare = pd.concat([tests[:-24], preds] , axis= 1)
    print(compare.plot())
    
def eval_df (y_true , y_pred):
    compare = pd.DataFrame({'Real Values': y_true, 'Predicts': y_pred}, index=future_data[:-24].index)
    print(compare)
    
def create_submission(future_preds, num):
    submission_df = pd.DataFrame({'Tarih': future_data.index, 'Dağıtılan Enerji (MWh)': future_preds})
    filename = 'submission{}.csv'.format(num)
    submission_df.to_csv(filename, index=False)
    globals()['submission{}'.format(num)] = submission_df
    
def preds_plot(data , future_data):
    import matplotlib.pyplot as plt
    # Gerçek değerleri mavi renkte çizdir
    plt.plot(data, color='blue', label='Gerçek Değerler')

    # Tahmin edilen değerleri yeşil renkte çizdir
    plt.plot(future_data, color='green', label='Tahminler')

    # Eksenleri ve grafik başlığını belirle
    plt.title('Gerçek Değerler ve Tahminler')
    plt.xlabel('Saat')
    plt.ylabel('Değer')
    plt.legend()

    # Grafikleri göster
    plt.show()

# Feature Creators

In [8]:
def create_datetimes(df, label=None):
    df['date'] = df.Tarih
    df['hour'] = df['date'].dt.hour
    df['dayofweek'] = df['date'].dt.dayofweek
    df['quarter'] = df['date'].dt.quarter
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year
    df['dayofyear'] = df['date'].dt.dayofyear
    df['dayofmonth'] = df['date'].dt.day
    df['weekofyear'] = df['date'].dt.weekofyear
    
    X = df[['hour','dayofweek','quarter','month','year',
           'dayofyear','dayofmonth','weekofyear']]
    # if label:
    #     y = df[label]
    #     return X, y
    # return X

def create_holiday_weekend(df):
    import holidays
    turkey_holidays = holidays.Turkey()

    def is_holiday(date): 
        return date in turkey_holidays

    df['holiday'] = df['Tarih'].dt.date.apply(is_holiday).astype(int)

    df['weekend'] = df['dayofweek'].apply(lambda x: 1 if x in (5, 6) else 0)
    
def create_electricOutage_timeofDay(df):
 
    outage_dates = set(med['Tarih'].dt.date)
    df['date'] = df['Tarih'].dt.date
    df['electrical_outage'] = df['date'].apply(lambda x: 1 if x in outage_dates else 0)
    
    
    conditions = [
    (6 <= df['hour']) & (df['hour'] < 12),
    (12 <= df['hour']) & (df['hour'] < 18),
    (18 <= df['hour']) & (df['hour'] < 24)
    ]
    choices = [1, 2, 3]

    df['timeofday'] = np.select(conditions, choices, default=3)
    df['timeofday'] = df['timeofday'].astype('int')
    
def create_businessDay_cumulativeholidays(df):
    df['business_day'] = df['dayofweek'].apply(lambda x: 1 if x in range(0, 5) else 0)
    df['cumulative_holidays'] = df['holiday'].cumsum()
    

def create_outage_rolling_percentages(df):
    df['outage_percentage'] = (df['electrical_outage'].cumsum() / (df.index + 1)) * 100

    window_size = 24
    df['rolling_outages_24h'] = df['electrical_outage'].rolling(window=window_size).sum()
    
    
    alpha = 0.1
    df['exp_avg_outages_24h'] = df['electrical_outage'].ewm(alpha=alpha).mean()
    df
    

def create_hourly_sin_cos(df):
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
    
    
def create_seasons(df):
    def is_spring(ds):
        date = pd.to_datetime(ds)
        if (date.month >= 3) & (date.month <= 5):
            return 1
        else :
            return 0

    def is_summer(ds):
        date = pd.to_datetime(ds)
        if (date.month >= 6) & (date.month <= 8):
            return 1
        else :
            return 0

    def is_autumn(ds):
        date = pd.to_datetime(ds)
        if (date.month >= 9) & (date.month <= 11):
            return 1
        else :
            return 0

    def is_winter(ds):
        date = pd.to_datetime(ds)
        if (date.month >= 12) & (date.month <= 2):
            return 1
        else :
            return 0

    def is_weekend(ds):
        date = pd.to_datetime(ds)
        if date.day_name in ('Saturday', 'Sunday'):
            return 1
        else :
            return 0

    # adding to train set
    df['is_spring'] = df['Tarih'].apply(is_spring)
    df['is_summer'] = df['Tarih'].apply(is_summer)
    df['is_autumn'] = df['Tarih'].apply(is_autumn)
    df['is_winter'] = df['Tarih'].apply(is_winter)
    df['is_weekend'] = df['Tarih'].apply(is_weekend)
    df['is_weekday'] = ~df['Tarih'].apply(is_weekend)

    # adding to test set
    df['is_spring'] = df['Tarih'].apply(is_spring)
    df['is_summer'] = df['Tarih'].apply(is_summer)
    df['is_autumn'] = df['Tarih'].apply(is_autumn)
    df['is_winter'] = df['Tarih'].apply(is_winter)
    df['is_weekend'] = df['Tarih'].apply(is_weekend)
    df['is_weekday'] = ~df['Tarih'].apply(is_weekend)
    


# def create_isramadan(df):
#     hols = pd.read_csv('Calendar.csv', parse_dates=['CALENDAR_DATE'])
#     hols = hols[['CALENDAR_DATE','RAMADAN_FLAG','PUBLIC_HOLIDAY_FLAG']]
#     df['isRamadan'] = np.where((hol['RAMADAN_FLAG'] == 'Y') | (hol['PUBLIC_HOLIDAY_FLAG'] == 'Y'), 'TR-Holidays', 0)


In [9]:
# X, y = create_datetimes(data.set_index('Tarih'), label=['Dağıtılan Enerji (MWh)'])

# features_and_target = pd.concat([X, y], axis=1)

In [10]:
for df in all_df:
    create_datetimes(df)
    create_holiday_weekend(df)
    create_electricOutage_timeofDay(df)
    create_businessDay_cumulativeholidays(df)
    create_outage_rolling_percentages(df)
    create_hourly_sin_cos(df)
    create_seasons(df)


In [11]:
print(train_df.shape)
train_df.head(2)

(39864, 28)


Unnamed: 0,Tarih,Dağıtılan Enerji (MWh),date,hour,dayofweek,quarter,month,year,dayofyear,dayofmonth,...,rolling_outages_24h,exp_avg_outages_24h,hour_sin,hour_cos,is_spring,is_summer,is_autumn,is_winter,is_weekend,is_weekday
0,2018-01-01 00:00:00,1593.944216,2018-01-01,0,0,1,1,2018,1,1,...,,0.0,0.0,1.0,0,0,0,0,0,-1
1,2018-01-01 01:00:00,1513.933887,2018-01-01,1,0,1,1,2018,1,1,...,,0.0,0.258819,0.965926,0,0,0,0,0,-1


In [12]:
print(val_df.shape)
val_df.head(2)

(96, 28)


Unnamed: 0,Tarih,Dağıtılan Enerji (MWh),date,hour,dayofweek,quarter,month,year,dayofyear,dayofmonth,...,rolling_outages_24h,exp_avg_outages_24h,hour_sin,hour_cos,is_spring,is_summer,is_autumn,is_winter,is_weekend,is_weekday
39864,2022-07-20 00:00:00,1972.864041,2022-07-20,0,2,3,7,2022,201,20,...,,0.0,0.0,1.0,0,1,0,0,0,-1
39865,2022-07-20 01:00:00,1833.400302,2022-07-20,1,2,3,7,2022,201,20,...,,0.0,0.258819,0.965926,0,1,0,0,0,-1


In [13]:
print(test_df.shape)
test_df.head(2)

(192, 28)


Unnamed: 0,Tarih,Dağıtılan Enerji (MWh),date,hour,dayofweek,quarter,month,year,dayofyear,dayofmonth,...,rolling_outages_24h,exp_avg_outages_24h,hour_sin,hour_cos,is_spring,is_summer,is_autumn,is_winter,is_weekend,is_weekday
39960,2022-07-24 00:00:00,2055.091859,2022-07-24,0,6,3,7,2022,205,24,...,,0.0,0.0,1.0,0,1,0,0,0,-1
39961,2022-07-24 01:00:00,1930.490383,2022-07-24,1,6,3,7,2022,205,24,...,,0.0,0.258819,0.965926,0,1,0,0,0,-1


In [14]:
print(future_df.shape)
future_df.head(2)

(744, 28)


Unnamed: 0,Tarih,Dağıtılan Enerji (MWh),date,hour,dayofweek,quarter,month,year,dayofyear,dayofmonth,...,rolling_outages_24h,exp_avg_outages_24h,hour_sin,hour_cos,is_spring,is_summer,is_autumn,is_winter,is_weekend,is_weekday
0,2022-08-01 00:00:00,,2022-08-01,0,0,3,8,2022,213,1,...,,0.0,0.0,1.0,0,1,0,0,0,-1
1,2022-08-01 01:00:00,,2022-08-01,1,0,3,8,2022,213,1,...,,0.0,0.258819,0.965926,0,1,0,0,0,-1


In [15]:
train_df.isna().sum()

Tarih                      0
Dağıtılan Enerji (MWh)     0
date                       0
hour                       0
dayofweek                  0
quarter                    0
month                      0
year                       0
dayofyear                  0
dayofmonth                 0
weekofyear                 0
holiday                    0
weekend                    0
electrical_outage          0
timeofday                  0
business_day               0
cumulative_holidays        0
outage_percentage          0
rolling_outages_24h       23
exp_avg_outages_24h        0
hour_sin                   0
hour_cos                   0
is_spring                  0
is_summer                  0
is_autumn                  0
is_winter                  0
is_weekend                 0
is_weekday                 0
dtype: int64

In [None]:
# def daylight_hours(df):

#     from astral.sun import sun
#     from astral import LocationInfo
#     import pytz

#     location_izmir = LocationInfo("Izmir", "Turkey")
#     location_izmir.latitude = 38.419200
#     location_izmir.longitude = 27.128700
#     location_izmir.timezone = pytz.timezone("Turkey")

#     location_manisa = LocationInfo("Manisa", "Turkey")
#     location_manisa.latitude = 38.612000
#     location_manisa.longitude = 27.426000
#     location_manisa.timezone = pytz.timezone("Turkey")

#     def daylight_hours_izmir(date):
#         s = sun(location_izmir.observer, date=date)
#         daylight_duration = (s['sunset'] - s['sunrise']).seconds / 3600
#         return daylight_duration

#     def daylight_hours_manisa(date):
#         s = sun(location_manisa.observer, date=date)
#         daylight_duration = (s['sunset'] - s['sunrise']).seconds / 3600
#         return daylight_duration

#     for dataset in all_data :
#         dataset['Daylight_hours_Izmir'] = dataset['Tarih'].apply(daylight_hours_izmir)
#         dataset['Daylight_hours_Manisa'] = dataset['Tarih'].apply(daylight_hours_manisa)
#         dataset

In [None]:
    # conditions = [
    #     (6 <= df['hour']) & (df['hour'] < 12),
    #     (12 <= df['hour']) & (df['hour'] < 18),
    #     (18 <= df['hour']) & (df['hour'] < 24)
    # ]
    # choices = [1, 2, 3]

    # df['Time_of_day'] = np.select(conditions, choices, default=3)
    # df['Time_of_day'] = df['Time_of_day'].astype('int')



    # # df['Energy_Izmir'] = df['Dağıtılan Enerji (MWh)'] * 0.85
    # # df['Energy_Manisa'] = df['Dağıtılan Enerji (MWh)'] * 0.15


    # # df['Business_day'] = df['dayofweek'].apply(lambda x: 1 if x in range(0, 5) else 0)
    # # df['Cumulative_holidays'] = df['Holiday'].cumsum()


    # med_df = pd.read_csv('med.csv')
    # med_df['Tarih'] = pd.to_datetime(med_df['Tarih'])

    # outage_dates = set(med_df['Tarih'].dt.date)

    # df['Date'] = df['Tarih'].dt.date

    # df['Electrical_outage'] = df['Date'].apply(lambda x: 1 if x in outage_dates else 0)