## Setup

In [1]:
import pandas as pd
import numpy as np

In [2]:
input_dir = "./InputData/"
output_dir = "./OutputData/"

## Data prep, consumption data

In [3]:
# Load consumption data
df_consumption = pd.read_csv(input_dir + "Real_Time_Consumption.csv", sep = ";")
df_consumption = df_consumption.rename({
    "Tarih": "date",
    "Saat": "hour",
    "Tüketim Miktarı(MWh)": "consumption_MWh"
}, axis = 1)

In [4]:
# Combine date & hour
df_consumption["time"] = df_consumption["date"].str.replace(".", ":") + ":" + df_consumption["hour"]
df_consumption = df_consumption.drop(["date", "hour"], axis = 1)

In [5]:
# Remove dots indicating thousands
df_consumption["consumption_MWh"] = df_consumption["consumption_MWh"].str.replace(".", "")

# Change fractional commas to dots, change consumption to float
df_consumption["consumption_MWh"] = df_consumption["consumption_MWh"].str.replace(",", ".").astype(float)

In [6]:
df_consumption

Unnamed: 0,consumption_MWh,time
0,29489.46,01:01:2021:00:00
1,28067.11,01:01:2021:01:00
2,26527.08,01:01:2021:02:00
3,25327.19,01:01:2021:03:00
4,24719.72,01:01:2021:04:00
...,...,...
26275,35090.93,31:12:2023:19:00
26276,33310.94,31:12:2023:20:00
26277,32083.96,31:12:2023:21:00
26278,30469.49,31:12:2023:22:00


In [7]:
# Check missing values
pd.isna(df_consumption).sum()

consumption_MWh    0
time               0
dtype: int64

In [8]:
# Check unusual values
df_consumption.describe()

Unnamed: 0,consumption_MWh
count,26280.0
mean,37097.680535
std,5723.335252
min,19209.2
25%,32357.3825
50%,37370.55
75%,40944.01
max,55575.02


## Data prep, generation data

In [9]:
# Load 2021 data
df_generation = pd.read_csv(input_dir + "Real_Time_Generation21.csv", sep = ";")

# Load & concatenate 2022 & 2023 data
for year in [22, 23]:
    df = pd.read_csv(input_dir + f"Real_Time_Generation{year}.csv", sep = ";")
    df_generation = pd.concat([df_generation, df])

In [10]:
newnames = [
    "date", "hour", "total_generation_MWh", "natural_gas", "hydro_dam", "lignite", 
    "hydro_river", "coal_imported", "wind", "solar", "fuel_oil", "geothermal", 
    "asphaltite_coal", "hard_coal", "biomass", "naphtha", "LNG", "international", 
    "waste_heat"]

In [11]:
# Reindex, rename columns
df_generation = df_generation.reset_index(drop = True)
df_generation = df_generation.rename(
    dict(zip(df_generation.columns, newnames)), axis = 1)

In [12]:
# Combine date & hour
df_generation["time"] = df_generation["date"].str.replace(".", ":") + ":" + df_generation["hour"]
df_generation = df_generation.drop(["date", "hour"], axis = 1)

In [13]:
# Change fractional commas to dots
df_generation = df_generation.apply(
    lambda col: col.astype(str), axis = 1)

df_generation.iloc[:, :-1] = df_generation.iloc[:, :-1].apply(
    lambda col: col.str.replace(",", "."), 
    axis = 1)

# Change datatypes
newtypes = ("float " * 17).split(" ")[0:-1]
newtypes.append("object")
df_generation = df_generation.astype(
    dict(zip(df_generation.columns, newtypes)))

In [14]:
df_generation

Unnamed: 0,total_generation_MWh,natural_gas,hydro_dam,lignite,hydro_river,coal_imported,wind,solar,fuel_oil,geothermal,asphaltite_coal,hard_coal,biomass,naphtha,LNG,international,waste_heat,time
0,29488.11,7083.97,1551.39,5256.89,686.07,7673.92,4836.19,0.0,33.20,1238.01,218.59,603.87,577.37,0.0,0.0,-364.20,92.84,01:01:2021:00:00
1,28065.76,6072.81,1387.63,5121.95,703.90,7530.56,4826.45,0.0,34.00,1244.05,223.00,599.09,575.69,0.0,0.0,-346.00,92.63,01:01:2021:01:00
2,26527.08,4878.19,1507.96,5110.55,667.26,6979.91,4750.11,0.0,30.30,1246.75,223.00,603.49,581.60,0.0,0.0,-142.00,89.96,01:01:2021:02:00
3,25327.19,4394.61,1652.86,5029.12,651.91,6536.85,4432.18,0.0,30.80,1248.66,220.79,603.81,579.64,0.0,0.0,-145.00,90.96,01:01:2021:03:00
4,24719.72,4191.38,1615.38,5012.93,670.94,6266.92,4321.98,0.0,31.40,1246.97,220.80,602.66,583.37,0.0,0.0,-137.00,91.99,01:01:2021:04:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26275,34753.40,5567.82,8719.86,5311.18,2287.36,9100.92,359.70,0.0,93.36,1219.49,130.27,613.05,997.70,0.0,0.0,270.20,82.49,31:12:2023:19:00
26276,31441.31,4792.96,7881.64,4908.48,2083.33,7613.93,459.87,0.0,93.46,1254.10,132.48,612.95,983.26,0.0,0.0,539.84,85.01,31:12:2023:20:00
26277,30368.98,4000.94,7513.96,4911.94,1966.32,7656.56,552.62,0.0,88.76,1246.25,132.48,607.59,977.02,0.0,0.0,629.91,84.63,31:12:2023:21:00
26278,30386.41,2903.52,8119.63,4615.93,1837.75,8390.47,720.11,0.0,83.06,1278.77,130.27,608.31,987.47,0.0,0.0,627.77,83.35,31:12:2023:22:00


In [15]:
# Check missing values
pd.isna(df_generation).sum()

total_generation_MWh    0
natural_gas             0
hydro_dam               0
lignite                 0
hydro_river             0
coal_imported           0
wind                    0
solar                   0
fuel_oil                0
geothermal              0
asphaltite_coal         0
hard_coal               0
biomass                 0
naphtha                 0
LNG                     0
international           0
waste_heat              0
time                    0
dtype: int64

In [16]:
# Check unusual values
df_generation.describe()

Unnamed: 0,total_generation_MWh,natural_gas,hydro_dam,lignite,hydro_river,coal_imported,wind,solar,fuel_oil,geothermal,asphaltite_coal,hard_coal,biomass,naphtha,LNG,international,waste_heat
count,26280.0,26280.0,26280.0,26280.0,26280.0,26280.0,26280.0,26280.0,26280.0,26280.0,26280.0,26280.0,26280.0,26280.0,26280.0,26280.0,26280.0
mean,35600.65043,9352.388915,4999.710506,4886.494534,2069.004806,7223.775079,3776.070075,310.68325,65.907522,1159.420432,210.401713,461.234166,818.720919,0.0,0.0,181.344213,85.487258
std,5229.624571,4231.276353,2537.886447,540.210669,1304.657072,2021.322339,2077.161811,435.431178,24.37102,119.470188,82.229953,76.131472,120.001982,0.0,0.0,392.650681,13.818308
min,16184.09,663.71,296.69,2366.31,389.39,956.35,158.89,-0.3,0.0,775.85,0.0,138.2,492.47,0.0,0.0,-1417.69,30.78
25%,31627.865,6079.6825,3055.4475,4563.9425,1088.125,5625.6725,2023.9025,0.0,40.7,1076.895,178.84,409.29,719.5975,0.0,0.0,-137.6,76.2375
50%,35626.295,9502.185,4726.52,4908.335,1533.37,7701.16,3482.625,19.35,71.9,1183.94,205.34,462.965,852.79,0.0,0.0,153.4,85.71
75%,39280.475,12470.3725,6705.9925,5254.4275,2866.2575,8741.9075,5341.7525,549.755,89.0025,1258.24,245.08,525.18,916.695,0.0,0.0,466.45,95.38
max,51285.69,19453.65,15784.77,6680.4,6103.59,11352.14,9737.59,1547.41,105.8,1458.28,368.72,656.49,1039.9,0.0,0.0,1648.54,127.76


In [17]:
# Check if the total adds up
(df_generation["total_generation_MWh"] - df_generation.iloc[:, 1:-1].sum(axis = 1)).describe()

count    2.628000e+04
mean     7.041096e-03
std      3.086210e-01
min     -2.182787e-11
25%     -3.637979e-12
50%      0.000000e+00
75%      3.637979e-12
max      2.317000e+01
dtype: float64

## Data prep, market clearing price data

In [18]:
# Load price data
df_price = pd.read_csv(input_dir + "Market_Clearing_Price.csv", sep = ";")
df_price = df_price.rename({
    "Tarih": "date",
    "Saat": "hour",
    "PTF (TL/MWh)": "TRY/MWh",
    "PTF (USD/MWh)": "USD/MWh",
    "PTF (EUR/MWh)": "EUR/MWh"
}, axis = 1)

In [19]:
# Combine date & hour
df_price["time"] = df_price["date"].str.replace(".", ":") + ":" + df_price["hour"]
df_price = df_price.drop(["date", "hour"], axis = 1)

In [20]:
# Remove dots indicating thousands
df_price.iloc[:, :-1] = df_price.iloc[:, :-1].apply(
    lambda col: col.str.replace(".", ""), 
    axis = 1)

# Change fractional commas to dots, change price to float
df_price.iloc[:, :-1] = df_price.iloc[:, :-1].apply(
    lambda col: col.str.replace(",", "."), 
    axis = 1)

# Change datatypes
newtypes = ("float " * 3).split(" ")[0:-1]
newtypes.append("object")
df_price = df_price.astype(
    dict(zip(df_price.columns, newtypes)))

In [21]:
df_price

Unnamed: 0,TRY/MWh,USD/MWh,EUR/MWh,time
0,263.99,35.96,29.31,01:01:2021:00:00
1,236.99,32.29,26.31,01:01:2021:01:00
2,216.70,29.52,24.06,01:01:2021:02:00
3,218.07,29.71,24.21,01:01:2021:03:00
4,208.99,28.47,23.20,01:01:2021:04:00
...,...,...,...,...
26275,2499.67,84.91,76.74,31:12:2023:19:00
26276,2472.34,83.98,75.90,31:12:2023:20:00
26277,2472.33,83.98,75.90,31:12:2023:21:00
26278,1800.00,61.15,55.26,31:12:2023:22:00


In [22]:
# Check missing values
pd.isna(df_price).sum()

TRY/MWh    0
USD/MWh    0
EUR/MWh    0
time       0
dtype: int64

In [23]:
# Check unusual values
df_price.describe()

Unnamed: 0,TRY/MWh,USD/MWh,EUR/MWh
count,26280.0,26280.0,26280.0
mean,1736.042084,99.88686,92.764924
std,1198.788668,57.198574,57.510338
min,0.0,0.0,0.0
25%,617.0,55.3075,48.79
50%,1699.005,84.89,78.03
75%,2499.99,131.3575,121.95
max,4800.0,264.17,271.63


## Merge & export

In [24]:
# Merge dataframes, change column order
df = df_consumption.merge(df_generation, on = "time", how = "inner").merge(df_price, on = "time", how = "inner")
cols = np.delete(df.columns.values, 1).tolist()
cols.insert(0, "time")
df = df[cols]

In [25]:
df

Unnamed: 0,time,consumption_MWh,total_generation_MWh,natural_gas,hydro_dam,lignite,hydro_river,coal_imported,wind,solar,...,asphaltite_coal,hard_coal,biomass,naphtha,LNG,international,waste_heat,TRY/MWh,USD/MWh,EUR/MWh
0,01:01:2021:00:00,29489.46,29488.11,7083.97,1551.39,5256.89,686.07,7673.92,4836.19,0.0,...,218.59,603.87,577.37,0.0,0.0,-364.20,92.84,263.99,35.96,29.31
1,01:01:2021:01:00,28067.11,28065.76,6072.81,1387.63,5121.95,703.90,7530.56,4826.45,0.0,...,223.00,599.09,575.69,0.0,0.0,-346.00,92.63,236.99,32.29,26.31
2,01:01:2021:02:00,26527.08,26527.08,4878.19,1507.96,5110.55,667.26,6979.91,4750.11,0.0,...,223.00,603.49,581.60,0.0,0.0,-142.00,89.96,216.70,29.52,24.06
3,01:01:2021:03:00,25327.19,25327.19,4394.61,1652.86,5029.12,651.91,6536.85,4432.18,0.0,...,220.79,603.81,579.64,0.0,0.0,-145.00,90.96,218.07,29.71,24.21
4,01:01:2021:04:00,24719.72,24719.72,4191.38,1615.38,5012.93,670.94,6266.92,4321.98,0.0,...,220.80,602.66,583.37,0.0,0.0,-137.00,91.99,208.99,28.47,23.20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26275,31:12:2023:19:00,35090.93,34753.40,5567.82,8719.86,5311.18,2287.36,9100.92,359.70,0.0,...,130.27,613.05,997.70,0.0,0.0,270.20,82.49,2499.67,84.91,76.74
26276,31:12:2023:20:00,33310.94,31441.31,4792.96,7881.64,4908.48,2083.33,7613.93,459.87,0.0,...,132.48,612.95,983.26,0.0,0.0,539.84,85.01,2472.34,83.98,75.90
26277,31:12:2023:21:00,32083.96,30368.98,4000.94,7513.96,4911.94,1966.32,7656.56,552.62,0.0,...,132.48,607.59,977.02,0.0,0.0,629.91,84.63,2472.33,83.98,75.90
26278,31:12:2023:22:00,30469.49,30386.41,2903.52,8119.63,4615.93,1837.75,8390.47,720.11,0.0,...,130.27,608.31,987.47,0.0,0.0,627.77,83.35,1800.00,61.15,55.26


In [28]:
# Export full data
df.to_csv(output_dir + "full_data.csv", index = False)