In [1]:
import pandas as pd


## KNMI weather data

|Data     |Granularity |Year range|Source |
|---------|------------|----------|-------|
|Weather  |hourly/daily|2006-2024 |KNMI   |

In [2]:
def read_data(path: str) -> pd.DataFrame:
    """
    Read the data.
    Strip spaces from the columns.
    Rename all the columns (abbreviations -> full names).
    """

    weather = pd.read_csv(path)

    weather.columns = weather.columns.str.strip()
    weather = weather.applymap(lambda x: x.strip() if isinstance(x, str) else x)
    weather.replace('', None, inplace=True)

    column_mapping = {
        'YYYYMMDD': 'Date',
        'RH': 'DailyPrecipitation',
        'RHX': 'MaxHourlyPrecipitation',
        'RHXH': 'HDMaxPrecipitation',
        'TG': 'DailyMeanTemperature',
        'TN': 'HourlyMinTemperature',
        'TNH': 'HDMinTemperature',
        'TX': 'HourlyMaxTemperature',
        'TXH': 'HDMaxTemperature',
        'FG': 'DailyMeanWindspeed',
        'FHX': 'MaxHourlyMeanWindspeed',
        'FHXH': 'HDMaxMeanWindspeed',
        'FHN': 'MinHourlyMeanWindspeed',
        'FHNH': 'HDMinMeanWindspeed'
    }

    weather = weather[column_mapping.keys()]
    weather.rename(columns=column_mapping, inplace=True)

    return weather


def filter_on_year(weather: pd.DataFrame) -> pd.DataFrame:
    """
    Filter on only 2006 and later and reset index after filtering.
    """
    weather = weather[weather['Date'] >= 20060101]
    weather.reset_index(drop=True, inplace=True)

    return weather


def cast_datekey(weather: pd.DataFrame) -> pd.DataFrame:
    """
    Cast datekey 20060101 to format 2006/01/01
    """
    weather['Date'] = pd.to_datetime(weather['Date'], format='%Y%m%d').dt.strftime('%Y/%m/%d')

    return weather


def cast_hour_columns(weather: pd.DataFrame) -> pd.DataFrame:
    """
    Cast hour columns to integer.
    """
    hour_cols = [
        'HDMaxPrecipitation',
        'HDMinTemperature',
        'HDMaxTemperature',
        'HDMaxMeanWindspeed',
        'HDMinMeanWindspeed'
    ]

    for i in hour_cols:
        weather[i] = weather[i].astype(int)

    return weather


def convert_columns(weather: pd.DataFrame) -> pd.DataFrame:
    """
    Divide the following columns by 10 to get actual number, since they are 0.1 of the actual number.
    """
    cols_to_convert = [
        'DailyPrecipitation',
        'MaxHourlyPrecipitation',
        'DailyMeanTemperature',
        'HourlyMaxTemperature',
        'DailyMeanWindspeed',
        'MinHourlyMeanWindspeed',
        'MaxHourlyMeanWindspeed'
    ]
    for i in cols_to_convert:
        weather[i] = weather[i] / 10

    return weather


def create_date_columns(weather: pd.DataFrame) -> pd.DataFrame:
    """
    Cast date column (2021-01-01 instead of 2021/01/01) and add month and year columns
    Create a week key combining year and week number
    """
    weather['Date'] = pd.to_datetime(weather['Date'], format='%Y/%m/%d')
    weather['Month'] = weather['Date'].dt.month
    weather['Year'] = weather['Date'].dt.year
    weather['Week'] = weather['Date'].dt.isocalendar().week

    weather['WeekKey'] = weather['Year'].astype(str) + weather['Week'].apply(lambda x: '{:02d}'.format(x))
    weather['day_of_week'] = weather['Date'].dt.day_name()

    return weather


def preprocess_weather(path: str) -> pd.DataFrame:
    """
    Preprocess the weather data.
    """
    weather = read_data(path)
    weather = filter_on_year(weather)
    weather = cast_datekey(weather)
    weather = cast_hour_columns(weather)
    weather = convert_columns(weather)
    weather = create_date_columns(weather)

    return weather

In [3]:
weather_path = 'deBilt.csv'

weather = preprocess_weather(weather_path)

weather

  weather = pd.read_csv(path)
  weather = weather.applymap(lambda x: x.strip() if isinstance(x, str) else x)


Unnamed: 0,Date,DailyPrecipitation,MaxHourlyPrecipitation,HDMaxPrecipitation,DailyMeanTemperature,HourlyMinTemperature,HDMinTemperature,HourlyMaxTemperature,HDMaxTemperature,DailyMeanWindspeed,MaxHourlyMeanWindspeed,HDMaxMeanWindspeed,MinHourlyMeanWindspeed,HDMinMeanWindspeed,Month,Year,Week,WeekKey,day_of_week
0,2006-01-01,0.0,0.0,1,4.0,23,7,5.3,13,2.4,4.0,1,1.0,15,1,2006,52,200652,Sunday
1,2006-01-02,-0.1,-0.1,10,1.5,-35,24,5.6,14,1.3,3.0,14,0.0,4,1,2006,1,200601,Monday
2,2006-01-03,-0.1,-0.1,20,-0.2,-45,8,4.5,12,1.6,3.0,19,1.0,1,1,2006,1,200601,Tuesday
3,2006-01-04,-0.1,-0.1,1,1.6,-5,24,3.8,13,2.9,5.0,21,1.0,3,1,2006,1,200601,Wednesday
4,2006-01-05,0.0,0.0,1,0.7,-13,7,1.9,20,4.6,6.0,3,3.0,8,1,2006,1,200601,Thursday
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6950,2025-01-11,-0.1,-0.1,1,0.5,-45,8,5.8,14,1.5,3.0,14,0.0,7,1,2025,2,202502,Saturday
6951,2025-01-12,0.1,0.1,21,1.7,-6,22,6.0,14,1.2,2.0,5,1.0,1,1,2025,2,202502,Sunday
6952,2025-01-13,0.0,0.0,1,0.5,-33,5,5.2,14,2.2,4.0,13,1.0,1,1,2025,3,202503,Monday
6953,2025-01-14,0.1,0.1,20,2.8,-16,1,5.7,24,3.1,5.0,13,2.0,6,1,2025,3,202503,Tuesday


In [4]:
weather.to_csv('processed_data/weather.csv', index = False)

## NAO

In [5]:
def preprocess_nao(path: str) -> pd.DataFrame:
    """
    Reset index, index is now the year and we want this as a column.
    Melt the dataset into a year, month, and NAO column.
    Rename index to Year.
    Map the months (now strings) to corresponding month numbers.
    """
    nao = pd.read_csv(path, delim_whitespace=True, index_col=0)
    nao.reset_index(inplace=True)  # Reset index to get 'Year'
    nao_melted = pd.melt(nao, id_vars=['index'], var_name='Month', value_name='NAO')
    nao_melted.rename(columns={'index': 'Year'}, inplace=True)

    # Map months to their numerical equivalent
    month_mapping = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6,
                     'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12}
    nao_melted['Month'] = nao_melted['Month'].map(month_mapping)
    nao_melted['Year'] = pd.to_numeric(nao_melted['Year'], errors='coerce')  # Ensure Year is numeric

    # Sort by Year and Month
    nao_melted.sort_values(by=['Year', 'Month'], inplace=True)

    return nao_melted


In [6]:
NAO_file_path = 'NAO.txt'
nao = preprocess_nao(NAO_file_path)


In [7]:
nao.head()

Unnamed: 0,Year,Month,NAO
0,1950,1,0.92
75,1950,2,0.4
150,1950,3,-0.36
225,1950,4,0.73
300,1950,5,-0.59


In [8]:
nao = nao[nao['Year'] >= 2006]

In [9]:
nao

Unnamed: 0,Year,Month,NAO
56,2006,1,1.27
131,2006,2,-0.51
206,2006,3,-1.28
281,2006,4,1.24
356,2006,5,-1.14
...,...,...,...
599,2024,8,0.63
674,2024,9,-1.43
749,2024,10,-0.38
824,2024,11,-0.23


In [10]:
nao.to_csv('processed_data/nao.csv', index = False)

## Energy Price

In [11]:
energyPrice_path = 'energyConsPricesNl2009.csv'
energyPriceNl = pd.read_csv(energyPrice_path, delimiter=';')

#energyPriceNl.columns.tolist()

In [12]:
# Energy Price Processing: Adjust Perioden to include numeric Year and Period columns
def preprocess_energy_price(df):
    # Extract the year as a separate column
    df['Year'] = df['Perioden'].str.extract(r'(\d{4})').astype(int)


    return df

In [13]:
# Apply preprocessing to the energyPriceNL DataFrame
energyPriceNl = preprocess_energy_price(energyPriceNl)


In [14]:
taxes_metadata = energyPriceNl['Belastingen'].unique()[0] 

# Drop the Taxes column
energyPriceNl = energyPriceNl.drop(columns=['Belastingen'])

# Display the metadata for reference
print(f"Taxes information: {taxes_metadata}")


Taxes information: Inclusief btw en belastingen


In [15]:
# Calculate the average electricity prices for households
energyPriceNl['AveragePrice_Elec_Household'] = energyPriceNl[
    energyPriceNl.filter(like='Elektriciteitsprijs /Verbruiksklassen huishoudens').columns
].mean(axis=1)

# Calculate the average electricity prices for non-households
energyPriceNl['AveragePrice_Elec_NonHousehold'] = energyPriceNl[
    energyPriceNl.filter(like='Elektriciteitsprijs /Verbruiksklassen niet-huishoudens').columns
].mean(axis=1)

# Calculate the average natural gas prices for households
energyPriceNl['AveragePrice_NG_Household'] = energyPriceNl[
    energyPriceNl.filter(like='Aardgasprijs/Verbruiksklassen huishoudens').columns
].mean(axis=1)

# Calculate the average natural gas prices for non-households
energyPriceNl['AveragePrice_NG_NonHousehold'] = energyPriceNl[
    energyPriceNl.filter(like='Aardgasprijs/Verbruiksklassen niet-huishoudens').columns
].mean(axis=1)

energyPriceNl.head()


Unnamed: 0,Prijscomponenten,Perioden,Aardgasprijs/Verbruiksklassen huishoudens/Minder dan 569 m3 (euro per m3),Aardgasprijs/Verbruiksklassen huishoudens/569 tot 5 687 m3 (euro per m3),Aardgasprijs/Verbruiksklassen huishoudens/5 687 m3 en meer (euro per m3),Aardgasprijs/Verbruiksklassen niet-huishoudens/Minder dan 28 433 m3 (euro per m3),Aardgasprijs/Verbruiksklassen niet-huishoudens/28 433 tot 284 333 m3 (euro per m3),Aardgasprijs/Verbruiksklassen niet-huishoudens/284 333 tot 2 843 332 m3 (euro per m3),Aardgasprijs/Verbruiksklassen niet-huishoudens/2 843 332 tot 28 433 324 m3 (euro per m3),Aardgasprijs/Verbruiksklassen niet-huishoudens/28 433 324 m3 en meer (euro per m3),...,Elektriciteitsprijs /Verbruiksklassen niet-huishoudens/500 tot 2 000 MWh (euro per kWh),Elektriciteitsprijs /Verbruiksklassen niet-huishoudens/2 000 tot 20 000 MWh (euro per kWh),Elektriciteitsprijs /Verbruiksklassen niet-huishoudens/20 000 tot 70 000 MWh (euro per kWh),Elektriciteitsprijs /Verbruiksklassen niet-huishoudens/70 000 tot 150 000 MWh (euro per kWh),Elektriciteitsprijs /Verbruiksklassen niet-huishoudens/150 000 MWh en meer (euro per kWh),Year,AveragePrice_Elec_Household,AveragePrice_Elec_NonHousehold,AveragePrice_NG_Household,AveragePrice_NG_NonHousehold
0,Totaalprijs,2009 1e halfjaar,1.057,0.791,0.722,0.752,0.662,0.459,0.374,0.276,...,0.132,0.118,0.108,0.104,0.08,2009,0.1424,0.134429,0.856667,0.5046
1,Totaalprijs,2009 2e halfjaar,1.031,0.658,0.572,0.622,0.612,0.407,0.33,0.24,...,0.127,0.116,0.099,0.094,0.072,2009,0.1246,0.123714,0.753667,0.4422
2,Totaalprijs,2009,1.045,0.735,0.656,0.706,0.644,0.435,0.352,0.255,...,0.129,0.117,0.104,0.098,0.076,2009,0.134,0.128857,0.812,0.4784
3,Totaalprijs,2010 1e halfjaar,0.943,0.664,0.593,0.621,0.545,0.39,0.296,0.226,...,0.12,0.106,0.091,0.087,0.074,2010,0.1176,0.123429,0.733333,0.4156
4,Totaalprijs,2010 2e halfjaar,1.06,0.703,0.608,0.621,0.59,0.36,0.302,0.256,...,0.115,0.106,0.085,0.086,0.076,2010,0.1118,0.114286,0.790333,0.4258


In [16]:
print(energyPriceNl.columns)

Index(['Prijscomponenten', 'Perioden',
       'Aardgasprijs/Verbruiksklassen huishoudens/Minder dan 569 m3 (euro per m3)',
       'Aardgasprijs/Verbruiksklassen huishoudens/569 tot 5 687 m3 (euro per m3)',
       'Aardgasprijs/Verbruiksklassen huishoudens/5 687 m3 en meer (euro per m3)',
       'Aardgasprijs/Verbruiksklassen niet-huishoudens/Minder dan 28 433 m3 (euro per m3)',
       'Aardgasprijs/Verbruiksklassen niet-huishoudens/28 433 tot 284 333 m3 (euro per m3)',
       'Aardgasprijs/Verbruiksklassen niet-huishoudens/284 333 tot 2 843 332 m3 (euro per m3)',
       'Aardgasprijs/Verbruiksklassen niet-huishoudens/2 843 332 tot 28 433 324 m3 (euro per m3)',
       'Aardgasprijs/Verbruiksklassen niet-huishoudens/28 433 324 m3 en meer (euro per m3)',
       'Elektriciteitsprijs /Verbruiksklassen huishoudens/Minder dan 1 MWh (euro per kWh)',
       'Elektriciteitsprijs /Verbruiksklassen huishoudens/1 tot 2,5 MWh (euro per kWh)',
       'Elektriciteitsprijs /Verbruiksklassen huishoudens

In [17]:
translations = {
    'Prijscomponenten': 'PriceComponents',
    'Belastingen': 'Taxes',
    'Perioden': 'Periods',
    'Aardgasprijs/Verbruiksklassen huishoudens/Minder dan 569 m3 (euro per m3)': 'NG_Household_LessThan_569m3_EuroPerM3',
    'Aardgasprijs/Verbruiksklassen huishoudens/569 tot 5 687 m3 (euro per m3)': 'NG_Household_569to5687m3_EuroPerM3',
    'Aardgasprijs/Verbruiksklassen huishoudens/5 687 m3 en meer (euro per m3)': 'NG_Household_MoreThan_5687m3_EuroPerM3',
    'Aardgasprijs/Verbruiksklassen niet-huishoudens/Minder dan 28 433 m3 (euro per m3)': 'NG_NonHousehold_LessThan_28433m3_EuroPerM3',
    'Aardgasprijs/Verbruiksklassen niet-huishoudens/28 433 tot 284 333 m3 (euro per m3)': 'NG_NonHousehold_28433to284333m3_EuroPerM3',
    'Aardgasprijs/Verbruiksklassen niet-huishoudens/284 333 tot 2 843 332 m3 (euro per m3)': 'NG_NonHousehold_284333to2843332m3_EuroPerM3',
    'Aardgasprijs/Verbruiksklassen niet-huishoudens/2 843 332 tot 28 433 324 m3 (euro per m3)': 'NG_NonHousehold_2843332to28433324m3_EuroPerM3',
    'Aardgasprijs/Verbruiksklassen niet-huishoudens/28 433 324 m3 en meer (euro per m3)': 'NG_NonHousehold_MoreThan_28433324m3_EuroPerM3',
    'Elektriciteitsprijs /Verbruiksklassen huishoudens/Minder dan 1 MWh (euro per kWh)': 'Electricity_Household_LessThan_1MWh_EuroPerKWh',
    'Elektriciteitsprijs /Verbruiksklassen huishoudens/1 tot 2,5 MWh (euro per kWh)': 'Electricity_Household_1to2.5MWh_EuroPerKWh',
    'Elektriciteitsprijs /Verbruiksklassen huishoudens/2,5 tot 5 MWh (euro per kWh)': 'Electricity_Household_2.5to5MWh_EuroPerKWh',
    'Elektriciteitsprijs /Verbruiksklassen huishoudens/5 tot 15 MWh (euro per kWh)': 'Electricity_Household_5to15MWh_EuroPerKWh',
    'Elektriciteitsprijs /Verbruiksklassen huishoudens/15 MWh en meer (euro per kWh)': 'Electricity_Household_MoreThan_15MWh_EuroPerKWh',
    'Elektriciteitsprijs /Verbruiksklassen niet-huishoudens/Minder dan 20 MWh (euro per kWh)': 'Electricity_NonHousehold_LessThan_20MWh_EuroPerKWh',
    'Elektriciteitsprijs /Verbruiksklassen niet-huishoudens/20 tot 500 MWh (euro per kWh)': 'Electricity_NonHousehold_20to500MWh_EuroPerKWh',
    'Elektriciteitsprijs /Verbruiksklassen niet-huishoudens/500 tot 2 000 MWh (euro per kWh)': 'Electricity_NonHousehold_500to2000MWh_EuroPerKWh',
    'Elektriciteitsprijs /Verbruiksklassen niet-huishoudens/2 000 tot 20 000 MWh (euro per kWh)': 'Electricity_NonHousehold_2000to20000MWh_EuroPerKWh',
    'Elektriciteitsprijs /Verbruiksklassen niet-huishoudens/20 000 tot 70 000 MWh (euro per kWh)': 'Electricity_NonHousehold_20000to70000MWh_EuroPerKWh',
    'Elektriciteitsprijs /Verbruiksklassen niet-huishoudens/70 000 tot 150 000 MWh (euro per kWh)': 'Electricity_NonHousehold_70000to150000MWh_EuroPerKWh',
    'Elektriciteitsprijs /Verbruiksklassen niet-huishoudens/150 000 MWh en meer (euro per kWh)': 'Electricity_NonHousehold_MoreThan_150000MWh_EuroPerKWh',
    'Year': 'Year',
    'AveragePrice_Elec_Household': 'AveragePrice_Electricity_Household',
    'AveragePrice_Elec_NonHousehold': 'AveragePrice_Electricity_NonHousehold',
    'AveragePrice_NG_Household': 'AveragePrice_NaturalGas_Household',
    'AveragePrice_NG_NonHousehold': 'AveragePrice_NaturalGas_NonHousehold',
}

# Rename columns in the dataset
energyPriceNl.rename(columns=translations, inplace=True)

#energyPriceNl.columns.tolist()


In [18]:
translations = {
    'Totaalprijs': 'TotalPrice',
  'Leveringsprijs': 'DeliveryPrice',
    'Netwerkprijs': 'NetworkPrice'
}

# Apply the translations to the column
energyPriceNl['PriceComponents'] = energyPriceNl['PriceComponents'].map(translations)

# Verify the updated unique values
updated_price_components_unique = energyPriceNl['PriceComponents'].unique()
updated_price_components_unique


array(['TotalPrice', 'DeliveryPrice', 'NetworkPrice'], dtype=object)

In [25]:
energyPriceNl['Year'] = energyPriceNl['Periods'].str.extract(r'(\d{4})')

# Convert 'Year' column to integer if needed
energyPriceNl['Year'] = energyPriceNl['Year'].astype(int)

# Move "Year" column to the first position
cols = ['Year'] + [col for col in energyPriceNl.columns if col != 'Year']
energyPriceNl = energyPriceNl[cols]

In [27]:
energyPriceNl.head(4)

Unnamed: 0,Year,PriceComponents,Periods,NG_Household_LessThan_569m3_EuroPerM3,NG_Household_569to5687m3_EuroPerM3,NG_Household_MoreThan_5687m3_EuroPerM3,NG_NonHousehold_LessThan_28433m3_EuroPerM3,NG_NonHousehold_28433to284333m3_EuroPerM3,NG_NonHousehold_284333to2843332m3_EuroPerM3,NG_NonHousehold_2843332to28433324m3_EuroPerM3,...,Electricity_NonHousehold_20to500MWh_EuroPerKWh,Electricity_NonHousehold_500to2000MWh_EuroPerKWh,Electricity_NonHousehold_2000to20000MWh_EuroPerKWh,Electricity_NonHousehold_20000to70000MWh_EuroPerKWh,Electricity_NonHousehold_70000to150000MWh_EuroPerKWh,Electricity_NonHousehold_MoreThan_150000MWh_EuroPerKWh,AveragePrice_Electricity_Household,AveragePrice_Electricity_NonHousehold,AveragePrice_NaturalGas_Household,AveragePrice_NaturalGas_NonHousehold
0,2009,TotalPrice,2009 1e halfjaar,1.057,0.791,0.722,0.752,0.662,0.459,0.374,...,0.187,0.132,0.118,0.108,0.104,0.08,0.1424,0.134429,0.856667,0.5046
1,2009,TotalPrice,2009 2e halfjaar,1.031,0.658,0.572,0.622,0.612,0.407,0.33,...,0.147,0.127,0.116,0.099,0.094,0.072,0.1246,0.123714,0.753667,0.4422
2,2009,TotalPrice,2009,1.045,0.735,0.656,0.706,0.644,0.435,0.352,...,0.167,0.129,0.117,0.104,0.098,0.076,0.134,0.128857,0.812,0.4784
3,2010,TotalPrice,2010 1e halfjaar,0.943,0.664,0.593,0.621,0.545,0.39,0.296,...,0.175,0.12,0.106,0.091,0.087,0.074,0.1176,0.123429,0.733333,0.4156


In [28]:
energyPriceNl.to_csv('processed_data/energyPrice.csv', index = False)

## Population

In [29]:
yearly_pop_path = 'Population (x million).csv' 
yearly_pop = pd.read_csv(yearly_pop_path, delimiter = ';')
yearly_pop.head()

Unnamed: 0,Year,Population
0,1900,510
1,1901,516
2,1902,523
3,1903,531
4,1904,538


In [30]:
yearly_pop = yearly_pop[yearly_pop['Year']>= 2006]

In [31]:
yearly_pop.to_csv('processed_data/populationNL.csv', index = False)

## GDP

In [32]:
def preprocess_gdp(df):
    df.columns = ['Period', 'GDP']  

    # Extract the year as a separate column
    df['Year'] = df['Period'].str.extract(r'(\d{4})').astype(int)

    # Return the cleaned DataFrame
    return df

In [35]:
gdp_path = 'eurostatGDP.csv'  
gdp_data = pd.read_csv(gdp_path)

gdp_data_processed = preprocess_gdp(gdp_data)

gdp_data_processed = gdp_data_processed[gdp_data_processed['Year']>= 2006]

gdp_data_processed.head()


Unnamed: 0,Period,GDP,Year
44,2006-01-01,143560.7,2006
45,2006-04-01,146021.7,2006
46,2006-07-01,147711.3,2006
47,2006-10-01,150378.1,2006
48,2007-01-01,153095.2,2007


In [36]:
gdp_data_processed.to_csv('processed_data/GDP.csv', index = False)

## Renewable Energy

In [37]:
renewableEnergy_path = 'HernieuwbareEnergieVerbeuik.csv'
renewableEnergy = pd.read_csv(renewableEnergy_path, delimiter = ';')
renewableEnergy

Unnamed: 0,Bron/techniek,Energietoepassingen,Perioden,Bruto eindverbruik (TJ),Bruto eindverbruik relatief (% van totaal eindverbruik energie)
0,Totaal hern. energie exc. stat. overdr.,Totaal energietoepassingen,2000,34689.0,1.62
1,Totaal hern. energie exc. stat. overdr.,Totaal energietoepassingen,2001,36562.0,1.68
2,Totaal hern. energie exc. stat. overdr.,Totaal energietoepassingen,2002,40076.0,1.85
3,Totaal hern. energie exc. stat. overdr.,Totaal energietoepassingen,2003,40600.0,1.84
4,Totaal hern. energie exc. stat. overdr.,Totaal energietoepassingen,2004,47422.0,2.04
...,...,...,...,...,...
355,Totaal biomassa,Totaal energietoepassingen,2019,108445.0,5.22
356,Totaal biomassa,Totaal energietoepassingen,2020,120444.0,6.20
357,Totaal biomassa,Totaal energietoepassingen,2021,126740.0,6.32
358,Totaal biomassa,Totaal energietoepassingen,2022,110244.0,5.96


In [38]:
renewableEnergy['Perioden'] = renewableEnergy['Perioden'].str.replace('**', '', regex=False).str.strip()

In [39]:
# Pivot the data
pivoted_data = renewableEnergy.pivot_table(
    index='Perioden', 
    columns='Bron/techniek', 
    values='Bruto eindverbruik (TJ)',
    aggfunc='sum'  # Handle aggregation if needed
)

# Reset index to flatten the DataFrame
pivoted_data.reset_index(inplace=True)

# Rename columns to include meaningful names (optional)
pivoted_data.columns.name = None

In [40]:
pivoted_data

Unnamed: 0,Perioden,Bodemwarmte,Buitenluchtwarmte,Statistische overdracht,Totaal aardwarmte en bodemenergie,Totaal biomassa,Totaal hern. energie exc. stat. overdr.,Totaal hern. energie inc. stat. overdr.,Totaal windenergie,Totaal zonne-energie,"Totaal, inclusief niet-hernieuwbaar",Waterkracht,Windenergie op land,Windenergie op zee,Zonnestroom,Zonnewarmte
0,2000,156.0,23.0,0.0,156.0,30989.0,34689.0,34689.0,2678.0,482.0,2139719.0,362.0,2678.0,0.0,28.0,454.0
1,2001,187.0,28.0,0.0,187.0,32469.0,36562.0,36562.0,2958.0,553.0,2177170.0,367.0,2958.0,0.0,42.0,511.0
2,2002,244.0,44.0,0.0,244.0,35113.0,40076.0,40076.0,3672.0,634.0,2164264.0,369.0,3672.0,0.0,59.0,575.0
3,2003,311.0,59.0,0.0,311.0,34262.0,40600.0,40600.0,4892.0,715.0,2203715.0,361.0,4892.0,0.0,91.0,624.0
4,2004,452.0,72.0,0.0,452.0,39396.0,47422.0,47422.0,6347.0,795.0,2328387.0,360.0,6347.0,0.0,121.0,674.0
5,2005,628.0,81.0,0.0,628.0,47907.0,57143.0,57143.0,7321.0,847.0,2300710.0,361.0,7321.0,0.0,128.0,719.0
6,2006,840.0,90.0,0.0,840.0,52156.0,63478.0,63478.0,9145.0,886.0,2284781.0,361.0,8917.0,228.0,132.0,755.0
7,2007,1141.0,106.0,0.0,1141.0,60188.0,74123.0,74123.0,11396.0,934.0,2247857.0,358.0,10304.0,1092.0,135.0,798.0
8,2008,1488.0,195.0,0.0,1584.0,64823.0,82081.0,82081.0,14131.0,988.0,2282493.0,360.0,12154.0,1977.0,143.0,846.0
9,2009,1841.0,351.0,0.0,1983.0,73747.0,93660.0,93660.0,16131.0,1088.0,2195534.0,360.0,13543.0,2588.0,162.0,926.0


In [41]:
pivoted_data.columns.tolist()

['Perioden',
 'Bodemwarmte',
 'Buitenluchtwarmte',
 'Statistische overdracht',
 'Totaal aardwarmte en bodemenergie',
 'Totaal biomassa',
 'Totaal hern. energie exc. stat. overdr.',
 'Totaal hern. energie inc. stat. overdr.',
 'Totaal windenergie',
 'Totaal zonne-energie',
 'Totaal, inclusief niet-hernieuwbaar',
 'Waterkracht',
 'Windenergie op land',
 'Windenergie op zee',
 'Zonnestroom',
 'Zonnewarmte']

In [42]:
# Mapping of current column names to English names
column_rename_map = {
    'Year': 'Year',
    'Bodemwarmte': 'GeothermalHeat',
    'Buitenluchtwarmte': 'AmbientAirHeat',
    'Totaal aardwarmte en bodemenergie': 'TotalGeothermal&GroundEnergy',
    'Totaal biomassa': 'TotalBiomass',
    'Totaal hern. energie exc. stat. overdr.': 'TotalRes(excl.Stat.Transfer)',
    'Totaal hern. energie inc. stat. overdr.': 'TotalRes(incl.Stat.Transfer)',
    'Totaal zonne-energie': 'TotalSolarEnergy',
    'Totaal, inclusief niet-hernieuwbaar': 'Total Including Non-Renewable',
    'Waterkracht': 'Hydropower',
    'Windenergie op land': 'OnshoreWindEnergy',
    'Windenergie op zee': 'OffshoreWindEnergy',
    'Zonnestroom': 'SolarPower',
    'Zonnewarmte': 'SolarThermalEnergy'
}

# Rename the columns in the DataFrame
pivoted_data.rename(columns=column_rename_map, inplace=True)


In [43]:
pivoted_data.drop(columns=['Statistische overdracht'], inplace = True)

In [44]:
pivoted_data.rename(columns={'Perioden': 'Year'}, inplace=True)


In [45]:
pivoted_data['Year'] = pivoted_data['Year'].astype(int)

# Verify the conversion
print(pivoted_data.dtypes)


Year                               int32
GeothermalHeat                   float64
AmbientAirHeat                   float64
TotalGeothermal&GroundEnergy     float64
TotalBiomass                     float64
TotalRes(excl.Stat.Transfer)     float64
TotalRes(incl.Stat.Transfer)     float64
Totaal windenergie               float64
TotalSolarEnergy                 float64
Total Including Non-Renewable    float64
Hydropower                       float64
OnshoreWindEnergy                float64
OffshoreWindEnergy               float64
SolarPower                       float64
SolarThermalEnergy               float64
dtype: object


In [46]:
pivoted_data = pivoted_data[pivoted_data['Year']>= 2006]

In [47]:
pivoted_data.to_csv('processed_data/renwableEnergy.csv',index = False)

## Energy Balance Sheet

In [48]:
eneryBalance_path = 'WorldEnergyBalancesHighlights2024.xlsx'

sheet_names = pd.ExcelFile(eneryBalance_path).sheet_names
print(sheet_names)

['Contents', 'Definitions', 'IEA Energy balance explained', 'TimeSeries_1971-2023', 'PivotData', 'Interactive_PivotChart', 'CustomLists']


In [49]:
energyBalance = pd.read_excel(eneryBalance_path, sheet_name='TimeSeries_1971-2023', skiprows=1)  

energyBalance.head()

Unnamed: 0,Country,Product,Flow,NoCountry,NoProduct,NoFlow,1971,1972,1973,1974,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023 Provisional
0,Australia,"Coal, peat and oil shale",Production (PJ),01. Australia,"01. Coal, peat and oil shale",01. Production (PJ),1368.251838,1648.414248,1685.223676,1640.362098,...,11950.78794,12501.00154,12223.55369,12256.93808,12323.3258,12596.41582,12334.88697,11467.56954,11416.06592,10961.73829
1,Australia,"Coal, peat and oil shale",Imports (PJ),01. Australia,"01. Coal, peat and oil shale",02. Imports (PJ),0.0,0.0,0.0,0.0,...,5.240999,6.978797,6.829994,7.783701,12.364454,17.03485,19.2703,12.416851,15.0273,16.560904
2,Australia,"Coal, peat and oil shale",Exports (PJ),01. Australia,"01. Coal, peat and oil shale",03. Exports (PJ),-541.424001,-614.718001,-738.939398,-745.382946,...,-10166.8869,-10634.4478,-10552.5051,-10248.0298,-10333.4232,-10629.0366,-10504.852,-9866.188301,-9687.771298,-9149.148195
3,Australia,"Coal, peat and oil shale",Total energy supply (PJ),01. Australia,"01. Coal, peat and oil shale",04. Total energy supply (PJ),884.29747,923.250159,945.54188,928.32307,...,1736.5671,1797.035545,1863.051458,1996.773936,1930.457749,1956.712871,1782.618877,1683.993595,1567.046515,1624.067792
4,Australia,"Coal, peat and oil shale","Electricity, CHP and heat plants (PJ)",01. Australia,"01. Coal, peat and oil shale","05. Electricity, CHP and heat plants (PJ)",-516.538554,-569.92192,-577.900491,-617.948468,...,-1639.902404,-1704.823218,-1753.358738,-1723.631613,-1647.423349,-1608.995635,-1515.11305,-1467.357844,-1394.946887,..


In [50]:
energyBalanceNL = energyBalance[energyBalance['Country'] == 'Netherlands']  


In [51]:
energyBalanceNL.head()

Unnamed: 0,Country,Product,Flow,NoCountry,NoProduct,NoFlow,1971,1972,1973,1974,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023 Provisional
2700,Netherlands,"Coal, peat and oil shale",Production (PJ),26. Netherlands,"01. Coal, peat and oil shale",01. Production (PJ),98.70036,76.515537,47.568634,20.832407,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2701,Netherlands,"Coal, peat and oil shale",Imports (PJ),26. Netherlands,"01. Coal, peat and oil shale",02. Imports (PJ),118.914938,106.863561,122.962812,132.257265,...,405.150687,451.839205,430.540882,392.461333,346.803019,277.179847,165.992042,234.836666,240.834743,176.012507
2702,Netherlands,"Coal, peat and oil shale",Exports (PJ),26. Netherlands,"01. Coal, peat and oil shale",03. Exports (PJ),-65.999329,-68.289178,-58.394321,-42.472566,...,-18.306201,-3.545198,-2.228035,-3.68399,-1.516518,-2.460666,-7.880634,-1.477183,-4.55912,-13.808058
2703,Netherlands,"Coal, peat and oil shale",Total energy supply (PJ),26. Netherlands,"01. Coal, peat and oil shale",04. Total energy supply (PJ),141.59771,118.672782,119.997548,121.014325,...,377.096355,464.264865,430.238269,385.720078,345.919123,268.94421,172.031258,234.322887,231.346587,158.430442
2704,Netherlands,"Coal, peat and oil shale","Electricity, CHP and heat plants (PJ)",26. Netherlands,"01. Coal, peat and oil shale","05. Electricity, CHP and heat plants (PJ)",-44.24825,-37.302353,-33.012575,-26.357824,...,-281.555965,-363.041518,-329.00996,-282.240511,-246.068577,-170.240128,-86.491798,-142.664029,-143.80327,..


In [52]:
year_columns = [col for col in energyBalanceNL.columns if str(col).isdigit() and int(col) >= 2006]

# Create a new DataFrame with the filtered year columns and key columns
energyBalanceNL = energyBalanceNL[["Product", "Flow"] + year_columns]

energyBalanceNL

Unnamed: 0,Product,Flow,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
2700,"Coal, peat and oil shale",Production (PJ),0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2701,"Coal, peat and oil shale",Imports (PJ),348.058067,372.143404,341.310846,310.644349,324.731189,316.048909,340.965163,341.266855,405.150687,451.839205,430.540882,392.461333,346.803019,277.179847,165.992042,234.836666,240.834743
2702,"Coal, peat and oil shale",Exports (PJ),-22.331901,-17.766799,-10.205999,-4.824596,-4.636998,-1.7417,-3.321698,-9.272498,-18.306201,-3.545198,-2.228035,-3.68399,-1.516518,-2.460666,-7.880634,-1.477183,-4.55912
2703,"Coal, peat and oil shale",Total energy supply (PJ),330.066855,352.10105,333.849583,312.413582,315.735661,311.470986,341.287207,340.142917,377.096355,464.264865,430.238269,385.720078,345.919123,268.94421,172.031258,234.322887,231.346587
2704,"Coal, peat and oil shale","Electricity, CHP and heat plants (PJ)",-226.623119,-242.900418,-229.648768,-226.724109,-221.246849,-209.08499,-238.501648,-246.3177,-281.555965,-363.041518,-329.00996,-282.240511,-246.068577,-170.240128,-86.491798,-142.664029,-143.80327
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2807,Total,Other final consumption (PJ),721.691858,714.926425,669.501274,696.471496,781.120357,722.860331,741.163501,702.118506,681.800367,675.486865,701.151108,739.868394,712.735598,680.710451,712.677331,707.705757,564.761011
6688,Fossil fuels,Electricity output (GWh),85745,91695,92238,96966,102386,95619,84937,84812,85848,89364,93329,93023.915,89339.829,92191.963,83977.437,75398.421,66744.139
6689,Nuclear,Electricity output (GWh),3469,4200,4169,4248,3969,4141,3915,2891,4091,4078,3960,3402.478,3514.77,3909.748,4087.363,3827.956,4156.313
6690,Renewable sources,Electricity output (GWh),8051,7583,9531,10837,11197,12322,12479,12106,11649,13688,14780,17438.536,18881.842,22803.19,32730.782,40355.69,48315.233


In [53]:
energyBalanceNL = energyBalanceNL[energyBalanceNL['Product'] != "Coal, peat and oil shale"]

In [54]:
unique_products = energyBalanceNL['Product'].unique()
unique_products

array(['Crude, NGL and feedstocks', 'Oil products', 'Natural gas',
       'Nuclear', 'Renewables and waste', 'Electricity', 'Heat', 'Total',
       'Fossil fuels', 'Renewable sources'], dtype=object)

In [55]:
def process_and_display_product_data(data, product_name):
    """
    Processes and displays the data for a specific product.
    
    Parameters:
        data (DataFrame): The original dataset.
        product_name (str): The product to process.
    """
    # Filter data for the given product
    product_data = data[data['Product'] == product_name]
    
    # Reshape the data
    reshaped_data = product_data.melt(
        id_vars=['Product', 'Flow'],  # Columns to keep
        var_name='Year',                        # Column for years
        value_name='Value'                      # Column for flow values
    )
    
    # Pivot the data to make Flow entries separate columns
    final_data = reshaped_data.pivot_table(
        index='Year',         # Years as rows
        columns='Flow',       # Each flow as a separate column
        values='Value',       # Values to populate the columns
        aggfunc='sum'         # In case of duplicate values
    ).reset_index()
    
    # Print the product name
    print(f"Data Frame for Product: {product_name}")
    display(final_data)
    return final_data


In [56]:
# Example usage for a single product
process_and_display_product_data(energyBalanceNL, 'Electricity')

Data Frame for Product: Electricity


Flow,Year,Commercial and public services (PJ),"Electricity, CHP and heat plants (PJ)",Exports (PJ),Imports (PJ),Industry (PJ),"Oil refineries, transformation (PJ)",Other final consumption (PJ),Production (PJ),Residential (PJ),Total energy supply (PJ),Total final consumption (PJ),Transport (PJ)
0,2006,124.657201,355.798798,-21.193201,98.445602,149.43277,0,20.682001,0,79.491598,77.252401,380.073969,5.810399
1,2007,127.501198,378.597603,-19.728001,83.120398,151.840775,0,22.672799,0,80.164802,63.392402,387.903574,5.724
2,2008,129.200398,387.187204,-32.8176,89.881201,150.773656,0,23.500801,0,81.003598,57.063601,390.324855,5.846401
3,2009,131.569202,409.284004,-38.019598,55.627198,131.312882,0,25.498801,0,82.3392,17.6076,376.760879,6.040799
4,2010,132.357601,429.368402,-46.108801,56.098799,140.767371,0,25.2864,0,82.778399,9.990002,387.511372,6.321599
5,2011,132.879599,410.266801,-41.511599,74.232002,140.438448,0,25.415999,0,82.954802,32.720399,387.956449,6.267602
6,2012,130.066509,371.642403,-54.165105,115.760603,125.853634,0,28.801403,0,84.107985,61.595498,375.176334,6.346799
7,2013,130.617094,365.867998,-54.054,119.707202,125.246112,0,30.124051,0,84.099452,65.653198,376.386709,6.299999
8,2014,128.123939,372.092405,-65.260799,118.278,119.425041,0,29.76917,0,82.185498,53.017201,365.681246,6.177598
9,2015,130.76408,396.7668,-79.24276,110.736033,123.786698,0,32.327999,0,79.855,31.493273,373.236289,6.502511


Flow,Year,Commercial and public services (PJ),"Electricity, CHP and heat plants (PJ)",Exports (PJ),Imports (PJ),Industry (PJ),"Oil refineries, transformation (PJ)",Other final consumption (PJ),Production (PJ),Residential (PJ),Total energy supply (PJ),Total final consumption (PJ),Transport (PJ)
0,2006,124.657201,355.798798,-21.193201,98.445602,149.43277,0,20.682001,0,79.491598,77.252401,380.073969,5.810399
1,2007,127.501198,378.597603,-19.728001,83.120398,151.840775,0,22.672799,0,80.164802,63.392402,387.903574,5.724
2,2008,129.200398,387.187204,-32.8176,89.881201,150.773656,0,23.500801,0,81.003598,57.063601,390.324855,5.846401
3,2009,131.569202,409.284004,-38.019598,55.627198,131.312882,0,25.498801,0,82.3392,17.6076,376.760879,6.040799
4,2010,132.357601,429.368402,-46.108801,56.098799,140.767371,0,25.2864,0,82.778399,9.990002,387.511372,6.321599
5,2011,132.879599,410.266801,-41.511599,74.232002,140.438448,0,25.415999,0,82.954802,32.720399,387.956449,6.267602
6,2012,130.066509,371.642403,-54.165105,115.760603,125.853634,0,28.801403,0,84.107985,61.595498,375.176334,6.346799
7,2013,130.617094,365.867998,-54.054,119.707202,125.246112,0,30.124051,0,84.099452,65.653198,376.386709,6.299999
8,2014,128.123939,372.092405,-65.260799,118.278,119.425041,0,29.76917,0,82.185498,53.017201,365.681246,6.177598
9,2015,130.76408,396.7668,-79.24276,110.736033,123.786698,0,32.327999,0,79.855,31.493273,373.236289,6.502511


In [57]:
elecBalance = process_and_display_product_data(energyBalanceNL, 'Electricity')

Data Frame for Product: Electricity


Flow,Year,Commercial and public services (PJ),"Electricity, CHP and heat plants (PJ)",Exports (PJ),Imports (PJ),Industry (PJ),"Oil refineries, transformation (PJ)",Other final consumption (PJ),Production (PJ),Residential (PJ),Total energy supply (PJ),Total final consumption (PJ),Transport (PJ)
0,2006,124.657201,355.798798,-21.193201,98.445602,149.43277,0,20.682001,0,79.491598,77.252401,380.073969,5.810399
1,2007,127.501198,378.597603,-19.728001,83.120398,151.840775,0,22.672799,0,80.164802,63.392402,387.903574,5.724
2,2008,129.200398,387.187204,-32.8176,89.881201,150.773656,0,23.500801,0,81.003598,57.063601,390.324855,5.846401
3,2009,131.569202,409.284004,-38.019598,55.627198,131.312882,0,25.498801,0,82.3392,17.6076,376.760879,6.040799
4,2010,132.357601,429.368402,-46.108801,56.098799,140.767371,0,25.2864,0,82.778399,9.990002,387.511372,6.321599
5,2011,132.879599,410.266801,-41.511599,74.232002,140.438448,0,25.415999,0,82.954802,32.720399,387.956449,6.267602
6,2012,130.066509,371.642403,-54.165105,115.760603,125.853634,0,28.801403,0,84.107985,61.595498,375.176334,6.346799
7,2013,130.617094,365.867998,-54.054,119.707202,125.246112,0,30.124051,0,84.099452,65.653198,376.386709,6.299999
8,2014,128.123939,372.092405,-65.260799,118.278,119.425041,0,29.76917,0,82.185498,53.017201,365.681246,6.177598
9,2015,130.76408,396.7668,-79.24276,110.736033,123.786698,0,32.327999,0,79.855,31.493273,373.236289,6.502511


In [58]:
elecBalance.to_csv('processed_data/elecBalance.csv', index = False)

In [59]:
unique_flow = energyBalanceNL['Flow'].unique()
unique_flow

array(['Production (PJ)', 'Imports (PJ)', 'Exports (PJ)',
       'Total energy supply (PJ)',
       'Electricity, CHP and heat plants (PJ)',
       'Oil refineries, transformation (PJ)',
       'Total final consumption (PJ)', 'Industry (PJ)', 'Transport (PJ)',
       'Residential (PJ)', 'Commercial and public services (PJ)',
       'Other final consumption (PJ)', 'Electricity output (GWh)'],
      dtype=object)

In [60]:
def reshape_total_final_consumption(data):
    """
    Reshapes the data to keep 'Total final consumption (PJ)' for all products 
    with Year as a column and products as separate columns.
    
    Parameters:
        data (DataFrame): The original dataset.
        
    Returns:
        DataFrame: Reshaped DataFrame with Year as rows and products as columns.
    """
    # Filter the data for 'Total final consumption (PJ)' flow
    filtered_data = data[data['Flow'] == 'Total final consumption (PJ)']
    
    # Melt the dataset to have years as rows
    reshaped_data = filtered_data.melt(
        id_vars=['Product'],           # Keep Product as identifier
        var_name='Year',               # Convert years to a single column
        value_name='Value'             # Values for the final consumption
    )
    
    # Pivot the data to have products as columns and years as rows
    final_data = reshaped_data.pivot_table(
        index='Year', 
        columns='Product', 
        values='Value', 
        aggfunc='sum'  # Handle duplicates if necessary
    ).reset_index()
    
    return final_data


In [61]:
final_consumption_df = reshape_total_final_consumption(energyBalanceNL)
final_consumption_df



Product,Year,"Crude, NGL and feedstocks",Electricity,Heat,Natural gas,Nuclear,Oil products,Renewables and waste,Total
0,2006,152.811999,380.073969,134.426999,898.165922,0,1008.011163,31.096825,2628.186508
1,2007,125.81602,387.903574,134.095999,846.718658,0,1014.433777,44.011474,2578.530834
2,2008,122.583998,390.324855,133.068001,885.718461,0,965.519794,43.982652,2562.009478
3,2009,118.55602,376.760879,119.710999,887.152712,0,952.343462,48.035018,2524.642987
4,2010,116.968017,387.511372,125.607002,1032.772229,0,1008.90288,42.37544,2732.806011
5,2011,120.683991,387.956449,119.317001,882.159823,0,952.904677,45.710628,2530.077378
6,2012,107.883997,375.176334,112.558999,922.146435,0,969.054823,46.413362,2557.834447
7,2013,111.648018,376.386709,112.764001,923.405782,0,915.891896,46.922037,2508.137131
8,2014,113.68799,365.681246,109.095,784.687381,0,865.024239,51.763699,2309.636765
9,2015,92.229192,373.236289,94.841186,809.116333,0,892.364206,50.699226,2333.886363


In [62]:
final_consumption_df = final_consumption_df[final_consumption_df['Year'] != "Flow"]

In [63]:
final_consumption_df.drop(columns= ['Nuclear'], inplace= True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_consumption_df.drop(columns= ['Nuclear'], inplace= True)


In [64]:
final_consumption_df.to_csv('processed_data/yearlyfinalConsPerSource.csv', index = False)

## Regions Energy Consumption

In [65]:
regions_data_path = 'resedentialRegionsEnergyData.csv'
regions_energy_data = pd.read_csv(regions_data_path, delimiter=';') 

In [66]:
regions_energy_data.head()

Unnamed: 0,Housing characteristics,Periods,Regions,Average consumption of natural gas (m3),Average supply of electricity (kWh)
0,Total dwellings,2010,The Netherlands,1850,3300
1,Total dwellings,2011,The Netherlands,1450,3250
2,Total dwellings,2012,The Netherlands,1500,3200
3,Total dwellings,2013,The Netherlands,1600,3150
4,Total dwellings,2014,The Netherlands,1200,3050


In [67]:
unique_regions = regions_energy_data["Regions"].unique()

num_regions = len(unique_regions)

unique_regions, num_regions

(array(['The Netherlands', 'Groningen (PV)', 'Fryslân (PV)',
        'Drenthe (PV)', 'Overijssel (PV)', 'Flevoland (PV)',
        'Gelderland (PV)', 'Utrecht (PV)', 'Noord-Holland (PV)',
        'Zuid-Holland (PV)', 'Zeeland (PV)', 'Noord-Brabant (PV)',
        'Limburg (PV)'], dtype=object),
 13)

In [68]:
regions_energy_data = regions_energy_data.rename(columns={
    "Periods": "Year",
    "Average consumption of natural gas (m3)": "AvgGasConsumptionResedetial",
    "Average supply of electricity (kWh)": "AvgElectricitySupplyResedential"
})

# Clean and convert numerical columns
regions_energy_data["AvgGasConsumptionResedetial"] = (
    regions_energy_data["AvgGasConsumptionResedetial"].astype(str).str.strip().replace({".": None, "": None}).astype(float)
)
regions_energy_data["AvgElectricitySupplyResedential"] = (
    regions_energy_data["AvgElectricitySupplyResedential"].astype(str).str.strip().replace({".": None, "": None}).astype(float)
)

# Extract numeric years
regions_energy_data["Year"] = regions_energy_data["Year"].astype(str).str.extract(r"(\d+)")[0].astype(int)


In [69]:
regions_energy_data.to_csv('processed_data/resedentialEnergyData(Provincies).csv', index = False)

## Electrciy Suppply Consumption Balance

In [70]:
ElecSupConsMonthly_path = 'ElectricitySupplyConsumptionMonthly.csv'
ElecSupConsMonthly = pd.read_csv(ElecSupConsMonthly_path, delimiter=';') 

In [71]:
ElecSupConsMonthly.head()

Unnamed: 0,Periods,Gross production (mln kWh),"Net production/Net production, total (mln kWh)",Net production/Nuclear energy (mln kWh),"Net production/Fuels/Fuels, total (mln kWh)",Net production/Fuels/Coal (mln kWh),Net production/Fuels/Petroleum products (mln kWh),Net production/Fuels/Natural gas (mln kWh),Net production/Fuels/Biomass (mln kWh),Net production/Fuels/Other fuels (non-renewable) (mln kWh),Net production/Hydro power (mln kWh),"Net production/Wind energy/Wind energy, total (mln kWh)",Net production/Wind energy/Wind energy on shore (mln kWh),Net production/Wind energy/Wind energy off shore (mln kWh),Net production/Solar photovoltaic (mln kWh),Net production/Other sources (mln kWh),"Imports/Imports, total (mln kWh)","Exports/Exports, total (mln kWh)",Net consumption (calculated) (mln kWh)
0,2015 January,10700,10301,364.0,8918,3771,96,4605,349,98,9,870,773,98,28,111,2432,1979,10263
1,2015 February,9695,9328,329.0,8310,3348,79,4464,328,91,12,504,440,65,58,116,2371,2110,9150
2,2015 March,9642,9302,363.0,8038,3249,49,4257,366,118,13,684,605,79,86,118,2729,2151,9427
3,2015 April,8602,8216,345.0,7211,3604,53,3099,342,113,12,420,376,44,133,94,2414,1805,8410
4,2015 May,7585,7248,132.0,6295,2982,97,2730,361,125,10,564,499,65,152,94,2782,1094,8520


In [72]:
ElecSupConsMonthly[['Year', 'Month']] = ElecSupConsMonthly['Periods'].str.split(' ', expand=True)

# Convert Year to integer for proper analysis
ElecSupConsMonthly['Year'] = ElecSupConsMonthly['Year'].astype(int)

# Drop the original "Periods" column
ElecSupConsMonthly = ElecSupConsMonthly.drop(columns=['Periods'])

# Reorder columns to have "Year" and "Month" at the front
column_order = ['Year', 'Month'] + [col for col in ElecSupConsMonthly.columns if col not in ['Year', 'Month']]
ElecSupConsMonthly = ElecSupConsMonthly[column_order]


In [73]:
ElecSupConsMonthly.head()

Unnamed: 0,Year,Month,Gross production (mln kWh),"Net production/Net production, total (mln kWh)",Net production/Nuclear energy (mln kWh),"Net production/Fuels/Fuels, total (mln kWh)",Net production/Fuels/Coal (mln kWh),Net production/Fuels/Petroleum products (mln kWh),Net production/Fuels/Natural gas (mln kWh),Net production/Fuels/Biomass (mln kWh),Net production/Fuels/Other fuels (non-renewable) (mln kWh),Net production/Hydro power (mln kWh),"Net production/Wind energy/Wind energy, total (mln kWh)",Net production/Wind energy/Wind energy on shore (mln kWh),Net production/Wind energy/Wind energy off shore (mln kWh),Net production/Solar photovoltaic (mln kWh),Net production/Other sources (mln kWh),"Imports/Imports, total (mln kWh)","Exports/Exports, total (mln kWh)",Net consumption (calculated) (mln kWh)
0,2015,January,10700,10301,364.0,8918,3771,96,4605,349,98,9,870,773,98,28,111,2432,1979,10263
1,2015,February,9695,9328,329.0,8310,3348,79,4464,328,91,12,504,440,65,58,116,2371,2110,9150
2,2015,March,9642,9302,363.0,8038,3249,49,4257,366,118,13,684,605,79,86,118,2729,2151,9427
3,2015,April,8602,8216,345.0,7211,3604,53,3099,342,113,12,420,376,44,133,94,2414,1805,8410
4,2015,May,7585,7248,132.0,6295,2982,97,2730,361,125,10,564,499,65,152,94,2782,1094,8520


In [74]:
missing_values = ElecSupConsMonthly.isnull().sum()

missing_values

Year                                                          0
Month                                                         0
Gross production (mln kWh)                                    0
Net production/Net production, total (mln kWh)                0
Net production/Nuclear energy (mln kWh)                       3
Net production/Fuels/Fuels, total (mln kWh)                   0
Net production/Fuels/Coal (mln kWh)                           0
Net production/Fuels/Petroleum products (mln kWh)             0
Net production/Fuels/Natural gas (mln kWh)                    0
Net production/Fuels/Biomass (mln kWh)                        0
Net production/Fuels/Other fuels (non-renewable) (mln kWh)    0
Net production/Hydro power (mln kWh)                          0
Net production/Wind energy/Wind energy, total (mln kWh)       0
Net production/Wind energy/Wind energy on shore (mln kWh)     0
Net production/Wind energy/Wind energy off shore (mln kWh)    0
Net production/Solar photovoltaic (mln k

In [75]:
ElecSupConsMonthly.to_csv('processed_data/ElecSupConsMonthly.csv', index = False)