# Data Preparation and Merging Pipeline

## Step 1: Load Libraries and Data

In [None]:

import pandas as pd
import numpy as np

# Load datasets
weather = pd.read_csv("/mnt/data/weather.csv")
yearlyFinalConsPerSource = pd.read_csv("/mnt/data/yearlyfinalConsPerSource.csv")
elecBalance = pd.read_csv("/mnt/data/elecBalance.csv")
energyPrice = pd.read_csv("/mnt/data/energyPrice.csv")
GDP = pd.read_csv("/mnt/data/GDP.csv")
populationNL = pd.read_csv("/mnt/data/populationNL.csv")
renewableEnergy = pd.read_csv("/mnt/data/renwableEnergy.csv")
nao = pd.read_csv("/mnt/data/nao.csv")


## Step 2: Preprocess Data

### 2.1 Transform Hourly Data to Daily Data

In [None]:

# Convert hourly weather data to daily data by aggregating (sum)
weather['Date'] = pd.to_datetime(weather['Date'])
daily_weather = weather.groupby(weather['Date'].dt.date).sum().reset_index()
daily_weather.rename(columns={'index': 'Date'}, inplace=True)


### 2.2 Standardize Yearly Data

In [None]:

# Ensure Year column is an integer
datasets_yearly = [yearlyFinalConsPerSource, elecBalance, energyPrice, GDP, populationNL, renewableEnergy]
for dataset in datasets_yearly:
    dataset['Year'] = dataset['Year'].astype(int)


## Step 3: Merge Datasets

### 3.1 Merge Daily Datasets

In [None]:

# Merge daily_weather with load consumption (example placeholder)
# daily_data = pd.merge(daily_weather, load_consumption, on='Date', how='inner')


### 3.2 Merge Yearly Datasets

In [None]:

# Merge yearly datasets on Year
yearly_data = GDP.merge(renewableEnergy, on='Year', how='outer')
yearly_data = yearly_data.merge(populationNL, on='Year', how='outer')
yearly_data = yearly_data.merge(yearlyFinalConsPerSource, on='Year', how='outer')
yearly_data = yearly_data.merge(energyPrice, on='Year', how='outer')


### 3.3 Upsample Yearly Data to Daily Data

In [None]:

# Convert Yearly data to daily frequency
yearly_data['Date'] = pd.to_datetime(yearly_data['Year'], format='%Y')  # Create a date column from Year
daily_from_yearly = yearly_data.set_index('Date').resample('D').ffill().reset_index()


### 3.4 Combine Daily and Upsampled Yearly Data

In [None]:

# Merge daily data with upsampled yearly data
final_data = pd.merge(daily_weather, daily_from_yearly, on='Date', how='inner')


## Step 4: Validation and Optimization

In [None]:

# Check final data structure and ensure no missing or duplicate values
print(final_data.info())
print(final_data.head())

# Save the final dataset
final_data.to_csv("/mnt/data/final_dataset.csv", index=False)
