#Understanding the Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', None)

# Load data
energy_df = pd.read_csv('../Data/energy_dataset.csv', parse_dates=['time'])
weather_df = pd.read_csv('../Data/weather_features.csv', parse_dates=['dt_iso'])

print(f"Energy: {energy_df.shape}")
print(f"Weather: {weather_df.shape}")

Energy: (35064, 29)
Weather: (178396, 17)


In [5]:
# Columns
print("ENERGY COLUMNS:")
print(energy_df.columns.tolist())
print("\n-------------------\n")
print("WEATHER COLUMNS:")
print(weather_df.columns.tolist())

ENERGY COLUMNS:
['time', 'generation biomass', 'generation fossil brown coal/lignite', 'generation fossil coal-derived gas', 'generation fossil gas', 'generation fossil hard coal', 'generation fossil oil', 'generation fossil oil shale', 'generation fossil peat', 'generation geothermal', 'generation hydro pumped storage aggregated', 'generation hydro pumped storage consumption', 'generation hydro run-of-river and poundage', 'generation hydro water reservoir', 'generation marine', 'generation nuclear', 'generation other', 'generation other renewable', 'generation solar', 'generation waste', 'generation wind offshore', 'generation wind onshore', 'forecast solar day ahead', 'forecast wind offshore eday ahead', 'forecast wind onshore day ahead', 'total load forecast', 'total load actual', 'price day ahead', 'price actual']

-------------------

WEATHER COLUMNS:
['dt_iso', 'city_name', 'temp', 'temp_min', 'temp_max', 'pressure', 'humidity', 'wind_speed', 'wind_deg', 'rain_1h', 'rain_3h', 'sn

In [8]:
# Missing values in energy data
energy_df.isnull().sum().sort_values(ascending=False).head(10)


generation hydro pumped storage aggregated     35064
forecast wind offshore eday ahead              35064
total load actual                                 36
generation hydro run-of-river and poundage        19
generation hydro pumped storage consumption       19
generation waste                                  19
generation marine                                 19
generation fossil oil                             19
generation biomass                                19
generation fossil peat                            18
dtype: int64

In [9]:

# Missing values in weather data
weather_df.isnull().sum().sort_values(ascending=False).head(10)

dt_iso        0
city_name     0
temp          0
temp_min      0
temp_max      0
pressure      0
humidity      0
wind_speed    0
wind_deg      0
rain_1h       0
dtype: int64

In [10]:
# Data Cleansing 

# Drop useless columns
energy_df = energy_df.drop(columns=[
    'generation hydro pumped storage aggregated',
    'forecast wind offshore eday ahead'
])

# Drop rows where total load actual is missing (only 36 rows)
energy_df = energy_df.dropna(subset=['total load actual'])

print(f"Energy after cleaning: {energy_df.shape}")

Energy after cleaning: (35028, 27)


In [12]:
weather_df['city_name'].unique()

<StringArray>
['Valencia', 'Madrid', 'Bilbao', ' Barcelona', 'Seville']
Length: 5, dtype: str

In [13]:
# Key energy metrics
energy_df[['total load actual', 'price actual']].describe()

Unnamed: 0,total load actual,price actual
count,35028.0,35028.0
mean,28696.939905,57.888661
std,4574.98795,14.192252
min,18041.0,9.33
25%,24807.75,49.3575
50%,28901.0,58.02
75%,32192.0,68.0
max,41015.0,116.8
