# Elhub API data - Gridloss - Summerproject 2025

## Missing values

Bjørn Eirik Rognskog Nordbak

### Importing data from Elhub API
https://api.elhub.no/energy-data-api#/grid-areas

In [None]:
import requests
import pandas as pd
from datetime import datetime, timedelta
from zoneinfo import ZoneInfo

oslo = ZoneInfo("Europe/Oslo")

def fetch_window(start_dt, end_dt):
    params = {
        "dataset":   "LOSS_PER_MGA_HOUR",
        "startDate": start_dt.isoformat(),
        "endDate":   end_dt.isoformat(),
    }
    url = "https://api.elhub.no/energy-data/v0/grid-areas"
    resp = requests.get(url, params=params)
    obj = resp.json()
    
    # --- safeguard: if there's no "data", bail with empty DF ----
    raw = obj.get("data")
    if raw is None:
        print(f"  → no 'data' for {start_dt.date()} → {end_dt.date()}, skipping")
        return pd.DataFrame()
    
    # otherwise flatten
    df = pd.json_normalize(
        raw,
        record_path=["attributes", "lossPerMgaHour"],
        meta=[
            ["attributes", "eic"],
            ["attributes", "name"],
            ["attributes", "status"],
        ],
        errors="ignore"
    ).rename(columns={
        "attributes.eic":    "eic",
        "attributes.name":   "name",
        "attributes.status": "status",
    })
    return df

# loop as before
span_start = datetime(2023,1,1,0,0, tzinfo=oslo)
span_end   = datetime(2025,6,1,0,0, tzinfo=oslo)
window = timedelta(days=7)

all_chunks = []
cur = span_start
while cur < span_end:
    nxt = min(cur + window, span_end)
    print(f"Fetching {cur.date()} → {nxt.date()}")
    dfc = fetch_window(cur, nxt)
    all_chunks.append(dfc)
    cur = nxt

big_df = pd.concat(all_chunks, ignore_index=True)


### Reading data from premade CSV file

In [None]:
import pandas as pd

big_df = pd.read_csv('big_df.csv')

big_df.head()

In [None]:
# 1. Summary with .info()
#    This shows you how many non-null rows each column has.
big_df.info()

In [None]:
# 2. Count of missing per column
missing_counts = big_df.isnull().sum()
print(missing_counts)

### Check for temporal gaps

In [None]:
import pandas as pd

# 1) Parse your timestamps & sort
df = big_df.copy()
df['startTime'] = pd.to_datetime(df['startTime'], utc=True)
df = df.sort_values(['eic', 'startTime'])

# 2) Compute the difference between each timestamp and the previous one
df['delta'] = df.groupby('eic')['startTime'].diff()

# 3) Find all cases where that delta isn’t exactly 1 hour
gaps = df[(df['delta'].notna()) & (df['delta'] != pd.Timedelta(hours=1))].copy()

# 4) For clarity, pull in the “previous timestamp” and the size of the gap
gaps['prevTime']  = gaps.groupby('eic')['startTime'].shift(1)
gaps['gapHours'] = gaps['delta'].dt.total_seconds() / 3600

# 5) Show the gaps
gaps[['eic', 'prevTime', 'startTime', 'gapHours']].reset_index(drop=True)


In [None]:
import pandas as pd

# 1) Parse your timestamps & sort
df = big_df.copy()
df['startTime'] = pd.to_datetime(df['startTime'], utc=True)
df = df.sort_values(['name', 'startTime'])

# 2) Compute the difference between each timestamp and the previous one, per name
df['delta_name'] = df.groupby('name')['startTime'].diff()

# 3) Find all cases where that delta isn’t exactly 1 hour
gaps_name = df[(df['delta_name'].notna()) & (df['delta_name'] != pd.Timedelta(hours=1))].copy()

# 4) For clarity, pull in the “previous timestamp” and the size of the gap
gaps_name['prevTime']  = gaps_name.groupby('name')['startTime'].shift(1)
gaps_name['gapHours'] = gaps_name['delta_name'].dt.total_seconds() / 3600

# 5) Show the gaps
gaps_name[['name', 'prevTime', 'startTime', 'gapHours']].reset_index(drop=True)


In [None]:
pd.set_option('display.max_rows', 140)  # Show only 140 rows, scroll for more
import pandas as pd

# Assuming you already have 'gaps_name' from your existing gap detection logic
# Ensure gap start and end columns are present
gaps_name['gap_start'] = gaps_name['prevTime']
gaps_name['gap_end'] = gaps_name['startTime']

# Group by 'name' and compute summary statistics
gap_overview = (
    gaps_name
    .groupby('name')
    .agg(
        n_gaps=('gapHours', 'count'),
        total_gap_hours=('gapHours', 'sum'),
        first_gap_start=('gap_start', 'min'),
        last_gap_end=('gap_end', 'max')
    )
    .reset_index()
    .sort_values(by='n_gaps', ascending=False)
)

# Optional: round or cast total gap hours
gap_overview['total_gap_hours'] = gap_overview['total_gap_hours'].round(1)

# Display
print("\n--- Missing Windows Overview ---")
gap_overview


In [None]:
pd.set_option('display.max_rows', 280)  # Show only 140 rows, scroll for more
# Ensure these columns are in place
gaps_name['gap_start'] = gaps_name['prevTime']
gaps_name['gap_end'] = gaps_name['startTime']

# Select and rename for clarity
missing_windows = (
    gaps_name[['name', 'gap_start', 'gap_end', 'gapHours']]
    .sort_values(by=['name', 'gap_start'])
    .reset_index(drop=True)
    .rename(columns={'gapHours': 'gap_duration_hours'})
)

# Show result
print("\n--- All Missing Time Windows ---")
display(missing_windows)


### Statisics on the data

In [None]:
# 1. Basic descriptive stats for all numeric columns
stats = big_df.describe()
print(stats)

In [None]:
import matplotlib.pyplot as plt

# Boxplots to spot outliers
big_df.select_dtypes('number').plot.box(figsize=(8, 6))
plt.title('Boxplots of Numeric Features')
plt.show()

In [None]:
# Option A: pandas ≥1.5 has numeric_only parameter
corr = big_df.corr(numeric_only=True)

# Option B: select numeric dtypes yourself
import numpy as np
num_df = big_df.select_dtypes(include=[np.number])
corr = num_df.corr()

import matplotlib.pyplot as plt
import seaborn as sns  # optional, but nicer

plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, fmt=".2f", square=True)
plt.title('Correlation Matrix (numeric only)')
plt.show()
