In [11]:
import os
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
import warnings
import matplotlib.pyplot as plt
# import the seaborn stylesheet
import seaborn as sns
sns.set()

from camelsp import Bundesland, util


In [12]:
nuts = [nt for nt in os.listdir(Bundesland('Hessen').base_path) if nt.startswith('DE')]

In [13]:
for NUTS in util._NUTS_LVL2_NAMES.keys():    
    print(NUTS)

DE1
DE2
DE3
DE4
DE5
DE6
DE7
DE8
DE9
DEA
DEB
DEC
DED
DEE
DEF
DEG


In [14]:
def percentage_missing(time_series):
    """
    Calculate the percentage of missing values in a time series between the first and last non-NaN value.
    Args:
        time_series (np.ndarray): Time series with NaN values.
    Returns:
        percentage_missing (float): Percentage of missing values.
    """
    # Drop rows with NaN values before the first non-NaN value and after the last non-NaN value
    first_valid_index = np.where(~np.isnan(time_series))[0][0]
    last_valid_index = np.where(~np.isnan(time_series))[0][-1]
    time_series = time_series[first_valid_index:last_valid_index + 1]
    
    # Count the number of missing values
    missing_values = np.sum(np.isnan(time_series))
    
    # Calculate the percentage of missing values
    percentage_missing = 100 * missing_values / len(time_series)
    
    return percentage_missing

In [33]:
for NUTS in util._NUTS_LVL2_NAMES.keys():
    print(NUTS)  
    # process this federal state
    with Bundesland(NUTS) as bl:   
         # get meta
        meta = bl.metadata

        # go for each id
        q_gaps = []
        p_gaps = []
        with warnings.catch_warnings(record=True) as warn:
            for camels_id in tqdm(meta.camels_id.values):
                problem_encountered = False
                try:
                    data = bl.get_data(camels_id)
                    # only keep data whichi si not empty
                    if not data.empty:
                        # Also check if there are duplicates in the index
                        if not data.index.duplicated().any():
                            q = data['q']
                            p = data['p']
                            q_missing = percentage_missing(q.values)
                            p_missing = percentage_missing(p.values)
                            # print(f"q: {q_missing:.2f}%, p: {p_missing:.2f}%")
                            q_gaps.append(q_missing)
                            p_gaps.append(p_missing)
                        else:
                            # Index broken
                            problem_encountered = True
                    else:
                        # Empty data
                         problem_encountered = True        

                except Exception as e:
                    warnings.warn(str(e))
                    # Some other problem, likely file not found
                    problem_encountered = True
                
                if problem_encountered:
                    q_gaps.append(np.nan)
                    p_gaps.append(np.nan)


            # all collected, return now
            gaps = pd.DataFrame({'camels_id': meta.camels_id.values, 'q_gaps': q_gaps, 'p_gaps': p_gaps})
            # update
            bl.update_metadata(gaps)

            if len(warn) > 0:
                print(f"There were {len(warn)} warnings (missing data files).")


DE1


100%|██████████| 259/259 [00:05<00:00, 49.70it/s]


DE2


100%|██████████| 540/540 [00:13<00:00, 40.89it/s]


DE3


0it [00:00, ?it/s]


DE4


100%|██████████| 382/382 [00:05<00:00, 65.94it/s] 


DE5


0it [00:00, ?it/s]


DE6


0it [00:00, ?it/s]


DE7


100%|██████████| 97/97 [00:02<00:00, 44.20it/s]


DE8


100%|██████████| 235/235 [00:03<00:00, 67.69it/s]


DE9


100%|██████████| 282/282 [00:04<00:00, 60.68it/s]


DEA


100%|██████████| 437/437 [00:06<00:00, 69.34it/s] 


DEB


0it [00:00, ?it/s]


DEC


100%|██████████| 56/56 [00:00<00:00, 60.02it/s]


DED


100%|██████████| 282/282 [00:04<00:00, 57.53it/s]


DEE


100%|██████████| 252/252 [00:04<00:00, 59.88it/s]


DEF


100%|██████████| 775/775 [00:07<00:00, 106.67it/s]


DEG


100%|██████████| 63/63 [00:01<00:00, 34.28it/s]




