In [1]:
import os
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
import warnings
import matplotlib.pyplot as plt
# import the seaborn stylesheet
import seaborn as sns
sns.set()

from camelsp import Bundesland, util


In [12]:
#nuts = [nt for nt in os.listdir(Bundesland('Hessen').base_path) if nt.startswith('DE')]

In [2]:
for NUTS in util._NUTS_LVL2_NAMES.keys():    
    print(NUTS)

DE1
DE2
DE3
DE4
DE5
DE6
DE7
DE8
DE9
DEA
DEB
DEC
DED
DEE
DEF
DEG


In [3]:
def percentage_missing(time_series):
    """
    Calculate the percentage of missing values in a time series between the first and last non-NaN value.
    Args:
        time_series (np.ndarray): Time series with NaN values.
    Returns:
        percentage_missing (float): Percentage of missing values.
    """
    # Drop rows with NaN values before the first non-NaN value and after the last non-NaN value
    first_valid_index = np.where(~np.isnan(time_series))[0][0]
    last_valid_index = np.where(~np.isnan(time_series))[0][-1]
    time_series = time_series[first_valid_index:last_valid_index + 1]
    
    # Count the number of missing values
    missing_values = np.sum(np.isnan(time_series))
    
    # Calculate the percentage of missing values
    percentage_missing = 100 * missing_values / len(time_series)
    
    return percentage_missing

In [23]:
for NUTS in util._NUTS_LVL2_NAMES.keys():
    print(NUTS)  
    # process this federal state
    with Bundesland(NUTS) as bl:   
         # get meta
        meta = bl.metadata

        # go for each id
        q_gaps = []
        w_gaps = []
        with warnings.catch_warnings(record=True) as warn:
            for camels_id in tqdm(meta.camels_id.values):
                problem_encountered = False

                # load data
                try:
                    data = bl.get_data(camels_id)
                except Exception as e:
                    warnings.warn(f"{camels_id};{type(e)};{str(e)}")
                    q_gaps.append(100)
                    w_gaps.append(100)
                    continue
                
                # is file empty
                if data.empty:
                    warnings.warn(f"{camels_id};EmptyFile;The data file is empty.")
                    q_gaps.append(100)
                    w_gaps.append(100)
                    continue
                
                # duplicates ? 
                if data.index.duplicated().any():
                    warnings.warn(f"{camels_id};DuplicatedIndex;The data file has index duplicates.")
                    q_gaps.append(100)
                    w_gaps.append(100)
                    continue
                
                # go for q
                try:
                    q_missing = percentage_missing(data.q.values)
                    q_gaps.append(q_missing)
                except Exception as e:
                    warnings.warn(f"{camels_id};{type(e)};{str(e)}")
                    q_gaps.append(100)
                
                # go for w
                try:
                    w_missing = percentage_missing(data.w.values)
                    w_gaps.append(w_missing)
                except Exception as e:
                    warnings.warn(f"{camels_id};{type(e)};{str(e)}")
                    w_gaps.append(100)

            # all collected, return now
            gaps = pd.DataFrame({'camels_id': meta.camels_id.values, 'q_gaps': q_gaps, 'p_gaps': w_gaps})
            # update

            # update
            bl.update_metadata(gaps)

            if len(warn) > 0:
                bl.save_warnings(warns=warn, posfix='_gaps')
                print(f"There were {len(warn)} warnings (missing data files).")


metadata = util.get_metadata()
metadata

DE1


100%|██████████| 259/259 [00:04<00:00, 56.62it/s]


DE2


100%|██████████| 540/540 [00:11<00:00, 48.60it/s]


DE3


0it [00:00, ?it/s]


DE4


100%|██████████| 382/382 [00:04<00:00, 78.07it/s] 


DE5


0it [00:00, ?it/s]


DE6


0it [00:00, ?it/s]


DE7


100%|██████████| 97/97 [00:01<00:00, 51.73it/s]


DE8


100%|██████████| 235/235 [00:03<00:00, 78.27it/s]


DE9


100%|██████████| 282/282 [00:04<00:00, 68.20it/s]


DEA


100%|██████████| 437/437 [00:05<00:00, 80.19it/s] 


DEB


0it [00:00, ?it/s]


DEC


100%|██████████| 56/56 [00:00<00:00, 68.37it/s]


DED


100%|██████████| 282/282 [00:04<00:00, 69.55it/s]


DEE


100%|██████████| 252/252 [00:03<00:00, 71.94it/s] 


DEF


100%|██████████| 775/775 [00:05<00:00, 132.61it/s]


DEG


100%|██████████| 63/63 [00:01<00:00, 44.56it/s]






Unnamed: 0,camels_id,provider_id,camels_path,nuts_lvl2,federal_state,area,x,y,q_count,w_count,q_gaps,p_gaps
0,DEG10000,573000,./DEG/DEG10000/DEG10000_data.csv,DEG,Thüringen,182.7,4.352221e+06,3124617.000,29646.0,29646.0,6.159347,0.000000
1,DEG10010,447000,./DEG/DEG10010/DEG10010_data.csv,DEG,Thüringen,275.0,4.318941e+06,3140875.000,0.0,0.0,1.000000,1.000000
2,DEG10020,574200,./DEG/DEG10020/DEG10020_data.csv,DEG,Thüringen,174.7,4.386764e+06,3077926.000,35490.0,35490.0,4.116653,0.000000
3,DEG10030,576500,./DEG/DEG10030/DEG10030_data.csv,DEG,Thüringen,1383.0,4.473276e+06,3073272.000,12845.0,12845.0,9.618763,8.758272
4,DEG10040,570210,./DEG/DEG10040/DEG10040_data.csv,DEG,Thüringen,1013.0,4.442190e+06,3033884.000,21246.0,21246.0,0.000000,3.522350
...,...,...,...,...,...,...,...,...,...,...,...,...
3655,DE215350,56113404,./DE2/DE215350/DE215350_data.csv,DE2,Bayern,8.2,4.456659e+06,3009715.476,16497.0,16497.0,0.000000,0.000000
3656,DE215360,56114000,./DE2/DE215360/DE215360_data.csv,DE2,Bayern,14.1,4.457008e+06,3011664.641,20880.0,20880.0,0.000000,0.000000
3657,DE215370,56122008,./DE2/DE215370/DE215370_data.csv,DE2,Bayern,84.3,4.465052e+06,3016792.655,23072.0,23072.0,0.000000,0.000000
3658,DE215380,56143008,./DE2/DE215380/DE215380_data.csv,DE2,Bayern,92.4,4.462157e+06,3021202.262,23440.0,23440.0,0.000000,0.000000


In [22]:
len(q_gaps)

233