# Libraries

## I/O

In [4]:
import os
from pprint import pprint
from util_IO import (
    load_pickle_from_main_project_dir,
    EDA_dirs_structure,
    load_attributes_df,
    load_timeseries_df
)

## Analysis

In [5]:
import pandas as pd

# Settings

## Packages

In [6]:
# Set pandas to display a maximum of 300 columns
pd.set_option('display.max_columns', 300)
pd.set_option('display.max_rows', 1000)

## Parameters

### Load metadata from previous step (*1-DataAggregation.ipynb*)

In [7]:
aggr_parameters_dict, camels_gb_use_case_dir = load_pickle_from_main_project_dir(
    'aggr_parameters_dict.pkl'
)

# # Print imported variable
pprint(aggr_parameters_dict)

{'attributes': {'aggregations': {'fundamental': {'chalk_streams_df': ['chalk_stream_flag'],
                                                 'climatic_attributes_df': [],
                                                 'humaninfluence_attributes_df': ['surfacewater_abs',
                                                                                  'groundwater_abs',
                                                                                  'discharges',
                                                                                  'num_reservoir',
                                                                                  'reservoir_cap'],
                                                 'hydrogeology_attributes_df': [],
                                                 'hydrologic_attributes_df': ['baseflow_index'],
                                                 'hydrometry_attributes_df': ['bankfull_flow'],
                                                 'landc

### Retrieve variables in use

In [8]:
camels_gb_data_attributes_aggr_dir = aggr_parameters_dict['camels_gb_data_attributes_aggr_dir']
camels_gb_data_timeseries_aggr_dir = aggr_parameters_dict['camels_gb_data_timeseries_aggr_dir']
attributes_index = aggr_parameters_dict["attributes"]["attributes_index"]
date_field = aggr_parameters_dict["timeseries"]["date_field"]
attributes_fundamental_fields = aggr_parameters_dict['attributes']['aggregations']['fundamental']

# Retrieve aggregated file

In [9]:
# Attributes
attributes_df = load_attributes_df(
    camels_gb_data_attributes_aggr_dir,
    "fundamental.csv",
    attributes_index
)

display(attributes_df.head(3))

# Timeseries
timeseries_df = load_timeseries_df(
    camels_gb_data_timeseries_aggr_dir,
    "timeseries.csv",
    date_field
)

display(timeseries_df.head(3))

Unnamed: 0_level_0,dwood_perc,ewood_perc,grass_perc,shrub_perc,crop_perc,urban_perc,inwater_perc,bares_perc,baseflow_index,gauge_name,gauge_lat,gauge_lon,gauge_elev,area,dpsbar,elev_mean,elev_min,elev_10,elev_50,elev_90,elev_max,bankfull_flow,sand_perc,silt_perc,clay_perc,organic_perc,surfacewater_abs,groundwater_abs,discharges,num_reservoir,reservoir_cap,chalk_stream_flag,@id,label,query_for_station_specifics,station_code
gauge_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1
101002,6.2,0.3,42.78,0.38,46.34,3.28,0.49,0.56,0.68,Medina at Shide,50.69,-1.29,10.4,30.61,80.0,60.0,9.4,26.8,56.8,98.0,193.2,17.8,70.99,15.36,13.66,0.43,0.006,0.023,0.018,0,0,True,http://environment.data.gov.uk/hydrology/id/st...,Shide,https://environment.data.gov.uk/hydrology/id/m...,69f4b3e5-a487-4769-aded-0d72062428e7
101005,4.6,0.12,59.24,0.25,31.08,4.78,0.05,0.0,0.62,Eastern Yar at Budbridge,50.65,-1.25,17.2,24.31,87.0,92.0,17.2,32.9,84.0,157.2,234.6,,71.01,15.56,13.45,0.45,0.001,0.015,0.053,0,0,True,http://environment.data.gov.uk/hydrology/id/st...,Budbridge,https://environment.data.gov.uk/hydrology/id/m...,c10d61a3-fc5d-4d96-bf9f-57a97b6256c0
22001,5.87,9.6,59.92,11.13,12.49,0.95,0.13,0.0,0.51,Coquet at Morwick,55.33,-1.63,5.2,578.25,110.0,225.0,5.2,79.3,192.8,420.0,774.5,175.0,45.16,31.88,22.96,4.13,0.0,0.006,0.004,0,0,False,http://environment.data.gov.uk/hydrology/id/st...,Morwick,https://environment.data.gov.uk/hydrology/id/m...,3df7a9c3-d40c-4781-a885-5f6b7abdb86a


Unnamed: 0,catchmentID,date,precipitation,pet,temperature,discharge_spec,discharge_vol_files,peti,humidity,shortwave_rad,longwave_rad,windspeed,discharge_vol,date_diff,date_consecutive_day,date_group
0,101002,1997-03-01,0.26,0.7,8.31,0.9,0.32,0.93,6.11,73.68,320.09,6.23,0.319,0.0,False,0
1,101002,1997-03-02,0.1,1.42,9.55,0.89,0.31,1.68,5.64,89.11,315.49,6.51,0.314,1.0,False,0
2,101002,1997-03-03,24.15,0.53,5.84,1.12,0.4,0.65,4.98,51.62,320.09,2.75,0.397,1.0,False,0


# Columns removal

## Columns removal for ***attributes***

As shown in ***02a-EDA-Attributes***, the only dataset with `NaN` values is the one related to ***attributes***. Columns to remove are:
 - **bankfull_flow**

In [10]:
# Define columns to be removed
columns_to_remove = [
    'bankfull_flow'
]

In [11]:
# Reduce the columns to the dimensions to use in the model
attributes_df = (
    attributes_df[
        [col for sublist in attributes_fundamental_fields.values() for col in sublist]
    ]
)

# Removing columns because of NaN
attributes_postEDA_df = (
    attributes_df
        .drop(
            columns=columns_to_remove
        )
)

## Column removal for ***timeseries***

In [12]:
# Define columns to be removed
columns_to_remove = [
    'pet',
    'peti',
    'discharge_spec',
    'discharge_vol_files',
    'date_diff',
    'date_consecutive_day'
]

# Removing columns
timeseries_postEDA_df = (
    timeseries_df
        .drop(
            columns=columns_to_remove
        )
)

# Rows removal

## ***attributes*** with `NaN`

In [13]:
# Identify rows with at least one NaN value
catchmentsID_with_nan_list = attributes_postEDA_df[attributes_postEDA_df.isna().any(axis=1)].index.to_list()

print(catchmentsID_with_nan_list)

['26006']


In [14]:
# Drop rows with at least one NaN value
attributes_postEDA_df.dropna(
    inplace=True
)

## Catchments removal because not in common

In [15]:
# Sets definitions
attributes_set = set(attributes_postEDA_df.index)
timeseries_set = set(timeseries_postEDA_df['catchmentID'])

### Catchments in ***attributes***, but NOT in ***timeseries***

In [16]:
# Identify elements present in attributes but not in timeseries
attributes_not_in_timeseries = attributes_set - timeseries_set

print(attributes_not_in_timeseries)

{'39004', '27002', '39095', '71014', '43021', '26005', '39016', '42016', '27062'}


### Catchments in timeseries, but NOT in attributes

In [17]:
# Identify elements present in timeseries but not in attributes
timeseries_not_in_attributes = timeseries_set - attributes_set

print(timeseries_not_in_attributes)

{'26006'}


### Filter out for common catchments only

In [18]:
# Identify common elements
common_elements = list(attributes_set & timeseries_set)

# Filter attributes to include only common elements
attributes_postEDA_df = (
    attributes_postEDA_df
        .loc[common_elements]
)

# Filter timeseries_df to include only common elements
timeseries_postEDA_df = (
    timeseries_postEDA_df[
        timeseries_postEDA_df['catchmentID']
            .isin(common_elements)
    ]
)

## Check on `catchmentID` coherence

In [20]:
# Check
assert set(timeseries_postEDA_df['catchmentID'].unique()) == set(attributes_postEDA_df.index), (
    "Catchment IDs for the two datasets are NOT corresponding"
)

# Save

In [21]:
# _____________________
# attributes_postEDA_df

# Define path to save
path = os.path.join(
        camels_gb_data_attributes_aggr_dir,
        "fundamental_postEDA.csv"
)

# Save
attributes_postEDA_df.to_csv(path)

# _____________________
# timeseries_postEDA_df

# Define path to save
path = os.path.join(
        camels_gb_data_timeseries_aggr_dir,
        "timeseries_postEDA.csv"
)

# Save
timeseries_postEDA_df.to_csv(
    path,
    index=False
)