In [23]:
import xarray as xr
from datatree import DataTree
import matplotlib.pyplot as plt
import os
import valenspy as vp
from valenspy._utilities import load_yml
from valenspy.processing.select import select_point
from valenspy.diagnostic.functions import mean_bias, mean_absolute_error, root_mean_square_error, spearman_correlation, perkins_skill_score, time_series_spatial_mean, calc_metrics_dt, calc_metrics_da,_add_ranks_metrics, root_mean_square_error
from valenspy.diagnostic.visualizations import plot_time_series, plot_map, plot_metric_ranking
from pathlib import Path
import pandas as pd
import numpy as np
import xoak
from sklearn.neighbors import BallTree as skBallTree
from scipy.stats import pearsonr

from functions import point_data_2_climate_grid

# define machine name - used for paths of (observational) datasets
machine = 'hortense'

manager = vp.InputManager(machine=machine)

In [24]:
start_date = "2019-07-25 00:00:00"
end_date = "2019-07-25 23:59:59" 
start_date = pd.to_datetime(start_date)
end_date = pd.to_datetime(end_date)
ucl_dir = "/dodrio/scratch/projects/2022_200/project_output/rcs/CORDEXBE2/fiens/UCL/"

In [25]:
file_wrf = "/dodrio/scratch/projects/2022_200/project_output/rcs/CORDEXBE2/fiens/UCL/geo_em.d03_new.nc"
ds = xr.open_mfdataset(file_wrf, combine='by_coords')
ds = ds.assign_coords({"lat":ds.CLAT})
ds = ds.assign_coords({"lon":ds.CLONG})

In [26]:
netatmo_file = ucl_dir + "netatmo_qc.csv"
orig_data = pd.read_csv(netatmo_file)
orig_data = orig_data.loc[np.where(orig_data.m4)]
obs_data = orig_data.rename(columns = {"ta": "T2"})
obs_data["T2"] = obs_data["T2"] + 273.15
[obs_data_agg, obs_indices] = point_data_2_climate_grid(obs_data, ds, 'T2')
obs_data_agg.date = pd.to_datetime(obs_data_agg.time).dt.date
obs_data = obs_data_agg.rename(columns = {"grid_index": "code"})
obs_data['time'] = pd.to_datetime(obs_data["time"])
obs_data["lat"] = np.round(obs_data.lat, 6)
obs_data["lon"] = np.round(obs_data.lon, 6)

# Checking whether the stations have data for each hour
full_range = pd.date_range(start=start_date, 
                           end=end_date, 
                           freq="h")  # Hourly intervals
drop_stations = []
for c in np.unique(obs_data.code):
    code_temp = obs_data.loc[np.where(obs_data.code == c)].reset_index(drop = True)
    missing_hours = full_range.difference(code_temp["time"])
    if len(missing_hours) > 0:
        drop_stations.append(c)
#print("In " + obs_data_str + str(len(drop_stations)) " are removed.")

drop_indices = [i for i in np.arange(0, len(obs_data)) if obs_data.loc[i, "code"] in drop_stations]
obs_data = obs_data.drop(drop_indices).reset_index(drop = True)

  obs_data_agg.date = pd.to_datetime(obs_data_agg.time).dt.date


In [28]:
obs_data = obs_data.loc[np.where(np.logical_and(obs_data.time >= start_date, obs_data.time <= end_date))].reset_index(drop = True)

### Adding lcz info

In [29]:
file_wrf = ucl_dir + "geo_em.d03_LCZ_params.nc"
ds = xr.open_mfdataset(file_wrf, combine='by_coords')
ds = ds.assign_coords({"lat":ds.CLAT})
ds = ds.assign_coords({"lon":ds.CLONG})
da = ds["LU_INDEX"]
da

Unnamed: 0,Array,Chunk
Bytes,318.94 kiB,318.94 kiB
Shape,"(1, 252, 324)","(1, 252, 324)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 318.94 kiB 318.94 kiB Shape (1, 252, 324) (1, 252, 324) Dask graph 1 chunks in 2 graph layers Data type float32 numpy.ndarray",324  252  1,

Unnamed: 0,Array,Chunk
Bytes,318.94 kiB,318.94 kiB
Shape,"(1, 252, 324)","(1, 252, 324)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,318.94 kiB,318.94 kiB
Shape,"(1, 252, 324)","(1, 252, 324)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 318.94 kiB 318.94 kiB Shape (1, 252, 324) (1, 252, 324) Dask graph 1 chunks in 2 graph layers Data type float32 numpy.ndarray",324  252  1,

Unnamed: 0,Array,Chunk
Bytes,318.94 kiB,318.94 kiB
Shape,"(1, 252, 324)","(1, 252, 324)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,318.94 kiB,318.94 kiB
Shape,"(1, 252, 324)","(1, 252, 324)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 318.94 kiB 318.94 kiB Shape (1, 252, 324) (1, 252, 324) Dask graph 1 chunks in 2 graph layers Data type float32 numpy.ndarray",324  252  1,

Unnamed: 0,Array,Chunk
Bytes,318.94 kiB,318.94 kiB
Shape,"(1, 252, 324)","(1, 252, 324)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [30]:
da.xoak.set_index(['lat', 'lon'], 'sklearn_geo_balltree')

In [31]:
d_coord_points = obs_data[["code", "lat", "lon"]]
d_coord_points = d_coord_points.drop_duplicates().reset_index(drop = True)
for i in np.arange(0, len(d_coord_points)):
    point_coord = d_coord_points.loc[i]    
    lat_array = xr.DataArray(np.array([point_coord.lat]), dims='point')
    lon_array = xr.DataArray(np.array([point_coord.lon]), dims='point')
    d_coord_points.loc[i, "LU_INDEX"] = da.xoak.sel(lat = lat_array, lon = lon_array).values

In [32]:
d_coord_points

Unnamed: 0,code,lat,lon,LU_INDEX
0,8666,49.554878,5.562805,14.0
1,9318,49.570152,5.620544,14.0
2,10948,49.608212,5.765136,14.0
3,11581,49.636749,5.558441,5.0
4,11923,49.633121,5.810211,12.0
...,...,...,...,...
510,72409,51.386841,4.550690,5.0
511,73074,51.395069,4.798676,56.0
512,73089,51.385960,5.015808,56.0
513,74689,51.443188,4.730988,56.0


In [33]:
indx_urb = np.where(d_coord_points["LU_INDEX"] >= 50)[0]
indx_rur = np.where(d_coord_points["LU_INDEX"] < 50)[0]
urb_loc = d_coord_points.loc[indx_urb].reset_index(drop = True)
rur_loc = d_coord_points.loc[indx_rur].reset_index(drop = True)

urb_loc.to_csv(ucl_dir + 'Location_' + "urban_" + "netatmo" + ".csv", index = False)
rur_loc.to_csv(ucl_dir + 'Location_' +  "rural_" + "netatmo" +".csv", index = False)

In [34]:
urb_loc

Unnamed: 0,code,lat,lon,LU_INDEX
0,33182,50.305840,4.118652,56.0
1,34173,50.322929,4.389954,56.0
2,34516,50.321316,4.659668,56.0
3,35794,50.367550,4.408386,56.0
4,35827,50.348591,4.875580,56.0
...,...,...,...,...
316,71467,51.342037,4.981903,56.0
317,71756,51.371567,4.476532,59.0
318,73074,51.395069,4.798676,56.0
319,73089,51.385960,5.015808,56.0
