In [1]:
import xarray as xr
from datatree import DataTree
import matplotlib.pyplot as plt
import os
import valenspy as vp
from valenspy._utilities import load_yml
from valenspy.processing.select import select_point
from valenspy.diagnostic.functions import mean_bias, mean_absolute_error, root_mean_square_error, spearman_correlation, perkins_skill_score, time_series_spatial_mean, calc_metrics_dt, calc_metrics_da,_add_ranks_metrics, root_mean_square_error
from valenspy.diagnostic.visualizations import plot_time_series, plot_map, plot_metric_ranking
from pathlib import Path
import pandas as pd
import numpy as np
import xoak
from sklearn.neighbors import BallTree as skBallTree
from scipy.stats import pearsonr

from functions import point_data_2_climate_grid, plot_points_map

# define machine name - used for paths of (observational) datasets
machine = 'hortense'

manager = vp.InputManager(machine=machine)

## On the importance of ...

In [2]:
obs_data_str = 'wow'
metrics = ["rmse", "mean_bias", "pearson_correlation2", "mean_absolute_error", "spearman_correlation"]
start_date = "2019-07-25 00:00:00"
end_date = "2019-07-25 23:59:59" 
start_date = pd.to_datetime(start_date)
end_date = pd.to_datetime(end_date)
ucl_dir = "/dodrio/scratch/projects/2022_200/project_output/rcs/CORDEXBE2/fiens/UCL/"

In [3]:
file_wrf = ucl_dir + "geo_em.d03_new.nc"
ds = xr.open_mfdataset(file_wrf, combine='by_coords')
ds = ds.assign_coords({"lat":ds.CLAT})
ds = ds.assign_coords({"lon":ds.CLONG})

In [4]:
wow_file = ucl_dir + "wow_BE.csv"
orig_data = pd.read_csv(wow_file)
obs_data = orig_data[["Id", "Longitude", "Latitude", "Report Date / Time", "Air Temperature"]]
obs_data = obs_data.rename(columns = {"Air Temperature" : "T2", "Report Date / Time": "time", "Longitude":"lon", "Latitude": "lat"})
obs_data["code"] = obs_data["lat"].astype(str) + '-' + obs_data["lon"].astype(str)
single_day_index = [i for i in np.arange(0, len(obs_data.time)) if len(obs_data.loc[i, "time"]) == 10]
obs_data = obs_data.drop(single_day_index).reset_index(drop = True)

# Averaging stations that fall within same gridcell
[obs_data_agg, obs_indices] = point_data_2_climate_grid(obs_data, ds, 'T2')
obs_data_agg.date = pd.to_datetime(obs_data_agg.time).dt.date
obs_data = obs_data_agg.rename(columns = {"grid_index": "code"})
obs_data['time'] = pd.to_datetime(obs_data["time"])
obs_data["T2"] = obs_data["T2"]+273.15
obs_data['date'] = pd.to_datetime(obs_data['time']).dt.date
obs_data["lat"] = np.round(obs_data.lat, 6)
obs_data["lon"] = np.round(obs_data.lon, 6)
obs_data["hour"] = pd.to_datetime(obs_data['time']).dt.hour
# Calculating average value for each hour
obs_data = obs_data[["date", "hour", "T2", "code", "lat", "lon"]].groupby(["date", "hour", "code", "lat", "lon"]).mean().reset_index()
obs_data["time"] = pd.to_datetime(obs_data["date"]) + pd.to_timedelta(obs_data["hour"], unit="h")


obs_data = obs_data.loc[np.where(np.logical_and(obs_data.time >= start_date, obs_data.time <= end_date))].reset_index(drop = True)

# Checking whether the stations have data for each hour
full_range = pd.date_range(start=start_date, 
                           end=end_date, 
                           freq="h")  # Hourly intervals
drop_stations = []
for c in np.unique(obs_data.code):
    code_temp = obs_data.loc[np.where(obs_data.code == c)].reset_index(drop = True)
    missing_hours = full_range.difference(code_temp["time"])
    if len(missing_hours) > 0:
        drop_stations.append(c)

drop_indices = [i for i in np.arange(0, len(obs_data)) if obs_data.loc[i, "code"] in drop_stations]
obs_data = obs_data.drop(drop_indices).reset_index(drop = True)



  orig_data = pd.read_csv(wow_file)
  obs_data_agg.date = pd.to_datetime(obs_data_agg.time).dt.date


In [6]:
df_duplicates = pd.DataFrame([], columns = ["lat", "lon", "grid_index", "count"])

for ind in np.unique(obs_indices.grid_index):
    locs = np.where(obs_indices.grid_index == ind)[0]
    if len(locs) > 1:
        for i in locs:
            df_duplicates.loc[i, ["lat", "lon", "grid_index"]] = obs_indices.loc[i]
            df_duplicates.loc[i, "count"] = len(locs)

In [9]:
df_leuven = df_duplicates

In [21]:
indx_max = np.where(df_leuven["count"] == df_leuven["count"].max())[0]
df_leuven_max = df_leuven.iloc[indx_max].reset_index(drop = True)

In [22]:
df_leuven_max

Unnamed: 0,lat,lon,grid_index,count
0,50.871,4.694,54279.0,7
1,50.8743,4.7058,54279.0,7
2,50.873,4.6949,54279.0,7
3,50.8733,4.6941,54279.0,7
4,50.8745,4.6951,54279.0,7
5,50.8692,4.7048,54279.0,7
6,50.8708,4.7014,54279.0,7


In [25]:
for i in np.arange(1, df_leuven["count"].max()):
    print(random(7))
    temp_station = df_leuven_max

NameError: name 'random' is not defined