In Google Collab, runtime must be restarted for each run.

# Environment Setup and User Specification

Precision matters. The input files need to have many decimal places.

In [None]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta, date
import matplotlib.pyplot as plt
import os
import sys

In [None]:
################################################################################
# User specification
################################################################################

# Plot
iplot = {"cloud_cover": 1}

# Date range for plotting
date_bgn_plot = date(2000,1,1)
date_end_plot = date(2016,1,1)

# isavecsv = 1 will save csv of timeseries
isavecsv = 1

# Number of bins for bias-correction
num_percentile = 50

# Number of bins for cdf plots
num_bins = 100

# Inputs
path_metsim = "./processing/Timeseries_step1_drybulb,raw_cloud,raw_relhum.csv"

# Observations; exactracted from DSS file.
dict_obs_fname = {"cloud_cover": "Stockton_Obs_CLOUD_COVER.csv"}

dir_input = os.path.join("./inputs/")
dir_output = os.path.join("./processing/")

# Variables to be compared with observations
list_var = ["cloud_cover"]

# Read Observation

In [None]:
###############################################################################
# Read observed data
###############################################################################
dict_obs = {}

for var in list_var:
  dict_obs[var] = {}
  fname_obs = dict_obs_fname[var]

  #
  # Read Observation
  #
  df_tmp = pd.read_csv(os.path.join(dir_input, fname_obs),
                        names = ["Date","Time",var], header = 1)

  date0 = df_tmp["Date"].iloc[0]
  date1 = df_tmp["Date"].iloc[-1]

  time0 = df_tmp["Time"].iloc[0]
  time1 = df_tmp["Time"].iloc[-1]
        
  if (time0 == " 24:00"):
      time0 = " 00:00"
      tdel0 = timedelta(days = 1)
  else:
      tdel0 = timedelta(days = 0)

  if (time1 == " 24:00"):
      time1 = " 00:00"
      tdel1 = timedelta(days = 1)
  else:
      tdel1 = timedelta(days = 0)
      
  dt0 = datetime.strptime(date0 + time0, "%d %b %y %H:%M") + tdel0
  dt1 = datetime.strptime(date1 + time1, "%d %b %y %H:%M") + tdel1
  index = pd.date_range(dt0, dt1, freq = timedelta(hours = 1))

  df_tmp = df_tmp.drop(columns = ["Time", "Date"]) # no longer needed 
  df_tmp = df_tmp.set_index(index)

  # Missing values are represented as a blank space.
  df_tmp = df_tmp.replace(" ", np.nan)  

  # Entries must be numeric
  df_tmp = pd.to_numeric(df_tmp.iloc[:,0], errors = "coerce")
  
  # 
  # Plot Observation
  #
  dict_obs[var]["Hourly"] = df_tmp

# Read MetSim Data

In [None]:
df_metsim = pd.read_csv(path_metsim)

In [None]:
dt0 = datetime.strptime(df_metsim["Date"].iloc[0], "%Y-%m-%d %H:%M")
dt1 = datetime.strptime(df_metsim["Date"].iloc[-1], "%Y-%m-%d %H:%M")
index = pd.date_range(dt0, dt1, freq = timedelta(hours = 1))
df_metsim = df_metsim.set_index(index)
df_metsim = df_metsim.drop("Date", axis=1)

In [None]:
df_metsim

In [None]:
df_metsim = df_metsim.rename(columns = {"Cloud_Cover [fraction]": "cloud_cover"})

In [None]:
dict_metsim = {}
for var in list_var:
  
  dict_metsim[var] = {"Raw": {},
                      "BC": {}}

  dict_metsim[var]["Raw"] = {"Hourly": df_metsim[var]}

In [None]:
dict_metsim["cloud_cover"]["Raw"]["Hourly"]

In [None]:
dict_obs[var]["Hourly"]

# Bias-correction

In [None]:
for var in list_var:

  df_tmp = dict_obs[var]["Hourly"].dropna()
  df_model_hourly = dict_metsim[var]["Raw"]["Hourly"]
  df_map = df_model_hourly

  arr_obs = pd.to_numeric(df_tmp, errors = "coerce").to_numpy()
  arr_model = df_model_hourly.to_numpy()
  

  list_model = []

  prange = int(100/num_percentile)
  #print(prange)
  for upb in range(prange, 100+prange, prange):

      pct_obs = np.percentile(arr_obs, upb)
      pct_model = np.percentile(arr_model, upb)
      lowb = upb - prange

      #print(pct_obs)
      #print(pct_model)
      #print(lowb)
        
      pct_model_low = np.percentile(arr_model, lowb)

      dat_model = df_map[df_map.between(pct_model_low,
                                        pct_model,
                                        inclusive="both")]
      
      ratio = pct_obs/pct_model
      if ( np.isnan(ratio) or np.isinf(ratio)):
        continue
      else:
        dat_model_new = dat_model * ratio
        list_model.append(dat_model_new)

  df_tmp = pd.concat(list_model, axis = 0)
  df_tmp = df_tmp.sort_index()

  # check for duplicate
  dup_index = df_tmp[df_tmp.index.duplicated(keep=False)]
  if (len(dup_index) > 0):
      print("Duplicate index found:", dup_index)
      print("Dropping all but first")

      df_tmp = df_tmp[~df_tmp.index.duplicated(keep='first')]      

  # Check
  arr_model = df_tmp.to_numpy()
  for upb in range(prange, 100+prange, prange):

      pct_obs = np.percentile(arr_obs, upb)
      pct_model = np.percentile(arr_model, upb)

  dict_metsim[var]["BC"]["Hourly"] = df_tmp

# Result

## Cumulative

### Cloud Cover

In [None]:
if (iplot["cloud_cover"] == 1):

  data_obs = dict_obs["cloud_cover"]["Hourly"]
  data_metsim_raw = dict_metsim["cloud_cover"]["Raw"]["Hourly"]
  data_metsim_bc = dict_metsim["cloud_cover"]["BC"]["Hourly"]

  plt.figure(figsize = (6, 6), dpi = 200)

  data = data_obs.dropna()
  values, base = np.histogram(data, bins=num_bins)
  cumulative = np.cumsum(values)/len(data)
  plt.plot(base[:-1], cumulative, "k", linewidth = 1)

  data = data_metsim_raw.dropna()
  values, base = np.histogram(data, bins=num_bins)
  cumulative = np.cumsum(values)/len(data)
  plt.plot(base[:-1], cumulative, "g", linewidth = 1)

  data = data_metsim_bc.dropna()
  values, base = np.histogram(data, bins=num_bins)
  cumulative = np.cumsum(values)/len(data)
  plt.plot(base[:-1], cumulative, "r", linewidth = 0.7, markersize = 0.5)

  plt.title("Cloud Cover")
  plt.xlabel("Cloud Cover")
  plt.legend(["Observation", "MetSim (Raw)", "MetSim (Bias-Corrected)"])
  #plt.legend(["Observation", "MetSim (Bias-Corrected)"])
  plt.ylim([0, 1])
  plt.xlim([0, 1])
  plt.grid()

## Timeseries

### Cloud Cover

# Print output

In [None]:
if (isavecsv == 1):
  for var in list_var:
    df = pd.DataFrame(dict_metsim[var]["BC"]["Hourly"])
    df.index.name = "Date"
    fname =  os.path.join(dir_output, "Timeseries_step5_corrected_cloud.csv")
    df.to_csv(fname, float_format = "%.3f", header=["Cloud Cover [fraction]"])