# Application of Downscaling Methods using Clmate Data as an example of Uganda International Education

## 1. Import required Python Libraries

In [None]:
import sys
import random

import xarray
import pandas as pd
import numpy as np
import sklearn
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.stattools import acf

from CCdownscaling import correction_downscale_methods, distribution_tests, error_metrics, som_downscale, utilities

import warnings
warnings.filterwarnings("ignore")

# for reproducibility
seed = 1
random.seed(seed)

## 2. Read Observation Weather data

* I couldn't find good quality weather data in Uganda, therefore, I used observation weather data that was provided by the original study to develop this downscaling method. https://egusphere.copernicus.org/preprints/2022/egusphere-2022-282/

In [None]:
# set downscaling variable and station id
downscaling_target='precip'
station_id='725300-94846'

In [None]:
# read observation weather data
station_data = pd.read_csv('./uganda/data/stations/' + station_id + '.csv')
station_data = station_data.replace(to_replace=[99.99, 9999.9], value=np.nan)
station_data

In [None]:
station_lat = 0.3
station_lon = 32.6
station_lat, station_lon

## 3. Read GCM historical data

In [None]:
# read five nc files
reanalysis_data = xarray.open_dataset('./uganda/data/models/pr_day_EC-Earth3-Veg-LR_historical_r1i1p1f1_gr_18500101-20141231_v20200217.nc')
reanalysis_data

### Remove Feb-29, Feb-30 considering leap years

In [None]:
start_year = 1850
end_year = 2014
leap_years = list(range(start_year + (4 - start_year % 4), end_year + 1, 4))
leap_years.remove(leap_years[12])
no_leap_years = list(range(start_year, end_year + 1, 1))
li = no_leap_years
remove_set = {3, 5}

li = [i for i in li if i not in leap_years]
li

In [None]:
# remove Feb-30
for a_year in no_leap_years:
    reanalysis_data = reanalysis_data.sel(time=~((reanalysis_data.time.dt.year == a_year) & (reanalysis_data.time.dt.month == 2) & (reanalysis_data.time.dt.day == 30)))

In [None]:
# remove Feb-29
for a_year in li:
    reanalysis_data = reanalysis_data.sel(time=~((reanalysis_data.time.dt.year == a_year) & (reanalysis_data.time.dt.month == 2) & (reanalysis_data.time.dt.day == 29)))

## 4. Change precipitation unit and extract data using lat and lon

In [None]:
# load GCM historical data
reanalysis_data['pr'] = reanalysis_data['pr'] * 86400
rean_precip = reanalysis_data['pr'].sel(lat=station_lat, lon=station_lon, method='nearest').values
rean_precip = np.squeeze(rean_precip)

## 5. Select the station data to match the time from the GCM data

In [None]:
# select the station data to match the time from the reanalysis data
start = reanalysis_data['time'][0].values
end = reanalysis_data['time'][-1].values
station_data['time'] = pd.to_datetime(station_data['date'], format='%Y-%m-%d')
date_mask = ((station_data['time'] >= start) & (station_data['time'] <= end))
station_data = station_data[date_mask]
station_data

In [None]:
hist_data = station_data[downscaling_target].values
# Convert units, in/day to mm/day for precip
hist_data = hist_data * 2.54

In [None]:
# For just a single grid point:
# reanalysis_data = reanalysis_data.sel(lat = station_lat, lon = station_lon, method='nearest')
# To use multiple grid points in a window around the location:
window = 2
lat_index = np.argmin(np.abs(reanalysis_data['lat'].values - station_lat))
lon_index = np.argmin(np.abs(reanalysis_data['lon'].values - station_lon))
reanalysis_data = reanalysis_data.isel({'lat': slice(lat_index - window, lat_index + window + 1),
                                        'lon': slice(lon_index - window, lon_index + window + 1)})
reanalysis_data

In [None]:
var_data = reanalysis_data['pr'].values
input_data = var_data.reshape(var_data.shape[0], var_data.shape[1] * var_data.shape[2])
input_data = np.array(input_data)
input_data

In [None]:
# Drop days with NaN values for the observation:
hist, rean_precip = utilities.remove_missing(hist_data, rean_precip)
hist_data, input_data = utilities.remove_missing(hist_data, input_data)

input_data, input_means, input_stdevs = utilities.normalize_climate_data(input_data)
input_data, input_means, input_stdevs

## 6. Split train and test sets

In [None]:
# split train and test sets:
# train_split = int(round(input_data.shape[0]*0.8))
train_split = 17150  # split out the first 24 years for the training data, last 6 years for the test set
training_data = input_data[0:train_split, :]
train_hist = hist_data[0:train_split]
test_data = input_data[train_split:, :]
test_hist = hist_data[train_split:]
rean_precip_train = rean_precip[0:train_split]
rean_precip_test = rean_precip[train_split:]
print(training_data.shape, test_data.shape)

In [None]:
# intialize the different methods
som = som_downscale(som_x=7, som_y=5, batch=512, alpha=0.1, epochs=50)
rf_two_part = correction_downscale_methods.two_step_random_forest()
random_forest = sklearn.ensemble.RandomForestRegressor()
qmap = correction_downscale_methods.quantile_mapping()
linear = sklearn.linear_model.LinearRegression()

## 7. Train Climate Data

In [None]:
# train
som.fit(training_data, train_hist, seed=1)
random_forest.fit(training_data, train_hist)
rf_two_part.fit(training_data, train_hist)
linear.fit(training_data, train_hist)
qmap.fit(rean_precip_train, train_hist)

## 8. Test and Visualize Climate data 

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

In [None]:
# generate outputs from the test data
som_output = som.predict(test_data)
random_forest_output = random_forest.predict(test_data)
rf_two_part_output = rf_two_part.predict(test_data)
linear_output = linear.predict(test_data)
qmap_output = qmap.predict(rean_precip_test)

In [None]:
mse = mean_squared_error(test_hist, som_output)
mae = mean_absolute_error(test_hist, som_output)
r2 = r2_score(test_hist, som_output)
mse, mae, r2

In [None]:
mse = mean_squared_error(test_hist, random_forest_output)
mae = mean_absolute_error(test_hist, random_forest_output)
r2 = r2_score(test_hist, random_forest_output)
mse, mae, r2

In [None]:
mse = mean_squared_error(test_hist, rf_two_part_output)
mae = mean_absolute_error(test_hist, rf_two_part_output)
r2 = r2_score(test_hist, rf_two_part_output)
mse, mae, r2

In [None]:
mse = mean_squared_error(test_hist, linear_output)
mae = mean_absolute_error(test_hist, linear_output)
r2 = r2_score(test_hist, linear_output)
mse, mae, r2

In [None]:
mse = mean_squared_error(test_hist, qmap_output)
mae = mean_absolute_error(test_hist, qmap_output)
r2 = r2_score(test_hist, qmap_output)
mse, mae, r2

## 9. Create Future Precipitations

In [None]:
# read five nc files
scenario_data = xarray.open_dataset('./uganda/data/models/pr_day_EC-Earth3-Veg-LR_ssp370_r1i1p1f1_gr_20220101-21001231_v20201123.nc')
scenario_data

In [None]:
# For just a single grid point:
# reanalysis_data = reanalysis_data.sel(lat = station_lat, lon = station_lon, method='nearest')
# To use multiple grid points in a window around the location:
window = 2
lat_index = np.argmin(np.abs(scenario_data['lat'].values - station_lat))
lon_index = np.argmin(np.abs(scenario_data['lon'].values - station_lon))
scenario_data = scenario_data.isel({'lat': slice(lat_index - window, lat_index + window + 1),
                                        'lon': slice(lon_index - window, lon_index + window + 1)})

In [None]:
scenario_data

In [None]:
#reanalysis_data['pr'] = reanalysis_data['pr'] * 86400
var_data = scenario_data['pr'].values * 86400
input_data = var_data.reshape(var_data.shape[0], var_data.shape[1] * var_data.shape[2])
input_data = np.array(input_data)
input_data

In [None]:
input_data.shape

In [None]:
# Drop days with NaN values for the observation:
hist, rean_precip = utilities.remove_missing(hist_data, rean_precip)
hist_data, input_data = utilities.remove_missing(hist_data, input_data)

input_data, input_means, input_stdevs = utilities.normalize_climate_data(input_data)

In [None]:
input_data

In [None]:
input_data1 = input_data[0:365,:]

In [None]:
# generate outputs from the test data
som_output = som.predict(input_data1)
random_forest_output = random_forest.predict(input_data1)
rf_two_part_output = rf_two_part.predict(input_data1)
linear_output = linear.predict(input_data1)
qmap_output = qmap.predict(input_data1)