In [1]:
# This script prepares the quantile data for regression

In [2]:
# import packages
import os
import numpy as np
import pandas as pd
from datetime import datetime

In [3]:
# load data
## load radiative forcings
df_rf = pd.read_csv('../../data/climate/raw/radiative-forcings.csv', index_col=0)
## load temperatures
df_temp = pd.read_table('../../data/climate/raw/Complete_TAVG_daily.txt', sep=' ', header=None)
df_temp = df_temp[[2, 5, 6]]
df_temp = df_temp[df_temp[2] <= max(df_rf.index)]
df_temp.columns = ['YEAR', 'DAY', 'TEMP']

In [4]:
# data cleaning
## create a temperature dictionary: year - daily temperatures
temp_dict = {}
for i in range(df_temp.shape[0]):
    year_now = df_temp['YEAR'].iloc[i]
    if year_now > 2011:
        break
    if year_now in temp_dict.keys():
        temp_dict[year_now].append(df_temp['TEMP'].iloc[i])
    else:
        temp_dict[year_now] = [df_temp['TEMP'].iloc[i]]
## filter out year by threshold
THRESH = 355
year_select = [year_now for year_now in temp_dict.keys() if len(temp_dict[year_now]) >= THRESH]

In [5]:
# create data for regression
n_levs = 300
q_vec = np.arange(1, (n_levs+1)) / (n_levs+1)
X = np.array(df_rf[['CO2', 'Solar', 'Volcano']].loc[year_select])
Y = np.array([np.quantile(temp_dict[year_now], q_vec) for year_now in year_select])

In [6]:
# K-fold cross validation
## create train and test (4:1)
np.random.seed(2020)
n_fold = 5
loc_cv = np.random.choice(len(Y), len(Y), replace=False) % n_fold

In [7]:
# save the simulated results in /output/
pd.DataFrame(X).to_csv('../../data/climate/processed/dat_X.csv', index=False)
pd.DataFrame(Y).to_csv('../../data/climate/processed/dat_Y.csv', index=False)
pd.DataFrame(loc_cv).to_csv('../../data/climate/processed/dat_CV.csv', index=False)