In [8]:
# This script prepares the quantile data for regression

In [9]:
# import packages
import os
import numpy as np
import pandas as pd
from datetime import datetime

In [10]:
# load data
## load radiative forcings
df_rf = pd.read_csv('../data/raw/fullDat.csv', index_col=0)
df_rf = df_rf[['Year', 'GHG', 'Volcanic', 'Solar', 'ENSO']]
## load temperatures
df_temp = pd.read_table('../data/raw/Complete_TAVG_daily.txt', sep=' ', header=None)
df_temp = df_temp[[2, 5, 6]]
df_temp.columns = ['YEAR', 'DAY', 'TEMP']

In [11]:
# data cleaning
## create a temperature dictionary: year - daily temperatures
temp_dict = {}
year_start = 1880
year_end = 2012
for i in range(df_temp.shape[0]):
    year_now = df_temp['YEAR'].iloc[i]
    if year_now < year_start or year_now > year_end:
        continue
    if year_now in temp_dict.keys():
        temp_dict[year_now].append(df_temp['TEMP'].iloc[i])
    else:
        temp_dict[year_now] = [df_temp['TEMP'].iloc[i]]
## filter out year by threshold
THRESH = 355
year_select = [year_now for year_now in temp_dict.keys() if len(temp_dict[year_now]) >= THRESH]

In [12]:
# create data for regression
n_levs = 300
q_vec = np.arange(1, (n_levs+1)) / (n_levs+1)
X = np.array(df_rf[(df_rf['Year']>=1860) & (df_rf['Year']<= 2012)][['GHG', 'Volcanic', 'Solar', 'ENSO']])
Y = np.array([np.quantile(temp_dict[year_now], q_vec) for year_now in year_select])

In [13]:
# K-fold cross validation
## create train and test (4:1)
np.random.seed(2022)
n_fold = 5
loc_cv = np.random.choice(len(Y), len(Y), replace=False) % n_fold

In [14]:
# save the simulated results in /output/
pd.DataFrame(X).to_csv('../data/processed/dat_X.csv', index=False)
pd.DataFrame(Y).to_csv('../data/processed/dat_Y.csv', index=False)
pd.DataFrame(loc_cv).to_csv('../data/processed/dat_CV.csv', index=False)