In [1]:
## This script implements the regression (linear & tree) over the indices

In [2]:
# import modules
import numpy as np
import pandas as pd
from sklearn import tree
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from datetime import datetime
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from IPython import display
#import seaborn as sns; sns.set()
%matplotlib inline

In [3]:
# function definition
def Gini_idx(X):
    """
    X: sorted income array
    """
    sum_income = np.sum(X)
    prop_income = np.cumsum(X) / sum_income
    prop_ideal = np.arange(1, len(X)+1) / (len(X))
    res_Gini = 1 - np.mean(prop_income) / np.mean(prop_ideal)
    return res_Gini

def median_wage(X):
    """
    X: sorted income array
    """
    return np.median(X)

def poverty_rate(X):
    """
    X: sorted income array
    """
    pvt_rt = np.sum(X < 12760) / len(X)
    return pvt_rt

In [4]:
# load data
X = pd.read_csv('../../../data/income/processed/dat_X.csv').to_numpy()
Y = pd.read_csv('../../../data/income/processed/dat_Y.csv').to_numpy()
loc_cv = pd.read_csv('../../../data/income/processed/dat_CV.csv').to_numpy().flatten()
## transform data
n_dist = X.shape[0]
n_levs = 100
q_vec = np.arange(1, (n_levs+1)) / (n_levs+1)
## transform Y
Q_mat = np.array([np.quantile(Y[i], q_vec) for i in range(n_dist)])
## create indices
indices_ = []
for i in range(n_dist):
    incomes_ = np.exp(Q_mat[i]) ## take exponential
    indices_.append([Gini_idx(incomes_), median_wage(incomes_), poverty_rate(incomes_)])
indices_ = np.array(indices_)

In [5]:
# fit the model using nested cross validation
n_fold = np.max(loc_cv) + 1
time_start = datetime.now()
train_loss = np.zeros(n_fold)
val_loss = np.zeros(n_fold)
## create numpy arrays to store the predictions
Y_train_linear = np.zeros((indices_.shape[0], indices_.shape[1], n_fold))
Y_train_tree = np.zeros((indices_.shape[0], indices_.shape[1], n_fold))
Y_test_linear = np.zeros(indices_.shape)
Y_test_tree = np.zeros(indices_.shape)
## parameter set for tree regression
depth_list = [2, 3, 5, 7, 10]
alpha_list = [0.001, 0.01, 0.1, 1, 10]
for id_fold in range(n_fold):
    print('This is fold ', id_fold+1)
    X_train = X[loc_cv != id_fold]
    Y_train = indices_[loc_cv != id_fold]
    X_test = X[loc_cv == id_fold]
    X_t_in, X_v_in, Y_t_in, Y_v_in = train_test_split(X_train, Y_train, test_size=0.25, random_state=2022)
    
    ## inner parameter selection for linear regression
    loss_ = []
    for alpha in alpha_list:
        reg_linear_init = linear_model.Lasso(alpha=alpha)
        reg_linear_init.fit(X_t_in, Y_t_in)
        Y_v_pred = reg_linear_init.predict(X_v_in)
        loss_.append(np.sum((Y_v_pred - Y_v_in)**2))
    alpha_best = alpha_list[np.argmin(np.array(loss_))]
    reg_linear = linear_model.Lasso(alpha=alpha_best)
    reg_linear.fit(X_train, Y_train)
    
    ## inner parameter selection for tree regression
    loss_ = []
    for depth in depth_list:
        reg_tree_init = DecisionTreeRegressor(max_depth=depth).fit(X_t_in, Y_t_in)
        Y_v_pred = reg_tree_init.predict(X_v_in)
        loss_.append(np.sum((Y_v_pred - Y_v_in)**2))
    depth_best = depth_list[np.argmin(np.array(loss_))]
    reg_tree = DecisionTreeRegressor(max_depth=depth_best).fit(X_train, Y_train)

    loc_test = np.where(loc_cv == id_fold)[0]
    loc_train = np.where(loc_cv != id_fold)[0]
    ## make predictions on training set
    Y_train_linear[loc_train, :, id_fold] = reg_linear.predict(X_train)
    Y_train_tree[loc_train, :, id_fold] = reg_tree.predict(X_train)
    ## make predictions on test set
    Y_test_linear[loc_test] = reg_linear.predict(X_test)
    Y_test_tree[loc_test] = reg_tree.predict(X_test)
    Y_train_linear[loc_test, :, id_fold] = np.nan
    Y_train_tree[loc_test, :, id_fold] = np.nan
    
print('Time:', datetime.now() - time_start)
    

This is fold  1
This is fold  2
This is fold  3
This is fold  4
This is fold  5
Time: 0:00:00.055118


In [6]:
# save the results
## save training results
np.save('../predictions/train_linear.npy', Y_train_linear)
np.save('../predictions/train_tree.npy', Y_train_tree)
## save test results
pd.DataFrame(Y_test_linear).to_csv('../predictions/pred_linear.csv')
pd.DataFrame(Y_test_tree).to_csv('../predictions/pred_tree.csv')