In [1]:
# This script evaluates the prediction performance using Gini index, poverty rate, and median income

In [2]:
# import modules
import numpy as np
import pandas as pd
import seaborn as sns
import pickle, time, scipy, operator
import matplotlib as mpl
import matplotlib.colors as mcolors
import matplotlib.cm as cm
from scipy import interp
from scipy.ndimage import filters
from scipy.stats import norm
from sklearn import tree
from sklearn.linear_model import LinearRegression
from datetime import datetime
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.animation as manimation
from matplotlib.patches import Ellipse
from IPython import display
#import seaborn as sns; sns.set()
%matplotlib inline

In [3]:
# function definitions
def Gini_idx(X):
    """
    X: sorted income array
    """
    X = np.sort(X)
    sum_income = np.sum(X)
    prop_income = np.cumsum(X) / sum_income
    prop_ideal = np.arange(1, len(X)+1) / (len(X))
    res_Gini = 1 - np.mean(prop_income) / np.mean(prop_ideal)
    return res_Gini

def median_wage(X):
    """
    X: sorted income array
    """
    return np.median(X)

def poverty_rate(X):
    """
    X: sorted income array
    """
    pvt_rt = np.sum(X < 12760) / len(X)
    return pvt_rt

In [4]:
# load data
## load ground truth
Y = pd.read_csv('../../../data/income/processed/dat_Y.csv').to_numpy()
## load quantile predictions
Y_WDL = pd.read_csv('../predictions/qt_test_WDL.csv', index_col=0).to_numpy()
Y_Frechet = pd.read_csv('../predictions/qt_test_Frechet.csv', index_col=0).to_numpy()
Y_CLR = pd.read_csv('../predictions/qt_test_CLR.csv', index_col=0).to_numpy()
## load scalar prediction of indices
indices_linear = pd.read_csv('../predictions/pred_linear.csv', index_col=0).to_numpy()
indices_tree = pd.read_csv('../predictions/pred_tree.csv', index_col=0).to_numpy()
## transform data
n_dist = Y.shape[0]
n_levs = 99
q_vec = np.arange(1, (n_levs+1)) / (n_levs+1)
## transform Y
Q_mat = np.array([np.quantile(Y[i], q_vec) for i in range(n_dist)])

In [5]:
# create indices
indices_ = []
indices_WDL = []
indices_Frechet = []
indices_CLR = []
for i in range(Y.shape[0]):
    incomes_ = np.exp(Q_mat[i]) 
    indices_.append([Gini_idx(incomes_), median_wage(incomes_), poverty_rate(incomes_)])
    incomes_ = np.exp(Y_WDL[i]) ## take exponential
    indices_WDL.append([Gini_idx(incomes_), median_wage(incomes_), poverty_rate(incomes_)])
    incomes_ = np.exp(Y_Frechet[i]) 
    indices_Frechet.append([Gini_idx(incomes_), median_wage(incomes_), poverty_rate(incomes_)])
    incomes_ = np.exp(Y_CLR[i]) 
    indices_CLR.append([Gini_idx(incomes_), median_wage(incomes_), poverty_rate(incomes_)])
indices_ = np.array(indices_)
indices_WDL = np.array(indices_WDL)
indices_Frechet = np.array(indices_Frechet)
indices_CLR = np.array(indices_CLR)
## find the outlier
#loc = np.where(indices_CLR[:, 2] < 0.2)[0][0]

In [8]:
# calculate RMSE and R-squared
## RMSE
RMSE_WDL = [np.sqrt(np.mean((indices_[:, i] - indices_WDL[:, i])**2)) for i in range(3)]
RMSE_Frechet = [np.sqrt(np.mean((indices_[:, i] - indices_Frechet[:, i])**2)) for i in range(3)]
RMSE_CLR = [np.sqrt(np.mean((indices_[:, i] - indices_CLR[:, i])**2)) for i in range(3)]
RMSE_tree = [np.sqrt(np.mean((indices_[:, i] - indices_tree[:, i])**2)) for i in range(3)]
RMSE_linear = [np.sqrt(np.mean((indices_[:, i] - indices_linear[:, i])**2)) for i in range(3)]
print('Gini', 'Median', 'Poverty')
print('RMSE WDL:', RMSE_WDL)
print('RMSE Frechet:', RMSE_Frechet)
print('RMSE CLR:', RMSE_CLR)
print('RMSE linear:', RMSE_linear)
print('RMSE tree:', RMSE_tree)
## R-squared
TSS = [np.var(indices_[:, i]) for i in range(3)]
RS_WDL = [1 - np.mean((indices_[:, i] - indices_WDL[:, i])**2) / TSS[i] for i in range(3)]
RS_Frechet = [1 - np.mean((indices_[:, i] - indices_Frechet[:, i])**2) / TSS[i] for i in range(3)]
RS_CLR = [1 - np.mean((indices_[:, i] - indices_CLR[:, i])**2) / TSS[i] for i in range(3)]
RS_tree = [1 - np.mean((indices_[:, i] - indices_tree[:, i])**2) / TSS[i] for i in range(3)]
RS_linear = [1 - np.mean((indices_[:, i] - indices_linear[:, i])**2) / TSS[i] for i in range(3)]
print('R-squared WDL:', RS_WDL)
print('R-squared Frechet:', RS_Frechet)
print('R-squared CLR:', RS_CLR)
print('R-squared linear:', RS_linear)
print('R-squared tree:', RS_tree)

Gini Median Poverty
RMSE WDL: [0.029026398885681783, 4017.4491312657087, 0.03713405269551012]
RMSE Frechet: [0.05237650595720725, 5113.148897475095, 0.05433941609611828]
RMSE CLR: [0.03016346945447764, 11065.044879520286, 0.03950181186817326]
RMSE linear: [0.02974135768807214, 4433.294218915797, 0.04075083070987116]
RMSE tree: [0.03169516824938421, 4535.967170096582, 0.03925984477814617]
R-squared WDL: [0.2124098267529616, 0.3690459357530289, 0.2811937417996522]
R-squared Frechet: [-1.5644073739313664, -0.02205401547357555, -0.5392071979983524]
R-squared CLR: [0.14949561449703974, -3.7863376093015626, 0.1866055899673511]
R-squared linear: [0.17313320737127935, 0.23166595311125848, 0.13435448457388088]
R-squared tree: [0.060925400768686355, 0.19566535672526975, 0.19653991338895405]
