In [1]:
import pltkit
import numpy as np
import pandas as pd
import plotly.express as px
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from  process_covariates import generate_panel_data, ssp_sdi
from statsmodels.stats.outliers_influence import variance_inflation_factor as VIF

wdir = 'X:\\user\\liprandicn\\Health Impacts Model'

### Open and explore data

In [None]:
# Load data (gdp, education, fertility, GINI and mortality) from 1980-2019
variables = generate_panel_data(wdir)
variables = variables[variables['cause'].isin(pltkit.diseases_level2)]

In [None]:
country_df = variables[(variables['cause']=='Cardiovascular diseases') & (variables['age_group']=='5-14') & (variables['sex']=='Female')]
fig = px.scatter(country_df, x='GINI', y='logmortality', color='year', hover_data='country', height=500, width=700)
fig.update_layout(showlegend=False)
fig.show()

### Regression analysis: Finding the best covariates

In [37]:
def ols_errors(mortality, df_disease, predictors_set):
    
    sex_groups = ['Female', 'Male']
    
    ages = ['0-5', '5-9','10-14','15-19','20-24','25-29','30-34','35-39','40-44',
                '45-49','50-54','55-59','60-64','65-69','70-74','75-79','80-84', '85+']
    age_groups = ['0-4', '15-24', '25-34', '35-44', '45-54', '5-14', '55-64', '65-74', '75-84', '85+']
        
    errors_age = pd.DataFrame(index=pd.MultiIndex.from_product([sex_groups, ages]),
                              columns = pd.MultiIndex.from_product([['RMSE', 'MAPE', 'RSS'], range(len(predictors_set))]))
    
    for sex_group in sex_groups:
        
        for age in ages:
            
            df = df_disease[(df_disease['age']==age)]# & (df_disease['sex']==sex_group)]
            
            if df.empty:
                continue
            
            # Set number of iterations
            iterations = 35
            
            # Where results will be stored
            errors_df = pd.DataFrame(index = range(iterations), columns = pd.MultiIndex.from_product([['RMSE', 'MAPE', 'RSS'], range(len(predictors_set))]))
            
            # Iterate ovevr all covariates combinations
            for i, predictor_set in enumerate(predictors_set):
                # Compute error several times to avoid error outliers
                for j in range(iterations):

                    # Divide dataset into training and validation using 80/20 split
                    train_df, val_df = train_test_split(df, test_size=0.2)

                    # Fit OLS model for training data
                    X_train = train_df[predictor_set]
                    X_train = sm.add_constant(X_train)
                    y_train = train_df[mortality]
                    model = sm.OLS(y_train, X_train).fit()

                    # Validate model on validation data
                    X_valid = val_df[predictor_set]
                    X_valid = sm.add_constant(X_valid)
                    y_valid = val_df[mortality]
                    y_pred = model.predict(X_valid)

                    # Store errors
                    errors_df.loc[j, ('RMSE',i)] = np.round(pltkit.rmse(y_valid, y_pred),3)
                    errors_df.loc[j, ('MAPE',i)] = np.round(pltkit.mape(y_valid, y_pred),3)
                    errors_df.loc[j, ('RSS',i)] = np.round(pltkit.rss(y_valid, y_pred),3)
                    
            errors_age.loc[(sex_group, age), :] = errors_df.mean(axis=0)
            
    return errors_age

In [None]:
variables.cause.unique()

In [96]:
disease = 'All causes'
df = variables[(variables['cause']==disease) &
               (~variables['loggdplag'].isna()) & (~variables['education'].isna()) & (~variables['fertility'].isna()) & 
               (~variables['GINI'].isna()) & (~variables['sdi'].isna())]

predictors = [#['gdppc', 'education', 'fertility', 'year'],
             # ['gdppc', 'gdppc_2', 'education', 'fertility', 'year'],
              ['loggdppc', 'education', 'TFU25', 'year'],
              ['loggdppc', 'loggdppc_2', 'education', 'TFU25', 'year'],
              ['loggdppc', 'loggdppc_2', 'education', 'TFU25', 'year', 'GINI'],
              ['loggdplag', 'loggdplag_2', 'education', 'TFU25', 'year', 'GINI'],
              ['loggdplag', 'education', 'TFU25', 'year'],
              ['loggdplag', 'education', 'TFU25', 'year', 'GINI'],
              ['sdi', 'year'],
              ]

errors_df = ols_errors('logmortality', df, predictors)

In [None]:
labels = ['Intercept' if len(p) == 0 else ' + '.join(p) for p in predictors]

fig, axes = plt.subplots(2,3, figsize=(15,10))

for i,sex in enumerate(['Female', 'Male']):
    for j,error in enumerate(['MAPE', 'RMSE', 'RSS']):
        for column,label in zip(range(len(predictors)),labels):
            axes[i,j].plot(errors_df.index.levels[1], errors_df.loc[sex, (error,column)], label=label)
        axes[i,j].set_title(error)
        axes[i,j].set_ylabel(error)
        axes[i,j].set_xlabel('Iteration')
        axes[i,j].tick_params('x', rotation=45)

plt.legend(loc='upper center', bbox_to_anchor=(-0.7, -0.15), ncol=1)
plt.ylabel('MAPE')
plt.xlabel('Iteration')
plt.suptitle(disease)

### Parameter computation

In [8]:
def get_regression_coef(variables, causes, covariates, name):

    # age_groups = ['0-4', '5-9','10-14','15-19','20-24','25-29','30-34','35-39','40-44',
    #             '45-49','50-54','55-59','60-64','65-69','70-74','75-79','80-84', '85+']
    age_groups = ['0-4', '15-24', '25-34', '35-44', '45-54', '5-14', '55-64', '65-74', '75-84', '85+']
    sex_groups = ['Female', 'Male']

    params = pd.DataFrame(
        index=pd.MultiIndex.from_product([causes,  sex_groups, age_groups], names=['cause', 'age_group', 'sex']),
        columns=pd.MultiIndex.from_product([covariates, ['coeff', 'bse', 'pvalue']], names=['covariate', 'metric'])
        ).sort_index().sort_index(axis=1)

    variables_cleaned = variables.dropna(subset=covariates)

    for cause in causes:
        
        print('Processing cause: ', cause)
        
        for age_group in age_groups:
            for sex in sex_groups:
                
                df = variables_cleaned[(variables_cleaned['cause']==cause) & 
                                    (variables_cleaned['age_group']==age_group) & 
                                    (variables_cleaned['sex']==sex)]
                
                if df is None or df.empty:
                    print(cause, age_group, sex, " --> Empty dataframe. Skip.")
                    continue
                
                df = df.sort_index()

                # Train 80-20 train set and average after 20 iterations:
                coeff = []; bse = []; pval=[]; r2=[]
                
                for count in range(20):

                    train_df,_= train_test_split(df, test_size=0.2)
                    X_train = train_df[covariates]
                    y_train = train_df['relative_mortality']
                    X_train = sm.add_constant(X_train)
                    model = sm.OLS(y_train, X_train).fit()
                    
                    coeff.append(model.params.values[1:])
                    bse.append(model.bse.values[1:])
                    pval.append(model.pvalues.values[1:])
                    r2.append(model.rsquared)
                
                coeff_array = np.array(coeff).mean(axis=0)
                bse_array = np.array(bse).mean(axis=0)
                pval_array = np.array(pval).mean(axis=0)
                r2_mean = np.mean(r2)            
                
                for metric, array in zip(['coeff', 'bse', 'pvalue'], [coeff_array, bse_array, pval_array]):
                    for cov, val in zip(covariates, array):
                        params.loc[(cause, sex, age_group), (cov, metric)] = val

                params.loc[(cause, sex, age_group), 'r2'] = r2_mean
     
    params.to_csv(f'{wdir}\\analysis\\regression_coefficients_{name}_WHO-groups.csv')

In [None]:
# covariates = ['loggdplag', 'education', 'TFU25', 'year', 'GINI']
covariates = ['sdi', 'year']
causes = ['Cardiovascular diseases', 'Chronic respiratory diseases', 'Diabetes and kidney diseases', 'Respiratory infections and tuberculosis']
name = 'sdi'

get_regression_coef(variables, causes, covariates, name)

In [None]:
df = variables[(variables['cause']=='Cardiovascular diseases') & (variables['age']=='0-4') & (variables['sex']=='Female') & (~variables['gdppc'].isna()) & (~variables['education'].isna()) & (~variables['TFR'].isna()) & (~variables['GINI'].isna())]
y = df.pop('relative_mortality')
predictors = ['loggdppc', 'loggdppc_2', 'education', 'TFR', 'year', 'GINI']
X = sm.add_constant(df[predictors])
model = sm.OLS(y, X)
results = model.fit()
print(results.summary())
print('RSS:', pltkit.rss(results.fittedvalues, y))
vif_data = pd.DataFrame()
vif_data["variable"] = X.columns
vif_data["VIF"] = [VIF(X.values, i) for i in range(X.shape[1])]
print(vif_data)

### Projections

In [34]:
# Open coefficients files
params_linear = pd.read_csv(f'{wdir}\\analysis\\regression_coefficients_linear_WHO-groups.csv', header=[0,1], index_col=[0,1,2])
params_sdi = pd.read_csv(f'{wdir}\\analysis\\regression_coefficients_sdi_WHO-groups.csv', header=[0,1], index_col=[0,1,2])

In [None]:
params_sdi

In [None]:
params = params_sdi.loc['Cardiovascular diseases', params_sdi.columns.get_level_values(1) == 'coeff']
params.columns = [f'{lvl0}_{lvl1}' for lvl0, lvl1 in params.columns]
params = params.reset_index()  
params


In [None]:
# Load Sociodemographic index
sdi = ssp_sdi(wdir)

In [45]:
df_cross = sdi.merge(params, how='cross')

In [None]:
df_cross['Mortality'] = -(df_cross['year'] * df_cross['year_coeff']) - (df_cross['sdi'] * df_cross['sdi_coeff'])

In [None]:
country = 'MEX'
df = sdi[(sdi['Scenario']=='SSP2') & (sdi['Sex']=='Both')]
fig = px.line(df, x='year', y='sdi', color='ISO3', title='Sociodemographic index projections for '+country, height=500, width=700)
fig.show()

In [None]:
country = 'NLD'
df_cross = df_cross[(df_cross['ISO3']==country) & (df_cross['Sex']=='Female') & (df_cross['sex']=='Female')]
fig = px.scatter(df_cross, x='year', y='Mortality', color='age_group', title='Predicted mortality from Cardiovascular Diseases for '+country, height=500, width=700)
fig.show()

In [None]:
df_cross