# 04 Regression model 

> k-fold cross validation of regression model choice   

In [1]:
#|default_exp core.04_regression_model

In [2]:
#|export
import dementia_inequalities as proj
from dementia_inequalities import const, log, utils, tools
import adu_proj.utils as adutils

In [3]:
#|export
import numpy as np 
import pandas as pd 
from matplotlib import pyplot as plt
from sklearn import linear_model
from sklearn.model_selection import KFold, cross_val_score
import itertools

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


---

In [4]:
df_dem_plus = pd.read_csv(const.output_path+'/df_dem_plus.csv')

df_dem_plus.head()

Unnamed: 0,area_code,area_name,pop,DEM_afflicted,HYP_afflicted,DM_afflicted,STIA_afflicted,ALevel_plus,Female_all,Male_all,...,Male_over_65,over_65,white_pc,DEM_afflicted_pc,ALevel_plus_pc,over_65_pc,female_pc,HYP_afflicted_pc,DM_afflicted_pc,STIA_afflicted_pc
0,E07000091,New Forest,151481,1858.546949,28892.13902,9338.69541,4466.648271,72749.0,91513.3596,84729.6073,...,23644.9796,51920.6826,0.932,0.012269,0.480252,0.342754,0.604124,0.190731,0.061649,0.029487
1,E09000027,Richmond upon Thames,156795,1016.262785,16263.714161,5723.91313,2019.443594,112919.0,101226.673,93918.3682,...,14170.0187,31637.7606,0.63,0.006481,0.72017,0.201778,0.645599,0.103726,0.036506,0.01288
2,E07000116,Tunbridge Wells,95022,876.25526,12908.434533,4606.674451,1889.024705,52395.0,59540.1412,56126.1458,...,10171.4566,22570.009,0.842,0.009222,0.551399,0.237524,0.626593,0.135847,0.04848,0.01988
3,E07000121,Lancaster,123214,1228.376774,16806.42122,6784.714317,2647.583108,65724.0,72615.648,69496.5141,...,13527.6316,29465.7648,0.878,0.009969,0.533413,0.239143,0.589346,0.1364,0.055064,0.021488
4,E07000042,Mid Devon,67987,547.793487,10854.465333,4281.878984,1530.207441,33190.0,42541.3752,40639.6312,...,9396.6343,20046.6806,0.938,0.008057,0.488182,0.29486,0.625728,0.159655,0.062981,0.022507


In [5]:
# # potential covariates 
# covar_opt = ['over_65_pc', 'female_pc', 'ALevel_plus_pc', 'white_pc', 'HYP_afflicted_pc', 'DM_afflicted_pc', 'STIA_afflicted_pc']

In [7]:
# X_data = df_dem_plus[['ALevel_plus_pc', 'white_pc', 'over_65_pc', 'female_pc']].values.reshape(-1,4)
# Y_data = df_dem_plus['DEM_afflicted_pc'].values.reshape(-1,1)

In [8]:
# # Parameters for cross validation 
# n_folds = 5
# cv_kfold = KFold(n_folds, random_state=13, shuffle=True)

# # Model 
# model = linear_model.LinearRegression()

# # evaluate the model using 5-fold CV 
# scores = cross_val_score(model, X=X_data, y=Y_data, cv=cv_kfold)

# print('Accuracy: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))

Accuracy: 0.745 (0.034)


In [9]:
# Potential covariates 
covar_opt = ['over_65_pc', 'female_pc', 'ALevel_plus_pc', 'white_pc', 'HYP_afflicted_pc', 'DM_afflicted_pc', 'STIA_afflicted_pc']

# Outcome data 
Y_data = df_dem_plus['DEM_afflicted_pc'].values.reshape(-1,1)

# Parameters for cross validation 
n_folds = 5
cv_kfold = KFold(n_folds, random_state=13, shuffle=True)

# Model 
model = linear_model.LinearRegression()

covar_comb = {}
perform_by_covar = {}

key_num = 0
for i in range(1,8):
    for j, subset in enumerate(itertools.combinations(covar_opt, i)):
        covar_comb[key_num] = subset
        #print(subset)
        X_data = df_dem_plus[list(subset)].values.reshape(-1,i)
        # evaluate the model using 5-fold CV 
        scores = cross_val_score(model, X=X_data, y=Y_data, cv=cv_kfold)
        perform_by_covar[key_num] = np.mean(scores)
        key_num += 1
        #print('Accuracy: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))

In [10]:
# Best combination of covariates
covar_comb[max(perform_by_covar, key=perform_by_covar.get)]

('over_65_pc', 'HYP_afflicted_pc', 'DM_afflicted_pc', 'STIA_afflicted_pc')

In [11]:
# Top 5 combination of covariates 
max_5 = sorted(perform_by_covar, key=perform_by_covar.get, reverse=True)[:5]
[f'Covariates:{covar_comb[x]}, Accuracy:{perform_by_covar[x]}' for x in max_5]

["Covariates:('over_65_pc', 'HYP_afflicted_pc', 'DM_afflicted_pc', 'STIA_afflicted_pc'), Accuracy:0.7674882777954846",
 "Covariates:('over_65_pc', 'ALevel_plus_pc', 'HYP_afflicted_pc', 'DM_afflicted_pc', 'STIA_afflicted_pc'), Accuracy:0.7667498581680416",
 "Covariates:('over_65_pc', 'white_pc', 'HYP_afflicted_pc', 'DM_afflicted_pc', 'STIA_afflicted_pc'), Accuracy:0.7667010442466264",
 "Covariates:('over_65_pc', 'HYP_afflicted_pc', 'STIA_afflicted_pc'), Accuracy:0.7664719739876983",
 "Covariates:('over_65_pc', 'white_pc', 'HYP_afflicted_pc', 'STIA_afflicted_pc'), Accuracy:0.7664142681554167"]

In [28]:
# Create dataframe of combinations of covariates
df_covar_comb = pd.DataFrame.from_dict(covar_comb, orient='index')
df_covar_comb.columns = ['covariate 1', 'covariate 2', 'covariate 3', 'covariate 4', 'covariate 5', 'covariate 6', 'covariate 7']

# Create dataframe of accuracy
df_accuracy = pd.DataFrame.from_dict(perform_by_covar, orient='index')
df_accuracy.rename(columns={0:'accuracy (mean)'}, inplace=True)

# Merge 
df_model_acc = pd.merge(df_accuracy, df_covar_comb, left_index=True, right_index=True)

In [30]:
df_model_acc

Unnamed: 0,accuracy (mean),covariate 1,covariate 2,covariate 3,covariate 4,covariate 5,covariate 6,covariate 7
0,0.739684,over_65_pc,,,,,,
1,-0.003884,female_pc,,,,,,
2,0.110213,ALevel_plus_pc,,,,,,
3,0.552557,white_pc,,,,,,
4,0.641614,HYP_afflicted_pc,,,,,,
...,...,...,...,...,...,...,...,...
122,0.763680,over_65_pc,female_pc,ALevel_plus_pc,HYP_afflicted_pc,DM_afflicted_pc,STIA_afflicted_pc,
123,0.763539,over_65_pc,female_pc,white_pc,HYP_afflicted_pc,DM_afflicted_pc,STIA_afflicted_pc,
124,0.765706,over_65_pc,ALevel_plus_pc,white_pc,HYP_afflicted_pc,DM_afflicted_pc,STIA_afflicted_pc,
125,0.744675,female_pc,ALevel_plus_pc,white_pc,HYP_afflicted_pc,DM_afflicted_pc,STIA_afflicted_pc,
