# Predictive analysis of Mais Médicos retention

Importing libraries

In [1]:
import pandas as pd
import pycaret
from pycaret.classification import *
import shap

Reading dataframe

In [2]:
df_original = pd.read_excel("database_mm_0312.xlsx")
df_original.head()

Unnamed: 0,ano,competen,codufmun,ibge_aloc,municipio_destino,uf_destino,cnes,tp_unid,fantasia,nomeprof,...,orcamento_saude,esgotamento_sanitario,taxa_homicidio,mortes_armasdefogo,feminicidio,ideb_anosiniciais,ideb_anosfinais,investimento_infraestrutura,equipamentos_esportivos,equpamentos_culturais
0,2019,2019-06-01,240580,240580,JOAO CAMARA,RN,2474689,2,PSF 2 SAO FRANCISCO,MARLIO EVERTON MACEDO DE PAIVA,...,617.07,35.43,51.495,0.0,28.412,4.2,3.0,1076.54,2.878,5.756
1,2019,2019-06-01,220020,220020,AGUA BRANCA,PI,2368455,2,POSTO DE SAUDE OTACIANA MARIA DO NASCIMENTO,ANA MARIA REGO COSTA,...,997.2,35.77,11.487,0.0,21.671,6.0,4.9,3447.022,46.112,34.584
2,2019,2019-10-01,520310,520310,BALIZA,GO,2500132,2,ESF MARLENE NERY RIBEIRO,RUBENS NOVAIS XAVIER,...,785.85,0.0,38.926,0.0,0.0,6.2,4.9,2781.115,20.032,20.032
3,2019,2019-09-01,290195,290195,APUAREMA,BA,7050062,2,USF OTACIANO,HERALDO ROCHA DE JESUS,...,578.33,0.0,40.928,0.0,26.688,4.2,3.4,4027.146,13.587,27.174
4,2019,2019-09-01,130340,130340,PARINTINS,AM,9640347,2,UNIDADE BASICA DE SAUDE DO BAIRRO DA UNIAO,KEDMA TAVARES BUAS,...,469.07,0.0,21.002,0.0,7.18,5.0,4.4,3341.453,1.767,1.767


Selecting only features which will be useful 

In [3]:
df_original.columns

Index(['ano', 'competen', 'codufmun', 'ibge_aloc', 'municipio_destino',
       'uf_destino', 'cnes', 'tp_unid', 'fantasia', 'nomeprof', 'cns_prof',
       'cpf_prof', 'data_nascimento', 'data_formacao', 'participou_pmmb',
       'sexo', 'ch_total', 'atuacao_previa_no_municipio',
       'anos_atuacao_mesmo_municipio', 'media_vinculos',
       'meses_no_local_alocado', 'churn', 'idade_em_anos', 'Prorrogado',
       'anos_formacao', 'leitos_hospitalar_municipal_estadual',
       'leitos_hospitalares_cirurgico_clinico_complementar',
       'leitos_instalacao_fisica_urgencia',
       'leitos_instalacao_fisica_ambulatorial',
       'leitos_instalacao_fisical_cirurgico',
       'leitos_instalacao_fisica_obstetrico', 'agente_saude', 'dentista',
       'enfermeiro', 'tec_aux_enf', 'populacao', 'porte', 'pib_percapita',
       'tx_pop_ocupada', 'cobertura_saude', 'vagas_medicina_milhab',
       'orcamento_saude', 'esgotamento_sanitario', 'taxa_homicidio',
       'mortes_armasdefogo', 'feminicidi

In [4]:
df_churn = df_original.loc[:,('uf_destino', 
                        'participou_pmmb',
                        'sexo',
                         'atuacao_previa_no_municipio',
                         'anos_atuacao_mesmo_municipio', 
                         'media_vinculos', 
                         'idade_em_anos',
                         'anos_formacao',
                         'leitos_hospitalares_cirurgico_clinico_complementar',
                         'leitos_instalacao_fisica_urgencia',
                        'leitos_instalacao_fisica_obstetrico', 
                        'agente_saude', 
                        'dentista',
                        'enfermeiro', 
                        'tec_aux_enf', 
                        'porte', 
                        'pib_percapita',
                        'tx_pop_ocupada', 
                        'cobertura_saude', 
                        'vagas_medicina_milhab',
                        'orcamento_saude', 
                        'esgotamento_sanitario', 
                        'taxa_homicidio',
                        'ideb_anosiniciais',
                        'ideb_anosfinais', 
                        'investimento_infraestrutura',
                        'equipamentos_esportivos', 
                        'equpamentos_culturais',
                        'churn')]

df_churn.head()

Unnamed: 0,uf_destino,participou_pmmb,sexo,atuacao_previa_no_municipio,anos_atuacao_mesmo_municipio,media_vinculos,idade_em_anos,anos_formacao,leitos_hospitalares_cirurgico_clinico_complementar,leitos_instalacao_fisica_urgencia,...,vagas_medicina_milhab,orcamento_saude,esgotamento_sanitario,taxa_homicidio,ideb_anosiniciais,ideb_anosfinais,investimento_infraestrutura,equipamentos_esportivos,equpamentos_culturais,churn
0,RN,NÃO,Male,Não,0.0,1.0,54.094456,0.084873,0.0,7.0,...,0.0,617.07,35.43,51.495,4.2,3.0,1076.54,2.878,5.756,migrou
1,PI,NÃO,Female,Não,0.0,1.0,44.654346,0.084873,0.0,4.0,...,0.0,997.2,35.77,11.487,6.0,4.9,3447.022,46.112,34.584,permanece
2,GO,,Male,Não,0.0,1.278689,48.147844,1.333333,2.0,8.0,...,0.0,785.85,0.0,38.926,6.2,4.9,2781.115,20.032,20.032,migrou
3,BA,,Male,Não,0.0,1.0,48.533881,0.91718,0.0,5.0,...,0.0,578.33,0.0,40.928,4.2,3.4,4027.146,13.587,27.174,migrou
4,AM,,Female,Não,0.0,1.209677,44.060233,0.999316,0.0,10.0,...,0.0,469.07,0.0,21.002,5.0,4.4,3341.453,1.767,1.767,migrou


In [5]:

clf1 = setup(data = df_churn, 
             target = 'churn',
             train_size=0.7,
             normalize=True,
             normalize_method='minmax',
             fix_imbalance=True,
             remove_multicollinearity=True,
             remove_outliers=True,
             fold=10)

clf1

Unnamed: 0,Description,Value
0,Session id,7909
1,Target,churn
2,Target type,Binary
3,Target mapping,"migrou: 0, permanece: 1"
4,Original data shape,"(1647, 29)"
5,Transformed data shape,"(1625, 33)"
6,Transformed train set shape,"(1130, 33)"
7,Transformed test set shape,"(495, 33)"
8,Numeric features,23
9,Categorical features,5


<pycaret.classification.oop.ClassificationExperiment at 0x20a7d654250>

In [6]:
best_model = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.619,0.6625,0.619,0.6201,0.6182,0.2375,0.2387,0.269
lda,Linear Discriminant Analysis,0.6138,0.6495,0.6138,0.6159,0.611,0.2254,0.2284,0.111
gbc,Gradient Boosting Classifier,0.6137,0.6521,0.6137,0.6144,0.612,0.226,0.2272,0.311
lr,Logistic Regression,0.6112,0.6484,0.6112,0.6129,0.6083,0.2201,0.2228,0.91
et,Extra Trees Classifier,0.6112,0.6373,0.6112,0.6121,0.61,0.2215,0.2227,0.218
ridge,Ridge Classifier,0.6103,0.651,0.6103,0.6123,0.607,0.2181,0.2211,0.139
ada,Ada Boost Classifier,0.6024,0.622,0.6024,0.6035,0.6013,0.2043,0.2055,0.202
qda,Quadratic Discriminant Analysis,0.6007,0.6152,0.6007,0.6059,0.5941,0.1985,0.2048,0.117
lightgbm,Light Gradient Boosting Machine,0.5981,0.6454,0.5981,0.5984,0.5969,0.1953,0.1959,0.358
dt,Decision Tree Classifier,0.5964,0.5962,0.5964,0.5971,0.5953,0.1924,0.1931,0.162


In [7]:
tuned_model = tune_model(best_model)
tuned_model

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6293,0.6464,0.6293,0.6293,0.6289,0.2577,0.2581
1,0.5948,0.6331,0.5948,0.5947,0.5947,0.1892,0.1892
2,0.5913,0.6111,0.5913,0.594,0.5904,0.1846,0.186
3,0.6,0.6877,0.6,0.6013,0.596,0.1958,0.1989
4,0.6174,0.7052,0.6174,0.6179,0.6174,0.235,0.2351
5,0.6087,0.6371,0.6087,0.6108,0.6041,0.2129,0.217
6,0.7391,0.7739,0.7391,0.7392,0.7389,0.4774,0.4777
7,0.687,0.7149,0.687,0.6875,0.687,0.3741,0.3743
8,0.6957,0.7381,0.6957,0.6958,0.6957,0.3914,0.3914
9,0.5826,0.6824,0.5826,0.5828,0.582,0.1646,0.165


Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [8]:
evaluate_model(tuned_model)


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [9]:
avaliacao = predict_model(tuned_model)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,0.6747,0.7152,0.6747,0.6758,0.6735,0.3477,0.3495


In [10]:
testar = avaliacao.loc[:,('churn',
             'prediction_label',
             'prediction_score')]

Unnamed: 0,churn,prediction_label,prediction_score
657,permanece,permanece,0.6108
638,migrou,migrou,0.5263
1393,permanece,migrou,0.5555
1041,migrou,permanece,0.6221
5,migrou,migrou,0.7211
...,...,...,...
1051,permanece,permanece,0.6479
1440,permanece,permanece,0.5534
1476,permanece,permanece,0.5829
366,permanece,permanece,0.5240


In [None]:
final_model = finalize_model(tuned_model)
save_model(final_model, 'final_model_classification_pmm')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('label_encoding',
                  TransformerWrapperWithInverse(exclude=None, include=None,
                                                transformer=LabelEncoder())),
                 ('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['anos_atuacao_municipio',
                                              'media_vinculos', 'idade_em_anos',
                                              'anos_formacao',
                                              'leitos_hospitalares_cirurgico_clinico_complementar',
                                              'leitos_insta...
                  TransformerWrapper(exclude=None, include=None,
                                     transformer=MinMaxScaler(clip=False,
                                                              copy=True,
                                                              feature_range=(0,
         

# Fazendo teste de usando regressão

In [102]:
df_original = pd.read_excel("database_mm_0312.xlsx")
df_original.head()

Unnamed: 0,ano,competen,codufmun,ibge_aloc,municipio_destino,uf_destino,cnes,tp_unid,fantasia,nomeprof,...,orcamento_saude,esgotamento_sanitario,taxa_homicidio,mortes_armasdefogo,feminicidio,ideb_anosiniciais,ideb_anosfinais,investimento_infraestrutura,equipamentos_esportivos,equpamentos_culturais
0,2019,2019-06-01,240580,240580,JOAO CAMARA,RN,2474689,2,PSF 2 SAO FRANCISCO,MARLIO EVERTON MACEDO DE PAIVA,...,617.07,35.43,51.495,0.0,28.412,4.2,3.0,1076.54,2.878,5.756
1,2019,2019-06-01,220020,220020,AGUA BRANCA,PI,2368455,2,POSTO DE SAUDE OTACIANA MARIA DO NASCIMENTO,ANA MARIA REGO COSTA,...,997.2,35.77,11.487,0.0,21.671,6.0,4.9,3447.022,46.112,34.584
2,2019,2019-10-01,520310,520310,BALIZA,GO,2500132,2,ESF MARLENE NERY RIBEIRO,RUBENS NOVAIS XAVIER,...,785.85,0.0,38.926,0.0,0.0,6.2,4.9,2781.115,20.032,20.032
3,2019,2019-09-01,290195,290195,APUAREMA,BA,7050062,2,USF OTACIANO,HERALDO ROCHA DE JESUS,...,578.33,0.0,40.928,0.0,26.688,4.2,3.4,4027.146,13.587,27.174
4,2019,2019-09-01,130340,130340,PARINTINS,AM,9640347,2,UNIDADE BASICA DE SAUDE DO BAIRRO DA UNIAO,KEDMA TAVARES BUAS,...,469.07,0.0,21.002,0.0,7.18,5.0,4.4,3341.453,1.767,1.767


In [103]:
df_media = df_original.loc[:,('uf_destino', 
                        'participou_pmmb',
                        'sexo',
                         'atuacao_previa_no_municipio',
                         'anos_atuacao_mesmo_municipio', 
                         'media_vinculos', 
                         'idade_em_anos',
                         'Prorrogado', 
                         'anos_formacao',
                         'leitos_hospitalares_cirurgico_clinico_complementar',
                         'leitos_instalacao_fisica_urgencia',
                        'leitos_instalacao_fisica_obstetrico', 
                        'agente_saude', 
                        'dentista',
                        'enfermeiro', 
                        'tec_aux_enf', 
                        'populacao', 
                        'porte', 
                        'pib_percapita',
                        'tx_pop_ocupada', 
                        'cobertura_saude', 
                        'vagas_medicina_milhab',
                        'orcamento_saude', 
                        'esgotamento_sanitario', 
                        'taxa_homicidio',
                        'ideb_anosiniciais',
                        'ideb_anosfinais', 
                        'investimento_infraestrutura',
                        'equipamentos_esportivos', 
                        'equpamentos_culturais',
                        'meses_no_local_alocado')]

In [104]:
from pycaret.regression import *

In [106]:

reg_mm = setup(data = df_media, 
                target = 'meses_no_local_alocado',
                train_size=0.7,
                normalize=True,
                normalize_method='minmax',
                remove_multicollinearity=True,
                remove_outliers=True,
                fold=10)

reg_mm

Unnamed: 0,Description,Value
0,Session id,8664
1,Target,meses_no_local_alocado
2,Target type,Regression
3,Original data shape,"(1647, 31)"
4,Transformed data shape,"(1589, 35)"
5,Transformed train set shape,"(1094, 35)"
6,Transformed test set shape,"(495, 35)"
7,Numeric features,24
8,Categorical features,6
9,Rows with missing values,33.2%


<pycaret.regression.oop.RegressionExperiment at 0x2ad9bc479d0>

In [107]:
best_model_reg = compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,11.3476,212.7985,14.5522,0.5014,0.6086,0.7652,0.246
rf,Random Forest Regressor,11.4977,217.6128,14.7153,0.4904,0.61,0.7724,0.361
lightgbm,Light Gradient Boosting Machine,11.7192,227.8801,15.065,0.4654,0.6222,0.7602,0.256
br,Bayesian Ridge,11.8589,229.3569,15.1155,0.4639,0.6292,0.84,0.081
ridge,Ridge Regression,11.9133,230.9964,15.1679,0.4597,0.6323,0.8328,0.08
omp,Orthogonal Matching Pursuit,11.851,232.2269,15.193,0.4585,0.6309,0.8436,0.082
et,Extra Trees Regressor,11.7852,233.2436,15.2265,0.4542,0.6206,0.7834,0.297
ada,AdaBoost Regressor,13.0004,242.7223,15.5314,0.4348,0.6548,0.9364,0.148
llar,Lasso Least Angle Regression,12.5919,245.7887,15.6289,0.4275,0.6492,0.8986,0.069
lasso,Lasso Regression,12.5919,245.7887,15.6289,0.4275,0.6492,0.8986,0.089


In [109]:
tuned_model_reg = tune_model(best_model_reg)


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


In [113]:
evaluate_model(tuned_model)


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…