# Predictive analysis of Mais Médicos retention

Importing libraries

In [2]:
import pandas as pd
from pycaret.classification import *
import shap

Reading dataframe

In [3]:
df_original = pd.read_excel("base_tratadav2.xlsx")
df_original.head()

Unnamed: 0,ano,competen,codufmun,ibge_aloc,municipio_destino,uf_destino,cnes,tp_unid,fantasia,nomeprof,...,vagas_medicina_milhab,orcamento_saude,esgotamento_sanitario,taxa_homicidio,ideb_anosiniciais,ideb_anosfinais,investimento_infraestrutura,equipamentos_esportivos,equpamentos_culturais,distancia_capital
0,2019,2019-06-01,240580,240580,JOAO CAMARA,RN,2474689,2,PSF 2 SAO FRANCISCO,MARLIO EVERTON MACEDO DE PAIVA,...,0.0,617.07,35.43,51.495,4.2,3.0,1076.54,2.878,5.756,81.0
1,2019,2019-06-01,220020,220020,AGUA BRANCA,PI,2368455,2,POSTO DE SAUDE OTACIANA MARIA DO NASCIMENTO,ANA MARIA REGO COSTA,...,0.0,997.2,35.77,11.487,6.0,4.9,3447.022,46.112,34.584,101.0
2,2019,2019-10-01,520310,520310,BALIZA,GO,2500132,2,ESF MARLENE NERY RIBEIRO,RUBENS NOVAIS XAVIER,...,0.0,785.85,0.0,38.926,6.2,4.9,2781.115,20.032,20.032,420.0
3,2019,2019-09-01,290195,290195,APUAREMA,BA,7050062,2,USF OTACIANO,HERALDO ROCHA DE JESUS,...,0.0,578.33,0.0,40.928,4.2,3.4,4027.146,13.587,27.174,343.0
4,2019,2019-09-01,130340,130340,PARINTINS,AM,9640347,2,UNIDADE BASICA DE SAUDE DO BAIRRO DA UNIAO,KEDMA TAVARES BUAS,...,0.0,469.07,0.0,21.002,5.0,4.4,3341.453,1.767,1.767,369.0


Selecting only features which will be useful 

In [4]:
df_original.columns

Index(['ano', 'competen', 'codufmun', 'ibge_aloc', 'municipio_destino',
       'uf_destino', 'cnes', 'tp_unid', 'fantasia', 'nomeprof', 'cns_prof',
       'cpf_prof', 'data_nascimento', 'data_formacao', 'participou_pmmb',
       'sexo', 'ch_total', 'alocacao', 'atuacao_previa_no_municipio',
       'anos_atuacao_mesmo_municipio', 'media_vinculos',
       'meses_no_local_alocado', 'churn', 'uf_final', 'idade_em_anos',
       'Prorrogado', 'anos_formacao', 'leitos_hospitalar_municipal_estadual',
       'leitos_hospitalares_cirurgico_clinico_complementar',
       'leitos_instalacao_fisica_urgencia',
       'leitos_instalacao_fisica_ambulatorial',
       'leitos_instalacao_fisical_cirurgico',
       'leitos_instalacao_fisica_obstetrico', 'enfermeiro', 'tec_aux_enf',
       'agente_saude', 'dentista', 'populacao', 'porte', 'pib_percapita',
       'tx_pop_ocupada', 'cobertura_saude', 'vagas_medicina_milhab',
       'orcamento_saude', 'esgotamento_sanitario', 'taxa_homicidio',
       'ideb_ano

In [5]:
df_original.isna().sum()

ano                                                     0
competen                                                0
codufmun                                                0
ibge_aloc                                               0
municipio_destino                                       0
uf_destino                                              0
cnes                                                    0
tp_unid                                                 0
fantasia                                                0
nomeprof                                                0
cns_prof                                                0
cpf_prof                                                0
data_nascimento                                         0
data_formacao                                           0
participou_pmmb                                       349
sexo                                                    0
ch_total                                                0
alocacao      

In [6]:
df_churn = df_original.loc[:,('uf_destino',
                        'sexo',
                         'atuacao_previa_no_municipio',
                         'anos_atuacao_mesmo_municipio', 
                         'media_vinculos', 
                         'idade_em_anos',
                         'anos_formacao',
                         'leitos_instalacao_fisica_urgencia',
                        'agente_saude', 
                        'dentista',
                        'enfermeiro', 
                        'tec_aux_enf', 
                        'porte', 
                        'pib_percapita',
                        'tx_pop_ocupada', 
                        'cobertura_saude', 
                        'vagas_medicina_milhab',
                        'orcamento_saude', 
                        'esgotamento_sanitario', 
                        'taxa_homicidio',
                        'ideb_anosiniciais',
                        'ideb_anosfinais',
                        'investimento_infraestrutura',
                        'equipamentos_esportivos', 
                        'equpamentos_culturais',
                        'churn',
                        'distancia_capital')]

df_churn.head()

Unnamed: 0,uf_destino,sexo,atuacao_previa_no_municipio,anos_atuacao_mesmo_municipio,media_vinculos,idade_em_anos,anos_formacao,leitos_instalacao_fisica_urgencia,agente_saude,dentista,...,orcamento_saude,esgotamento_sanitario,taxa_homicidio,ideb_anosiniciais,ideb_anosfinais,investimento_infraestrutura,equipamentos_esportivos,equpamentos_culturais,churn,distancia_capital
0,RN,Male,Não,0.0,1.0,54.094456,0.084873,7.0,5.166667,1.0,...,617.07,35.43,51.495,4.2,3.0,1076.54,2.878,5.756,migrou,81.0
1,PI,Female,Não,0.0,1.0,44.654346,0.084873,4.0,6.166667,1.0,...,997.2,35.77,11.487,6.0,4.9,3447.022,46.112,34.584,permanece,101.0
2,GO,Male,Não,0.0,1.290323,48.147844,1.333333,8.0,10.333333,0.833333,...,785.85,0.0,38.926,6.2,4.9,2781.115,20.032,20.032,migrou,420.0
3,BA,Male,Não,0.0,1.0,48.533881,0.91718,5.0,8.333333,1.0,...,578.33,0.0,40.928,4.2,3.4,4027.146,13.587,27.174,migrou,343.0
4,AM,Female,Não,0.0,1.222222,44.060233,0.999316,10.0,15.166667,2.0,...,469.07,0.0,21.002,5.0,4.4,3341.453,1.767,1.767,migrou,369.0


In [7]:
#Codificando a variavel churn
df_churn['churn'] = df_churn['churn'].map({'migrou': 1, 'permanece': 0})

df_churn.head()

Unnamed: 0,uf_destino,sexo,atuacao_previa_no_municipio,anos_atuacao_mesmo_municipio,media_vinculos,idade_em_anos,anos_formacao,leitos_instalacao_fisica_urgencia,agente_saude,dentista,...,orcamento_saude,esgotamento_sanitario,taxa_homicidio,ideb_anosiniciais,ideb_anosfinais,investimento_infraestrutura,equipamentos_esportivos,equpamentos_culturais,churn,distancia_capital
0,RN,Male,Não,0.0,1.0,54.094456,0.084873,7.0,5.166667,1.0,...,617.07,35.43,51.495,4.2,3.0,1076.54,2.878,5.756,1,81.0
1,PI,Female,Não,0.0,1.0,44.654346,0.084873,4.0,6.166667,1.0,...,997.2,35.77,11.487,6.0,4.9,3447.022,46.112,34.584,0,101.0
2,GO,Male,Não,0.0,1.290323,48.147844,1.333333,8.0,10.333333,0.833333,...,785.85,0.0,38.926,6.2,4.9,2781.115,20.032,20.032,1,420.0
3,BA,Male,Não,0.0,1.0,48.533881,0.91718,5.0,8.333333,1.0,...,578.33,0.0,40.928,4.2,3.4,4027.146,13.587,27.174,1,343.0
4,AM,Female,Não,0.0,1.222222,44.060233,0.999316,10.0,15.166667,2.0,...,469.07,0.0,21.002,5.0,4.4,3341.453,1.767,1.767,1,369.0


In [14]:

clf1 = setup(data = df_churn, 
             target = 'churn',
             train_size=0.7,
             normalize=True,
             normalize_method='minmax',
             fix_imbalance=True,
             remove_multicollinearity=True,
             remove_outliers = True,
             ordinal_features = {"porte":["PP I", "PP II", "MP", "GP", "Metropole"]},
             fold=10,
            session_id = 5049)

# Acessando os dados de treino e teste completos (todas as variáveis)
transformed_train_df = get_config('X_train')  # Variáveis independentes transformadas
transformed_train_df['churn'] = get_config('y_train')  # Adicionando a variável alvo

# Caso queira visualizar ou salvar os dados completos
transformed_train_df.to_csv('transformed_train_df.csv', index=False)

clf1

Unnamed: 0,Description,Value
0,Session id,5049
1,Target,churn
2,Target type,Binary
3,Original data shape,"(1647, 27)"
4,Transformed data shape,"(1633, 52)"
5,Transformed train set shape,"(1138, 52)"
6,Transformed test set shape,"(495, 52)"
7,Ordinal features,1
8,Numeric features,22
9,Categorical features,4


<pycaret.classification.oop.ClassificationExperiment at 0x2e8adbd5e80>

In [17]:
clf1.train_transformed



Unnamed: 0,uf_destino_MA,uf_destino_TO,uf_destino_ES,uf_destino_CE,uf_destino_BA,uf_destino_MG,uf_destino_PE,uf_destino_PA,uf_destino_SP,uf_destino_RJ,...,orcamento_saude,esgotamento_sanitario,taxa_homicidio,ideb_anosiniciais,ideb_anosfinais,investimento_infraestrutura,equipamentos_esportivos,equpamentos_culturais,distancia_capital,churn
300,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.054657,0.000000,0.311668,0.180328,0.259259,0.100057,0.045502,0.037278,0.274120,1
1267,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.085695,0.000000,0.231003,0.114754,0.379244,0.075531,0.111625,0.137174,0.241312,0
767,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.103658,0.471800,0.256498,0.508197,0.379244,0.184189,0.041013,0.100801,0.084302,0
495,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.081301,0.000000,0.150905,0.524590,0.555556,0.084661,0.024180,0.029715,0.240726,1
266,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.000000,0.000000,0.0,...,0.098017,0.077000,0.164645,0.245902,0.203704,0.153377,0.012633,0.054332,0.196201,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1686,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.369792,0.000000,0.0,...,0.065041,0.043921,0.075958,0.216018,0.403355,0.088280,0.036060,0.091683,0.415758,1
1687,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.000000,0.000000,0.0,...,0.051778,0.301800,0.232859,0.360656,0.500000,0.107228,0.000965,0.001186,0.011655,1
1688,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.093925,0.276832,0.211781,0.336167,0.361042,0.064671,0.016152,0.019081,0.063371,1
1689,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.343867,0.0,...,0.091206,0.336199,0.242539,0.150903,0.239765,0.097749,0.025184,0.086151,0.093730,1


In [9]:
best_model = compare_models(n_select = 5)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.6337,0.6752,0.6311,0.6246,0.627,0.2672,0.2678,0.11
lda,Linear Discriminant Analysis,0.6337,0.679,0.5781,0.6412,0.6069,0.2658,0.268,0.056
ridge,Ridge Classifier,0.6302,0.6804,0.5746,0.6389,0.6038,0.259,0.2614,0.056
lr,Logistic Regression,0.625,0.6806,0.5799,0.6294,0.6023,0.2488,0.2507,0.731
lightgbm,Light Gradient Boosting Machine,0.625,0.6683,0.6134,0.6178,0.6151,0.2495,0.2498,0.178
ada,Ada Boost Classifier,0.6241,0.6594,0.6223,0.6148,0.6176,0.2481,0.2488,0.082
gbc,Gradient Boosting Classifier,0.6163,0.6691,0.6135,0.6072,0.6098,0.2325,0.2328,0.135
et,Extra Trees Classifier,0.6128,0.6506,0.5797,0.6098,0.5941,0.2245,0.2248,0.101
dt,Decision Tree Classifier,0.6102,0.6103,0.6135,0.5993,0.6048,0.2204,0.2219,0.057
svm,SVM - Linear Kernel,0.599,0.6538,0.546,0.6227,0.5589,0.1959,0.2103,0.057


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

In [10]:

tuned_model_rf = tune_model(best_model[0])
tuned_model_rf

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6207,0.6827,0.5965,0.6182,0.6071,0.2407,0.2408
1,0.6724,0.7138,0.6667,0.6667,0.6667,0.3446,0.3446
2,0.5478,0.6021,0.5789,0.541,0.5593,0.0961,0.0964
3,0.687,0.7319,0.6316,0.7059,0.6667,0.3733,0.3754
4,0.7391,0.7482,0.7679,0.7167,0.7414,0.4789,0.48
5,0.6,0.6221,0.6429,0.5806,0.6102,0.2016,0.2027
6,0.6522,0.676,0.6964,0.629,0.661,0.3058,0.3074
7,0.7043,0.7365,0.7321,0.6833,0.7069,0.4094,0.4104
8,0.6696,0.7424,0.6786,0.6552,0.6667,0.3393,0.3395
9,0.6174,0.7048,0.6607,0.5968,0.6271,0.2363,0.2376


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [114]:
evaluate_model(tuned_model_rf)


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [11]:
avaliacao = predict_model(tuned_model_rf)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,0.6586,0.6848,0.624,0.6594,0.6412,0.316,0.3165


In [12]:
testar = avaliacao.loc[:,('churn',
             'prediction_label',
             'prediction_score')]

testar

Unnamed: 0,churn,prediction_label,prediction_score
189,0,1,0.6440
2,1,1,0.7078
8,1,1,0.6520
1484,0,0,0.6767
38,1,1,0.6858
...,...,...,...
757,0,0,0.5866
414,0,0,0.5317
608,0,0,0.5836
812,1,0,0.6399


In [12]:
final_model = finalize_model(tuned_model_rf)

IndexError: Length of values (1564) does not match length of index (1647). This usually happens when transformations that drop rows aren't applied on all the columns.

# Fazendo teste de usando regressão

In [102]:
df_original = pd.read_excel("database_mm_0312.xlsx")
df_original.head()

Unnamed: 0,ano,competen,codufmun,ibge_aloc,municipio_destino,uf_destino,cnes,tp_unid,fantasia,nomeprof,...,orcamento_saude,esgotamento_sanitario,taxa_homicidio,mortes_armasdefogo,feminicidio,ideb_anosiniciais,ideb_anosfinais,investimento_infraestrutura,equipamentos_esportivos,equpamentos_culturais
0,2019,2019-06-01,240580,240580,JOAO CAMARA,RN,2474689,2,PSF 2 SAO FRANCISCO,MARLIO EVERTON MACEDO DE PAIVA,...,617.07,35.43,51.495,0.0,28.412,4.2,3.0,1076.54,2.878,5.756
1,2019,2019-06-01,220020,220020,AGUA BRANCA,PI,2368455,2,POSTO DE SAUDE OTACIANA MARIA DO NASCIMENTO,ANA MARIA REGO COSTA,...,997.2,35.77,11.487,0.0,21.671,6.0,4.9,3447.022,46.112,34.584
2,2019,2019-10-01,520310,520310,BALIZA,GO,2500132,2,ESF MARLENE NERY RIBEIRO,RUBENS NOVAIS XAVIER,...,785.85,0.0,38.926,0.0,0.0,6.2,4.9,2781.115,20.032,20.032
3,2019,2019-09-01,290195,290195,APUAREMA,BA,7050062,2,USF OTACIANO,HERALDO ROCHA DE JESUS,...,578.33,0.0,40.928,0.0,26.688,4.2,3.4,4027.146,13.587,27.174
4,2019,2019-09-01,130340,130340,PARINTINS,AM,9640347,2,UNIDADE BASICA DE SAUDE DO BAIRRO DA UNIAO,KEDMA TAVARES BUAS,...,469.07,0.0,21.002,0.0,7.18,5.0,4.4,3341.453,1.767,1.767


In [103]:
df_media = df_original.loc[:,('uf_destino', 
                        'participou_pmmb',
                        'sexo',
                         'atuacao_previa_no_municipio',
                         'anos_atuacao_mesmo_municipio', 
                         'media_vinculos', 
                         'idade_em_anos',
                         'Prorrogado', 
                         'anos_formacao',
                         'leitos_hospitalares_cirurgico_clinico_complementar',
                         'leitos_instalacao_fisica_urgencia',
                        'leitos_instalacao_fisica_obstetrico', 
                        'agente_saude', 
                        'dentista',
                        'enfermeiro', 
                        'tec_aux_enf', 
                        'populacao', 
                        'porte', 
                        'pib_percapita',
                        'tx_pop_ocupada', 
                        'cobertura_saude', 
                        'vagas_medicina_milhab',
                        'orcamento_saude', 
                        'esgotamento_sanitario', 
                        'taxa_homicidio',
                        'ideb_anosiniciais',
                        'ideb_anosfinais', 
                        'investimento_infraestrutura',
                        'equipamentos_esportivos', 
                        'equpamentos_culturais',
                        'meses_no_local_alocado')]

In [104]:
from pycaret.regression import *

In [106]:

reg_mm = setup(data = df_media, 
                target = 'meses_no_local_alocado',
                train_size=0.7,
                normalize=True,
                normalize_method='minmax',
                remove_multicollinearity=True,
                remove_outliers=True,
                fold=10)

reg_mm

Unnamed: 0,Description,Value
0,Session id,8664
1,Target,meses_no_local_alocado
2,Target type,Regression
3,Original data shape,"(1647, 31)"
4,Transformed data shape,"(1589, 35)"
5,Transformed train set shape,"(1094, 35)"
6,Transformed test set shape,"(495, 35)"
7,Numeric features,24
8,Categorical features,6
9,Rows with missing values,33.2%


<pycaret.regression.oop.RegressionExperiment at 0x2ad9bc479d0>

In [107]:
best_model_reg = compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,11.3476,212.7985,14.5522,0.5014,0.6086,0.7652,0.246
rf,Random Forest Regressor,11.4977,217.6128,14.7153,0.4904,0.61,0.7724,0.361
lightgbm,Light Gradient Boosting Machine,11.7192,227.8801,15.065,0.4654,0.6222,0.7602,0.256
br,Bayesian Ridge,11.8589,229.3569,15.1155,0.4639,0.6292,0.84,0.081
ridge,Ridge Regression,11.9133,230.9964,15.1679,0.4597,0.6323,0.8328,0.08
omp,Orthogonal Matching Pursuit,11.851,232.2269,15.193,0.4585,0.6309,0.8436,0.082
et,Extra Trees Regressor,11.7852,233.2436,15.2265,0.4542,0.6206,0.7834,0.297
ada,AdaBoost Regressor,13.0004,242.7223,15.5314,0.4348,0.6548,0.9364,0.148
llar,Lasso Least Angle Regression,12.5919,245.7887,15.6289,0.4275,0.6492,0.8986,0.069
lasso,Lasso Regression,12.5919,245.7887,15.6289,0.4275,0.6492,0.8986,0.089


In [109]:
tuned_model_reg = tune_model(best_model_reg)


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


In [113]:
evaluate_model(tuned_model)


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…