In [68]:
import pandas as pd
import numpy as np
from sklearn.model_selection import (train_test_split as tts,
                                     cross_validate as cv,
                                     RandomizedSearchCV as rscv)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (RandomForestClassifier,
                              AdaBoostClassifier,
                              GradientBoostingClassifier)
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import (OneHotEncoder,
                                   OrdinalEncoder,
                                   StandardScaler)

from sklearn import metrics

import pickle 
import warnings
warnings.filterwarnings("ignore")




In [None]:
def calculate_metrics(model, X, y):
        y_pred = model.predict(X)

        model_metrics = {
        'accuracy': metrics.accuracy_score(y, y_pred),
        'balanced_accuracy': metrics.balanced_accuracy_score(y, y_pred),
        'precision': metrics.precision_score(y, y_pred),
        'recall': metrics.recall_score(y, y_pred),
        'f1': metrics.f1_score(y, y_pred),
    }

        try:
            y_pred_proba = model.predict_proba(X)[:,1]
            model_metrics['roc-auc'] = metrics.roc_auc_score(y, y_pred_proba)
            return model_metrics
        except:
            return model_metrics

: 

In [None]:
df = pd.read_csv(r"dataset/sql_trabalho.csv",usecols=["paciente_id","medico_id","sexo_paciente","especialidade"])
df.head()

Unnamed: 0,paciente_id,medico_id,sexo_paciente,especialidade
0,2,100045/SP,Female,Psicologo
1,666,100038/RJ,Female,Oncologia
2,907,100085/SP,Female,Oftalmologia
3,607,100024/SP,Female,Cardiologista
4,275,100092/SP,Male,Ginecologista


: 

In [None]:
#criando dados sinteticos
rand_num=np.random.choice(2,100_000,p=[0.4,0.6])
df_like=pd.DataFrame(rand_num)
df_like.columns=["recomenda"]

df=pd.concat([df,df_like],axis=1)
df.head()

Unnamed: 0,paciente_id,medico_id,sexo_paciente,especialidade,recomenda
0,2,100045/SP,Female,Psicologo,1
1,666,100038/RJ,Female,Oncologia,1
2,907,100085/SP,Female,Oftalmologia,1
3,607,100024/SP,Female,Cardiologista,1
4,275,100092/SP,Male,Ginecologista,0


: 

In [None]:
age=np.random.randint(18,99,100_000)
df_age=pd.DataFrame(age)
df_age.columns=["idade"]

df=pd.concat([df,df_age],axis=1)
df.head()

Unnamed: 0,paciente_id,medico_id,sexo_paciente,especialidade,recomenda,idade
0,2,100045/SP,Female,Psicologo,1,77
1,666,100038/RJ,Female,Oncologia,1,28
2,907,100085/SP,Female,Oftalmologia,1,74
3,607,100024/SP,Female,Cardiologista,1,98
4,275,100092/SP,Male,Ginecologista,0,67


: 

In [None]:
df.sexo_paciente.unique()

array(['Female', 'Male'], dtype=object)

: 

In [None]:
# valor 1 para recomenda e 0 para nao recomenda
df.recomenda.value_counts()

recomenda
1    59924
0    40076
Name: count, dtype: int64

: 

In [None]:
num_col=df.select_dtypes("number").columns.to_list()
cat_col=[col for col in df if col not in num_col]
cat_col=[
        #'medico_id',
         'sexo_paciente',
        'especialidade']

: 

In [None]:
num_col=[#'paciente_id',
         #'recomenda',
         'idade']
x=cat_col+num_col
y='recomenda'

: 

In [None]:
X_train, X_test,y_train, y_test=tts(df[x], df[y], stratify=df[y], random_state=101,test_size=0.2 )

: 

In [None]:

preprocessor=ColumnTransformer([
    ("scaler",StandardScaler(), num_col),
    ("encoder", OneHotEncoder(handle_unknown="ignore"), cat_col)
])
pipe=Pipeline([
    ("preprocess", preprocessor),
    ("model",RandomForestClassifier(class_weight="balanced", random_state=101))
])

: 

In [None]:
pipe.fit(X_train, y_train)

: 

In [None]:
calculate_metrics(pipe, X_test,y_test)

{'accuracy': 0.5041,
 'balanced_accuracy': 0.500729344754973,
 'precision': 0.6000773544768904,
 'recall': 0.5176843510176844,
 'f1': 0.5558441558441558,
 'roc-auc': 0.4998898726068388}

: 

In [None]:
model={
    "dummy":DummyClassifier(),
    "rf":RandomForestClassifier(class_weight="balanced", random_state=101),
    "ada":AdaBoostClassifier(random_state=101),
    "lr":LogisticRegression(class_weight="balanced", random_state=101),
    "gbc":GradientBoostingClassifier(random_state=101),
    "neural":MLPClassifier(alpha=1e-05, hidden_layer_sizes=(5, 2), random_state=101,
              solver='lbfgs'),
    "xgb":XGBClassifier(random_state=101)

}
results=[]
for model_name , modelo in model.items():
  print(model_name, modelo)
  pipe.set_params(model=modelo).fit(X_train,y_train)
  result=calculate_metrics(pipe, X_test,y_test)
  results.append(result)
  print(results)

dummy DummyClassifier()
[{'accuracy': 0.5994, 'balanced_accuracy': 0.5, 'precision': 0.5994, 'recall': 1.0, 'f1': 0.7495310741528073, 'roc-auc': 0.5}]
rf RandomForestClassifier(class_weight='balanced', random_state=101)
[{'accuracy': 0.5994, 'balanced_accuracy': 0.5, 'precision': 0.5994, 'recall': 1.0, 'f1': 0.7495310741528073, 'roc-auc': 0.5}, {'accuracy': 0.5041, 'balanced_accuracy': 0.500729344754973, 'precision': 0.6000773544768904, 'recall': 0.5176843510176844, 'f1': 0.5558441558441558, 'roc-auc': 0.4998898726068388}]
ada AdaBoostClassifier(random_state=101)
[{'accuracy': 0.5994, 'balanced_accuracy': 0.5, 'precision': 0.5994, 'recall': 1.0, 'f1': 0.7495310741528073, 'roc-auc': 0.5}, {'accuracy': 0.5041, 'balanced_accuracy': 0.500729344754973, 'precision': 0.6000773544768904, 'recall': 0.5176843510176844, 'f1': 0.5558441558441558, 'roc-auc': 0.4998898726068388}, {'accuracy': 0.5994, 'balanced_accuracy': 0.5, 'precision': 0.5994, 'recall': 1.0, 'f1': 0.7495310741528073, 'roc-auc': 0

: 

In [None]:
df_results=pd.DataFrame(results).T
df_results.columns=[
    'dummy',
    'rf',
    'ada',
    'lr',
    "gbc",
    "neural",
    "xgboost"
]

df_results


Unnamed: 0,dummy,rf,ada,lr,gbc,neural,xgboost
accuracy,0.5994,0.5041,0.5994,0.4869,0.5994,0.5994,0.59355
balanced_accuracy,0.5,0.500729,0.5,0.499214,0.5,0.5,0.500502
precision,0.5994,0.600077,0.5994,0.598538,0.5994,0.5994,0.599649
recall,1.0,0.517684,1.0,0.437271,1.0,1.0,0.968552
f1,0.749531,0.555844,0.749531,0.50535,0.749531,0.749531,0.74071
roc-auc,0.5,0.49989,0.495768,0.497539,0.500139,0.500209,0.49817


: 

In [None]:
model={
    "dummy":DummyClassifier(),
    "rf":RandomForestClassifier(class_weight="balanced", random_state=101),
    "ada":AdaBoostClassifier(random_state=101),
    "lr":LogisticRegression(class_weight="balanced", random_state=101),
    "gbc":GradientBoostingClassifier(random_state=101),
    "neural":MLPClassifier(alpha=1e-05, hidden_layer_sizes=(5, 2), random_state=101,
              solver='lbfgs'),
    "xgb":XGBClassifier(random_state=101)

}

cross_results=[]
for model_name, modelo in model.items():
  print(model_name, modelo)
  pipe.set_params(model=modelo)
  cross=cv(pipe, X_train, y_train, cv=5, scoring=["precision", "accuracy", "f1","recall", "roc_auc" ])
  cross_result=pd.DataFrame(cross).mean().to_frame(model_name).T
  cross_results.append(cross_result)


  df_cross=pd.concat(cross_results)

dummy DummyClassifier()


rf RandomForestClassifier(class_weight='balanced', random_state=101)
ada AdaBoostClassifier(random_state=101)
lr LogisticRegression(class_weight='balanced', random_state=101)
gbc GradientBoostingClassifier(random_state=101)
neural MLPClassifier(alpha=1e-05, hidden_layer_sizes=(5, 2), random_state=101,
              solver='lbfgs')
xgb XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None,

: 

In [None]:
df_cross=pd.concat(cross_results)
df_cross

Unnamed: 0,fit_time,score_time,test_precision,test_accuracy,test_f1,test_recall,test_roc_auc
dummy,0.119951,0.131765,0.599375,0.599375,0.749512,1.0,0.5
rf,32.762946,0.631021,0.599695,0.502975,0.553271,0.513556,0.498126
ada,1.584287,0.247888,0.599375,0.599375,0.749512,1.0,0.501387
lr,0.13232,0.076277,0.601099,0.48755,0.500929,0.432346,0.499922
gbc,4.1723,0.161201,0.59936,0.599225,0.749324,0.999374,0.502343
neural,1.322563,0.070534,0.599375,0.599375,0.749512,1.0,0.502348
xgb,0.901537,0.195572,0.59901,0.59025,0.736815,0.956997,0.498938


: 

In [None]:
pipe_xgb=Pipeline([
    ("preprocess",preprocessor),
    ("model",XGBClassifier(random_state=101, is_unbalanced=True))
])
pipe_xgb


params={
    'model__is_unbalaced':[True],
    'model__learning_rate' : [0.05,0.1, 0.3],
    'model__max_depth' : [ 2, 15],
  }

: 

In [None]:
final_random=rscv(pipe_xgb,param_distributions=params, cv=5, scoring="precision", verbose=True)
final_random.fit(X_train,y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


: 

In [None]:
df_random_final=pd.DataFrame(final_random.cv_results_).set_index("rank_test_score").sort_index()
df_random_final.loc[:,~df_random_final.columns.str.contains("split|time")].head(1)

Unnamed: 0_level_0,param_model__max_depth,param_model__learning_rate,param_model__is_unbalaced,params,mean_test_score,std_test_score
rank_test_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,2,0.1,True,"{'model__max_depth': 2, 'model__learning_rate'...",0.599377,5e-06


: 

In [None]:
pipe_xgbfinal = Pipeline([
    ("preprocessor", preprocessor),
    ("xgb", XGBClassifier(random_state=101, is_unbalanced=True,
                          max_depth=15, learning_rate=0.05))
]).fit(X_train, y_train)

: 

In [None]:
calculate_metrics(pipe_xgbfinal, X_test,y_test)

{'accuracy': 0.58935,
 'balanced_accuracy': 0.49968884261195795,
 'precision': 0.5992428624007572,
 'recall': 0.9507007007007007,
 'f1': 0.735124326764924,
 'roc-auc': 0.4994656674064646}

: 

In [None]:
with open("pipe_xgbfinal.pkl","wb") as file_model: # pode ser joblib tb
    pickle.dump(pipe_xgbfinal,file_model)

: 