In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import (train_test_split as tts,
                                     cross_validate as cv,
                                     RandomizedSearchCV as rscv)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (RandomForestClassifier,
                              AdaBoostClassifier,
                              GradientBoostingClassifier)
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyClassifier

from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import (OneHotEncoder,
                                  
                                   StandardScaler)

from sklearn import metrics

import pickle 
import warnings
warnings.filterwarnings("ignore")




In [None]:
def calculate_metrics(model, X, y):
        y_pred = model.predict(X)

        model_metrics = {
        'accuracy': metrics.accuracy_score(y, y_pred),
        'balanced_accuracy': metrics.balanced_accuracy_score(y, y_pred),
        'precision': metrics.precision_score(y, y_pred),
        'recall': metrics.recall_score(y, y_pred),
        'f1': metrics.f1_score(y, y_pred),
    }

        try:
            y_pred_proba = model.predict_proba(X)[:,1]
            model_metrics['roc-auc'] = metrics.roc_auc_score(y, y_pred_proba)
            return model_metrics
        except:
            return model_metrics

: 

: 

In [None]:
df = pd.read_csv(r"dataset/sql_trabalho.csv",usecols=["paciente_id","medico_id","sexo_paciente","especialidade"])
df.head()

Unnamed: 0,paciente_id,medico_id,sexo_paciente,especialidade
0,2,100045/SP,Female,Psicologo
1,666,100038/RJ,Female,Oncologia
2,907,100085/SP,Female,Oftalmologia
3,607,100024/SP,Female,Cardiologista
4,275,100092/SP,Male,Ginecologista


: 

: 

In [None]:
#criando dados sinteticos
rand_num=np.random.choice(2,100_000,p=[0.4,0.6])
df_like=pd.DataFrame(rand_num)
df_like.columns=["recomenda"]

df=pd.concat([df,df_like],axis=1)
df.head()

Unnamed: 0,paciente_id,medico_id,sexo_paciente,especialidade,recomenda
0,2,100045/SP,Female,Psicologo,0
1,666,100038/RJ,Female,Oncologia,1
2,907,100085/SP,Female,Oftalmologia,0
3,607,100024/SP,Female,Cardiologista,0
4,275,100092/SP,Male,Ginecologista,1


: 

: 

In [None]:
age=np.random.randint(18,99,100_000)
df_age=pd.DataFrame(age)
df_age.columns=["idade"]

df=pd.concat([df,df_age],axis=1)
df.head()

Unnamed: 0,paciente_id,medico_id,sexo_paciente,especialidade,recomenda,idade
0,2,100045/SP,Female,Psicologo,0,28
1,666,100038/RJ,Female,Oncologia,1,65
2,907,100085/SP,Female,Oftalmologia,0,97
3,607,100024/SP,Female,Cardiologista,0,48
4,275,100092/SP,Male,Ginecologista,1,87


: 

: 

In [None]:
df.sexo_paciente.unique()

array(['Female', 'Male'], dtype=object)

: 

: 

In [None]:
# valor 1 para recomenda e 0 para nao recomenda
df.recomenda.value_counts()

1    60245
0    39755
Name: recomenda, dtype: int64

: 

: 

In [None]:
num_col=df.select_dtypes("number").columns.to_list()
cat_col=[col for col in df if col not in num_col]
cat_col=[
        #'medico_id',
         'sexo_paciente',
        'especialidade']

: 

: 

In [None]:
num_col=[#'paciente_id',
         #'recomenda',
         'idade']
x=cat_col+num_col
y='recomenda'

: 

: 

In [None]:
X_train, X_test,y_train, y_test=tts(df[x], df[y], stratify=df[y], random_state=101,test_size=0.2 )

: 

: 

In [None]:

preprocessor=ColumnTransformer([
    ("scaler",StandardScaler(), num_col),
    ("encoder", OneHotEncoder(handle_unknown="ignore"), cat_col)
])
pipe=Pipeline([
    ("preprocess", preprocessor),
    ("model",RandomForestClassifier(class_weight="balanced", random_state=101))
])

: 

: 

In [None]:
pipe.fit(X_train, y_train)

: 

: 

In [None]:
calculate_metrics(pipe, X_test,y_test)

{'accuracy': 0.5042,
 'balanced_accuracy': 0.49787142905621024,
 'precision': 0.6005278537091149,
 'recall': 0.5287575732425928,
 'f1': 0.5623620796186777,
 'roc-auc': 0.4920820945796532}

: 

: 

In [None]:
model={
    "dummy":DummyClassifier(),
    "rf":RandomForestClassifier(class_weight="balanced", random_state=101),
    "ada":AdaBoostClassifier(random_state=101),
    "lr":LogisticRegression(class_weight="balanced", random_state=101),
    "gbc":GradientBoostingClassifier(random_state=101),
    "neural":MLPClassifier(alpha=1e-05, hidden_layer_sizes=(5, 2), random_state=101,
              solver='lbfgs'),
    

}
results=[]
for model_name , modelo in model.items():
  print(model_name, modelo)
  pipe.set_params(model=modelo).fit(X_train,y_train)
  result=calculate_metrics(pipe, X_test,y_test)
  results.append(result)
  print(results)

dummy DummyClassifier()
[{'accuracy': 0.60245, 'balanced_accuracy': 0.5, 'precision': 0.60245, 'recall': 1.0, 'f1': 0.7519111360728884, 'roc-auc': 0.5}]
rf RandomForestClassifier(class_weight='balanced', random_state=101)
[{'accuracy': 0.60245, 'balanced_accuracy': 0.5, 'precision': 0.60245, 'recall': 1.0, 'f1': 0.7519111360728884, 'roc-auc': 0.5}, {'accuracy': 0.5042, 'balanced_accuracy': 0.49787142905621024, 'precision': 0.6005278537091149, 'recall': 0.5287575732425928, 'f1': 0.5623620796186777, 'roc-auc': 0.4920820945796532}]
ada AdaBoostClassifier(random_state=101)
[{'accuracy': 0.60245, 'balanced_accuracy': 0.5, 'precision': 0.60245, 'recall': 1.0, 'f1': 0.7519111360728884, 'roc-auc': 0.5}, {'accuracy': 0.5042, 'balanced_accuracy': 0.49787142905621024, 'precision': 0.6005278537091149, 'recall': 0.5287575732425928, 'f1': 0.5623620796186777, 'roc-auc': 0.4920820945796532}, {'accuracy': 0.60245, 'balanced_accuracy': 0.5, 'precision': 0.60245, 'recall': 1.0, 'f1': 0.7519111360728884, 

: 

: 

In [None]:
df_results=pd.DataFrame(results).T
df_results.columns=[
    'dummy',
    'rf',
    'ada',
    'lr',
    "gbc",
    "neural"
]

df_results


Unnamed: 0,dummy,rf,ada,lr,gbc,neural
accuracy,0.60245,0.5042,0.60245,0.50315,0.6025,0.60245
balanced_accuracy,0.5,0.497871,0.5,0.498283,0.50032,0.5
precision,0.60245,0.600528,0.60245,0.600879,0.602603,0.60245
recall,1.0,0.528758,1.0,0.522035,0.999004,1.0
f1,0.751911,0.562362,0.751911,0.558689,0.751749,0.751911
roc-auc,0.5,0.492082,0.496118,0.499274,0.495858,0.49987


: 

: 

In [None]:
model={
    "dummy":DummyClassifier(),
    "rf":RandomForestClassifier(class_weight="balanced", random_state=101),
    "ada":AdaBoostClassifier(random_state=101),
    "lr":LogisticRegression(class_weight="balanced", random_state=101),
    "gbc":GradientBoostingClassifier(random_state=101),
    "neural":MLPClassifier(alpha=1e-05, hidden_layer_sizes=(5, 2), random_state=101,
              solver='lbfgs'),
}

cross_results=[]
for model_name, modelo in model.items():
  print(model_name, modelo)
  pipe.set_params(model=modelo)
  cross=cv(pipe, X_train, y_train, cv=5, scoring=["precision", "accuracy", "f1","recall", "roc_auc" ])
  cross_result=pd.DataFrame(cross).mean().to_frame(model_name).T
  cross_results.append(cross_result)


  df_cross=pd.concat(cross_results)

dummy DummyClassifier()
rf RandomForestClassifier(class_weight='balanced', random_state=101)
ada AdaBoostClassifier(random_state=101)
lr LogisticRegression(class_weight='balanced', random_state=101)
gbc GradientBoostingClassifier(random_state=101)
neural MLPClassifier(alpha=1e-05, hidden_layer_sizes=(5, 2), random_state=101,
              solver='lbfgs')


: 

: 

In [None]:
df_cross=pd.concat(cross_results)
df_cross

Unnamed: 0,fit_time,score_time,test_precision,test_accuracy,test_f1,test_recall,test_roc_auc
dummy,0.088601,0.085711,0.60245,0.60245,0.751911,1.0,0.5
rf,44.120809,0.894387,0.60176,0.503413,0.557643,0.519587,0.500463
ada,2.204147,0.368156,0.60245,0.60245,0.751911,1.0,0.501753
lr,0.184976,0.091499,0.606444,0.495675,0.524594,0.46481,0.503408
gbc,5.891591,0.240441,0.602535,0.6024,0.751713,0.999066,0.501564
neural,0.637739,0.107931,0.60245,0.60245,0.751911,1.0,0.499317


: 

: 

In [None]:
pipe_xgb=Pipeline([
    ("preprocess",preprocessor),
    ("model",GradientBoostingClassifier(random_state=101))
])
pipe_xgb


params={
    'model__learning_rate' : [0.05,0.1, 0.3],
    'model__max_depth' : [ 2, 15],
  }

: 

: 

In [None]:
final_random=rscv(pipe_xgb,param_distributions=params, cv=3, scoring="precision", verbose=True)
final_random.fit(X_train,y_train)

Fitting 3 folds for each of 6 candidates, totalling 18 fits


: 

: 

In [None]:
df_random_final=pd.DataFrame(final_random.cv_results_).set_index("rank_test_score").sort_index()
df_random_final.loc[:,~df_random_final.columns.str.contains("split|time")].head(1)

Unnamed: 0_level_0,param_model__max_depth,param_model__learning_rate,params,mean_test_score,std_test_score
rank_test_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,2,0.05,"{'model__max_depth': 2, 'model__learning_rate'...",0.60245,1.5e-05


: 

: 

In [None]:
pipe_xgbfinal = Pipeline([
    ("preprocessor", preprocessor),
    ("gbc", GradientBoostingClassifier(random_state=101, 
                          max_depth=2, learning_rate=0.3))
]).fit(X_train, y_train)

: 

: 

In [None]:
calculate_metrics(pipe_xgbfinal, X_test,y_test)

{'accuracy': 0.60245,
 'balanced_accuracy': 0.5001283277119414,
 'precision': 0.6025115069041425,
 'recall': 0.9995020333637646,
 'f1': 0.7518182101944626,
 'roc-auc': 0.4956392063977972}

: 

: 

In [None]:
with open("pipe_xgbfinal.pkl","wb") as file_model: # pode ser joblib tb
    pickle.dump(pipe_xgbfinal,file_model)

: 

: 