In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import (train_test_split as tts,
                                     cross_validate as cv,
                                     RandomizedSearchCV as rscv)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (RandomForestClassifier,
                              AdaBoostClassifier,
                              GradientBoostingClassifier)
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyClassifier

from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import (OneHotEncoder,
                                  
                                   StandardScaler)

from sklearn import metrics

import pickle 
import warnings
warnings.filterwarnings("ignore")




In [28]:
def calculate_metrics(model, X, y):
        y_pred = model.predict(X)

        model_metrics = {
        'accuracy': metrics.accuracy_score(y, y_pred),
        'balanced_accuracy': metrics.balanced_accuracy_score(y, y_pred),
        'precision': metrics.precision_score(y, y_pred),
        'recall': metrics.recall_score(y, y_pred),
        'f1': metrics.f1_score(y, y_pred),
    }

        try:
            y_pred_proba = model.predict_proba(X)[:,1]
            model_metrics['roc-auc'] = metrics.roc_auc_score(y, y_pred_proba)
            return model_metrics
        except:
            return model_metrics

In [29]:
df = pd.read_csv(r"dataset/sql_trabalho.csv",usecols=["paciente_id","medico_id","sexo_paciente","especialidade"])
df.head()

Unnamed: 0,paciente_id,medico_id,sexo_paciente,especialidade
0,2,100045/SP,Female,Psicologo
1,666,100038/RJ,Female,Oncologia
2,907,100085/SP,Female,Oftalmologia
3,607,100024/SP,Female,Cardiologista
4,275,100092/SP,Male,Ginecologista


In [30]:
#criando dados sinteticos
rand_num=np.random.choice(2,100_000,p=[0.4,0.6])
df_like=pd.DataFrame(rand_num)
df_like.columns=["recomenda"]

df=pd.concat([df,df_like],axis=1)
df.head()

Unnamed: 0,paciente_id,medico_id,sexo_paciente,especialidade,recomenda
0,2,100045/SP,Female,Psicologo,0
1,666,100038/RJ,Female,Oncologia,0
2,907,100085/SP,Female,Oftalmologia,1
3,607,100024/SP,Female,Cardiologista,1
4,275,100092/SP,Male,Ginecologista,1


In [31]:
age=np.random.randint(18,99,100_000)
df_age=pd.DataFrame(age)
df_age.columns=["idade"]

df=pd.concat([df,df_age],axis=1)
df.head()

Unnamed: 0,paciente_id,medico_id,sexo_paciente,especialidade,recomenda,idade
0,2,100045/SP,Female,Psicologo,0,19
1,666,100038/RJ,Female,Oncologia,0,83
2,907,100085/SP,Female,Oftalmologia,1,93
3,607,100024/SP,Female,Cardiologista,1,54
4,275,100092/SP,Male,Ginecologista,1,40


In [32]:
df.sexo_paciente.unique()

array(['Female', 'Male'], dtype=object)

In [33]:
# valor 1 para recomenda e 0 para nao recomenda
df.recomenda.value_counts()

1    59798
0    40202
Name: recomenda, dtype: int64

In [34]:
num_col=df.select_dtypes("number").columns.to_list()
cat_col=[col for col in df if col not in num_col]
cat_col=[
        #'medico_id',
         'sexo_paciente',
        'especialidade']

In [35]:
num_col=[#'paciente_id',
         #'recomenda',
         'idade']
x=cat_col+num_col
y='recomenda'

In [36]:
X_train, X_test,y_train, y_test=tts(df[x], df[y], stratify=df[y], random_state=101,test_size=0.2 )

In [37]:

preprocessor=ColumnTransformer([
    ("scaler",StandardScaler(), num_col),
    ("encoder", OneHotEncoder(handle_unknown="ignore"), cat_col)
])
pipe=Pipeline([
    ("preprocess", preprocessor),
    ("model",RandomForestClassifier(class_weight="balanced", random_state=101))
])

In [38]:
pipe.fit(X_train, y_train)

In [39]:
calculate_metrics(pipe, X_test,y_test)

{'accuracy': 0.5074,
 'balanced_accuracy': 0.5040470723306545,
 'precision': 0.6017570959644719,
 'recall': 0.5211538461538462,
 'f1': 0.558562595214625,
 'roc-auc': 0.505010045924225}

In [40]:
model={
    "dummy":DummyClassifier(),
    "rf":RandomForestClassifier(class_weight="balanced", random_state=101),
    "ada":AdaBoostClassifier(random_state=101),
    "lr":LogisticRegression(class_weight="balanced", random_state=101),
    "gbc":GradientBoostingClassifier(random_state=101),
    "neural":MLPClassifier(alpha=1e-05, hidden_layer_sizes=(5, 2), random_state=101,
              solver='lbfgs'),
    

}
results=[]
for model_name , modelo in model.items():
  print(model_name, modelo)
  pipe.set_params(model=modelo).fit(X_train,y_train)
  result=calculate_metrics(pipe, X_test,y_test)
  results.append(result)
  print(results)

dummy DummyClassifier()
[{'accuracy': 0.598, 'balanced_accuracy': 0.5, 'precision': 0.598, 'recall': 1.0, 'f1': 0.7484355444305382, 'roc-auc': 0.5}]
rf RandomForestClassifier(class_weight='balanced', random_state=101)
[{'accuracy': 0.598, 'balanced_accuracy': 0.5, 'precision': 0.598, 'recall': 1.0, 'f1': 0.7484355444305382, 'roc-auc': 0.5}, {'accuracy': 0.5074, 'balanced_accuracy': 0.5040470723306545, 'precision': 0.6017570959644719, 'recall': 0.5211538461538462, 'f1': 0.558562595214625, 'roc-auc': 0.505010045924225}]
ada AdaBoostClassifier(random_state=101)
[{'accuracy': 0.598, 'balanced_accuracy': 0.5, 'precision': 0.598, 'recall': 1.0, 'f1': 0.7484355444305382, 'roc-auc': 0.5}, {'accuracy': 0.5074, 'balanced_accuracy': 0.5040470723306545, 'precision': 0.6017570959644719, 'recall': 0.5211538461538462, 'f1': 0.558562595214625, 'roc-auc': 0.505010045924225}, {'accuracy': 0.598, 'balanced_accuracy': 0.5, 'precision': 0.598, 'recall': 1.0, 'f1': 0.7484355444305382, 'roc-auc': 0.491173688

In [41]:
df_results=pd.DataFrame(results).T
df_results.columns=[
    'dummy',
    'rf',
    'ada',
    'lr',
    "gbc",
    "neural"
]

df_results


Unnamed: 0,dummy,rf,ada,lr,gbc,neural
accuracy,0.598,0.5074,0.598,0.4993,0.598,0.598
balanced_accuracy,0.5,0.504047,0.5,0.49489,0.5,0.5
precision,0.598,0.601757,0.598,0.593289,0.598,0.598
recall,1.0,0.521154,1.0,0.517391,1.0,1.0
f1,0.748436,0.558563,0.748436,0.552747,0.748436,0.748436
roc-auc,0.5,0.50501,0.491174,0.497624,0.50046,0.49433


In [42]:
model={
    "dummy":DummyClassifier(),
    "rf":RandomForestClassifier(class_weight="balanced", random_state=101),
    "ada":AdaBoostClassifier(random_state=101),
    "lr":LogisticRegression(class_weight="balanced", random_state=101),
    "gbc":GradientBoostingClassifier(random_state=101),
    "neural":MLPClassifier(alpha=1e-05, hidden_layer_sizes=(5, 2), random_state=101,
              solver='lbfgs'),
}

cross_results=[]
for model_name, modelo in model.items():
  print(model_name, modelo)
  pipe.set_params(model=modelo)
  cross=cv(pipe, X_train, y_train, cv=5, scoring=["precision", "accuracy", "f1","recall", "roc_auc" ])
  cross_result=pd.DataFrame(cross).mean().to_frame(model_name).T
  cross_results.append(cross_result)


  df_cross=pd.concat(cross_results)

dummy DummyClassifier()
rf RandomForestClassifier(class_weight='balanced', random_state=101)
ada AdaBoostClassifier(random_state=101)
lr LogisticRegression(class_weight='balanced', random_state=101)
gbc GradientBoostingClassifier(random_state=101)
neural MLPClassifier(alpha=1e-05, hidden_layer_sizes=(5, 2), random_state=101,
              solver='lbfgs')


In [43]:
df_cross=pd.concat(cross_results)
df_cross

Unnamed: 0,fit_time,score_time,test_precision,test_accuracy,test_f1,test_recall,test_roc_auc
dummy,0.093841,0.086949,0.597975,0.597975,0.748416,1.0,0.5
rf,36.265019,0.855461,0.598764,0.504875,0.557451,0.521489,0.497328
ada,2.229942,0.751495,0.597975,0.597975,0.748416,1.0,0.499626
lr,0.329844,0.12024,0.599714,0.506738,0.560136,0.526778,0.500384
gbc,6.147043,0.203199,0.597922,0.5978,0.748251,0.999561,0.494911
neural,1.166847,0.075711,0.597975,0.597975,0.748416,1.0,0.499581


In [44]:
pipe_xgb=Pipeline([
    ("preprocess",preprocessor),
    ("model",GradientBoostingClassifier(random_state=101))
])
pipe_xgb


params={
    'model__learning_rate' : [0.05,0.1, 0.3],
    'model__max_depth' : [ 2, 15],
  }

In [45]:
final_random=rscv(pipe_xgb,param_distributions=params, cv=3, scoring="precision", verbose=True)
final_random.fit(X_train,y_train)

Fitting 3 folds for each of 6 candidates, totalling 18 fits


In [46]:
df_random_final=pd.DataFrame(final_random.cv_results_).set_index("rank_test_score").sort_index()
df_random_final.loc[:,~df_random_final.columns.str.contains("split|time")].head(1)

Unnamed: 0_level_0,param_model__max_depth,param_model__learning_rate,params,mean_test_score,std_test_score
rank_test_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,2,0.05,"{'model__max_depth': 2, 'model__learning_rate'...",0.597975,1.1e-05


In [47]:
pipe_xgbfinal = Pipeline([
    ("preprocessor", preprocessor),
    ("gbc", GradientBoostingClassifier(random_state=101, 
                          max_depth=2, learning_rate=0.05))
]).fit(X_train, y_train)

In [48]:
calculate_metrics(pipe_xgbfinal, X_test,y_test)

{'accuracy': 0.598,
 'balanced_accuracy': 0.5,
 'precision': 0.598,
 'recall': 1.0,
 'f1': 0.7484355444305382,
 'roc-auc': 0.4952929749247076}

In [49]:
with open("pipe_xgbfinal.pkl","wb") as file_model: # pode ser joblib tb
    pickle.dump(pipe_xgbfinal,file_model)