In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import (train_test_split as tts,
                                     cross_validate as cv,
                                     RandomizedSearchCV as rscv)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (RandomForestClassifier,
                              AdaBoostClassifier,
                              GradientBoostingClassifier)
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import (OneHotEncoder,
                                   OrdinalEncoder,
                                   StandardScaler)

from sklearn import metrics





In [5]:
def calculate_metrics(model, X, y):
        y_pred = model.predict(X)

        model_metrics = {
        'accuracy': metrics.accuracy_score(y, y_pred),
        'balanced_accuracy': metrics.balanced_accuracy_score(y, y_pred),
        'precision': metrics.precision_score(y, y_pred),
        'recall': metrics.recall_score(y, y_pred),
        'f1': metrics.f1_score(y, y_pred),
    }

        try:
            y_pred_proba = model.predict_proba(X)[:,1]
            model_metrics['roc-auc'] = metrics.roc_auc_score(y, y_pred_proba)
            return model_metrics
        except:
            return model_metrics

In [6]:
df = pd.read_csv(r"dataset/sql_trabalho.csv",usecols=["paciente_id","medico_id","sexo_paciente","especialidade"])
df.head()

Unnamed: 0,paciente_id,medico_id,sexo_paciente,especialidade
0,2,100045/SP,Female,Psicologo
1,666,100038/RJ,Female,Oncologia
2,907,100085/SP,Female,Oftalmologia
3,607,100024/SP,Female,Cardiologista
4,275,100092/SP,Male,Ginecologista


In [7]:
#criando dados sinteticos
rand_num=np.random.choice(2,100_000,p=[0.4,0.6])
df_like=pd.DataFrame(rand_num)
df_like.columns=["recomenda"]

df=pd.concat([df,df_like],axis=1)
df.head()

Unnamed: 0,paciente_id,medico_id,sexo_paciente,especialidade,recomenda
0,2,100045/SP,Female,Psicologo,0
1,666,100038/RJ,Female,Oncologia,0
2,907,100085/SP,Female,Oftalmologia,1
3,607,100024/SP,Female,Cardiologista,1
4,275,100092/SP,Male,Ginecologista,0


In [8]:
age=np.random.randint(18,99,100_000)
df_age=pd.DataFrame(age)
df_age.columns=["idade"]

df=pd.concat([df,df_age],axis=1)
df.head()

Unnamed: 0,paciente_id,medico_id,sexo_paciente,especialidade,recomenda,idade
0,2,100045/SP,Female,Psicologo,0,86
1,666,100038/RJ,Female,Oncologia,0,52
2,907,100085/SP,Female,Oftalmologia,1,45
3,607,100024/SP,Female,Cardiologista,1,19
4,275,100092/SP,Male,Ginecologista,0,82


In [9]:
# valor 1 para recomenda e 0 para nao recomenda
df.recomenda.value_counts()

1    60076
0    39924
Name: recomenda, dtype: int64

In [10]:
num_col=df.select_dtypes("number").columns.to_list()
cat_col=[col for col in df if col not in num_col]
cat_col=[
        #'medico_id',
         'sexo_paciente',
        'especialidade']

In [11]:
num_col=[#'paciente_id',
         #'recomenda',
         'idade']
x=cat_col+num_col
y='recomenda'

In [12]:
X_train, X_test,y_train, y_test=tts(df[x], df[y], stratify=df[y], random_state=101,test_size=0.2 )

In [13]:

preprocessor=ColumnTransformer([
    ("scaler",StandardScaler(), num_col),
    ("encoder", OneHotEncoder(handle_unknown="ignore"), cat_col)
])
pipe=Pipeline([
    ("preprocess", preprocessor),
    ("model",RandomForestClassifier(class_weight="balanced", random_state=101))
])

In [14]:
pipe.fit(X_train, y_train)

In [15]:
calculate_metrics(pipe, X_test,y_test)

{'accuracy': 0.50125,
 'balanced_accuracy': 0.4970603433247577,
 'precision': 0.5980392156862745,
 'recall': 0.517852684144819,
 'f1': 0.5550649003077746,
 'roc-auc': 0.4955760163081475}

In [16]:
model={
    "dummy":DummyClassifier(),
    "rf":RandomForestClassifier(class_weight="balanced", random_state=101),
    "ada":AdaBoostClassifier(random_state=101),
    "lr":LogisticRegression(class_weight="balanced", random_state=101),
    "gbc":GradientBoostingClassifier(random_state=101),
    "neural":MLPClassifier(alpha=1e-05, hidden_layer_sizes=(5, 2), random_state=101,
              solver='lbfgs'),
    "xgb":XGBClassifier(random_state=101)

}
results=[]
for model_name , modelo in model.items():
  print(model_name, modelo)
  pipe.set_params(model=modelo).fit(X_train,y_train)
  result=calculate_metrics(pipe, X_test,y_test)
  results.append(result)
  print(results)

dummy DummyClassifier()
[{'accuracy': 0.60075, 'balanced_accuracy': 0.5, 'precision': 0.60075, 'recall': 1.0, 'f1': 0.7505856629704826, 'roc-auc': 0.5}]
rf RandomForestClassifier(class_weight='balanced', random_state=101)
[{'accuracy': 0.60075, 'balanced_accuracy': 0.5, 'precision': 0.60075, 'recall': 1.0, 'f1': 0.7505856629704826, 'roc-auc': 0.5}, {'accuracy': 0.50125, 'balanced_accuracy': 0.4970603433247577, 'precision': 0.5980392156862745, 'recall': 0.517852684144819, 'f1': 0.5550649003077746, 'roc-auc': 0.4955760163081475}]
ada AdaBoostClassifier(random_state=101)
[{'accuracy': 0.60075, 'balanced_accuracy': 0.5, 'precision': 0.60075, 'recall': 1.0, 'f1': 0.7505856629704826, 'roc-auc': 0.5}, {'accuracy': 0.50125, 'balanced_accuracy': 0.4970603433247577, 'precision': 0.5980392156862745, 'recall': 0.517852684144819, 'f1': 0.5550649003077746, 'roc-auc': 0.4955760163081475}, {'accuracy': 0.60075, 'balanced_accuracy': 0.5, 'precision': 0.60075, 'recall': 1.0, 'f1': 0.7505856629704826, 'r

In [17]:
df_results=pd.DataFrame(results).T
df_results.columns=[
    'dummy',
    'rf',
    'ada',
    'lr',
    "gbc",
    "neural",
    "xgboost"
]

df_results


Unnamed: 0,dummy,rf,ada,lr,gbc,neural,xgboost
accuracy,0.60075,0.50125,0.60075,0.50435,0.60075,0.60075,0.5927
balanced_accuracy,0.5,0.49706,0.5,0.501069,0.500105,0.5,0.497795
precision,0.60075,0.598039,0.60075,0.601742,0.6008,0.60075,0.59966
recall,1.0,0.517853,1.0,0.517353,0.999584,1.0,0.968789
f1,0.750586,0.555065,0.750586,0.556366,0.750508,0.750586,0.740788
roc-auc,0.5,0.495576,0.49589,0.501479,0.49456,0.499606,0.4982


In [18]:
model={
    "dummy":DummyClassifier(),
    "rf":RandomForestClassifier(class_weight="balanced", random_state=101),
    "ada":AdaBoostClassifier(random_state=101),
    "lr":LogisticRegression(class_weight="balanced", random_state=101),
    "gbc":GradientBoostingClassifier(random_state=101),
    "neural":MLPClassifier(alpha=1e-05, hidden_layer_sizes=(5, 2), random_state=101,
              solver='lbfgs'),
    "xgb":XGBClassifier(random_state=101)

}

cross_results=[]
for model_name, modelo in model.items():
  print(model_name, modelo)
  pipe.set_params(model=modelo)
  cross=cv(pipe, X_train, y_train, n_jobs=-1, cv=5, scoring=["precision", "accuracy", "f1","recall", "roc_auc" ])
  cross_result=pd.DataFrame(cross).mean().to_frame(model_name).T
  cross_results.append(cross_result)


  df_cross=pd.concat(cross_results)

dummy DummyClassifier()


rf RandomForestClassifier(class_weight='balanced', random_state=101)
ada AdaBoostClassifier(random_state=101)
lr LogisticRegression(class_weight='balanced', random_state=101)
gbc GradientBoostingClassifier(random_state=101)
neural MLPClassifier(alpha=1e-05, hidden_layer_sizes=(5, 2), random_state=101,
              solver='lbfgs')
xgb XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None,

In [19]:
df_cross=pd.concat(cross_results)
df_cross

Unnamed: 0,fit_time,score_time,test_precision,test_accuracy,test_f1,test_recall,test_roc_auc
dummy,0.170965,0.134675,0.600762,0.600762,0.750595,1.0,0.5
rf,61.284516,1.472302,0.599816,0.502962,0.556348,0.518757,0.498871
ada,4.425528,0.710525,0.600762,0.600762,0.750595,1.0,0.500929
lr,0.361638,0.175963,0.599399,0.500613,0.550359,0.508853,0.50114
gbc,13.391862,0.388331,0.600829,0.600625,0.750301,0.998772,0.500432
neural,1.384054,0.175329,0.600762,0.600762,0.750595,1.0,0.501411
xgb,2.097452,0.809344,0.600527,0.591875,0.738197,0.957762,0.500081


In [21]:
pipe_xgb=Pipeline([
    ("preprocess",preprocessor),
    ("model",XGBClassifier(random_state=101, is_unbalanced=True))
])
pipe_xgb


params={
    'model__is_unbalaced':[True],
    'model__learning_rate' : [0.05,0.1, 0.3],
    'model__max_depth' : [ 2, 15],
  }

In [23]:
final_random=rscv(pipe_xgb,param_distributions=params, cv=5, scoring="precision", n_jobs=-1, verbose=True)
final_random.fit(X_train,y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


Parameters: { "is_unbalaced", "is_unbalanced" } are not used.

Parameters: { "is_unbalaced", "is_unbalanced" } are not used.

Parameters: { "is_unbalaced", "is_unbalanced" } are not used.

Parameters: { "is_unbalaced", "is_unbalanced" } are not used.

Parameters: { "is_unbalaced", "is_unbalanced" } are not used.

Parameters: { "is_unbalaced", "is_unbalanced" } are not used.

Parameters: { "is_unbalaced", "is_unbalanced" } are not used.

Parameters: { "is_unbalaced", "is_unbalanced" } are not used.

Parameters: { "is_unbalaced", "is_unbalanced" } are not used.

Parameters: { "is_unbalaced", "is_unbalanced" } are not used.

Parameters: { "is_unbalaced", "is_unbalanced" } are not used.

Parameters: { "is_unbalaced", "is_unbalanced" } are not used.

Parameters: { "is_unbalaced", "is_unbalanced" } are not used.

Parameters: { "is_unbalaced", "is_unbalanced" } are not used.

Parameters: { "is_unbalaced", "is_unbalanced" } are not used.

Parameters: { "is_unbalaced", "is_unbalanced" } are not