In [None]:
%%capture
!pip install catboost
!pip install xgboost

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import (train_test_split as tts,
                                     cross_validate as cv)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (RandomForestClassifier,
                              AdaBoostClassifier,
                              GradientBoostingClassifier)
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import (OneHotEncoder,
                                   OrdinalEncoder,
                                   StandardScaler)

from sklearn import metrics





In [None]:
def calculate_metrics(model, X, y):
        y_pred = model.predict(X)

        model_metrics = {
        'accuracy': metrics.accuracy_score(y, y_pred),
        'balanced_accuracy': metrics.balanced_accuracy_score(y, y_pred),
        'precision': metrics.precision_score(y, y_pred),
        'recall': metrics.recall_score(y, y_pred),
        'f1': metrics.f1_score(y, y_pred),
    }

        try:
            y_pred_proba = model.predict_proba(X)[:,1]
            model_metrics['roc-auc'] = metrics.roc_auc_score(y, y_pred_proba)
            return model_metrics
        except:
            return model_metrics

In [None]:
df = pd.read_csv(r"/content/sql_trabalho.csv",usecols=["paciente_id","medico_id","sexo_paciente","especialidade"])
df.head()

Unnamed: 0,paciente_id,medico_id,sexo_paciente,especialidade
0,2,100045/SP,Female,Psicologo
1,666,100038/RJ,Female,Oncologia
2,907,100085/SP,Female,Oftalmologia
3,607,100024/SP,Female,Cardiologista
4,275,100092/SP,Male,Ginecologista


In [None]:
#criando dados sinteticos
rand_num=np.random.choice(2,100_000,p=[0.4,0.6])
df_like=pd.DataFrame(rand_num)
df_like.columns=["recomenda"]

df=pd.concat([df,df_like],axis=1)
df.head()

Unnamed: 0,paciente_id,medico_id,sexo_paciente,especialidade,recomenda
0,2,100045/SP,Female,Psicologo,1
1,666,100038/RJ,Female,Oncologia,0
2,907,100085/SP,Female,Oftalmologia,1
3,607,100024/SP,Female,Cardiologista,1
4,275,100092/SP,Male,Ginecologista,0


In [None]:
age=np.random.randint(18,99,100_000)
df_age=pd.DataFrame(age)
df_age.columns=["idade"]

df=pd.concat([df,df_age],axis=1)
df.head()

Unnamed: 0,paciente_id,medico_id,sexo_paciente,especialidade,recomenda,idade
0,2,100045/SP,Female,Psicologo,1,23
1,666,100038/RJ,Female,Oncologia,0,62
2,907,100085/SP,Female,Oftalmologia,1,42
3,607,100024/SP,Female,Cardiologista,1,38
4,275,100092/SP,Male,Ginecologista,0,62


In [None]:
# valor 1 para recomenda e 0 para nao recomenda
df.recomenda.value_counts()

1    60045
0    39955
Name: recomenda, dtype: int64

In [None]:
num_col=df.select_dtypes("number").columns.to_list()
cat_col=[col for col in df if col not in num_col]
cat_col=[
        #'medico_id',
         'sexo_paciente',
        'especialidade']

In [None]:
num_col=[#'paciente_id',
         #'recomenda',
         'idade']
x=cat_col+num_col
y='recomenda'

In [None]:
X_train, X_test,y_train, y_test=tts(df[x], df[y], stratify=df[y], random_state=101,test_size=0.2 )

In [None]:

preprocessor=ColumnTransformer([
    ("scaler",StandardScaler(), num_col),
    ("encoder", OneHotEncoder(handle_unknown="ignore"), cat_col)
])
pipe=Pipeline([
    ("preprocess", preprocessor),
    ("model",RandomForestClassifier(class_weight="balanced", random_state=101))
])

In [None]:
pipe.fit(X_train, y_train)

In [None]:
calculate_metrics(pipe, X_test,y_test)

{'accuracy': 0.5002,
 'balanced_accuracy': 0.4958853441573181,
 'precision': 0.5966580236243157,
 'recall': 0.5173619785161129,
 'f1': 0.5541878512175541,
 'roc-auc': 0.49812555070828235}

In [None]:
model={
    "dummy":DummyClassifier(),
    "rf":RandomForestClassifier(class_weight="balanced", random_state=101),
    "ada":AdaBoostClassifier(random_state=101),
    "lr":LogisticRegression(class_weight="balanced", random_state=101),
    "gbc":GradientBoostingClassifier(random_state=101),
    "neural":MLPClassifier(alpha=1e-05, hidden_layer_sizes=(5, 2), random_state=101,
              solver='lbfgs'),
    "cat":CatBoostClassifier(),
    "xgb":XGBClassifier(random_state=101)

}
results=[]
for model_name , modelo in model.items():
  print(model_name, modelo)
  pipe.set_params(model=modelo).fit(X_train,y_train)
  result=calculate_metrics(pipe, X_test,y_test)
  results.append(result)
  print(results)

dummy DummyClassifier()
[{'accuracy': 0.60045, 'balanced_accuracy': 0.5, 'precision': 0.60045, 'recall': 1.0, 'f1': 0.7503514636508483, 'roc-auc': 0.5}]
rf RandomForestClassifier(class_weight='balanced', random_state=101)
[{'accuracy': 0.60045, 'balanced_accuracy': 0.5, 'precision': 0.60045, 'recall': 1.0, 'f1': 0.7503514636508483, 'roc-auc': 0.5}, {'accuracy': 0.5002, 'balanced_accuracy': 0.4958853441573181, 'precision': 0.5966580236243157, 'recall': 0.5173619785161129, 'f1': 0.5541878512175541, 'roc-auc': 0.49812555070828235}]
ada AdaBoostClassifier(random_state=101)
[{'accuracy': 0.60045, 'balanced_accuracy': 0.5, 'precision': 0.60045, 'recall': 1.0, 'f1': 0.7503514636508483, 'roc-auc': 0.5}, {'accuracy': 0.5002, 'balanced_accuracy': 0.4958853441573181, 'precision': 0.5966580236243157, 'recall': 0.5173619785161129, 'f1': 0.5541878512175541, 'roc-auc': 0.49812555070828235}, {'accuracy': 0.60045, 'balanced_accuracy': 0.5, 'precision': 0.60045, 'recall': 1.0, 'f1': 0.7503514636508483, 

In [None]:
df_results=pd.DataFrame(results).T
df_results.columns=[
    'dummy',
    'rf',
    'ada',
    'lr',
    "gbc",
    "neural",
    "cat",
    "xgboost"
]

df_results


Unnamed: 0,dummy,rf,ada,lr,gbc,neural,cat,xgboost
accuracy,0.60045,0.5002,0.60045,0.49195,0.60045,0.60045,0.599,0.59525
balanced_accuracy,0.5,0.495885,0.5,0.495212,0.5,0.5,0.500677,0.501406
precision,0.60045,0.596658,0.60045,0.595692,0.60045,0.60045,0.600778,0.601147
recall,1.0,0.517362,1.0,0.478974,1.0,1.0,0.990091,0.968524
f1,0.750351,0.554188,0.750351,0.530995,0.750351,0.750351,0.747799,0.741844
roc-auc,0.5,0.498126,0.496314,0.494654,0.498567,0.496546,0.499229,0.499345


In [None]:
model={
    "dummy":DummyClassifier(),
    "rf":RandomForestClassifier(class_weight="balanced", random_state=101),
    "ada":AdaBoostClassifier(random_state=101),
    "lr":LogisticRegression(class_weight="balanced", random_state=101),
    "gbc":GradientBoostingClassifier(random_state=101),
    "neural":MLPClassifier(alpha=1e-05, hidden_layer_sizes=(5, 2), random_state=101,
              solver='lbfgs'),
    "cat":CatBoostClassifier(),
    "xgb":XGBClassifier(random_state=101)

}

cross_results=[]
for model_name, modelo in model.items():
  print(model_name, modelo)
  pipe.set_params(model=modelo)
  cross=cv(pipe, X_train, y_train, n_jobs=-1, cv=5, scoring=["precision", "accuracy", "f1","recall", "roc_auc" ])
  cross_result=pd.DataFrame(cross).mean().to_frame(model_name).T
  cross_results.append(cross_result)


  df_cross=pd.concat(cross_results)

dummy DummyClassifier()
rf RandomForestClassifier(class_weight='balanced', random_state=101)
ada AdaBoostClassifier(random_state=101)
lr LogisticRegression(class_weight='balanced', random_state=101)
gbc GradientBoostingClassifier(random_state=101)
neural MLPClassifier(alpha=1e-05, hidden_layer_sizes=(5, 2), random_state=101,
              solver='lbfgs')
cat <catboost.core.CatBoostClassifier object at 0x787519d3b550>
xgb XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child

In [None]:
df_cross=pd.concat(cross_results)
df_cross

Unnamed: 0,fit_time,score_time,test_precision,test_accuracy,test_f1,test_recall,test_roc_auc
dummy,0.136824,0.133215,0.60045,0.60045,0.750351,1.0,0.5
rf,40.270096,0.839194,0.602627,0.507563,0.562913,0.528146,0.503073
ada,2.602991,0.523971,0.60045,0.60045,0.750351,1.0,0.50291
lr,0.344237,0.153862,0.601719,0.498787,0.537902,0.487904,0.502795
gbc,5.059549,0.258687,0.600463,0.600375,0.750227,0.999521,0.499926
neural,0.973339,0.101587,0.60045,0.60045,0.750351,1.0,0.501668
cat,30.583044,0.172422,0.600331,0.597775,0.746756,0.987655,0.501091
xgb,0.95653,0.393744,0.60098,0.59215,0.737561,0.954492,0.500896
