In [1]:
import pandas as pd
import numpy as np

In [2]:
df_diabetes = pd.read_csv("diabetes.csv")

colunas = list( df_diabetes.columns )
colunas

['Pregnancies',
 'Glucose',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age',
 'Outcome']

In [3]:
df_diabetes.head(3)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1


In [4]:
df_diabetes.shape

(768, 9)

In [5]:
df_diabetes.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


## Dados 

In [7]:
0 in df_diabetes['Glucose'].value_counts().index

True

In [8]:
0 in df_diabetes['DiabetesPedigreeFunction'].value_counts().index

False

In [8]:
print("# # # Quantidade de zeros nas colunas # # #")
for column in df_diabetes.columns:
    listCount = df_diabetes[column].value_counts()
    if 0 in listCount:
        print(f"{column} => {listCount[0]}")

# # # Quantidade de zeros nas colunas # # #
Pregnancies => 111
Glucose => 5
BloodPressure => 35
SkinThickness => 227
Insulin => 374
BMI => 11
Outcome => 500


In [9]:
colunas_com_zero = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']

In [11]:
#FUNCAO QUE NORMALIZA UM DATAFRAME
def normalizeDf( df ):
    dfn = pd.DataFrame( columns = list(df.columns) )

    for column in df.columns:
        max_value = df.loc[:, column].max()
        dfn.loc[:, column ] = df.loc[:, column] / max_value
        
    return dfn

In [12]:
df_diabetes_n = normalizeDf( df_diabetes )

In [13]:
df_diabetes_n.head(3)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,0.352941,0.743719,0.590164,0.353535,0.0,0.500745,0.259091,0.617284,1.0
1,0.058824,0.427136,0.540984,0.292929,0.0,0.396423,0.145041,0.382716,0.0
2,0.470588,0.919598,0.52459,0.0,0.0,0.347243,0.277686,0.395062,1.0


In [14]:
df_diabetes[ ( df_diabetes['Glucose'] == 0 )  ]

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
75,1,0,48,20,0,24.7,0.14,22,0
182,1,0,74,20,23,27.7,0.299,21,0
342,1,0,68,35,0,32.0,0.389,22,0
349,5,0,80,32,0,41.0,0.346,37,1
502,6,0,68,41,0,39.0,0.727,41,1


### Media, Moda, Mediana e Ausentes 

In [17]:
mapMensures = {}
for key in ["media", "moda", "mediana"]:
    mapMensures[key] = {}

for colunaZero in colunas_com_zero:
    dfAux = df_diabetes[ df_diabetes[colunaZero] != 0 ][ colunaZero ]
    mapMensures["media"][ colunaZero ] = dfAux.mean()
    mapMensures["moda"][ colunaZero ] = dfAux.value_counts().index[0]
    mapMensures["mediana"][ colunaZero ] = dfAux.median()

In [18]:
mapMensures["media"]

{'Glucose': 121.6867627785059,
 'BloodPressure': 72.40518417462484,
 'SkinThickness': 29.153419593345657,
 'Insulin': 155.5482233502538,
 'BMI': 32.45746367239099}

In [19]:
mapMensures["moda"]

{'Glucose': 99,
 'BloodPressure': 70,
 'SkinThickness': 32,
 'Insulin': 105,
 'BMI': 32.0}

In [20]:
mapMensures["mediana"]

{'Glucose': 117.0,
 'BloodPressure': 72.0,
 'SkinThickness': 29.0,
 'Insulin': 125.0,
 'BMI': 32.3}

In [22]:
mapMensures["media"]

{'Glucose': 121.6867627785059,
 'BloodPressure': 72.40518417462484,
 'SkinThickness': 29.153419593345657,
 'Insulin': 155.5482233502538,
 'BMI': 32.45746367239099}

In [23]:
df_diabetes_mean = df_diabetes.copy()
df_diabetes_mode = df_diabetes.copy()
df_diabetes_median = df_diabetes.copy()
for column in ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']:
    df_diabetes_mean.loc[ df_diabetes_mean[column]==0, column] = mapMensures["media"][ column ]
    df_diabetes_mode.loc[ df_diabetes_mode[column]==0, column] = mapMensures["moda"][ column ]
    df_diabetes_median.loc[ df_diabetes_median[column]==0, column] = mapMensures["mediana"][ column ]

In [24]:
df_diabetes_nozero = df_diabetes.copy()
df_diabetes_nozero = df_diabetes_nozero [ 
                             ( df_diabetes['Glucose'] != 0 ) &
                             ( df_diabetes['BloodPressure'] != 0 ) &
                             ( df_diabetes['SkinThickness'] != 0 ) &
                             ( df_diabetes['Insulin'] != 0 ) &
                             ( df_diabetes['BMI'] != 0 )
                           ]

In [29]:
df_diabetes_nozero.describe().head(2)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0
mean,3.30102,122.627551,70.663265,29.145408,156.056122,33.086224,0.523046,30.864796,0.331633


In [30]:
df_diabetes_mean.describe().head(2)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,121.686763,72.405184,29.15342,155.548223,32.457464,0.471876,33.240885,0.348958


In [35]:
df_diabetes_mean.loc[ df_diabetes_mean['Glucose']==0 ]

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome


### Split de Dados

In [38]:
from sklearn.model_selection import train_test_split

In [39]:
def getDfToSplit( df ):
    X = df.loc[:, 'Pregnancies':'Age']
    y = df.loc[:, 'Outcome']
    return X, y

In [47]:
X, y = getDfToSplit( df_diabetes )
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=13)

### Metricas 

In [10]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn import metrics

### GridSearch 

In [11]:
from sklearn.model_selection import GridSearchCV

In [12]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=1, random_state=1)

### Pycaret 

In [37]:
from pycaret.classification import *

In [46]:
help( get_config )

Help on function get_config in module pycaret.classification:

get_config(variable: str)
    This function retrieves the global variables created when initializing the
    ``setup`` function. Following variables are accessible:
    
    - X: Transformed dataset (X)
    - y: Transformed dataset (y)
    - X_train: Transformed train dataset (X)
    - X_test: Transformed test/holdout dataset (X)
    - y_train: Transformed train dataset (y)
    - y_test: Transformed test/holdout dataset (y)
    - seed: random state set through session_id
    - prep_pipe: Transformation pipeline
    - fold_shuffle_param: shuffle parameter used in Kfolds
    - n_jobs_param: n_jobs parameter used in model training
    - html_param: html_param configured through setup
    - create_model_container: results grid storage container
    - master_model_container: model storage container
    - display_container: results display container
    - exp_name_log: Name of experiment
    - logging_param: log_experiment param


In [56]:
# df_diabetes, df_diabetes_mean, df_diabetes_mode, df_diabetes_median, df_diabetes_nozero

In [111]:
X, y = getDfToSplit( df_diabetes_nozero )
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=13)
dfTrain = pd.concat( [X_train, y_train], axis=1) # concatenando X e Y de treino
dfTrain.head(3)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
747,1,81,74,41,57,46.3,1.096,32,0
419,3,129,64,29,115,26.4,0.219,28,1
576,6,108,44,20,130,24.0,0.813,35,0


In [105]:
s = setup( dfTrain, target='Outcome')

Unnamed: 0,Description,Value
0,session_id,6873
1,Target,Outcome
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(274, 9)"
5,Missing Values,False
6,Numeric Features,7
7,Categorical Features,1
8,Ordinal Features,False
9,High Cardinality Features,False


In [106]:
# df_diabetes_nozero .3
top3zero = compare_models( n_select=3 ) #top 3 algoritmos

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.8121,0.7841,0.5833,0.8098,0.6332,0.5202,0.5562,1.969
rf,Random Forest Classifier,0.8066,0.7865,0.53,0.8187,0.6018,0.4899,0.5308,0.14
et,Extra Trees Classifier,0.8013,0.79,0.5133,0.7913,0.597,0.4773,0.5122,0.126
ridge,Ridge Classifier,0.7963,0.0,0.5133,0.7781,0.5796,0.4625,0.497,0.008
xgboost,Extreme Gradient Boosting,0.7911,0.8218,0.5833,0.72,0.6196,0.4829,0.502,0.119
gbc,Gradient Boosting Classifier,0.7858,0.7853,0.56,0.6917,0.5908,0.4553,0.4794,0.054
lda,Linear Discriminant Analysis,0.7753,0.7619,0.4967,0.7288,0.553,0.4186,0.4484,0.013
ada,Ada Boost Classifier,0.7645,0.7585,0.5433,0.6479,0.5673,0.4108,0.4308,0.051
lightgbm,Light Gradient Boosting Machine,0.7595,0.7968,0.4967,0.6967,0.5428,0.3914,0.4207,0.091
knn,K Neighbors Classifier,0.7542,0.7355,0.4467,0.65,0.513,0.3608,0.3799,0.024


In [112]:
tune_model( top3zero[0] )

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7,0.8452,0.8333,0.5,0.625,0.4,0.4364
1,0.8421,0.5714,0.4,1.0,0.5714,0.4956,0.5739
2,0.8421,0.8714,0.6,0.75,0.6667,0.5649,0.5709
3,0.7895,0.8462,0.5,0.75,0.6,0.4648,0.4824
4,0.7368,0.6282,0.1667,1.0,0.2857,0.2149,0.3469
5,0.9474,0.9615,0.8333,1.0,0.9091,0.8725,0.8797
6,0.7895,0.7308,0.6667,0.6667,0.6667,0.5128,0.5128
7,0.7895,0.7179,0.3333,1.0,0.5,0.4062,0.5049
8,0.8421,0.8333,0.8333,0.7143,0.7692,0.6503,0.6548
9,0.8421,0.8462,0.6667,0.8,0.7273,0.6174,0.6225


LogisticRegression(C=2.458, class_weight={}, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=6873, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [114]:
model_lr = create_model('lr')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7,0.8571,0.8333,0.5,0.625,0.4,0.4364
1,0.8421,0.5857,0.4,1.0,0.5714,0.4956,0.5739
2,0.8421,0.8857,0.6,0.75,0.6667,0.5649,0.5709
3,0.8421,0.8718,0.5,1.0,0.6667,0.5778,0.6374
4,0.7368,0.6282,0.1667,1.0,0.2857,0.2149,0.3469
5,0.9474,0.9359,0.8333,1.0,0.9091,0.8725,0.8797
6,0.7895,0.7692,0.6667,0.6667,0.6667,0.5128,0.5128
7,0.7368,0.6538,0.3333,0.6667,0.4444,0.2963,0.3269
8,0.8421,0.8077,0.8333,0.7143,0.7692,0.6503,0.6548
9,0.8421,0.8462,0.6667,0.8,0.7273,0.6174,0.6225


In [116]:
model_lrt = tune_model( model_lr )

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7,0.8452,0.8333,0.5,0.625,0.4,0.4364
1,0.8421,0.5714,0.4,1.0,0.5714,0.4956,0.5739
2,0.8421,0.8714,0.6,0.75,0.6667,0.5649,0.5709
3,0.7895,0.8462,0.5,0.75,0.6,0.4648,0.4824
4,0.7368,0.6282,0.1667,1.0,0.2857,0.2149,0.3469
5,0.9474,0.9615,0.8333,1.0,0.9091,0.8725,0.8797
6,0.7895,0.7308,0.6667,0.6667,0.6667,0.5128,0.5128
7,0.7895,0.7179,0.3333,1.0,0.5,0.4062,0.5049
8,0.8421,0.8333,0.8333,0.7143,0.7692,0.6503,0.6548
9,0.8421,0.8462,0.6667,0.8,0.7273,0.6174,0.6225


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [117]:
model_lrt

LogisticRegression(C=2.458, class_weight={}, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=6873, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [120]:
dfTest = pd.concat( [X_test, y_test], axis=1)

prediction = predict_model( model_lrt, data=dfTest )

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.7203,0.801,0.4737,0.5806,0.5217,0.327,0.3304


### xgboost 

In [121]:
xgboostM = create_model('xgboost')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7,0.8571,0.8333,0.5,0.625,0.4,0.4364
1,0.8421,0.8143,0.4,1.0,0.5714,0.4956,0.5739
2,0.8947,0.9571,0.6,1.0,0.75,0.6885,0.7246
3,0.8421,0.7436,0.6667,0.8,0.7273,0.6174,0.6225
4,0.6842,0.8205,0.1667,0.5,0.25,0.1094,0.1359
5,0.8947,0.8718,0.8333,0.8333,0.8333,0.7564,0.7564
6,0.6842,0.8333,0.5,0.5,0.5,0.2692,0.2692
7,0.6316,0.5769,0.3333,0.4,0.3636,0.1074,0.1083
8,0.7895,0.8846,0.6667,0.6667,0.6667,0.5128,0.5128
9,0.9474,0.859,0.8333,1.0,0.9091,0.8725,0.8797


In [123]:
xgboostMT = tune_model( xgboostM )

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.65,0.8571,1.0,0.4615,0.6316,0.375,0.4804
1,0.7368,0.7571,0.4,0.5,0.4444,0.2748,0.2777
2,0.8947,0.9714,0.8,0.8,0.8,0.7286,0.7286
3,0.7895,0.7051,0.6667,0.6667,0.6667,0.5128,0.5128
4,0.7368,0.8205,0.3333,0.6667,0.4444,0.2963,0.3269
5,0.8947,0.8974,0.8333,0.8333,0.8333,0.7564,0.7564
6,0.7895,0.8333,0.8333,0.625,0.7143,0.5529,0.5673
7,0.5789,0.641,0.5,0.375,0.4286,0.1059,0.1086
8,0.7895,0.8205,0.8333,0.625,0.7143,0.5529,0.5673
9,0.7895,0.8462,0.6667,0.6667,0.6667,0.5128,0.5128


In [135]:
prediction = predict_model( xgboostMT, data=dfTrain )

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extreme Gradient Boosting,0.938,0.9644,0.9239,0.8947,0.9091,0.862,0.8623


In [132]:
prediction = predict_model( xgboostMT, data=dfTest )

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extreme Gradient Boosting,0.7627,0.7947,0.7105,0.6136,0.6585,0.4782,0.4812


In [136]:
prediction = predict_model( xgboostM, data=dfTrain )

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extreme Gradient Boosting,0.9489,0.9617,0.9022,0.9432,0.9222,0.8842,0.8847


In [125]:
prediction = predict_model( xgboostM, data=dfTest )

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extreme Gradient Boosting,0.8136,0.7918,0.6579,0.7353,0.6944,0.5609,0.5627


In [127]:
xgboostM

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=-1,
              num_parallel_tree=1, objective='binary:logistic',
              predictor='auto', random_state=6873, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=1, tree_method='auto',
              use_label_encoder=True, validate_parameters=1, verbosity=0)

In [128]:
xgboostMT

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.9,
              enable_categorical=False, gamma=0, gpu_id=-1,
              importance_type=None, interaction_constraints='',
              learning_rate=0.3, max_delta_step=0, max_depth=10,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=240, n_jobs=-1, num_parallel_tree=1,
              objective='binary:logistic', predictor='auto', random_state=6873,
              reg_alpha=0.0001, reg_lambda=4, scale_pos_weight=42.2,
              subsample=1, tree_method='auto', use_label_encoder=True,
              validate_parameters=1, verbosity=0)

In [None]:
'''
This function tunes the hyperparameters of a given estimator. The output of
    997     this function is a score grid with CV scores by fold of the best selected
''';

In [142]:
#xgboostMT2 = tune_model( xgboostM, search_algorithm='grid' )
xgboostMT2 = tune_model( xgboostM, optimize='fFFF' )

ValueError: Optimize method not supported. See docstring for list of available parameters.

In [141]:
prediction = predict_model( xgboostM, data=dfTest )

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extreme Gradient Boosting,0.8136,0.7918,0.6579,0.7353,0.6944,0.5609,0.5627


In [144]:
save_model( xgboostM, 'xgboostM8136')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True, features_todrop=[],
                                       id_columns=[],
                                       ml_usecase='classification',
                                       numerical_features=[], target='Outcome',
                                       time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numeric_stra...
                                learning_rate=0.300000012, max_delta_step=0,
                                max_depth=6, min_child_weight=1, missing=nan,
                                monotone_constraints='()', n_estimators=100,
                                n_jobs=-1,

In [148]:
prediction = predict_model( lm, data=dfTest )

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extreme Gradient Boosting,0.8136,0.7918,0.6579,0.7353,0.6944,0.5609,0.5627


In [146]:
lm = load_model('xgboostM8136')

Transformation Pipeline and Model Successfully Loaded


In [139]:
prediction = predict_model( xgboostMT2, data=dfTest )

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extreme Gradient Boosting,0.6864,0.8066,0.7895,0.5085,0.6186,0.3729,0.399


  from pandas import MultiIndex, Int64Index
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  from pandas import MultiIndex, Int64Index
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  from pandas import MultiIndex, Int64Index
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  from pandas import MultiIndex, Int64Index
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


In [118]:
'''
prediction = predict_model( LogisticRegression(C=2.458, class_weight={}, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=6873, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False), data=dfTest )
''';                   

In [75]:
# df_diabetes_mode .3
top3mode = compare_models( n_select=3 ) #top 3 algoritmos

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
knn,K Neighbors Classifier,0.7737,0.7896,0.634,0.6726,0.6493,0.4826,0.4857,0.023
gbc,Gradient Boosting Classifier,0.7549,0.8044,0.5551,0.669,0.5946,0.4228,0.4347,0.062
ridge,Ridge Classifier,0.7548,0.0,0.5122,0.6762,0.5715,0.4064,0.421,0.008
lda,Linear Discriminant Analysis,0.7548,0.7963,0.5192,0.6745,0.574,0.408,0.4225,0.009
lr,Logistic Regression,0.7547,0.8067,0.5282,0.6767,0.5815,0.4132,0.4273,0.153
rf,Random Forest Classifier,0.7547,0.8159,0.5288,0.6637,0.5781,0.4105,0.4222,0.163
dt,Decision Tree Classifier,0.7546,0.7138,0.5942,0.6532,0.6055,0.4312,0.4442,0.008
xgboost,Extreme Gradient Boosting,0.7524,0.807,0.6038,0.6389,0.6094,0.4305,0.4381,0.146
lightgbm,Light Gradient Boosting Machine,0.7443,0.8133,0.5705,0.6386,0.5912,0.4074,0.4167,0.134
ada,Ada Boost Classifier,0.7307,0.7745,0.5699,0.599,0.575,0.3802,0.3869,0.062


In [83]:
# df_diabetes_mean .2
top3mean = compare_models( n_select=3 ) #top 3 algoritmos

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ridge,Ridge Classifier,0.774,0.0,0.6019,0.7207,0.6492,0.4861,0.4952,0.008
lr,Logistic Regression,0.7715,0.8429,0.6086,0.711,0.6482,0.4822,0.491,0.136
lightgbm,Light Gradient Boosting Machine,0.7715,0.8248,0.6429,0.6974,0.6629,0.4915,0.4974,0.141
lda,Linear Discriminant Analysis,0.7666,0.8399,0.5886,0.7096,0.6364,0.4686,0.4775,0.009
gbc,Gradient Boosting Classifier,0.7618,0.8292,0.6233,0.68,0.6438,0.4671,0.4731,0.064
et,Extra Trees Classifier,0.759,0.8156,0.5876,0.6918,0.6235,0.4513,0.4625,0.13
rf,Random Forest Classifier,0.7565,0.8285,0.581,0.6959,0.6254,0.4487,0.4583,0.143
knn,K Neighbors Classifier,0.7418,0.772,0.6167,0.6456,0.6273,0.4306,0.4338,0.022
xgboost,Extreme Gradient Boosting,0.7394,0.8131,0.6086,0.6373,0.6162,0.421,0.4253,0.152
ada,Ada Boost Classifier,0.7321,0.7795,0.6514,0.6124,0.6279,0.4195,0.4228,0.056


In [82]:
# df_diabetes_mean .2
top3mean = compare_models( n_select=3 ) #top 3 algoritmos

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ridge,Ridge Classifier,0.774,0.0,0.6019,0.7207,0.6492,0.4861,0.4952,0.009
lr,Logistic Regression,0.7715,0.8429,0.6086,0.711,0.6482,0.4822,0.491,1.215
lightgbm,Light Gradient Boosting Machine,0.7715,0.8248,0.6429,0.6974,0.6629,0.4915,0.4974,0.141
lda,Linear Discriminant Analysis,0.7666,0.8399,0.5886,0.7096,0.6364,0.4686,0.4775,0.008
gbc,Gradient Boosting Classifier,0.7618,0.8292,0.6233,0.68,0.6438,0.4671,0.4731,0.067
et,Extra Trees Classifier,0.759,0.8156,0.5876,0.6918,0.6235,0.4513,0.4625,0.146
rf,Random Forest Classifier,0.7565,0.8285,0.581,0.6959,0.6254,0.4487,0.4583,0.154
knn,K Neighbors Classifier,0.7418,0.772,0.6167,0.6456,0.6273,0.4306,0.4338,0.023
xgboost,Extreme Gradient Boosting,0.7394,0.8131,0.6086,0.6373,0.6162,0.421,0.4253,0.153
ada,Ada Boost Classifier,0.7321,0.7795,0.6514,0.6124,0.6279,0.4195,0.4228,0.056


In [88]:
# df_diabetes_mean .3
top3mean = compare_models( n_select=3 ) #top 3 algoritmos df de media

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.7974,0.8335,0.5417,0.7527,0.6245,0.4916,0.5076,0.64
ridge,Ridge Classifier,0.792,0.0,0.5242,0.7516,0.6094,0.4751,0.4942,0.008
lda,Linear Discriminant Analysis,0.7787,0.8236,0.5242,0.7059,0.5939,0.4481,0.4617,0.009
gbc,Gradient Boosting Classifier,0.7711,0.8186,0.578,0.6586,0.6067,0.449,0.456,0.064
dt,Decision Tree Classifier,0.7676,0.7436,0.6788,0.6251,0.6472,0.475,0.479,0.013
xgboost,Extreme Gradient Boosting,0.7656,0.8093,0.5591,0.6584,0.5969,0.4346,0.4423,0.147
rf,Random Forest Classifier,0.7602,0.8086,0.4924,0.6638,0.5596,0.4013,0.4126,0.143
et,Extra Trees Classifier,0.7602,0.7959,0.4659,0.6512,0.5391,0.3883,0.3977,0.136
lightgbm,Light Gradient Boosting Machine,0.7522,0.8353,0.5417,0.6242,0.5693,0.4,0.4076,0.138
ada,Ada Boost Classifier,0.7521,0.7779,0.5583,0.6327,0.5712,0.4049,0.4177,0.054


In [51]:
# df_diabetes
top3 = compare_models( n_select=3 ) #top 3 algoritmos

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ridge,Ridge Classifier,0.7616,0.0,0.5516,0.6667,0.5923,0.4317,0.4416,0.021
lda,Linear Discriminant Analysis,0.7591,0.8018,0.5516,0.6622,0.5895,0.4269,0.4374,0.018
lr,Logistic Regression,0.7567,0.8095,0.5516,0.6577,0.5914,0.4243,0.4321,2.421
gbc,Gradient Boosting Classifier,0.7517,0.811,0.5962,0.6448,0.6143,0.4331,0.4375,0.068
ada,Ada Boost Classifier,0.747,0.7919,0.5868,0.6373,0.6091,0.4227,0.4249,0.055
rf,Random Forest Classifier,0.7468,0.809,0.5055,0.6647,0.5685,0.3961,0.4069,0.157
lightgbm,Light Gradient Boosting Machine,0.7393,0.8067,0.55,0.6327,0.5817,0.3954,0.4016,0.163
xgboost,Extreme Gradient Boosting,0.7343,0.7925,0.5786,0.6206,0.5919,0.3963,0.4019,0.241
et,Extra Trees Classifier,0.717,0.7977,0.433,0.6106,0.4973,0.3134,0.3258,0.134
knn,K Neighbors Classifier,0.7146,0.742,0.4901,0.5867,0.529,0.3292,0.3343,0.123


In [94]:
evaluate_model( top3mean[0] )

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [100]:
tune_model( top3mean[1] ) 

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8158,0.0,0.5833,0.7778,0.6667,0.543,0.5537
1,0.7105,0.0,0.25,0.6,0.3529,0.2053,0.238
2,0.6842,0.0,0.4167,0.5,0.4545,0.2349,0.2368
3,0.8684,0.0,0.5833,1.0,0.7368,0.657,0.6995
4,0.8684,0.0,0.5833,1.0,0.7368,0.657,0.6995
5,0.8378,0.0,0.75,0.75,0.75,0.63,0.63
6,0.8378,0.0,0.5833,0.875,0.7,0.5949,0.6178
7,0.8108,0.0,0.5,0.8571,0.6316,0.5159,0.5498
8,0.7838,0.0,0.4545,0.7143,0.5556,0.4219,0.4407
9,0.7568,0.0,0.5455,0.6,0.5714,0.4022,0.403


RidgeClassifier(alpha=5.25, class_weight=None, copy_X=True, fit_intercept=True,
                max_iter=None, normalize=False, random_state=5172,
                solver='auto', tol=0.001)

In [102]:
dfTest = pd.concat( [X_test, y_test], axis=1)

prediction = predict_model( top3mean[0], data=dfTest )

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.7403,0.8088,0.5402,0.7015,0.6104,0.4205,0.4285


In [108]:
dfTest = pd.concat( [X_test, y_test], axis=1)

prediction = predict_model( tune_model( top3mean[0] ), data=dfTest )

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7,0.8452,0.8333,0.5,0.625,0.4,0.4364
1,0.8421,0.5714,0.4,1.0,0.5714,0.4956,0.5739
2,0.8421,0.8714,0.6,0.75,0.6667,0.5649,0.5709
3,0.7895,0.8462,0.5,0.75,0.6,0.4648,0.4824
4,0.7368,0.6282,0.1667,1.0,0.2857,0.2149,0.3469
5,0.9474,0.9615,0.8333,1.0,0.9091,0.8725,0.8797
6,0.7895,0.7308,0.6667,0.6667,0.6667,0.5128,0.5128
7,0.7895,0.7179,0.3333,1.0,0.5,0.4062,0.5049
8,0.8421,0.8333,0.8333,0.7143,0.7692,0.6503,0.6548
9,0.8421,0.8462,0.6667,0.8,0.7273,0.6174,0.6225


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.7203,0.801,0.4737,0.5806,0.5217,0.327,0.3304


# FIM PYCARET 2.0

In [143]:
lista_modelos = [ 'lr', 'gbc', 'rf', 'ada', 'nb', 'svm', 'xgboost']

#### Removendo Coluna De Funcao 

In [150]:
df_diabetes_aux = df_diabetes.loc[:, ['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','Age','Outcome']]

In [151]:
#df_diabetes
s = setup(df_diabetes_aux, target = 'Outcome')

Unnamed: 0,Description,Value
0,session_id,6436
1,Target,Outcome
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(768, 8)"
5,Missing Values,False
6,Numeric Features,6
7,Categorical Features,1
8,Ordinal Features,False
9,High Cardinality Features,False


In [152]:
best = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.7637,0.8292,0.5895,0.6965,0.6361,0.4636,0.4687,0.155
lightgbm,Light Gradient Boosting Machine,0.762,0.8164,0.6158,0.6866,0.6468,0.4684,0.4721,0.219
ridge,Ridge Classifier,0.7582,0.0,0.5632,0.7036,0.6182,0.446,0.4567,0.152
lda,Linear Discriminant Analysis,0.7563,0.8099,0.5684,0.6977,0.6188,0.444,0.4541,0.016
xgboost,Extreme Gradient Boosting,0.7545,0.8144,0.6316,0.6651,0.6442,0.4577,0.4611,0.26
lr,Logistic Regression,0.7489,0.8135,0.5684,0.6817,0.6116,0.43,0.4394,5.018
gbc,Gradient Boosting Classifier,0.7469,0.8168,0.5947,0.6792,0.6244,0.4363,0.4457,0.069
et,Extra Trees Classifier,0.7394,0.8016,0.5421,0.668,0.594,0.406,0.4139,0.13
ada,Ada Boost Classifier,0.7378,0.7933,0.5789,0.6499,0.6069,0.4123,0.4178,0.051
knn,K Neighbors Classifier,0.7302,0.7624,0.5632,0.6489,0.5955,0.396,0.4035,0.029


In [153]:
evaluate_model(best)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

### Predizendo 

In [154]:
predict_model(best)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,0.7446,0.7743,0.4872,0.6667,0.563,0.3886,0.3982


Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,Age,Pregnancies_0,Pregnancies_1,Pregnancies_10,Pregnancies_11,...,Pregnancies_3,Pregnancies_4,Pregnancies_5,Pregnancies_6,Pregnancies_7,Pregnancies_8,Pregnancies_9,Outcome,Label,Score
0,96.0,64.0,27.0,87.0,33.200001,21.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.98
1,126.0,60.0,0.0,0.0,30.100000,47.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1,0.51
2,107.0,80.0,0.0,0.0,24.600000,34.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0,0,0.82
3,89.0,24.0,19.0,25.0,27.799999,21.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,1.00
4,138.0,0.0,0.0,0.0,36.299999,25.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1,0.64
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
226,110.0,66.0,0.0,0.0,31.900000,29.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0,0,0.52
227,110.0,74.0,29.0,125.0,32.400002,27.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.69
228,0.0,68.0,35.0,0.0,32.000000,22.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.98
229,111.0,62.0,13.0,182.0,24.000000,23.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.96


In [21]:
predict_model(best)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.8095,0.8602,0.6163,0.8281,0.7067,0.5701,0.5837


Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Pregnancies_0,Pregnancies_1,Pregnancies_10,...,Pregnancies_3,Pregnancies_4,Pregnancies_5,Pregnancies_6,Pregnancies_7,Pregnancies_8,Pregnancies_9,Outcome,Label,Score
0,111.0,90.0,12.0,78.0,28.400000,0.495,29.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.7810
1,108.0,88.0,19.0,0.0,27.100000,0.400,24.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.9128
2,177.0,60.0,29.0,478.0,34.599998,1.072,21.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1,0.6435
3,92.0,92.0,0.0,0.0,19.900000,0.188,28.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0,0,0.9476
4,108.0,60.0,46.0,178.0,35.500000,0.415,24.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.8321
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
226,100.0,54.0,28.0,105.0,37.799999,0.498,24.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.8435
227,131.0,66.0,40.0,0.0,34.299999,0.196,22.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0.7801
228,77.0,82.0,41.0,42.0,35.799999,0.156,35.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0,0.8995
229,95.0,85.0,25.0,36.0,37.400002,0.247,24.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0.9086


In [22]:
save_model(best, 'best_pycaret_model')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True, features_todrop=[],
                                       id_columns=[],
                                       ml_usecase='classification',
                                       numerical_features=[], target='Outcome',
                                       time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numeric_stra...
                 ('feature_select', 'passthrough'), ('fix_multi', 'passthrough'),
                 ('dfs', 'passthrough'), ('pca', 'passthrough'),
                 ['trained_model',
                  LogisticRegression(C=1.0, class_weight=None, dual=False,
                 

In [25]:
type(best), best

(sklearn.linear_model._logistic.LogisticRegression,
 LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=1000,
                    multi_class='auto', n_jobs=None, penalty='l2',
                    random_state=2044, solver='lbfgs', tol=0.0001, verbose=0,
                    warm_start=False))

### xgboost 

In [12]:
import xgboost

### random forest 

In [13]:
from sklearn.ensemble import RandomForestClassifier

In [27]:
gridRF = {
    'kernel': ['linear', 'rbf', 'polynomial', 'sigmoid'],
    #'C': ['max_leaf_nodes', 'max_depth', 'bootstrap', 'ccp_alpha', 'class_weight', 'criterion'],
    'gamma': [0.01, 0.5, 1.0, 10, 100],
    'class_weight': ['balanced', 'unbalanced']
}

In [23]:
# C => ['bootstrap', 'ccp_alpha', 'class_weight', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'max_samples', 'min_impurity_decrease', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start']

In [25]:
grid_search = GridSearchCV(estimator=RandomForestClassifier(), 
                           param_grid=gridRF, 
                           n_jobs=-1, 
                           cv=cv, 
                           scoring='accuracy')

In [None]:
grid_result = grid_search.fit(X, y)

### Testes 

In [143]:
ss = '''1	Region	Gender	Style	Ship date	Units	Price	Cost
2	East	Boy	Tee	2005-01-31	12	11.04	10.42
3	East	Boy	Golf	2005-01-31	12	13.00	12.60
4	East	Boy	Fancy	2005-01-31	12	11.96	11.74
5	East	Girl	Tee	2005-01-31	10	11.27	10.56
6	East	Girl	Golf	2005-01-31	10	12.12	11.95
7	East	Girl	Fancy	2005-01-31	10	13.74	13.33
8	West	Boy	Tee	2005-01-31	11	11.44	10.94
9	West	Boy	Golf	2005-01-31	11	12.63	11.73
10	West	Boy	Fancy	2005-01-31	11	12.06	11.51
11	West	Girl	Tee	2005-01-31	15	13.42	13.29
12	West	Girl	Golf	2005-01-31	15	11.48	10.67'''

dfa = pd.DataFrame( columns="Region	Gender	Style	Ship date	Units	Price	Cost".split('\t'))
for line in ss.split('\n')[1:]:
    dfa.loc[len(dfa)] = line.split("\t")[1:]
    
dfa['Units'] = dfa.Units.astype(int)
dfa['Price'] = dfa.Price.astype(float)
dfa['Cost']  = dfa.Cost.astype(float)
dfa['Ship date'] = pd.to_datetime( dfa['Ship date'] )

dfa.groupby(["Region", "Ship date", "Gender"]).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Units,Price,Cost
Region,Ship date,Gender,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
East,2005-01-31,Boy,36,36.0,34.76
East,2005-01-31,Girl,30,37.13,35.84
West,2005-01-31,Boy,33,36.13,34.18
West,2005-01-31,Girl,30,24.9,23.96


In [144]:
dfa[ (dfa.Region == 'East') & (dfa['Ship date'] == '2005-01-31') ].loc[:,'Price'].sum()

73.13

In [145]:
dfa.groupby(["Region", "Ship date", "Gender"])['Units', 'Price'].agg(['sum', 'count', 'max', 'min'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Units,Units,Units,Units,Price,Price,Price,Price
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,sum,count,max,min,sum,count,max,min
Region,Ship date,Gender,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
East,2005-01-31,Boy,36,3,12,12,36.0,3,13.0,11.04
East,2005-01-31,Girl,30,3,10,10,37.13,3,13.74,11.27
West,2005-01-31,Boy,33,3,11,11,36.13,3,12.63,11.44
West,2005-01-31,Girl,30,2,15,15,24.9,2,13.42,11.48
