In [2]:
from pycaret.classification import *
import pandas as pd
import numpy as np

In [3]:
df_diabetes = pd.read_csv("diabetes.csv")

colunas = list( df_diabetes.columns )
colunas

['Pregnancies',
 'Glucose',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age',
 'Outcome']

In [4]:
df_diabetes.head(3)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1


In [5]:
#FUNCAO QUE NORMALIZA UM DATAFRAME
def normalizeDf( df ):
    dfn = pd.DataFrame( columns = list(df.columns) )

    for column in df.columns:
        max_value = df.loc[:, column].max()
        dfn.loc[:, column ] = df.loc[:, column] / max_value
        
    return dfn

#### Df Normalizado 

In [6]:
df_diabetes_n = normalizeDf( df_diabetes )

#### Media, Moda, Mediana e Ausentes 

In [7]:
colunas_com_zero = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
mapMensures = {}
for key in ["media", "moda", "mediana"]:
    mapMensures[key] = {}

for colunaZero in colunas_com_zero:
    dfAux = df_diabetes[ df_diabetes[colunaZero] != 0 ][ colunaZero ]
    mapMensures["media"][ colunaZero ] = dfAux.mean()
    mapMensures["moda"][ colunaZero ] = dfAux.value_counts().index[0]
    mapMensures["mediana"][ colunaZero ] = dfAux.median()

In [8]:
df_diabetes_mean = df_diabetes.copy()
df_diabetes_mode = df_diabetes.copy()
df_diabetes_median = df_diabetes.copy()
for column in ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']:
    df_diabetes_mean.loc[ df_diabetes_mean[column]==0, column] = mapMensures["media"][ column ]
    df_diabetes_mode.loc[ df_diabetes_mode[column]==0, column] = mapMensures["moda"][ column ]
    df_diabetes_median.loc[ df_diabetes_median[column]==0, column] = mapMensures["mediana"][ column ]

In [9]:
df_diabetes_nozero = df_diabetes.copy()
df_diabetes_nozero = df_diabetes_nozero [ 
                             ( df_diabetes['Glucose'] != 0 ) &
                             ( df_diabetes['BloodPressure'] != 0 ) &
                             ( df_diabetes['SkinThickness'] != 0 ) &
                             ( df_diabetes['Insulin'] != 0 ) &
                             ( df_diabetes['BMI'] != 0 )
                           ]

In [10]:
dicDf = {
    'normal':df_diabetes,
    'mean':df_diabetes_mean,
    'mode': df_diabetes_mode,
    'median':df_diabetes_median,
    'noZero':df_diabetes_nozero
}

In [11]:
mapMensures['media']

{'Glucose': 121.6867627785059,
 'BloodPressure': 72.40518417462484,
 'SkinThickness': 29.153419593345657,
 'Insulin': 155.5482233502538,
 'BMI': 32.45746367239099}

In [12]:
pd.concat([pd.DataFrame.from_dict( mapMensures['media'], orient='index', columns=['Media']).round(2),
           pd.DataFrame.from_dict( mapMensures['moda'], orient='index', columns=['Moda']),
           pd.DataFrame.from_dict( mapMensures['mediana'], orient='index', columns=['Mediana'])], axis= 1)

Unnamed: 0,Media,Moda,Mediana
Glucose,121.69,99.0,117.0
BloodPressure,72.41,70.0,72.0
SkinThickness,29.15,32.0,29.0
Insulin,155.55,105.0,125.0
BMI,32.46,32.0,32.3


#### Split

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
def getDfToSplit( df ):
    X = df.loc[:, 'Pregnancies':'Age']
    y = df.loc[:, 'Outcome']
    return X, y

### Models 

In [15]:
from pycaret.classification import *

In [16]:
lista_modelos = [ 'lr', 'gbc', 'rf', 'ada', 'nb', 'svm', 'knn', 'lda', 'xgboost']
dicM = {}

In [17]:
def getModelsDic():
    dic = {}
    for model in lista_modelos:
        dic[model] = create_model( model )
    
    return dic

#### Df Normal: df_diabetes

In [18]:
X, y = getDfToSplit( df_diabetes )
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=13)
dfTrain = pd.concat( [X_train, y_train], axis=1)
dfTest  = pd.concat( [ X_test, y_test], axis=1)

s = setup( dfTrain, target='Outcome')

Unnamed: 0,Description,Value
0,session_id,7152
1,Target,Outcome
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(537, 9)"
5,Missing Values,False
6,Numeric Features,7
7,Categorical Features,1
8,Ordinal Features,False
9,High Cardinality Features,False


In [19]:
dicM['normal'] = getModelsDic()

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7632,0.7846,0.6154,0.6667,0.64,0.4639,0.4648
1,0.7632,0.84,0.4615,0.75,0.5714,0.4203,0.444
2,0.7368,0.7938,0.5385,0.6364,0.5833,0.393,0.3959
3,0.7105,0.7477,0.6154,0.5714,0.5926,0.3686,0.3692
4,0.7632,0.8123,0.6154,0.6667,0.64,0.4639,0.4648
5,0.7027,0.7276,0.6923,0.5625,0.6207,0.3805,0.3861
6,0.7838,0.7788,0.6923,0.6923,0.6923,0.5256,0.5256
7,0.7297,0.7436,0.5385,0.6364,0.5833,0.3854,0.3883
8,0.7297,0.7167,0.4167,0.625,0.5,0.3248,0.3374
9,0.8108,0.91,0.5833,0.7778,0.6667,0.5383,0.5492


In [20]:
for model in dicM['normal'].keys():
    predict_model( dicM['normal'][model], data=dfTest );

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.7532,0.8251,0.5402,0.7344,0.6225,0.4455,0.4571


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Gradient Boosting Classifier,0.7316,0.8266,0.5517,0.6761,0.6076,0.4068,0.4117


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,0.7446,0.8178,0.5172,0.7258,0.604,0.4233,0.4365


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Ada Boost Classifier,0.7316,0.7629,0.5632,0.6712,0.6125,0.4096,0.4133


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Naive Bayes,0.6407,0.6803,0.2414,0.5526,0.336,0.1388,0.1612


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,SVM - Linear Kernel,0.6104,0.5897,0.5057,0.4835,0.4944,0.1777,0.1779


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,K Neighbors Classifier,0.71,0.737,0.5057,0.6471,0.5677,0.3544,0.3605


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Linear Discriminant Analysis,0.7446,0.8085,0.5517,0.7059,0.6194,0.4315,0.4389


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extreme Gradient Boosting,0.7316,0.8082,0.5172,0.6923,0.5921,0.3983,0.4077


  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  from pandas import MultiIndex, Int64Index
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  from pandas import MultiIndex, Int64Index
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


  from pandas import MultiIndex, Int64Index
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  from pandas import MultiIndex, Int64Index
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


In [100]:
for model in dicM['normal'].keys():
    predict_model( dicM['normal'][model], data=dfTest );

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.7792,0.8258,0.5977,0.7647,0.671,0.5086,0.5173


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Gradient Boosting Classifier,0.7576,0.8053,0.5747,0.7246,0.641,0.4617,0.4687


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,0.7446,0.8054,0.4828,0.75,0.5874,0.4148,0.4359


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Ada Boost Classifier,0.7532,0.7882,0.5747,0.7143,0.6369,0.4534,0.4595


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Naive Bayes,0.6537,0.7197,0.3333,0.5686,0.4203,0.1967,0.2109


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,SVM - Linear Kernel,0.6234,0.5,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,K Neighbors Classifier,0.7359,0.747,0.5402,0.6912,0.6065,0.4122,0.4193


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Linear Discriminant Analysis,0.7749,0.8231,0.5977,0.7536,0.6667,0.5001,0.5078


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extreme Gradient Boosting,0.7229,0.8032,0.4943,0.6825,0.5733,0.3759,0.3866


  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  from pandas import MultiIndex, Int64Index
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  elif isinstance(data.columns, (pd.Int64Index, pd

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  from pandas import MultiIndex, Int64Index
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.

  from pandas import MultiIndex, Int64Index
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


In [95]:
for model in dicM['normal'].keys():
    predict_model( dicM['normal'][model], data=dfTestNormal );

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.7446,0.8123,0.5287,0.7188,0.6093,0.426,0.4371


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Gradient Boosting Classifier,0.7835,0.8348,0.6437,0.7467,0.6914,0.5261,0.5295


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,0.7619,0.8189,0.5977,0.7222,0.6541,0.475,0.48


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Ada Boost Classifier,0.7446,0.8147,0.6092,0.6795,0.6424,0.4447,0.4463


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Naive Bayes,0.632,0.707,0.3333,0.5179,0.4056,0.1569,0.1649


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,SVM - Linear Kernel,0.4199,0.4574,0.6092,0.3464,0.4417,-0.0741,-0.0873


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,K Neighbors Classifier,0.697,0.7397,0.5402,0.6104,0.5732,0.3396,0.3411


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Linear Discriminant Analysis,0.7446,0.8064,0.5517,0.7059,0.6194,0.4315,0.4389


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extreme Gradient Boosting,0.7446,0.7617,0.6897,0.6522,0.6704,0.4622,0.4627


In [98]:
for model in dicM['normal'].keys():
    predict_model( dicM['normal'][model], data=dfTest );

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.7792,0.8258,0.5977,0.7647,0.671,0.5086,0.5173


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Gradient Boosting Classifier,0.7576,0.8053,0.5747,0.7246,0.641,0.4617,0.4687


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,0.7446,0.8054,0.4828,0.75,0.5874,0.4148,0.4359


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Ada Boost Classifier,0.7532,0.7882,0.5747,0.7143,0.6369,0.4534,0.4595


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Naive Bayes,0.6537,0.7197,0.3333,0.5686,0.4203,0.1967,0.2109


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,SVM - Linear Kernel,0.6234,0.5,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,K Neighbors Classifier,0.7359,0.747,0.5402,0.6912,0.6065,0.4122,0.4193


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Linear Discriminant Analysis,0.7749,0.8231,0.5977,0.7536,0.6667,0.5001,0.5078


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extreme Gradient Boosting,0.7229,0.8032,0.4943,0.6825,0.5733,0.3759,0.3866


#### Df Mean: df_diabetes_mean

In [44]:
X, y = getDfToSplit( df_diabetes_mean )
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=13)
dfTrain = pd.concat( [X_train, y_train], axis=1)
dfTest  = pd.concat( [ X_test, y_test], axis=1)

s = setup( dfTrain, target='Outcome')

Unnamed: 0,Description,Value
0,session_id,5876
1,Target,Outcome
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(537, 9)"
5,Missing Values,False
6,Numeric Features,7
7,Categorical Features,1
8,Ordinal Features,False
9,High Cardinality Features,False


In [45]:
dicM['mean'] = getModelsDic()

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7632,0.7785,0.4615,0.75,0.5714,0.4203,0.444
1,0.7632,0.8677,0.8462,0.6111,0.7097,0.5183,0.5379
2,0.7368,0.8769,0.6154,0.6154,0.6154,0.4154,0.4154
3,0.6579,0.6122,0.3333,0.4444,0.381,0.1512,0.1542
4,0.7895,0.7788,0.4167,0.8333,0.5556,0.437,0.4821
5,0.7027,0.7367,0.5,0.5455,0.5217,0.3066,0.3073
6,0.6757,0.7733,0.5,0.5,0.5,0.26,0.26
7,0.7027,0.7833,0.5833,0.5385,0.56,0.3361,0.3367
8,0.8378,0.8667,0.75,0.75,0.75,0.63,0.63
9,0.7568,0.7933,0.5833,0.6364,0.6087,0.4327,0.4336


In [46]:
for model in dicM['mean'].keys():
    predict_model( dicM['mean'][model], data=dfTest );

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.7749,0.8368,0.5977,0.7536,0.6667,0.5001,0.5078


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Gradient Boosting Classifier,0.7489,0.8085,0.5977,0.6933,0.642,0.4503,0.4532


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,0.7489,0.8142,0.5402,0.7231,0.6184,0.4371,0.4474


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Ada Boost Classifier,0.7229,0.8089,0.5862,0.6456,0.6145,0.399,0.4002


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Naive Bayes,0.6667,0.7491,0.4828,0.5676,0.5217,0.2685,0.2705


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,SVM - Linear Kernel,0.6364,0.5582,0.2414,0.5385,0.3333,0.1306,0.1505


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,K Neighbors Classifier,0.7013,0.7361,0.5517,0.6154,0.5818,0.3506,0.3518


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Linear Discriminant Analysis,0.7489,0.8296,0.5517,0.7164,0.6234,0.4398,0.4482


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extreme Gradient Boosting,0.7316,0.7821,0.6092,0.6543,0.631,0.4205,0.4212


#### Df Mode: df_diabetes_mode

In [48]:
X, y = getDfToSplit( df_diabetes_mode )
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=13)
dfTrain = pd.concat( [X_train, y_train], axis=1)
dfTest  = pd.concat( [ X_test, y_test], axis=1)

s = setup( dfTrain, target='Outcome')

Unnamed: 0,Description,Value
0,session_id,7415
1,Target,Outcome
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(537, 9)"
5,Missing Values,False
6,Numeric Features,7
7,Categorical Features,1
8,Ordinal Features,False
9,High Cardinality Features,False


In [49]:
dicM['mode'] = getModelsDic()

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7895,0.7981,0.5,0.75,0.6,0.4648,0.4824
1,0.7632,0.8429,0.4167,0.7143,0.5263,0.3827,0.4074
2,0.7368,0.7628,0.6667,0.5714,0.6154,0.4172,0.42
3,0.7368,0.859,0.5833,0.5833,0.5833,0.391,0.391
4,0.6842,0.734,0.6667,0.5,0.5714,0.3294,0.338
5,0.8108,0.7692,0.6364,0.7,0.6667,0.535,0.5362
6,0.7297,0.8042,0.3636,0.5714,0.4444,0.2773,0.2897
7,0.7568,0.8462,0.5455,0.6,0.5714,0.4022,0.403
8,0.7027,0.7203,0.1818,0.5,0.2667,0.1285,0.1544
9,0.7297,0.8067,0.6667,0.5714,0.6154,0.4089,0.4118


In [50]:
for model in dicM['mode'].keys():
    predict_model( dicM['mode'][model], data=dfTest );

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.7792,0.7992,0.5517,0.8,0.6531,0.499,0.5176


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Gradient Boosting Classifier,0.7446,0.8277,0.4828,0.75,0.5874,0.4148,0.4359


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,0.7446,0.8086,0.4713,0.7593,0.5816,0.4119,0.4362


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Ada Boost Classifier,0.7229,0.7789,0.4943,0.6825,0.5733,0.3759,0.3866


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Naive Bayes,0.6623,0.6629,0.4483,0.5652,0.5,0.2502,0.254


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,SVM - Linear Kernel,0.5801,0.5927,0.6437,0.459,0.5359,0.1717,0.1799


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,K Neighbors Classifier,0.7056,0.736,0.4598,0.6557,0.5405,0.3337,0.3451


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Linear Discriminant Analysis,0.7619,0.7866,0.5632,0.7424,0.6405,0.4675,0.4775


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extreme Gradient Boosting,0.697,0.7731,0.4483,0.6393,0.527,0.3141,0.3248


#### Df Median: df_diabetes_median

In [55]:
X, y = getDfToSplit( df_diabetes_median )
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=13)
dfTrain = pd.concat( [X_train, y_train], axis=1)
dfTest  = pd.concat( [ X_test, y_test], axis=1)

s = setup( dfTrain, target='Outcome')

Unnamed: 0,Description,Value
0,session_id,6233
1,Target,Outcome
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(537, 9)"
5,Missing Values,False
6,Numeric Features,7
7,Categorical Features,1
8,Ordinal Features,False
9,High Cardinality Features,False


In [56]:
dicM['median'] = getModelsDic()

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7632,0.8974,0.6667,0.6154,0.64,0.4639,0.4648
1,0.8421,0.8558,0.5833,0.875,0.7,0.5986,0.6212
2,0.7368,0.7415,0.3846,0.7143,0.5,0.3426,0.3728
3,0.7105,0.7754,0.6923,0.5625,0.6207,0.3907,0.3962
4,0.7105,0.7662,0.3846,0.625,0.4762,0.2915,0.3079
5,0.7297,0.7533,0.4167,0.625,0.5,0.3248,0.3374
6,0.6486,0.7267,0.5,0.4615,0.48,0.2153,0.2157
7,0.7027,0.7967,0.5,0.5455,0.5217,0.3066,0.3073
8,0.7838,0.8967,0.8333,0.625,0.7143,0.546,0.5606
9,0.6486,0.81,0.6667,0.4706,0.5517,0.2767,0.2881


In [58]:
for model in dicM['median'].keys():
    predict_model( dicM['median'][model], data=dfTest );

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.7359,0.813,0.5402,0.6912,0.6065,0.4122,0.4193


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Gradient Boosting Classifier,0.7359,0.8148,0.5862,0.6711,0.6258,0.4232,0.4255


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,0.7403,0.8235,0.5402,0.7015,0.6104,0.4205,0.4285


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Ada Boost Classifier,0.7186,0.7674,0.5977,0.6341,0.6154,0.3938,0.3943


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Naive Bayes,0.6623,0.7234,0.2644,0.6216,0.371,0.1886,0.2208


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,SVM - Linear Kernel,0.6407,0.5389,0.1264,0.6111,0.2095,0.0923,0.1407


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,K Neighbors Classifier,0.7013,0.7299,0.5747,0.6098,0.5917,0.3565,0.3569


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Linear Discriminant Analysis,0.7446,0.802,0.5402,0.7121,0.6144,0.4288,0.4379


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extreme Gradient Boosting,0.7489,0.8076,0.6322,0.679,0.6548,0.4579,0.4586


#### Df NoZero: df_diabetes_nozero

In [59]:
X, y = getDfToSplit( df_diabetes_nozero )
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=13)
dfTrain = pd.concat( [X_train, y_train], axis=1)
dfTest  = pd.concat( [ X_test, y_test], axis=1)

s = setup( dfTrain, target='Outcome')

Unnamed: 0,Description,Value
0,session_id,8152
1,Target,Outcome
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(274, 9)"
5,Missing Values,False
6,Numeric Features,7
7,Categorical Features,1
8,Ordinal Features,False
9,High Cardinality Features,False


In [60]:
dicM['noZero'] = getModelsDic()

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8,0.8462,0.7143,0.7143,0.7143,0.5604,0.5604
1,0.8421,0.9231,0.6667,0.8,0.7273,0.6174,0.6225
2,0.7895,0.859,0.8333,0.625,0.7143,0.5529,0.5673
3,0.8421,0.869,0.7143,0.8333,0.7692,0.6503,0.6548
4,0.5789,0.8333,0.1429,0.3333,0.2,-0.027,-0.0315
5,0.6842,0.8095,0.4286,0.6,0.5,0.2785,0.2869
6,0.6842,0.7143,0.5714,0.5714,0.5714,0.3214,0.3214
7,0.6842,0.8929,0.7143,0.5556,0.625,0.3596,0.368
8,0.6842,0.6667,0.5714,0.5714,0.5714,0.3214,0.3214
9,0.8947,0.9643,0.8571,0.8571,0.8571,0.7738,0.7738


In [62]:
for model in dicM['noZero'].keys():
    predict_model( dicM['noZero'][model], data=dfTest );

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.7373,0.7803,0.6053,0.5897,0.5974,0.4025,0.4026


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Gradient Boosting Classifier,0.7712,0.7712,0.5789,0.6667,0.6197,0.4572,0.4596


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,0.7881,0.7974,0.5789,0.7097,0.6377,0.4901,0.4952


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Ada Boost Classifier,0.7627,0.7523,0.5263,0.6667,0.5882,0.4248,0.4306


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Naive Bayes,0.6949,0.6997,0.2632,0.5556,0.3571,0.1893,0.212


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,SVM - Linear Kernel,0.5169,0.623,0.9211,0.3933,0.5512,0.182,0.267


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,K Neighbors Classifier,0.6525,0.7186,0.5263,0.4651,0.4938,0.2308,0.2319


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Linear Discriminant Analysis,0.7288,0.7826,0.5789,0.5789,0.5789,0.3789,0.3789


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extreme Gradient Boosting,0.7881,0.8092,0.6579,0.6757,0.6667,0.5114,0.5115


# AFINAMENTO DE HIPERPARAMETROS 

In [18]:
'''
Sem tratamento
Substituição pela média
Substituição pela moda     Regressão Logística - acurácia 77.49%

Substituição pela mediana
Remoção dos dados          Extreme Gradient    - acurácia 78.81%

Substituição pela moda     Análise Discriminante de Linear - 76.19% 

Remoção dos dados          Random Forest                   - 78.81%


dicDf = {
    'normal':df_diabetes,
    'mean':df_diabetes_mean,
    'mode': df_diabetes_mode,
    'median':df_diabetes_median,
    'noZero':df_diabetes_nozero
}

[ 'lr', 'rf', 'lda' ]
''';

In [24]:
X, y = getDfToSplit( dicDf['mean'] )
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=13)
dfTrain = pd.concat( [X_train, y_train], axis=1 )
dfTest  = pd.concat( [ X_test, y_test], axis=1 )

s = setup( dfTrain, target='Outcome' )

Unnamed: 0,Description,Value
0,session_id,563
1,Target,Outcome
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(537, 9)"
5,Missing Values,False
6,Numeric Features,7
7,Categorical Features,1
8,Ordinal Features,False
9,High Cardinality Features,False


In [25]:
modelRf = create_model( 'rf' )

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8684,0.8365,0.6667,0.8889,0.7619,0.6735,0.6868
1,0.7895,0.8397,0.5833,0.7,0.6364,0.4899,0.494
2,0.8158,0.9231,0.5,0.8571,0.6316,0.5199,0.5534
3,0.5789,0.6354,0.1538,0.2857,0.2,-0.0519,-0.0565
4,0.7895,0.8908,0.5385,0.7778,0.6364,0.495,0.5116
5,0.8378,0.885,0.6667,0.8,0.7273,0.6132,0.6184
6,0.7027,0.79,0.4167,0.5556,0.4762,0.2745,0.28
7,0.7027,0.82,0.4167,0.5556,0.4762,0.2745,0.28
8,0.7568,0.75,0.5,0.6667,0.5714,0.4064,0.4146
9,0.5676,0.63,0.1667,0.25,0.2,-0.0803,-0.0834


In [26]:
predict_model( modelRf, data=dfTest );

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,0.7489,0.7966,0.4828,0.7636,0.5915,0.4233,0.4465


In [27]:
modelRfT = tune_model( modelRf )

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8158,0.8526,0.6667,0.7273,0.6957,0.5639,0.565
1,0.7895,0.8494,0.5833,0.7,0.6364,0.4899,0.494
2,0.8158,0.9199,0.5833,0.7778,0.6667,0.543,0.5537
3,0.5789,0.6338,0.2308,0.3333,0.2727,-0.01,-0.0103
4,0.8684,0.9292,0.6923,0.9,0.7826,0.6906,0.7028
5,0.8649,0.8733,0.6667,0.8889,0.7619,0.6702,0.6837
6,0.7297,0.8,0.4167,0.625,0.5,0.3248,0.3374
7,0.7838,0.8567,0.5833,0.7,0.6364,0.4843,0.4884
8,0.7568,0.74,0.5,0.6667,0.5714,0.4064,0.4146
9,0.5405,0.6433,0.25,0.2727,0.2609,-0.0716,-0.0717


In [28]:
predict_model( modelRfT, data=dfTest );

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,0.7489,0.8073,0.5057,0.7458,0.6027,0.4289,0.4462


### Análise Discriminante de Linear

In [51]:
modelLda = create_model( 'lda' )

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6842,0.7415,0.5385,0.5385,0.5385,0.2985,0.2985
1,0.7895,0.8423,0.5714,0.8,0.6667,0.519,0.5347
2,0.7632,0.7887,0.6429,0.6923,0.6667,0.4834,0.4842
3,0.7895,0.9107,0.7143,0.7143,0.7143,0.5476,0.5476
4,0.7105,0.7054,0.5714,0.6154,0.5926,0.3686,0.3692
5,0.7297,0.8494,0.4615,0.6667,0.5455,0.3621,0.3745
6,0.8649,0.8141,0.6154,1.0,0.7619,0.6749,0.7136
7,0.8108,0.8686,0.7692,0.7143,0.7407,0.5921,0.5931
8,0.8378,0.8365,0.7692,0.7692,0.7692,0.6442,0.6442
9,0.7838,0.8494,0.6154,0.7273,0.6667,0.5083,0.5122


In [52]:
predict_model( modelLda, data=dfTrain );

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Linear Discriminant Analysis,0.7877,0.8455,0.6298,0.7081,0.6667,0.5117,0.5136


In [53]:
predict_model( modelLda, data=dfTest );

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Linear Discriminant Analysis,0.7619,0.8198,0.6092,0.7162,0.6584,0.4775,0.4812


In [54]:
modelLdaT = tune_model( modelLda )

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7895,0.8123,0.7692,0.6667,0.7143,0.549,0.5525
1,0.7105,0.8452,0.3571,0.7143,0.4762,0.3056,0.3407
2,0.7632,0.7976,0.6429,0.6923,0.6667,0.4834,0.4842
3,0.8158,0.8988,0.7143,0.7692,0.7407,0.5982,0.5992
4,0.7105,0.7381,0.5714,0.6154,0.5926,0.3686,0.3692
5,0.7568,0.8718,0.4615,0.75,0.5714,0.4148,0.4386
6,0.8378,0.8333,0.6154,0.8889,0.7273,0.6172,0.6384
7,0.8649,0.8878,0.7692,0.8333,0.8,0.6982,0.6995
8,0.8649,0.8718,0.8462,0.7857,0.8148,0.7087,0.7099
9,0.7568,0.8558,0.5385,0.7,0.6087,0.4365,0.4445


In [55]:
predict_model( modelLdaT, data=dfTrain );

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Linear Discriminant Analysis,0.7784,0.845,0.6022,0.6987,0.6469,0.4867,0.4896


In [56]:
predict_model( modelLdaT, data=dfTest );

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Linear Discriminant Analysis,0.7749,0.8271,0.6207,0.7397,0.675,0.5048,0.5094


In [58]:
save_model(modelLdaT, 'modelLdaT7749')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True, features_todrop=[],
                                       id_columns=[],
                                       ml_usecase='classification',
                                       numerical_features=[], target='Outcome',
                                       time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numeric_stra...
                 ('dummy', Dummify(target='Outcome')),
                 ('fix_perfect', Remove_100(target='Outcome')),
                 ('clean_names', Clean_Colum_Names()),
                 ('feature_select', 'passthrough'), ('fix_multi', 'passthrough'),
                 (

In [59]:
save_model(modelLda, 'modelLda7619')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True, features_todrop=[],
                                       id_columns=[],
                                       ml_usecase='classification',
                                       numerical_features=[], target='Outcome',
                                       time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numeric_stra...
                 ('dummy', Dummify(target='Outcome')),
                 ('fix_perfect', Remove_100(target='Outcome')),
                 ('clean_names', Clean_Colum_Names()),
                 ('feature_select', 'passthrough'), ('fix_multi', 'passthrough'),
                 (

### Regressão Logística

In [73]:
modelLr = create_model( 'lr' )

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7561,0.7831,0.5714,0.6667,0.6154,0.4384,0.4411
1,0.7561,0.8042,0.6429,0.6429,0.6429,0.4577,0.4577
2,0.7561,0.8862,0.5,0.7,0.5833,0.4176,0.4294
3,0.825,0.8234,0.6923,0.75,0.72,0.593,0.594
4,0.75,0.8348,0.4615,0.6667,0.5455,0.3808,0.3931
5,0.8,0.8205,0.5385,0.7778,0.6364,0.5046,0.5209
6,0.725,0.7151,0.4615,0.6,0.5217,0.3333,0.339
7,0.725,0.8063,0.6923,0.5625,0.6207,0.4086,0.414
8,0.775,0.7088,0.5,0.7778,0.6087,0.4611,0.4832
9,0.675,0.7555,0.4286,0.5455,0.48,0.2486,0.2524


In [74]:
predict_model( modelLr, data=dfTrain );

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.776,0.8374,0.5657,0.7226,0.6346,0.4765,0.484


In [75]:
predict_model( modelLr, data=dfTest );

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.776,0.813,0.5714,0.7547,0.6504,0.4902,0.5005


In [76]:
lrMTuned = tune_model( modelLr )

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7561,0.7831,0.5714,0.6667,0.6154,0.4384,0.4411
1,0.7561,0.8042,0.6429,0.6429,0.6429,0.4577,0.4577
2,0.7561,0.8862,0.5,0.7,0.5833,0.4176,0.4294
3,0.825,0.8234,0.6923,0.75,0.72,0.593,0.594
4,0.75,0.8319,0.4615,0.6667,0.5455,0.3808,0.3931
5,0.8,0.8177,0.5385,0.7778,0.6364,0.5046,0.5209
6,0.725,0.7208,0.4615,0.6,0.5217,0.3333,0.339
7,0.725,0.8063,0.6923,0.5625,0.6207,0.4086,0.414
8,0.775,0.7088,0.5,0.7778,0.6087,0.4611,0.4832
9,0.675,0.7555,0.4286,0.5455,0.48,0.2486,0.2524


In [77]:
predict_model( lrMTuned, data=dfTrain );

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.7743,0.8376,0.5606,0.7208,0.6307,0.4718,0.4795


In [78]:
predict_model( lrMTuned, data=dfTest );

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.776,0.8133,0.5714,0.7547,0.6504,0.4902,0.5005


In [79]:
modelLr

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=7088, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [81]:
lrMTuned

LogisticRegression(C=0.96, class_weight={}, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=7088, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [82]:
!ls

HackerRank.ipynb
[34mImagens TCC2[m[m
Testes Diabetes.ipynb
[34m__pycache__[m[m
best_pycaret_model.pkl
bot.ipynb
diabetes.csv
[34mgit xgboost[m[m
[34minstagram[m[m
logs.log
python diabetes model final - cópia.ipynb
python diabetes model final.ipynb
python diabetes model.ipynb
testeInsta.ipynb
testePython.py
xgboostM8136.pkl
xgboostM8136.pkl - cópia


In [83]:
save_model(lrMTuned, 'lrMTuned7760')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True, features_todrop=[],
                                       id_columns=[],
                                       ml_usecase='classification',
                                       numerical_features=[], target='Outcome',
                                       time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numeric_stra...
                 ('feature_select', 'passthrough'), ('fix_multi', 'passthrough'),
                 ('dfs', 'passthrough'), ('pca', 'passthrough'),
                 ['trained_model',
                  LogisticRegression(C=0.96, class_weight={}, dual=False,
                  

In [37]:
modelLr

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=3123, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [35]:
lrMTuned

LogisticRegression(C=1.78, class_weight={}, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=3123, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [111]:
X, y = getDfToSplit( df_diabetes )
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=13)
dfTrain = pd.concat( [X_train, y_train], axis=1)
dfTest  = pd.concat( [ X_test, y_test], axis=1)

s = setup( dfTrain, target='Outcome')

Unnamed: 0,Description,Value
0,session_id,1494
1,Target,Outcome
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(537, 9)"
5,Missing Values,False
6,Numeric Features,7
7,Categorical Features,1
8,Ordinal Features,False
9,High Cardinality Features,False


In [112]:
lrM = create_model( 'lr' )

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7368,0.7692,0.6154,0.6154,0.6154,0.4154,0.4154
1,0.7105,0.8092,0.3077,0.6667,0.4211,0.2615,0.2962
2,0.6579,0.7938,0.5385,0.5,0.5185,0.2538,0.2542
3,0.6842,0.7815,0.3846,0.5556,0.4545,0.2425,0.2506
4,0.7632,0.8214,0.7143,0.6667,0.6897,0.4985,0.4993
5,0.8108,0.7821,0.5385,0.875,0.6667,0.5448,0.5761
6,0.8108,0.859,0.7692,0.7143,0.7407,0.5921,0.5931
7,0.8919,0.9135,0.7692,0.9091,0.8333,0.7542,0.7599
8,0.7838,0.8109,0.4615,0.8571,0.6,0.4695,0.5118
9,0.7297,0.7981,0.6154,0.6154,0.6154,0.4071,0.4071


In [113]:
predict_model( lrM, data=dfTrain );

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.7691,0.8342,0.5801,0.6863,0.6287,0.4629,0.4663


In [114]:
predict_model( lrM, data=dfTest );

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.7749,0.8163,0.5977,0.7536,0.6667,0.5001,0.5078


In [115]:
lrMTuned = tune_model( lrM )

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7368,0.7754,0.6154,0.6154,0.6154,0.4154,0.4154
1,0.7105,0.8185,0.3077,0.6667,0.4211,0.2615,0.2962
2,0.6579,0.7908,0.5385,0.5,0.5185,0.2538,0.2542
3,0.6842,0.7846,0.3846,0.5556,0.4545,0.2425,0.2506
4,0.7632,0.8274,0.7143,0.6667,0.6897,0.4985,0.4993
5,0.7838,0.7917,0.4615,0.8571,0.6,0.4695,0.5118
6,0.8108,0.8654,0.7692,0.7143,0.7407,0.5921,0.5931
7,0.8919,0.9071,0.7692,0.9091,0.8333,0.7542,0.7599
8,0.7838,0.8109,0.4615,0.8571,0.6,0.4695,0.5118
9,0.7297,0.7981,0.6154,0.6154,0.6154,0.4071,0.4071


In [116]:
predict_model( lrMTuned, data=dfTrain );

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.7691,0.834,0.5856,0.6839,0.631,0.4644,0.4673


In [117]:
predict_model( lrMTuned, data=dfTest );

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.7749,0.8184,0.5977,0.7536,0.6667,0.5001,0.5078


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [86]:
predict_model( dicM['noZero']['xgboost'], data=dfTrain );

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extreme Gradient Boosting,0.797,0.8348,0.5525,0.7812,0.6472,0.5106,0.5257


In [87]:
predict_model( dicM['noZero']['xgboost'], data=dfTest );

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extreme Gradient Boosting,0.8095,0.8559,0.6207,0.8308,0.7105,0.573,0.5865


In [74]:
dicM.keys()

dict_keys(['normal', 'mean', 'mode', 'median', 'noZero'])

In [78]:
tuned = tune_model( dicM['noZero']['xgboost'] )

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6667,0.8878,0.8571,0.5,0.6316,0.3636,0.4082
1,0.8095,0.7959,0.8571,0.6667,0.75,0.6,0.6124
2,0.7619,0.9439,1.0,0.5833,0.7368,0.5455,0.6124
3,0.7143,0.7959,1.0,0.5385,0.7,0.4706,0.5547
4,0.5714,0.8673,1.0,0.4375,0.6087,0.2703,0.3953
5,0.75,0.7262,0.6667,0.5714,0.6154,0.4318,0.4346
6,0.85,0.9011,0.7143,0.8333,0.7692,0.6591,0.6634
7,0.6,0.7033,0.7143,0.4545,0.5556,0.2233,0.2423
8,0.85,0.8242,0.8571,0.75,0.8,0.6809,0.6847
9,0.7,0.7473,0.5714,0.5714,0.5714,0.3407,0.3407


In [84]:
dfTrain.head(3)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
747,1,81,74,41,57,46.3,1.096,32,0
419,3,129,64,29,115,26.4,0.219,28,1
576,6,108,44,20,130,24.0,0.813,35,0


In [136]:
dfTest.head(3)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
99,1,122,90,51,220,49.7,0.325,31,1
208,1,96,64,27,87,33.2,0.289,21,0
88,15,136,70,32,110,37.1,0.153,43,1


In [85]:
dfTest.head(3)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
99,1,122,90,51,220,49.7,0.325,31,1
208,1,96,64,27,87,33.2,0.289,21,0
88,15,136,70,32,110,37.1,0.153,43,1


  from pandas import MultiIndex, Int64Index
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  from pandas import MultiIndex, Int64Index
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  from pandas import MultiIndex, Int64Index
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  from pandas import MultiIndex, Int64Index
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


In [63]:
0.02645833*558

14.763748139999999

In [110]:
tuned

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7,
              enable_categorical=False, gamma=0, gpu_id=-1,
              importance_type=None, interaction_constraints='',
              learning_rate=0.01, max_delta_step=0, max_depth=3,
              min_child_weight=2, missing=nan, monotone_constraints='()',
              n_estimators=40, n_jobs=-1, num_parallel_tree=1,
              objective='binary:logistic', predictor='auto', random_state=2393,
              reg_alpha=0.7, reg_lambda=0.7, scale_pos_weight=3.8, subsample=1,
              tree_method='auto', use_label_encoder=True, validate_parameters=1,
              verbosity=0)

In [132]:
tuned

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=-1,
              num_parallel_tree=1, objective='binary:logistic',
              predictor='auto', random_state=6873, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=1, tree_method='auto',
              use_label_encoder=True, validate_parameters=1, verbosity=0)

### Tunando modelo 

In [128]:
lp = '''base_score=0.5, booster='gbtree', colsample_bylevel=1,
  colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
  gamma=0, gpu_id=-1, importance_type=None,
  interaction_constraints='', learning_rate=0.300000012,
  max_delta_step=0, max_depth=6, min_child_weight=1,
  monotone_constraints='()', n_estimators=100, n_jobs=-1,
  num_parallel_tree=1, objective='binary:logistic',
  predictor='auto', random_state=6873, reg_alpha=0, reg_lambda=1,
  scale_pos_weight=1, subsample=1, tree_method='auto',
  use_label_encoder=True, validate_parameters=1, verbosity=0'''

ll = [ l.split("=") for l in lp.replace("\n  ","").replace(" ","").split(",")]

In [131]:
for l in ll:
    exec(f"tuned.{l[0]} = {l[1]}")

In [None]:
predict_model( tuned, data=dfTrain );

# Melhor Modelo 0.81 81

In [29]:
mx = load_model('xgboostM8136')

Transformation Pipeline and Model Successfully Loaded


In [30]:
predict_model( mx, data=dfTest );

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extreme Gradient Boosting,0.8139,0.8553,0.7241,0.7683,0.7456,0.599,0.5997


In [24]:
predict_model( mx, data=dfTest );

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extreme Gradient Boosting,0.8136,0.7918,0.6579,0.7353,0.6944,0.5609,0.5627


In [None]:
for key in dicDf.keys():
    print(f"Analisando DataFrame {key}")
    df = dicDf[ key ]
    
    #Dividindo em treino e teste
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=13)
    dfTrain = pd.concat( [X_train, y_train], axis=1)
    dfTest  = pd.concat( [ X_test, y_test], axis=1)
    
    s = setup( dfTrain, target='Outcome')
    compare_models( n_select=3 )
    print("- - - - - - - - "*4)

In [37]:
'''
PEGAR O MELHOR MODELO DE CADA CENARIO E TUNAR ELE

FALAR DA CRIAÇÃO DO BOT
''';