# 0.0 Imports 

In [167]:
import random

import numpy   as np
import pandas  as pd
import seaborn as sns
import xgboost as xgb


from IPython.display      import Image
from pandas_profiling     import ProfileReport
from IPython.core.display import HTML

from boruta                        import BorutaPy
from sklearn                       import tree
from sklearn                       import svm
from lightgbm                      import LGBMClassifier
from sklearn.dummy                 import DummyClassifier
from sklearn.metrics               import classification_report
from sklearn.metrics               import precision_score, accuracy_score, f1_score, recall_score
from sklearn.ensemble              import RandomForestClassifier
from sklearn.neighbors             import NearestCentroid
from sklearn.naive_bayes           import GaussianNB
from sklearn.linear_model          import LogisticRegression,SGDClassifier
from sklearn.model_selection       import StratifiedKFold, cross_val_score, train_test_split, RandomizedSearchCV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis


## 0.1 Helper functions

In [136]:
def calculate_model_metrics( y_test, y_pred ):
    
    #calculate precision
    precision = precision_score( y_test, y_pred)

    #calculate accuracy
    accuracy = accuracy_score( y_test, y_pred)

    #calculate f1_score
    f1 = f1_score( y_test, y_pred)

    #calculate recall
    recall = recall_score( y_test, y_pred)
    
    return precision, accuracy, f1, recall
    def get_classifiers_performance(X_train, X_test, y_train, y_test, classifiers):

        # creates empty data frame
        df_performance = pd.DataFrame()

        for clf in classifiers:
            print("Training " + type(clf).__name__ + "...")
            # fits the classifier to training data
            clf.fit(X_train, y_train)

            # predict the probabilities
            y_pred = clf.predict(X_test)

            # calculates model metrics
            clf_precision, clf_accuracy, clf_f1, clf_recall = calculate_model_metrics( y_test, y_pred)

            # creates a dict
            clf_dict = {
                'model': [type(clf).__name__, '---'],
                'precision': [clf_precision, np.nan],
                'recall': [clf_recall, np.nan],
                'f1-Score': [clf_f1, np.nan],
                'accuracy': [clf_accuracy, np.nan]}

            # concatenate Data Frames
            df_performance = pd.concat([df_performance, pd.DataFrame(clf_dict)])

        # resets Data Frame index
        df_performance = df_performance.reset_index()

        # drops index
        df_performance.drop('index', axis=1, inplace=True)

        # gets only the odd numbered rows
        rows_to_drop = np.arange(1, len(classifiers)*2, 2)

        # drops unwanted rows that have no data
        df_performance.drop(rows_to_drop, inplace=True)

        # returns performance summary
        return df_performance


In [137]:
def jupyter_settings():
    %matplotlib inline
    %pylab inline
    
    plt.style.use( 'bmh' )
    plt.rcParams['figure.figsize'] = [20, 10]
    plt.rcParams['font.size'] = 24
    
    display( HTML( '<style>.container { width:100% !important; }</style>') )
    pd.options.display.max_columns = None
    pd.options.display.max_rows = None
    pd.set_option( 'display.expand_frame_repr', False )
    
    sns.set()

In [121]:
jupyter_settings()
pd.set_option('display.float_format', lambda x: '%.5f' % x)

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


## 0.2 Data Loading

In [4]:
#open csv file

df_raw = pd.read_csv('../data/cardio_train.csv')


# 1.0 Data Description

In [6]:
df1 = df_raw.copy()

## 1.1 Rename Columns 

In [None]:
df1.columns


## 1.2 Data Dimensions

In [None]:
print("Number of rows {}".format(df1.shape[0]))
print("Number of cols {}".format(df1.shape[1]))

## 1.3 Data Types 

In [None]:
df1.dtypes

## 1.4 Check NA 


In [None]:
df1.isna().sum()


## 1.5 Descriptive Statistical

In [None]:
#Central tendency - mean, median
ct1 = pd.DataFrame(df1.apply(np.mean)).T
ct2 = pd.DataFrame(df1.apply(np.median)).T

#dispersion - std, min, max,  range, skew, kurtosis
d1 = pd.DataFrame(df1.apply(np.std)).T
d2 = pd.DataFrame(df1.apply(min)).T
d3 = pd.DataFrame(df1.apply(max)).T
d4 = pd.DataFrame(df1.apply(lambda x: x.max() - x.min())).T
d5 = pd.DataFrame(df1.apply(lambda x: x.skew())).T
d6 = pd.DataFrame(df1.apply(lambda x: x.kurtosis())).T

#concatenate
m = pd.concat([d2,d3,d4,ct1,ct2,d1,d5,d6]).T.reset_index()

m.columns = ['Atrib.','min','max','range','mean','median','std','skew','kurtosis']

In [None]:
m

# 2.0 Feature Engineering

In [7]:
df2 =df1.copy()

## 2.1 Mind Map

In [None]:
Image('img/mind_map_cardio.png')

## 2.1 Criação de Hipóteses

1. pessoas acima do peso deveriam ser mais propensas a ter cardio disease

2. pessoas acima do peso e fumantes deveriam ser mais propensas a ter cardio disease

3. pessoas acima do peso,fumantes e que não fazem atividade fisica deveriam ser mais propensas a ter cardio disease

4. pessoas fumantes e com colesterol muito acima do normal deveriam ser mais propensas a ter cardio disease

5. Homens com colesterol acima do normal e que não praticam atividade fisica deveriam ter mais chance de ter cardio disease

6. pessoas que com glicose mto acima do normal e colesterol mto acima do normal e que não praticam atividade fisica deveriam ser mais propensas a ter cardio disease

7. pessoas mais velhas e acima do peso deveriam ser mais propensas a ter cardio disease

8. pessoas mais velhas e com o colesterol mto acima do normal deveriam ser mais propensas a ter cardio disease

9. Homens deveriam ter mais chance de adquirir cardio disease do que as mulheres

10. homens que fumam e bembe deveriam ter mais chances do que mulheres que fumam e bebem.

## 2.2 Feature Engineering

In [8]:
#blood_pressure
# 1 - ap_hi < 120 e ap_lo < 80 
# 2 -  120<= ap_hi <= 129 e ap_lo < 80
# 3 -  130<= ap_hi <= 139 e 80 <= ap_lo <= 89
# 4 -  139 < ap_hi  e 89 < ap_lo 

df2['blood_pressure'] = df2.apply( (lambda x: 1 if (x['ap_hi'] < 120) & (x['ap_lo'] <= 80) else 2 if (120 <= x['ap_hi'] <= 129) & (x['ap_lo'] < 80) else 3 if (130 <= x['ap_hi'] < 139) or ( 80 <= x['ap_lo'] <= 89) else 4 ),axis =1 ) 

 
    



In [9]:
#IMC = WEIGHT/HEIGHT^2
df2['height2'] = df2['height']/100

df2['imc'] = (df2['weight']/(df2['height2']**2))

In [10]:
#imc_type
df2['imc_type'] = df2.apply( (lambda x : 1 if x['imc'] < 18.5 else 2 if  18.5 <= x['imc'] < 25.0 else 3 if 25 <= x['imc'] < 30.0 else 4 if  30.0 <= x['imc'] < 35.0 else 5 if 35 <= x['imc'] < 40.0 else 6) ,axis=1)

In [12]:
#numero de pessoas com cardio por idade
df2['age_years'] = df2['age'].apply(lambda x: x/365)

#arredondar

df2['age_years'] = round(df2['age_years'], ndigits=0).astype(int)



## 2.3 Exclusão de colunas auxiliares

In [13]:
cols_drop = ['height2','imc']
df2 = df2.drop(cols_drop,axis=1)

# 3.0 EDA

In [14]:
df3 = df2.copy()

## 3.1 Analise Univariada

### 3.1.2 Variaveis Numericas 

In [None]:
df3.hist(bins=25);

In [None]:
#more detailed plot of variables

#ProfileReport(df3, title="Pandas Profiling Report")


## 3.2 Analise Bivariada 

### H1- Pessoas acima  do peso ideal(levando em consideração o IMC) deveriam ser mais propensas a ter cardio disease
**TRUE** pessoas acima do seu ideal são mais propensas a ter cardio disease

In [None]:

#plots graph
sns.countplot(x='imc_type', hue='cardio', data=df3);

#label
plt.title("Cardio por Tipo IMC");
plt.xlabel("Tipos de IMC");
plt.ylabel("Quantidade");

#plt.show()







### H2- Pessoas mais velhas  deveriam ser mais propensas a ter cardio disease
**TRUE** Pessoas mais velhas tem um indice maior de cardio disease

In [None]:

#plot graph
sns.countplot(x='age_years',hue='cardio',data=df3);



### H3- Pessoas que fumam deveriam ser mais propensas a ter cardio disease
**FALSE** pessoas que NÃO FUMAM tem um indice MAIOR de ter cardio diseas

In [None]:


#plots graph
sns.countplot(x='smoke',hue='cardio', data=df3);




### H4- Pessoas que bebem alcool deveriam ser mais propensas a ter cardio disease
**False** Pessoas que BEBEM NÃO TEM mais propensão para cardio disease

In [None]:
#plot graph
sns.countplot(x='alco',hue='cardio',data =df3);


plt.xlabel("Consome Alcool?");

### H5- Pessoas que não praticam atividade fisica deveriam ser mais propensas a ter cardio disease
**Verdade** 

In [None]:
#plot graph
sns.countplot(x='active', hue= 'cardio',data = df3);

plt.xlabel("pratica atividade fisica?");


### H6- Pessoas que tem o colesterol muito acima do normal deveriam ser mais propensas a ter cardio disease
**VERDADE**

In [None]:
#plot graph
sns.countplot(x = 'cholesterol', hue= 'cardio', data = df3);

plt.xticks(ticks=[0,1,2],labels=['Normal','Acima do Normal','Muito acima do normal']);
plt.xlabel("Nivel de colesterol");


### H7- Pessoas que tem o blood pressure 3 e 4 deveriam ser mais propensas a ter cardio disease
**Apenas pessoas com blood pressure 4 tem mais propensão**

In [None]:
#plot graph
sns.countplot(x='blood_pressure', hue = 'cardio' ,data = df3);



### H8- Pessoas que tem nivel de glicose mais alto deveriam ser mais propensas a ter cardio disease
**VERDADE**

In [None]:

#plot graph
sns.countplot(x = 'gluc' , hue = 'cardio',data = df3);

plt.title("Cardio por Nivel de Glicose",fontsize=20);
plt.xlabel("Nivel de Glicose");
plt.ylabel("Quantidade")
plt.xticks(ticks=[0,1,2],labels=['Normal','Acima do normal','Muito acima do normal']);


### H9 Homens deveriam ter mais chance de adquirir cardio disease do que as mulheres.
**FALSO**



In [None]:

sns.countplot(x = 'gender', hue = 'cardio', data = df3);
plt.title("Cardio Por Genero");
plt.xticks(ticks=[0,1],labels=['Homem','Mulher']);


### H10 homens que fumam deveriam ter mais chances do que mulheres que fumam 
**True**

In [None]:

sns.barplot(x = 'smoke', y = 'cardio', hue = 'gender', data = df3);



## 3.3 Analise Multivariada 

### 3.3.1 Variaveis numericas 

In [None]:
#Image('img/Pearson.png')
num = df3.select_dtypes(include = ['float64', 'int64'])
corre = num.corr(method = 'pearson')
sns.heatmap(corre,annot=True);

### 3.3.2 Variaveis Categoricas


In [None]:
Image('img/Cramer.png')

# 4.0 Data Preparation 

In [15]:
df4 = df3.copy()

# 5.0 Feature Selection

In [16]:
df5 = df4.copy()

## 5.1 split train and test dataset 

In [17]:
#training data set
x_train, x_test, y_train, y_test = train_test_split(df5,df5['cardio'], random_state = 42, test_size= 0.25)

In [18]:
#delete columns
x_train = x_train.drop( [ 'id','cardio','age_years'], axis=1 )
x_test = x_test.drop( ['id','cardio','age_years'], axis=1  )



In [19]:
#training dataset for cross validation
x_train_cv = x_train
y_train_cv = y_train

## 5.2 Boruta as Feature Selector

# 6.0 Machine Learning Models

In [20]:
df6 = df5.copy()

## 6.1 Dummy Classifier - Baseline 

## 6.2 Nearest Centroid Classifier

## 6.3 Decision Tree Classifier 




## 6.4 Naive Bayes


## 6.5 Logistic Regression

## 6.6 LDA classifier

## 6.7 SVM Classifier 

##  6.8 XGBClassifier 

## 6.9 SGDClassifier 

## 6.10 LGBMClassifier 

# 7.0 Cross validation

In [21]:
#kfold strategy
cv1 = StratifiedKFold( n_splits = 10)

## 7.1 Nearest Centroid Classifier - Cross validation

In [22]:
nc = NearestCentroid()
cv_scores = cross_val_score( nc, x_train_cv, np.ravel( y_train_cv), cv =cv1, scoring ='precision', n_jobs=-1 )
print(" média precisão %0.4f (+/- %0.4f)" %(cv_scores.mean(), cv_scores.std()*2 ) )

cv_scores = cross_val_score( nc, x_train_cv,  y_train_cv, cv =cv1, scoring ='recall', n_jobs=-1 )
print("\n média recall %0.4f (+/- %0.4f)" %(cv_scores.mean(), cv_scores.std()*2 ) )

cv_scores = cross_val_score( nc, x_train_cv, np.ravel( y_train_cv), cv =cv1, scoring ='f1', n_jobs=-1 )
print("\n média f1 %0.4f (+/- %0.4f)" %(cv_scores.mean(), cv_scores.std()*2 )  )
      

 média precisão 0.5867 (+/- 0.0148)

 média recall 0.6449 (+/- 0.0262)

 média f1 0.6144 (+/- 0.0182)


## 7.2 Decision Tree Classifier - Cross Validation




In [23]:
dtc = tree.DecisionTreeClassifier()
cv_scores = cross_val_score( dtc, x_train_cv, np.ravel( y_train_cv), cv =cv1, scoring ='precision', n_jobs=-1 )
print(" média precisão %0.4f (+/- %0.4f)" %(cv_scores.mean(), cv_scores.std()*2 ) )

cv_scores = cross_val_score( dtc, x_train_cv,  y_train_cv, cv =cv1, scoring ='recall', n_jobs=-1 )
print("\n média recall %0.4f (+/- %0.4f)" %(cv_scores.mean(), cv_scores.std()*2 ) )

cv_scores = cross_val_score( dtc, x_train_cv, np.ravel( y_train_cv), cv =cv1, scoring ='f1', n_jobs=-1 )
print("\n média f1 %0.4f (+/- %0.4f)" %(cv_scores.mean(), cv_scores.std()*2 )  )
      

 média precisão 0.6311 (+/- 0.0073)

 média recall 0.6309 (+/- 0.0199)

 média f1 0.6302 (+/- 0.0130)


## 7.3 Naive Bayes - Cross Validation


In [24]:
gnb = GaussianNB()
cv_scores = cross_val_score( gnb, x_train_cv, np.ravel( y_train_cv), cv =cv1, scoring ='precision', n_jobs=-1 )
print(" média precisão %0.4f (+/- %0.4f)" %(cv_scores.mean(), cv_scores.std()*2 ) )

cv_scores = cross_val_score( gnb, x_train_cv,  y_train_cv, cv =cv1, scoring ='recall', n_jobs=-1 )
print("\n média recall %0.4f (+/- %0.4f)" %(cv_scores.mean(), cv_scores.std()*2 ) )

cv_scores = cross_val_score( gnb, x_train_cv, np.ravel( y_train_cv), cv =cv1, scoring ='f1', n_jobs=-1 )
print("\n média f1 %0.4f (+/- %0.4f)" %(cv_scores.mean(), cv_scores.std()*2 )  )
      

 média precisão 0.7276 (+/- 0.0137)

 média recall 0.4271 (+/- 0.0635)

 média f1 0.5375 (+/- 0.0523)


## 7.4 Logistic Regression - Cross Validation

In [25]:
lr = LogisticRegression()
cv_scores = cross_val_score( lr, x_train_cv, np.ravel( y_train_cv), cv =cv1, scoring ='precision', n_jobs=-1 )
print(" média precisão %0.4f (+/- %0.4f)" %(cv_scores.mean(), cv_scores.std()*2 ) )

cv_scores = cross_val_score( lr, x_train_cv,  y_train_cv, cv =cv1, scoring ='recall', n_jobs=-1 )
print("\n média recall %0.4f (+/- %0.4f)" %(cv_scores.mean(), cv_scores.std()*2 ) )

cv_scores = cross_val_score( lr, x_train_cv, np.ravel( y_train_cv), cv =cv1, scoring ='f1', n_jobs=-1 )
print("\n média f1 %0.4f (+/- %0.4f)" %(cv_scores.mean(), cv_scores.std()*2 )  )

 média precisão 0.7025 (+/- 0.0163)

 média recall 0.6757 (+/- 0.0634)

 média f1 0.6884 (+/- 0.0292)


## 7.5 LDA classifier - Cross Validation

In [26]:
lda = LinearDiscriminantAnalysis()
cv_scores = cross_val_score( lda, x_train_cv, np.ravel( y_train_cv), cv =cv1, scoring ='precision', n_jobs=-1 )
print(" média precisão %0.4f (+/- %0.4f)" %(cv_scores.mean(), cv_scores.std()*2 ) )

cv_scores = cross_val_score( lda, x_train_cv,  y_train_cv, cv =cv1, scoring ='recall', n_jobs=-1 )
print("\n média recall %0.4f (+/- %0.4f)" %(cv_scores.mean(), cv_scores.std()*2 ) )

cv_scores = cross_val_score( lda, x_train_cv, np.ravel( y_train_cv), cv =cv1, scoring ='f1', n_jobs=-1 )
print("\n média f1 %0.4f (+/- %0.4f)" %(cv_scores.mean(), cv_scores.std()*2 )  )

 média precisão 0.6874 (+/- 0.0128)

 média recall 0.7253 (+/- 0.0221)

 média f1 0.7058 (+/- 0.0131)


##  7.6 SVM Classifier  - Cross Validation

In [27]:
svm = svm.SVC()
cv_scores = cross_val_score( svm, x_train_cv, np.ravel( y_train_cv), cv =cv1, scoring ='precision', n_jobs=-1 )
print(" média precisão %0.4f (+/- %0.4f)" %(cv_scores.mean(), cv_scores.std()*2 ) )

cv_scores = cross_val_score( svm, x_train_cv,  y_train_cv, cv =cv1, scoring ='recall', n_jobs=-1 )
print("\n média recall %0.4f (+/- %0.4f)" %(cv_scores.mean(), cv_scores.std()*2 ) )

cv_scores = cross_val_score( svm, x_train_cv, np.ravel( y_train_cv), cv =cv1, scoring ='f1', n_jobs=-1 )
print("\n média f1 %0.4f (+/- %0.4f)" %(cv_scores.mean(), cv_scores.std()*2 )  )



KeyboardInterrupt: 

##  7.7 XGBClassifier - Cross Validation


In [28]:
xgb = xgb.XGBClassifier(objective='binary:logistic')
cv_scores = cross_val_score( xgb, x_train_cv, np.ravel( y_train_cv), cv =cv1, scoring ='precision', n_jobs=-1 )
print(" média precisão %0.4f (+/- %0.4f)" %(cv_scores.mean(), cv_scores.std()*2 ) )

cv_scores = cross_val_score( xgb, x_train_cv,  y_train_cv, cv =cv1, scoring ='recall', n_jobs=-1 )
print("\n média recall %0.4f (+/- %0.4f)" %(cv_scores.mean(), cv_scores.std()*2 ) )

cv_scores = cross_val_score( xgb, x_train_cv, np.ravel( y_train_cv), cv =cv1, scoring ='f1', n_jobs=-1 )
print("\n média f1 %0.4f (+/- %0.4f)" %(cv_scores.mean(), cv_scores.std()*2 )  )

 média precisão 0.7522 (+/- 0.0094)

 média recall 0.6908 (+/- 0.0173)

 média f1 0.7202 (+/- 0.0109)


## 7.8 SGDClassifier - Cross Validation


In [29]:
sgd = SGDClassifier(max_iter=1000, tol=1e-3)
cv_scores = cross_val_score( sgd, x_train_cv, np.ravel( y_train_cv), cv =cv1, scoring ='precision', n_jobs=-1 )
print(" média precisão %0.4f (+/- %0.4f)" %(cv_scores.mean(), cv_scores.std()*2 ) )

cv_scores = cross_val_score( sgd, x_train_cv,  y_train_cv, cv =cv1, scoring ='recall', n_jobs=-1 )
print("\n média recall %0.4f (+/- %0.4f)" %(cv_scores.mean(), cv_scores.std()*2 ) )

cv_scores = cross_val_score( sgd, x_train_cv, np.ravel( y_train_cv), cv =cv1, scoring ='f1', n_jobs=-1 )
print("\n média f1 %0.4f (+/- %0.4f)" %(cv_scores.mean(), cv_scores.std()*2 )  )

 média precisão 0.6905 (+/- 0.2635)

 média recall 0.7176 (+/- 0.7902)

 média f1 0.3086 (+/- 0.5335)


## 7.9 LGBMClassifier - Cross Validation

In [31]:
lgbm_clf = LGBMClassifier()
cv_scores = cross_val_score( lgbm_clf, x_train_cv, np.ravel( y_train_cv), cv =cv1, scoring ='precision', n_jobs=-1 )
print("média precisão %0.4f (+/- %0.4f)" %(cv_scores.mean(), cv_scores.std()*2 ) )

cv_scores = cross_val_score( lgbm_clf, x_train_cv,  y_train_cv, cv =cv1, scoring ='recall', n_jobs=-1 )
print("\n média recall %0.4f (+/- %0.4f)" %(cv_scores.mean(), cv_scores.std()*2 ) )

cv_scores = cross_val_score( lgbm_clf, x_train_cv, np.ravel( y_train_cv), cv =cv1, scoring ='f1', n_jobs=-1 )
print("\n média f1 %0.4f (+/- %0.4f)" %(cv_scores.mean(), cv_scores.std()*2 )  )
      
cv_scores = cross_val_score( lgbm_clf, x_train_cv, np.ravel( y_train_cv), cv =cv1, scoring ='roc_auc', n_jobs=-1 )
print("\n média roc %0.4f (+/- %0.4f)" %(cv_scores.mean(), cv_scores.std()*2 ))

média precisão 0.7562 (+/- 0.0147)

 média recall 0.6934 (+/- 0.0167)

 média f1 0.7234 (+/- 0.0114)

 média roc 0.8021 (+/- 0.0121)


# 8.0 Hyperparemeter fine tuning

## 8.1 Random search


In [164]:
# instantiate the classifier

lgbm_clf = LGBMClassifier(random_state=33, n_jobs=-1)

In [165]:
 # set up space dictionary with specified hyperparameters
param = {'max_depth': np.arange(2, 12, 2), 
         'num_leaves': 2 ** np.arange(2, 10, 2),
         'min_data_in_leaf': np.arange(100, 1050, 50), 
         'learning_rate': np.linspace(0.001, 0.6, 15),
         'colsample_bytree': np.linspace(0.1, 1, 5),
         'subsample': np.linspace(0.25, 1, 15),
         'n_estimators': np.arange(10, 105, 15)}

In [173]:
# prepare RandomizedSearchCV 
lgbm_clf_cv = RandomizedSearchCV(estimator=lgbm_clf, param_distributions=param,
                                scoring='recall', n_iter=100, cv=5, verbose=2,
                                random_state=33, n_jobs=-1)

In [174]:
# fits 
lgbm_clf_cv.fit(x_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 150 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done 353 tasks      | elapsed:   14.8s




[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:   21.3s finished


RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=LGBMClassifier(boosting_type='gbdt',
                                            class_weight=None,
                                            colsample_bytree=0.526,
                                            importance_type='split',
                                            learning_rate=0.2465, max_depth=-1,
                                            min_child_samples=20,
                                            min_child_weight=0.001,
                                            min_split_gain=0.0,
                                            n_estimators=5000, n_jobs=-1,
                                            num_leaves=31, objective=None,
                                            random_state=33, reg_alpha=0.0,
                                            reg_lambda=0.0,...
                                        'n_estimators': array([ 10,  25,  40,  55,  70,  85, 100]),
                          

In [178]:
#best parameters
lgbm_clf_cv.best_estimator_


LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.775,
               importance_type='split', learning_rate=0.5144285714285713,
               max_depth=4, min_child_samples=20, min_child_weight=0.001,
               min_data_in_leaf=900, min_split_gain=0.0, n_estimators=40,
               n_jobs=-1, num_leaves=4, objective=None, random_state=33,
               reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=0.625,
               subsample_for_bin=200000, subsample_freq=0)

## 8.2 Final Model


In [181]:
#MODEL
lgbm = lgbm_clf_cv.best_estimator_



## 8.3 Metrics Final Model

In [186]:
#Perfomance cross validation

cv_scores = cross_val_score( lgbm, x_train_cv, np.ravel( y_train_cv), cv =cv1, scoring ='precision', n_jobs=-1 )
print("Average Precision %0.4f (+/- %0.4f)" %(cv_scores.mean(), cv_scores.std()*2 ) )

cv_scores = cross_val_score( lgbm, x_train_cv,  y_train_cv, cv =cv1, scoring ='recall', n_jobs=-1 )
print("\n Average Recall %0.4f (+/- %0.4f)" %(cv_scores.mean(), cv_scores.std()*2 ) )

cv_scores = cross_val_score( lgbm, x_train_cv, np.ravel( y_train_cv), cv =cv1, scoring ='f1', n_jobs=-1 )
print("\n Average f1-score %0.4f (+/- %0.4f)" %(cv_scores.mean(), cv_scores.std()*2 )  )

Average Precision 0.7505 (+/- 0.0148)

 Average Recall 0.6986 (+/- 0.0183)

 Average f1-score 0.7236 (+/- 0.0126)


## 8.4 Cross validation summary

|     Model                                  | Avg Precision             | Avg Recall                | Avg f1\-score             
|--------------------------------------------|---------------------------|---------------------------|---------------------------
| LGBM Classifier                            | 0\.7562 \(\+/\- 0\.0147\) | 0\.0934 \(\+/\- 0\.0167\) | 0\.7234 \(\+/\- 0\.0114\) 
| LGBM Classifier \(Tuned HP\)               | 0.7505 (+/- 0\.0148\)     | 0\.6989 \(\+/\- 0\.0183\) | 0\.7236 \(\+/\- 0\.0126\) 


## 8.5 Calculate Business perfomance

In [189]:
#model performance business
price_per_percent = 500/5

baseline = 50
num_pacients = 70000

model_accuracy = cv_scores.mean()
deviation = cv_scores.std() * 2


accuracy_lower = (model_accuracy - deviation) * 100
accuracy_upper = (model_accuracy + deviation) * 100

percent_difference_lower = accuracy_lower - baseline
percent_difference_upper = accuracy_upper - baseline

amount_best = percent_difference_upper * price_per_percent * num_pacients
amount_worst = percent_difference_lower * price_per_percent * num_pacients

print(f'Best:  ${round(amount_best, 2):,.2f}')
print(f'Worst: ${round(amount_worst, 2):,.2f}')

Best:  $165,305,455.95
Worst: $147,682,954.67


In [192]:
#today's performance

price_per_percent = 500/5

baseline = 50
num_pacients = 70000



accuracy_lower = 55.0
accuracy_upper = 65.0

percent_difference_lower = accuracy_lower - baseline
percent_difference_upper = accuracy_upper - baseline

amount_best = percent_difference_upper * price_per_percent * num_pacients
amount_worst = percent_difference_lower * price_per_percent * num_pacients

print(f'Best:  ${round(amount_best, 2):,.2f}')
print(f'Worst: ${round(amount_worst, 2):,.2f}')

Best:  $105,000,000.00
Worst: $35,000,000.00


# 9.0 Business performance




The price of the diagnosis, paid by the client, varies according to the precision achieved by the team of specialists.

| Exam Accuracy | Price          | Rules                                    | Example                         |
|:--------------|:---------------|:-----------------------------------------|:--------------------------------|
| Above 50%     | min \$500\.00  | \+\$500 for each additional 5% precision | Precision = 55% \-> \$1,000\.00 |
| Up to 50%     | $0\.00         | N/A                                      | N/A                             |




translating my model performance it to business numbers.

| Exam Accuracy | Best Scenario    | Worst Scenario         |
|:--------------|:-----------------|:---------------------- |
| Our Model     | \$165,305,455.95 | \$147,682,954\.67      |
| Today         | \$105,000,000.00 | \$35,000,000\.00       |  