In [11]:
import os

# Data Manipulation
import numpy as np
import pandas as pd

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

# Modeling
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

# MLflow
import mlflow

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("stroke-prediction")

<Experiment: artifact_location='./mlruns/1', experiment_id='1', lifecycle_stage='active', name='stroke-prediction', tags={}>

# Global Variables

In [2]:
RANDOM_STATE = 12354

# Read Data

In [3]:
# Paths
root = os.getcwd()
data_folder_path = os.path.join("data")
data_file_path = os.path.join(data_folder_path, "healthcare-dataset-stroke-data.csv")

In [4]:
# Read Data
data = pd.read_csv(data_file_path)

# Preproccesing

1. Deal with missing values
2. Use SMOTE to deal with imbalanced data

In [5]:
# Selecting Features and Target
y_target = "stroke"
X_features = [c for c in data.columns if c != y_target]

y = data[y_target].copy()
X = data[X_features].copy()

In [6]:
# Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = RANDOM_STATE)

In [7]:
# Pipelines For Column Tranformation
numeric_columns = ['age', 'avg_glucose_level', 'bmi']
categorical_columns = ['Residence_type', 'ever_married', 'gender', 'heart_disease',
                       'hypertension', 'smoking_status', 'work_type']

numeric_pipeline = Pipeline(steps = 
                            [("imputer", SimpleImputer(strategy = "median"))])

categorical_pipeline = Pipeline(steps =
                                [("encoder", OrdinalEncoder(handle_unknown = "use_encoded_value",
                                                            unknown_value = np.nan)),
                                ("imputer", SimpleImputer(strategy = "most_frequent"))])

column_trans = ColumnTransformer(
    [("num_columns", numeric_pipeline, numeric_columns),
     ("cat_columns", categorical_pipeline, categorical_columns)])

# Model Trainning

Recall:
Precission.
Definir métrica
f1_score
roc

In [8]:
# Adding SMOTE and Random forest to the pipeline
rf_pipeline = Pipeline(steps = [
        ("transformations", column_trans),
        ("oversampling", SMOTE(random_state = RANDOM_STATE)),
        ("rf_estimator", RandomForestClassifier())
])

## Tracking a Single Experiment Run

In [32]:
with mlflow.start_run():
    mlflow.set_tag("developer", "Chris")
    
    rf_params = {"n_estimators": 1000,
                "max_features": 5,
                "max_depth": 4,
                "ccp_alpha": 0.1,
                "random_state": RANDOM_STATE,
                "n_jobs": -1}
    mlflow.log_params(rf_params)
    rf_pipeline.named_steps["rf_estimator"].set_params(**rf_params)
    
    rf_pipeline.fit(X_train, y_train)
    
    y_pred = rf_pipeline.predict(X_test)
    mlflow.log_metric("precission", precision_score(y_test, y_pred))
    mlflow.log_metric("recall", recall_score(y_test, y_pred))
    mlflow.log_metric("accuracy", accuracy_score(y_test, y_pred))

## Tracking a Multiples Experiment Runs

In [41]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

# Without Cross Validation
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("developer", "Chris")
        mlflow.set_tag("model_type", "RandomForest")
        mlflow.set_tag("tunning_type", "tpe")
        
        mlflow.log_params(params)
        rf_pipeline.named_steps["rf_estimator"].set_params(**params)
        
        rf_pipeline.fit(X_train, y_train)
        
        y_pred = rf_pipeline.predict(X_test)
        recall = recall_score(y_test, y_pred)
        precission = precision_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        mlflow.log_metric("precission", precission)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1", f1)

    return {'loss': -f1, 'status': STATUS_OK}

search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    "n_estimators": scope.int(hp.quniform('n_estimators', 100, 1000, 1)),
    "max_features": scope.int(hp.quniform('max_features', 2, 8, 1)),
    "ccp_alpha": hp.uniform('ccp_alpha', 0.0000000001, 0.1),
    'random_state': RANDOM_STATE 
}

best_result = fmin(
    fn = objective,
    space = search_space,
    algo = tpe.suggest,
    max_evals = 50,
    trials = Trials()
)

100%|███████████████████████████████████████████████████████████████████████████████| 50/50 [00:57<00:00,  1.14s/trial, best loss: -0.20848056537102472]


In [None]:
# Mlflow colaborativo
# Tracking
# Despliegue Fácil

In [None]:
# Permisos

In [44]:
y_train

1731    0
2073    0
3444    0
4536    0
1428    0
       ..
2287    0
4731    0
3370    0
22      1
3172    0
Name: stroke, Length: 3832, dtype: int64

In [42]:
X_train

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
1731,54058,Female,22.0,0,0,No,Private,Urban,56.84,29.9,smokes
2073,49615,Female,12.0,0,0,No,children,Urban,58.14,21.3,never smoked
3444,51116,Female,40.0,0,0,Yes,Self-employed,Urban,64.66,25.0,formerly smoked
4536,59405,Female,68.0,1,0,Yes,Private,Urban,150.74,40.3,Unknown
1428,46373,Female,57.0,0,0,Yes,Private,Rural,169.97,25.8,never smoked
...,...,...,...,...,...,...,...,...,...,...,...
2287,15964,Female,64.0,1,0,Yes,Private,Rural,99.40,29.1,never smoked
4731,17130,Female,23.0,0,0,No,Private,Rural,76.56,30.1,never smoked
3370,46767,Female,8.0,0,0,No,children,Rural,67.84,24.0,Unknown
22,68794,Female,79.0,0,0,Yes,Self-employed,Urban,228.70,26.6,never smoked


In [None]:
# De los resultados de validación cruzada de sklearn como los puedo ver en el mlflow para ver la distribución de estos errores?

In [None]:
# No cross validation

In [9]:
# Poner que metrica queremos minimizar

In [None]:
# SMOTE

In [10]:
# False Negative: Avoid people with high stroke probability being negatively diagnosed
# True positives: Diagnose all people who are actually likely to have a stroke.