In [44]:
import os

# Data Manipulation
import numpy as np
import pandas as pd

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

# Modeling
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
from sklearn.model_selection import StratifiedKFold, cross_val_score

# Model Tunnning
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

# MLflow
import mlflow

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("stroke-prediction")

<Experiment: artifact_location='./mlruns/1', experiment_id='1', lifecycle_stage='active', name='stroke-prediction', tags={}>

# Global Variables

In [17]:
RANDOM_STATE = 12354

# Read Data

In [18]:
# Paths
root = os.getcwd()
data_folder_path = os.path.join("data")
data_file_path = os.path.join(data_folder_path, "healthcare-dataset-stroke-data.csv")

In [19]:
# Read Data
data = pd.read_csv(data_file_path)

# Preproccesing

1. Deal with missing values
2. Use SMOTE to deal with imbalanced data

In [20]:
# Selecting Features and Target
y_target = "stroke"
X_features = [c for c in data.columns if c != y_target]

y = data[y_target].copy()
X = data[X_features].copy()

In [21]:
# Train and Test Data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, stratify = y, random_state = RANDOM_STATE)

In [22]:
# Pipelines For Column Tranformation
numeric_columns = ['age', 'avg_glucose_level', 'bmi']
categorical_columns = ['Residence_type', 'ever_married', 'gender', 'heart_disease',
                       'hypertension', 'smoking_status', 'work_type']

numeric_pipeline = Pipeline(steps = 
                            [("imputer", SimpleImputer(strategy = "median"))])

categorical_pipeline = Pipeline(steps =
                                [("encoder", OrdinalEncoder(handle_unknown = "use_encoded_value",
                                                            unknown_value = np.nan)),
                                ("imputer", SimpleImputer(strategy = "most_frequent"))])

column_trans = ColumnTransformer(
    [("num_columns", numeric_pipeline, numeric_columns),
     ("cat_columns", categorical_pipeline, categorical_columns)])

# Model Trainning

**Objectives:**

- We want to detect as many strokes as possible(**Recall** must be as large as possible). 
- But at the same time we don't want too many false positives(**Precission** must be as large as possible).

**Metric to be optimized:**

- **f1_score**


<kbd>
  <img src="images/confusion_matrix.ppm">
</kbd>

In [51]:
# Adding SMOTE and Random forest to the pipeline
rf_pipeline = Pipeline(steps = [
        ("transformations", column_trans),
        ("oversampling", SMOTE(random_state = RANDOM_STATE)),
        ("rf_estimator", RandomForestClassifier(n_estimators = 1000, n_jobs = -1, class_weight = "balanced", random_state = RANDOM_STATE))
])

## Tracking a Single Experiment Run

In [52]:
with mlflow.start_run():
    mlflow.set_tag("developer", "Chris")
    mlflow.set_tag("model_type", "RandomForest")
    
    rf_params = {"max_features": 3,
                "max_depth": 4,
                "ccp_alpha": 0.1}
    mlflow.log_params(rf_params)
    rf_pipeline.named_steps["rf_estimator"].set_params(**rf_params)
    
    rf_pipeline.fit(X_train, y_train)
    
    y_pred = rf_pipeline.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    precission = precision_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("precission", precission)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("f1", f1)

## Tracking a Multiples Experiment Runs

In [53]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("developer", "Chris")
        mlflow.set_tag("model_type", "RandomForest")
        mlflow.set_tag("tunning_type", "tpe")
        
        mlflow.log_params(params)
        rf_pipeline.named_steps["rf_estimator"].set_params(**params)
        
        skfold = StratifiedKFold(n_splits = 5, random_state = RANDOM_STATE, shuffle = True)
        new_scores = cross_val_score(rf_pipeline, X_train, y_train, cv = skfold, scoring = "f1", n_jobs = -1)
        f1_mean = new_scores.mean()
        
        mlflow.log_metric("f1_mean", f1_mean)
        
    return {'loss': -f1_mean, 'status': STATUS_OK}

search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 8, 1)),
    "max_features": scope.int(hp.quniform('max_features', 1, 6, 1)),
    "ccp_alpha": hp.uniform('ccp_alpha', 0.000001, 0.01),
    'random_state': RANDOM_STATE 
}

best_result = fmin(
    fn = objective,
    space = search_space,
    algo = tpe.suggest,
    max_evals = 10,
    trials = Trials()
)

100%|██████████████████████████████████████████████| 10/10 [00:53<00:00,  5.32s/trial, best loss: -0.23660435660435658]


In [54]:
best_result

{'ccp_alpha': 0.0016218846126159721, 'max_depth': 6.0, 'max_features': 4.0}

# Best Model

In [55]:
with mlflow.start_run():
    mlflow.set_tag("developer", "Chris")
    mlflow.set_tag("model_type", "RandomForest")
    
    best_params = {'ccp_alpha': 0.0016218846126159721,
                   'max_depth': 6, 
                   'max_features': 4}
    mlflow.log_params(rf_params)
    rf_pipeline.named_steps["rf_estimator"].set_params(**best_params)
    
    rf_pipeline.fit(X_train, y_train)
    
    y_pred = rf_pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    precission = precision_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("precission", precission)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("f1", f1)

In [None]:
# Use model registry 

In [None]:
# Dani una pregunta -> el AUC ROC score es para comparar modelos 

In [None]:
# Falta pipenv
# Falta docker