In [2]:
# !pip install -U tfx
# !pip install apache-beam==2.39.0
# !pip install pandas-tfrecords
# !pip install mlflow

In [3]:
from absl import logging
from getpass import getpass
from imblearn.over_sampling import RandomOverSampler 
import IPython
import mlflow
import mlflow.sklearn
import numpy as np
import os
import pandas as pd
import pandas_tfrecords as pdtfr
import pathlib
import pprint
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score, roc_auc_score
import tensorflow as tf
import tensorflow_transform as tft
from tfx import v1 as tfx
from urllib.parse import urlparse

# To ignore warnings from TF
tf.get_logger().setLevel('ERROR')
# Set default logging level
logging.set_verbosity(logging.INFO) 

# For formatting print statements
pp = pprint.PrettyPrinter()

# Display versions of TF and TFX related packages
print('TensorFlow version: {}'.format(tf.__version__))
print('TFX version: {}'.format(tfx.__version__))

TensorFlow version: 2.9.1
TFX version: 1.9.0


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# Location of the data and model directory
DATA_DIR = '/content/drive/My Drive/Stroke Prediction ML System'
DATA_ROOT = f'{DATA_DIR}/data'

# Set the paths to the reduced dataset
DATA_DIR_SELECT = f'{DATA_ROOT}/select'
TRAINING_ROOT = f'{DATA_DIR}/training'
TESTING_ROOT = f'{DATA_DIR}/testing'
SERVING_ROOT = f'{DATA_DIR}/serving'

TRAINING_DATA = f'{TRAINING_ROOT}/stoke_prediction_training_dataset.csv'
TESTING_DATA = f'{TESTING_ROOT}/stoke_prediction_testing_dataset.csv'
SERVING_DATA = f'{SERVING_ROOT}/stoke_prediction_serving_dataset.csv'

# Names of transformed data files
TRANSFORMED_TRAIN_DATA = 'train_transformed'
TRANSFORMED_TEST_DATA = 'test_transformed'
OUTPUT_DIR = f'{DATA_ROOT}/transformed'

# Set random seed
RANDOM_SEED = 0

In [6]:
FEATURE_KEYS = ['gender', 'age', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'avg_glucose_level', 'bmi', 'smoking_status', 'stroke']

LABEL_KEY = 'stroke'

In [7]:
def get_dataset(working_dir, filebase):
  tf_transform_output = tft.TFTransformOutput(working_dir)

  data_path_pattern = os.path.join(working_dir, filebase + '-00000-of-00001')
  
  dataset = pdtfr.tfrecords_to_pandas(file_paths=data_path_pattern)

  return dataset

def build_model():
  """
  Creates a Classification model for classifying stroke data

  Returns:
    A Sklearn Model
  """

  model = LogisticRegressionCV(cv=5, max_iter=500, class_weight='balanced', verbose=1, random_state=RANDOM_SEED)

  return model
  
def train_model(model, X_train, y_train):

  # ros = RandomOverSampler(random_state=RANDOM_SEED)
  # X_res, y_res = ros.fit_resample(X_train, y_train)

  fitted_model = model.fit(X_train, y_train)
  logging.info(fitted_model)
  
  return fitted_model

def evaluate_model(fitted_model, X_eval, y_eval):

  predictions = fitted_model.predict(X_eval)
  score = fitted_model.score(X_eval, y_eval)

  accuracy = accuracy_score(y_eval, predictions)
  f1 = f1_score(y_eval, predictions)
  precision = precision_score(y_eval, predictions)
  recall = recall_score(y_eval, predictions)
  roc_auc = roc_auc_score(y_eval, predictions)

  logging.info('Accuracy: %f', accuracy)
  logging.info('F1: %f', f1)
  logging.info('Precision: %f', precision)
  logging.info('Recall: %f', recall)
  logging.info('ROC AUC: %f', roc_auc)

  return predictions, accuracy, f1, precision, recall, roc_auc

def train_and_evaluate(working_dir):
  """
  Train the model on training data and evaluate on test data

  Args:
    working_dir: The location of the Transform output

  Returns:
    The results from the estimator's 'evaluate' method
  """

  train_dataset = get_dataset(working_dir, TRANSFORMED_TRAIN_DATA)
  validation_dataset = get_dataset(working_dir, TRANSFORMED_TEST_DATA)

  X_train = train_dataset.drop(columns=['stroke'])
  y_train = train_dataset['stroke'].values
  X_eval = validation_dataset.drop(columns=['stroke'])
  y_eval = validation_dataset['stroke'].values

  os.environ['MLFLOW_TRACKING_USERNAME'] = input('Enter your DAGsHub username: ')
  os.environ['MLFLOW_TRACKING_PASSWORD'] = getpass('Enter your DAGsHub access token: ')
  os.environ['MLFLOW_TRACKING_PROJECTNAME'] = input('Enter your DAGsHub project name: ')

  mlflow.set_tracking_uri(f'https://dagshub.com/' + os.environ['MLFLOW_TRACKING_USERNAME'] 
                          + '/' + os.environ['MLFLOW_TRACKING_PROJECTNAME'] + '.mlflow')

  with mlflow.start_run(run_name='logistic_regression_cv'):

    model = build_model()
    fitted_model = train_model(model, X_train, y_train)
    (predictions, accuracy, f1, precision, recall, roc_auc) = evaluate_model(fitted_model, X_eval, y_eval)

    mlflow.log_param('Accuracy', accuracy)
    mlflow.log_param('F1', f1)
    mlflow.log_metric('Precision', precision)
    mlflow.log_metric('Recall', recall)
    mlflow.log_metric('ROC AUC', roc_auc)

    mlflow.sklearn.log_model(fitted_model, 'model')

  return fitted_model, predictions

In [8]:
fitted_model, predictions = train_and_evaluate(OUTPUT_DIR)

Enter your DAGsHub username: AniMadurkar
Enter your DAGsHub access token: ··········
Enter your DAGsHub project name: stroke-prediction


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.9s finished
INFO:absl:LogisticRegressionCV(class_weight='balanced', cv=5, max_iter=500,
                     random_state=0, verbose=1)
INFO:absl:Accuracy: 0.762443
INFO:absl:F1: 0.139344
INFO:absl:Precision: 0.079070
INFO:absl:Recall: 0.586207
INFO:absl:ROC AUC: 0.677314


In [9]:
display(IPython.display.IFrame("https://dagshub.com/"+ os.environ['MLFLOW_TRACKING_USERNAME'] + '/' + os.environ['MLFLOW_TRACKING_PROJECTNAME'] + "/experiments/#/",'100%',600))