In [1]:
import numpy as np
import pandas as pd

In [2]:
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
kidney_disease_medical_data = pd.read_csv('../data/kidney_disease.csv')

In [4]:
! pip install mlflow

Collecting mlflow
  Downloading mlflow-2.17.1-py3-none-any.whl.metadata (29 kB)
Collecting mlflow-skinny==2.17.1 (from mlflow)
  Downloading mlflow_skinny-2.17.1-py3-none-any.whl.metadata (30 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.13.3-py3-none-any.whl.metadata (7.4 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4-py2.py3-none-any.whl.metadata (6.7 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting cachetools<6,>=5.0.0 (from mlflow-skinny==2.17.1->mlflow)
  Downloading cachetools-5.5.0-py3-none-any.whl.metadata (5.3 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.17.1->mlflow)
  Downloading databricks_sdk-0.36.0-py3-none-any.whl.metadata (38 kB)
Collecting opentelemetry-api<3,>=1.9.0 (from mlflow-skinny==2.17.1->mlflow)
  Downloading opentelemetry_api-1

In [5]:
import mlflow

# Set an experiment (this can be a name you choose for your project)
mlflow.set_experiment("model_comparison_experiment")


2024/10/26 01:36:25 INFO mlflow.tracking.fluent: Experiment with name 'model_comparison_experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///Users/david/Documents/Documents%20-%20David%E2%80%99s%20Mac%20Studio/Personal_Projects/Chronic-Kidney-Disease/backend/data_exploration_notebooks/mlruns/147228988099714012', creation_time=1729920985215, experiment_id='147228988099714012', last_update_time=1729920985215, lifecycle_stage='active', name='model_comparison_experiment', tags={}>

In [8]:
# fill na with mean depending on column
def replaceNAWithMean(df, column):
    df[column] = df[column].fillna(df[column].mean())

def replaceNAWithMode(df, column):
    df[column] = df[column].fillna(df[column].mode()[0])

In [9]:
# clean ckd\t
kidney_disease_medical_data.replace('ckd\t', 'ckd', inplace=True)

kidney_disease_medical_data['classification'].value_counts()

for c in kidney_disease_medical_data.select_dtypes('object').columns:
    print(f"---- {c} ---")
    print(kidney_disease_medical_data[c].unique())

# clean data with "\t" in it
kidney_disease_medical_data = kidney_disease_medical_data.replace(r'\t', '', regex=True)


# replace ? with 0

kidney_disease_medical_data = kidney_disease_medical_data.replace('?', '0')

for c in kidney_disease_medical_data.select_dtypes('object').columns:
    print(f"---- {c} ---")
    print(kidney_disease_medical_data[c].unique())

# remove space
kidney_disease_medical_data['dm'] = kidney_disease_medical_data['dm'].str.strip()

---- rbc ---
[nan 'normal' 'abnormal']
---- pc ---
['normal' 'abnormal' nan]
---- pcc ---
['notpresent' 'present' nan]
---- ba ---
['notpresent' 'present' nan]
---- pcv ---
['44' '38' '31' '32' '35' '39' '36' '33' '29' '28' nan '16' '24' '37' '30'
 '34' '40' '45' '27' '48' '\t?' '52' '14' '22' '18' '42' '17' '46' '23'
 '19' '25' '41' '26' '15' '21' '43' '20' '\t43' '47' '9' '49' '50' '53'
 '51' '54']
---- wc ---
['7800' '6000' '7500' '6700' '7300' nan '6900' '9600' '12100' '4500'
 '12200' '11000' '3800' '11400' '5300' '9200' '6200' '8300' '8400' '10300'
 '9800' '9100' '7900' '6400' '8600' '18900' '21600' '4300' '8500' '11300'
 '7200' '7700' '14600' '6300' '\t6200' '7100' '11800' '9400' '5500' '5800'
 '13200' '12500' '5600' '7000' '11900' '10400' '10700' '12700' '6800'
 '6500' '13600' '10200' '9000' '14900' '8200' '15200' '5000' '16300'
 '12400' '\t8400' '10500' '4200' '4700' '10900' '8100' '9500' '2200'
 '12800' '11200' '19100' '\t?' '12300' '16700' '2600' '26400' '8800'
 '7400' '4900'

In [10]:
# replace missing values in numeric columns with mean of column
for col in kidney_disease_medical_data[['bp', 'rbc', 'rc', 'wc', 'pcv', 'cad', 'appet', 'sc', 'sod', 'pot', 'hemo', 'htn', 'dm', 'ane', 'age', 'pe']].select_dtypes(exclude='object').columns:
    replaceNAWithMean(kidney_disease_medical_data, col)

In [11]:
# replace missing values in object columns with mode or most frequently occuring value
for col in kidney_disease_medical_data[['bp', 'rbc', 'rc', 'wc', 'pcv', 'cad', 'appet', 'sc', 'sod', 'pot', 'hemo', 'htn', 'dm', 'ane', 'age', 'pe']].select_dtypes('object').columns:
    replaceNAWithMode(kidney_disease_medical_data, col)

In [12]:
# columns to train model on and preprocess data
X_features = ['bp', 'rbc', 'rc', 'wc', 'pcv', 'cad', 'appet', 'sc', 'sod', 'pot', 'hemo', 'htn', 'dm', 'ane', 'age', 'pe']

In [18]:
# convert pcv, wc and rc into float
kidney_disease_medical_data[['pcv', 'wc', 'rc']] = kidney_disease_medical_data[['pcv', 'wc', 'rc']].astype(float)

In [19]:
X = kidney_disease_medical_data[X_features]
y = kidney_disease_medical_data['classification']

In [20]:
# feature engineer columns
X = pd.get_dummies(X, drop_first=True, columns=['rbc', 'cad', 'appet', 'htn', 'dm', 'ane', 'pe'])
y = kidney_disease_medical_data['classification']

In [21]:
from sklearn.preprocessing import LabelEncoder
label_enc = LabelEncoder()
y = label_enc.fit_transform(y) # convert classification to 0 and 1

In [26]:
import mlflow
import mlflow.sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB 
from sklearn.model_selection import train_test_split
# from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42) # split into train and test

In [27]:
from imblearn.over_sampling import SMOTE

In [28]:
sm = SMOTE()

In [29]:
X_train, y_train = sm.fit_resample(X_train, y_train) # resample training data to fix imbalance

In [35]:
from sklearn.metrics import precision_score, recall_score, f1_score



NameError: name 'predictions' is not defined

In [36]:
# Define a function to train, evaluate, and log models with MLflow
def train_and_log_model(model, model_name, params):
    with mlflow.start_run(run_name=model_name):
        # Log hyperparameters
        mlflow.log_params(params)
        
        # Train the model
        model.fit(X_train, y_train)
        
        # Make predictions and calculate accuracy
        predictions = model.predict(X_test)
        accuracy = accuracy_score(y_test, predictions)

        precision = precision_score(y_test, predictions, average="macro")
        recall = recall_score(y_test, predictions, average="macro")
        f1 = f1_score(y_test, predictions, average="macro")
        
        # Log the accuracy metric
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1_score", f1)
        
        # Log the model itself
        mlflow.sklearn.log_model(model, model_name)
        
        print(f"{model_name} accuracy: {accuracy:.4f}")



In [37]:
# Train and log a Decision Tree model
dt_params = {"max_depth": 5}
dt_model = DecisionTreeClassifier(**dt_params)
train_and_log_model(dt_model, "DecisionTree", dt_params)

# Train and log a Random Forest model
rf_params = {"n_estimators": 100, "max_depth": 5}
rf_model = RandomForestClassifier(**rf_params)
train_and_log_model(rf_model, "RandomForest", rf_params)

# Train and log a Gradient Boosting model
gb_params = {"n_estimators": 100, "learning_rate": 0.1, "max_depth": 3}
gb_model = GradientBoostingClassifier(**gb_params)
train_and_log_model(gb_model, "GradientBoosting", gb_params)

# Train and log a Logistic Regression model
lr_params = {"solver": "liblinear", "penalty": "l2", "C": 1.0}
lr_model = LogisticRegression(**lr_params)
train_and_log_model(lr_model, "LogisticRegression", lr_params)

# Train and log a Naive Bayes model
nb_params = {}
nb_model = GaussianNB(**nb_params)
train_and_log_model(nb_model, "NaiveBayes", nb_params)




DecisionTree accuracy: 0.9667




RandomForest accuracy: 1.0000




GradientBoosting accuracy: 1.0000




LogisticRegression accuracy: 0.9917




NaiveBayes accuracy: 0.9333
