In [1]:
import sys
from pathlib import Path

# Add project root to sys.path
sys.path.append(str(Path().resolve().parent))

In [14]:
import pandas as pd
import mlflow
from src.models import train_and_log_logistic_regression, train_and_log_random_forest
from src.metrics import get_metrics
from src.preprocessing import create_preprocessor
from src.config import LOG_REG_PARAMS, RF_PARAMS, EXPERIMENT_NAME, TRACKING_URI, NUMERIC_FEATURES, CATEGORICAL_FEATURES
from sklearn.model_selection import train_test_split

data_path = "../dataset/diabetes.csv"
data = pd.read_csv(data_path)
data.head()

Unnamed: 0,PatientID,Pregnancies,PlasmaGlucose,DiastolicBloodPressure,TricepsThickness,SerumInsulin,BMI,DiabetesPedigree,Age,Diabetic
0,1354778,0,171,80,34,23,43.509726,1.213191,21,0
1,1147438,8,92,93,47,36,21.240576,0.158365,23,0
2,1640031,7,115,47,52,35,41.511523,0.079019,23,0
3,1883350,9,103,78,25,304,29.582192,1.28287,43,1
4,1424119,1,85,59,27,35,42.604536,0.549542,22,0


In [15]:
features = [col for col in data.columns][1:9]
labels = [col for col in data.columns][9]

X, y = data[features].values, data[labels].values

for n in range(0,4):
    print("Patient", str(n+1), "\n  Features:",list(X[n]), "\n  Label:", y[n])

Patient 1 
  Features: [np.float64(0.0), np.float64(171.0), np.float64(80.0), np.float64(34.0), np.float64(23.0), np.float64(43.50972593), np.float64(1.213191354), np.float64(21.0)] 
  Label: 0
Patient 2 
  Features: [np.float64(8.0), np.float64(92.0), np.float64(93.0), np.float64(47.0), np.float64(36.0), np.float64(21.24057571), np.float64(0.158364981), np.float64(23.0)] 
  Label: 0
Patient 3 
  Features: [np.float64(7.0), np.float64(115.0), np.float64(47.0), np.float64(52.0), np.float64(35.0), np.float64(41.51152348), np.float64(0.079018568), np.float64(23.0)] 
  Label: 0
Patient 4 
  Features: [np.float64(9.0), np.float64(103.0), np.float64(78.0), np.float64(25.0), np.float64(304.0), np.float64(29.58219193), np.float64(1.282869847), np.float64(43.0)] 
  Label: 1


In [4]:
from matplotlib import pyplot as plt
%matplotlib inline

for col in features:
    data.boxplot(column=col, by='Diabetic', figsize=(6,6))
    plt.title(col)
plt.show()

In [5]:

    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

print ('Training cases: %d\nTest cases: %d' % (X_train.shape[0], X_test.shape[0]))

## Using MLFlow

In [None]:
preprocessor = create_preprocessor(NUMERIC_FEATURES, CATEGORICAL_FEATURES)

mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME)


log_model  = train_and_log_logistic_regression(X_train, X_test, y_train, y_test, preprocessor, 
                                               data_path=data_path, **LOG_REG_PARAMS)

rf_model = train_and_log_random_forest(X_train, X_test, y_train, y_test, preprocessor, 
                                       data_path=data_path, **RF_PARAMS)


Testing Logistic Regression Model

In [None]:
import os

model_path = "../mlruns_server/artifacts/1"

run_id = os.listdir(model_path)[0]
model_uri = f"runs:/{run_id}/model"
model = mlflow.sklearn.load_model(model_uri)

prediction, label = model.predict(X_test[121].reshape(1, -1)), y_test[121]
print(prediction, label)

Testing Random Classifer model

In [None]:

from mlflow.pyfunc import load_model


random_forest_model = load_model(model_uri="models:/RandomForestClassifier/1")
prediction, label = random_forest_model.predict(X_test[69].reshape(1, -1)), y_test[69]
print(prediction, label)
