In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'>
import numpy as np
from sklearn.model_selection import train_test_split
import collections
import sys
from mlxtend.evaluate import confusion_matrix
import matplotlib.pyplot as plt
from mlxtend.plotting import plot_confusion_matrix
from matplotlib import cm as cmap
import plotly.express as px
import psycopg2
from psycopg2.extras import execute_values

import warnings
warnings.filterwarnings("ignore")
import mlflow
from sklearn.metrics import accuracy_score, recall_score, f1_score, average_precision_score
from urllib.parse import urlparse


The psycopg2 wheel package will be renamed from release 2.8; in order to keep installing from binary please use "pip install psycopg2-binary" instead. For details see: <http://initd.org/psycopg/docs/install.html#binary-install-from-pypi>.



# 1. BATCH  ETL

## 1.1 READ DATA FROM CSV

In [2]:
index_file = 0
files = ['/data/jupyter/healthcare-dataset-stroke-data.csv']
ds = pd.read_csv(files[index_file], index_col=0)
display(ds.head())
print(ds.shape)

Unnamed: 0_level_0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


(5110, 11)


In [3]:
ds.dtypes

gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object

## 1.2 CREATE TABLES

In [4]:
def connect_postgress():
    """ open conexion to the PostgreSQL database"""
    conn = psycopg2.connect(
        host="postgres-mlflow",
        port=5432,
        database="mlflow_db",
        user="mlflow_user",
        password="mlflow")
    return conn

In [5]:
def create_tables():
    """ create tables in the PostgreSQL database"""
    command = (
        """
        CREATE TABLE stroke (
            id BIGINT PRIMARY KEY,
            gender VARCHAR(20),
            age INT,
            hypertension INT,
            heart_disease INT,
            ever_married VARCHAR(3),
            work_type VARCHAR(200),
            Residence_type VARCHAR(20),
            avg_glucose_level FLOAT,
            bmi FLOAT,
            smoking_status VARCHAR(20),
            stroke INT
        )
        """)
    conn = None
    a = False
    try:
        # read the connection parameters
        #params = config()
        # connect to the PostgreSQL server
        #conn = psycopg2.connect(**params)
        conn = connect_postgress()
        cur = conn.cursor()
        # create table one by one
        #for command in commands:
        cur.execute(command)
        # close communication with the PostgreSQL database server
        cur.close()
        # commit the changes
        conn.commit()
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        a = True
    finally:
        if conn is not None:
            conn.close()
    return a

In [6]:
a = create_tables()
print(a)

relation "stroke" already exists

True


## 1.3 INSERT DATA

In [6]:
def insert_values(table_name,values):
    """ insert a new vendor into the vendors table """
    sql = """INSERT INTO """+table_name+""" (id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke) VALUES %s """
    conn = None
    vendor_id = None
    print(sql)
    try:
        # read database configuration
        # params = config()
        # connect to the PostgreSQL database
        conn = connect_postgress()
        #conn = psycopg2.connect(**params)
        # create a new cursor
        cur = conn.cursor()
        # execute the INSERT statement
        # cur.execute(sql, (values,))
        # execute the SQL statement
        execute_values(cur, sql, values)

        # get the generated id back
        #cur.fetchone()
        # cur.fetchall()
        # commit the changes to the database
        conn.commit()
        # close communication with the database
        cur.close()
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        id_values = None
    finally:
        if conn is not None:
            conn.close()


In [7]:
def prepare_values(ds,start,length):
    df = ds.reset_index()
    #index = ds.iloc[start:length].index.values
    value = df.iloc[start:length].values
    values = []
    for i in range(length-start):
        print(np.array(value[i][0]))
        values.append(np.array(value[i]))
    values = list(values)
    return values

In [8]:
length = 5100
start = 2
values = prepare_values(ds,start,length)
print(values)

31112
60182
1665
56669
53882
10434
27419
60491
12109
12095
12175
8213
5317
58202
56112
34120
27458
25226
70630
13861
68794
64778
4219
70822
38047
61843
54827
69160
43717
33879
39373
54401
14248
712
47269
24977
47306
62602
4651
1261
61960
1845
7937
19824
37937
47472
35626
36338
18587
15102
59190
47167
8752
25831
38829
66400
58631
5111
10710
55927
65842
19557
7356
17013
17004
72366
6118
7371
70676
2326
27169
50784
19773
66159
36236
71673
45805
42117
57419
26015
26727
66638
70042
32399
3253
71796
14499
49130
28291
51169
66315
37726
54385
2458
35512
56841
8154
4639
12363
63973
45277
4712
33175
2346
42072
12062
30456
59125
56546
48405
36706
41069
71639
53401
60744
7547
31720
5563
68798
72918
13491
44033
14164
50522
3352
70943
37132
48796
53440
16817
69551
31563
20387
71279
55824
11762
29281
30683
20439
45965
8045
37651
17308
67981
41241
62861
72081
58978
11933
46703
32503
12482
56939
24669
43054
59437
66258
34567
50931
16590
69768
20426
3512
42899
63453
43364
44993
210
28939
60739
67432
218

67911
16856
37972
62414
50485
47405
70928
4679
15070
25625
35123
20165
41730
38761
4797
19199
30402
25088
54756
19590
23332
16971
11727
60255
38796
46498
41042
35069
61103
25095
55607
63029
2919
60003
46256
23659
2952
49229
2457
23508
28364
31360
19335
40390
63936
24832
25219
42393
17951
17443
52242
45931
7828
21547
42305
9442
57047
2538
28461
16433
50681
71327
46699
25248
35315
63144
21517
29789
52207
19209
42041
58153
27717
35106
47730
20657
63411
18671
3843
1225
40264
72451
20292
31201
59359
57985
2885
59743
11544
11969
42929
72776
21438
51084
13440
15533
50903
35276
44472
23587
66794
35854
60907
12449
54371
8106
2013
61785
2707
49120
30752
64972
49537
315
62814
7665
28108
50536
8655
760
47501
16863
51342
35759
17270
53862
40951
56976
37299
33247
32560
10973
3816
50215
10351
69665
14976
28183
33085
13386
15601
22254
15539
58235
21162
67880
42545
48359
54815
6233
52225
50463
49084
61889
25525
9730
30622
26480
65895
4913
20676
52410
57944
20290
10875
2393
66464
40548
19699
205
54805
5

36837
57333
19826
37713
6278
15517
25326
57034
70718
47461
50091
62416
5288
66951
28335
67465
67426
19508
65405
49773
57159
69710
58834
42007
5121
44878
40220
19692
27616
19801
21467
25102
28788
29028
15581
16738
31836
43496
52677
11630
53478
38349
48425
64420
60271
38009
11184
68967
66684
7789
40112
65814
49598
15599
62425
52652
71957
17231
30379
63997
39935
8203
27446
42709
22691
37680
24552
72914
29540
53525
65411
26214
22190
56714
4211
6369
56799
32235
28048
68598
41512
64520
579
7293
[array([31112, 'Male', 80.0, 0, 1, 'Yes', 'Private', 'Rural', 105.92, 32.5,
       'never smoked', 1], dtype=object), array([60182, 'Female', 49.0, 0, 0, 'Yes', 'Private', 'Urban', 171.23,
       34.4, 'smokes', 1], dtype=object), array([1665, 'Female', 79.0, 1, 0, 'Yes', 'Self-employed', 'Rural',
       174.12, 24.0, 'never smoked', 1], dtype=object), array([56669, 'Male', 81.0, 0, 0, 'Yes', 'Private', 'Urban', 186.21, 29.0,
       'formerly smoked', 1], dtype=object), array([53882, 'Male', 74.0, 1, 

In [9]:
insert_values("stroke",values)

INSERT INTO stroke (id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke) VALUES %s 
duplicate key value violates unique constraint "stroke_pkey"
DETAIL:  Key (id)=(31112) already exists.



## 1.4 QUERY DATA

In [9]:
def get_data(table_name):
    """ query data from the cell tables table """
    conn = None
    try:
        #params = config()
        conn = connect_postgress()
        cur = conn.cursor()
        cur.execute("SELECT id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke FROM "+table_name+" ORDER BY id")
        rowcount = cur.rowcount
        row = cur.fetchall()
        cur.close()
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
    finally:
        if conn is not None:
            conn.close()
    return row,rowcount

In [10]:
rows, rowcount = get_data("stroke")
rowcount

5100

In [11]:
pd.DataFrame(rows,columns=["id","gender","age","hypertension","heart_disease","ever_married","work_type","Residence_type","avg_glucose_level","bmi","smoking_status","stroke"])

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,67,Female,17,0,0,No,Private,Urban,92.97,,formerly smoked,0
1,77,Female,13,0,0,No,children,Rural,85.81,18.6,Unknown,0
2,84,Male,55,0,0,Yes,Private,Urban,89.17,31.5,never smoked,0
3,91,Female,42,0,0,No,Private,Urban,98.53,18.5,never smoked,0
4,99,Female,31,0,0,No,Private,Urban,108.89,52.3,Unknown,0
...,...,...,...,...,...,...,...,...,...,...,...,...
5095,72911,Female,57,1,0,Yes,Private,Rural,129.54,60.9,smokes,0
5096,72914,Female,19,0,0,No,Private,Urban,90.57,24.2,Unknown,0
5097,72915,Female,45,0,0,Yes,Private,Urban,172.33,45.3,formerly smoked,0
5098,72918,Female,53,1,0,Yes,Private,Urban,62.55,30.3,Unknown,1


# 2. TRAIN MODEL

## 2.1 PREPROCESS DATA

In [52]:
work_type_dict = {"Private":0, 
                  "Self-employed":1, 
                  "children":2, 
                  "Govt_job":3,
                  "Never_worked":4}
Residence_type_dict = {"Urban":0, 
                  "Rural":1}
smoking_status_dict = {"never smoked":0, 
                  "Unknown":1, 
                  "formerly smoked":2, 
                  "Govt_job":3,
                  "smokes":4}

def data_to_df(rows):
    df                 = pd.DataFrame(rows,columns=["id","gender","age","hypertension","heart_disease","ever_married","work_type","Residence_type","avg_glucose_level","bmi","smoking_status","stroke"]).set_index("id")
    return df

def change_on_dict(value,dict):
    return dict[value]

def preprocess(df_in):
    df                   = df_in[df_in["gender"] != "Other"].copy()
    df["bmi"]            = df["bmi"].fillna(df["bmi"].mean())
    df["ever_married"]   = np.where(df["ever_married"] == "Yes",1,0)
    df["ever_married"]   = df["ever_married"].astype("int")
    df["gender"]         = np.where(df["gender"] == "Female",1,0)
    df["gender"]         = df["gender"].astype("int")
    #stroke["work_type"]      = stroke["work_type"].apply(change_on_dict,dict=work_type_dict)
    #stroke["Residence_type"] = stroke["Residence_type"].apply(change_on_dict,dict=Residence_type_dict)
    #stroke["smoking_status"] = stroke["smoking_status"].apply(change_on_dict,dict=smoking_status_dict)
    df = pd.get_dummies(df,columns=["work_type","Residence_type","smoking_status"])
    
    return df

In [53]:
rows, rowcount = get_data("stroke")
stroke         = data_to_df(rows)
stroke         = preprocess(stroke)
stroke

Unnamed: 0_level_0,gender,age,hypertension,heart_disease,ever_married,avg_glucose_level,bmi,stroke,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
67,1,17,0,0,0,92.97,28.89408,0,0,0,1,0,0,0,1,0,1,0,0
77,1,13,0,0,0,85.81,18.60000,0,0,0,0,0,1,1,0,1,0,0,0
84,0,55,0,0,1,89.17,31.50000,0,0,0,1,0,0,0,1,0,0,1,0
91,1,42,0,0,0,98.53,18.50000,0,0,0,1,0,0,0,1,0,0,1,0
99,1,31,0,0,0,108.89,52.30000,0,0,0,1,0,0,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72911,1,57,1,0,1,129.54,60.90000,0,0,0,1,0,0,1,0,0,0,0,1
72914,1,19,0,0,0,90.57,24.20000,0,0,0,1,0,0,0,1,1,0,0,0
72915,1,45,0,0,1,172.33,45.30000,0,0,0,1,0,0,0,1,0,1,0,0
72918,1,53,1,0,1,62.55,30.30000,1,0,0,1,0,0,0,1,1,0,0,0


In [16]:
def train_test(stroke):
    train_ds, test_ds = train_test_split(stroke, test_size=0.2)
    return train_ds, test_ds

In [17]:
train_ds, test_ds  = train_test(stroke)

In [18]:
train_ds

Unnamed: 0_level_0,gender,age,hypertension,heart_disease,ever_married,avg_glucose_level,bmi,stroke,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
39661,0,18,0,0,1,140.52,27.4,0,0,0,1,0,0,1,0,0,0,1,0
12915,1,66,0,0,1,85.52,30.0,0,1,0,0,0,0,1,0,0,0,1,0
12095,1,61,0,1,1,120.46,36.8,1,1,0,0,0,0,1,0,0,0,0,1
17079,0,44,0,0,1,94.71,28.4,0,0,0,1,0,0,1,0,0,0,0,1
50009,1,17,0,0,0,81.51,19.5,0,0,0,1,0,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35565,0,43,0,0,1,111.43,21.9,0,0,0,1,0,0,0,1,0,0,0,1
54127,1,40,0,0,1,106.76,24.1,0,0,0,0,1,0,0,1,0,1,0,0
47269,0,74,0,0,1,219.72,33.7,1,0,0,1,0,0,1,0,0,1,0,0
2374,0,60,1,0,1,213.37,36.0,0,0,0,1,0,0,1,0,0,0,1,0


In [19]:
test_ds.columns

Index(['gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'avg_glucose_level', 'bmi', 'stroke', 'work_type_Govt_job',
       'work_type_Never_worked', 'work_type_Private',
       'work_type_Self-employed', 'work_type_children', 'Residence_type_Rural',
       'Residence_type_Urban', 'smoking_status_Unknown',
       'smoking_status_formerly smoked', 'smoking_status_never smoked',
       'smoking_status_smokes'],
      dtype='object')

## 2.2 Autogluon

In [20]:
from autogluon.tabular import TabularPredictor as task

In [21]:
def get_best_model(train_ds):
    label="stroke"
    predictor = task(label=label).fit(train_ds)
    model = predictor._trainer.load_model(predictor.get_model_best())
    print(model.get_info())
    return model, predictor

In [23]:
model,predictor = get_best_model(train_ds)

No path specified. Models will be saved in: "AutogluonModels/ag-20210505_125034/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20210505_125034/"
AutoGluon Version:  0.2.0
Train Data Rows:    4079
Train Data Columns: 18
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [1, 0]
	If 'binary' is not the correct problem_type, please manually specify the problem_type argument in fit() (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    5289.07 MB
	Train Data (Original)  Memory Usage: 0.27 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of

{'name': 'LightGBMXT', 'model_type': 'LGBModel', 'problem_type': 'binary', 'eval_metric': 'accuracy', 'stopping_metric': 'accuracy', 'fit_time': 3.433985710144043, 'num_classes': 2, 'quantile_levels': None, 'predict_time': 0.0047948360443115234, 'val_score': 0.952, 'hyperparameters': {'num_boost_round': 10000, 'num_threads': -1, 'learning_rate': 0.05, 'objective': 'binary', 'verbose': -1, 'boosting_type': 'gbdt', 'two_round': True, 'extra_trees': True}, 'hyperparameters_fit': {'num_boost_round': 1}, 'hyperparameters_nondefault': ['extra_trees'], 'ag_args_fit': {'max_memory_usage_ratio': 1.0, 'max_time_limit_ratio': 1.0, 'max_time_limit': None, 'min_time_limit': 0, 'ignored_type_group_special': None, 'ignored_type_group_raw': ['object'], 'get_features_kwargs': None, 'get_features_kwargs_extra': None}, 'num_features': 18, 'features': ['gender', 'age', 'hypertension', 'heart_disease', 'ever_married', 'avg_glucose_level', 'bmi', 'work_type_Govt_job', 'work_type_Never_worked', 'work_type_Pr

## 2.3 Predictor vs model

In [22]:
predictor.predict(test_ds)

NameError: name 'predictor' is not defined

In [26]:
columns_test = list(test_ds.columns)
columns_test.remove("stroke")
test_ds_model = test_ds[columns_test]
model.predict(test_ds_model)

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


## 2.4 TRAIN ADN WRITE INTO MLFLOW

In [22]:
def eval_metrics(actual, pred):
        accuracy = accuracy_score(actual, pred)
        recall = recall_score(actual, pred, average='weighted')
        f1 = f1_score(actual, pred, average='weighted')
        return accuracy, recall, f1

In [23]:
def train_stroke():

    experiment_name = 'stroke_demo'

    warnings.filterwarnings("ignore")
    np.random.seed(40)

    
    mlflow.set_tracking_uri("http://nginx:80")
    mlflow.set_experiment(experiment_name)
    
    print(mlflow.get_tracking_uri())
    
    with mlflow.start_run():
        
        # Get data and train model with AutoML
        rows, rowcount = get_data("stroke")
        stroke         = data_to_df(rows)
        stroke         = preprocess(stroke)
        train_ds, test_ds  = train_test(stroke)
        model, predictor = get_best_model(train_ds)
        
        columns_test = list(test_ds.columns)
        columns_test.remove("stroke")
        test_ds_model = test_ds[columns_test]
        model.predict(test_ds_model)
        predicted_y = model.predict(test_ds_model)
        (accuracy, recall, f1) = eval_metrics(test_ds.stroke.values, predicted_y)

        print("  accuracy: %s" % accuracy)
        print("  recall: %s" % recall)
        print("  f1: %s" % f1)

        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1", f1)

        tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme

        # Model registry does not work with file store
        if tracking_url_type_store != "file":

            # Register the model
            # There are other ways to use the Model Registry, which depends on the use case,
            # please refer to the doc for more information:
            # https://mlflow.org/docs/latest/model-registry.html#api-workflow
            mlflow.sklearn.log_model(model, "model", registered_model_name=experiment_name)
        else:
            mlflow.sklearn.log_model(model, "model")

In [24]:
!echo $MLFLOW_S3_ENDPOINT_URL

http://s3server:9000


In [54]:
train_stroke()

No path specified. Models will be saved in: "AutogluonModels/ag-20210506_091830/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20210506_091830/"
AutoGluon Version:  0.2.0
Train Data Rows:    4079
Train Data Columns: 18
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [0, 1]
	If 'binary' is not the correct problem_type, please manually specify the problem_type argument in fit() (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...


http://nginx:80


	Available Memory:                    4789.3 MB
	Train Data (Original)  Memory Usage: 0.27 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Types of features in original data (raw dtype, special dtypes):
		('float', []) :  2 | ['avg_glucose_level', 'bmi']
		('int', [])   : 16 | ['gender', 'age', 'hypertension', 'heart_disease', 'ever_married', ...]
	Types of features in processed data (raw dtype, special dtypes):
		('float', []) :  2 | ['avg_glucose_level', 'bmi']
		('int', [])   : 16 | ['gender', 'age', 'hypertension', 'heart_disease', 'ever_married', ...]
	0.1s = Fit runtime
	18 features in original data used to generate 18 features in p

{'name': 'NeuralNetFastAI', 'model_type': 'NNFastAiTabularModel', 'problem_type': 'binary', 'eval_metric': 'accuracy', 'stopping_metric': 'accuracy', 'fit_time': 23.519787549972534, 'num_classes': 2, 'quantile_levels': None, 'predict_time': 0.02547168731689453, 'val_score': 0.954, 'hyperparameters': {'layers': None, 'emb_drop': 0.1, 'ps': 0.1, 'bs': 256, 'lr': 0.01, 'epochs': 30, 'early.stopping.min_delta': 0.0001, 'early.stopping.patience': 20, 'smoothing': 0.0}, 'hyperparameters_fit': {'best_epoch': 9}, 'hyperparameters_nondefault': [], 'ag_args_fit': {'max_memory_usage_ratio': 1.0, 'max_time_limit_ratio': 1.0, 'max_time_limit': None, 'min_time_limit': 0, 'ignored_type_group_special': None, 'ignored_type_group_raw': ['object'], 'get_features_kwargs': None, 'get_features_kwargs_extra': None}, 'num_features': 18, 'features': ['gender', 'age', 'hypertension', 'heart_disease', 'ever_married', 'avg_glucose_level', 'bmi', 'work_type_Govt_job', 'work_type_Never_worked', 'work_type_Private',

Registered model 'stroke_demo' already exists. Creating a new version of this model...
2021/05/06 09:19:16 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: stroke_demo, version 4
Created version '4' of model 'stroke_demo'.


## 2.5 Interacting with the model

In [55]:
def get_experiment_id(base_name,mlflow):
    n = 0
    experiment_id = 0
    while True:
        try:
            name = mlflow.get_experiment(n).name
        except:
            break
        print(name)
        if base_name == name:
            experiment_id = n
            break
        else:
            n = n + 1
    return experiment_id

In [None]:
mlflow.set_tracking_uri("http://nginx:80")
    
experiment_name     = 'stroke_demo_airflow'
model_registry_name = 'stroke_demo_airflow_register'


experiment_id = get_experiment_id(experiment_name,mlflow)
print(experiment_name,experiment_id)

# best artifact uri
#runs_metadata = mlflow.search_runs([experiment_id], filter_string=filter_string)
runs_metadata = mlflow.search_runs([experiment_id])
best_artifact_uri = runs_metadata.sort_values(by='metrics.accuracy', ascending=False)['artifact_uri'].values[0]
print("best_artifact_uri:",best_artifact_uri)

# register the best model into the MLflow registry
mlflow.register_model(best_artifact_uri, model_registry_name)

In [55]:
experiment_name     = 'soldering_cells_airflow'
experiment_id = get_experiment_id(experiment_name,mlflow)
print(experiment_name,experiment_id)

# best artifact uri
#runs_metadata = mlflow.search_runs([experiment_id], filter_string=filter_string)
runs_metadata = mlflow.search_runs([experiment_id])
best_artifact_uri = runs_metadata.sort_values(by='metrics.accuracy', ascending=False)['artifact_uri'].values[0]
print("best_artifact_uri:",best_artifact_uri)

# register the best model into the MLflow registry

Default
winequality_elasticnet_2
iris
winequality_elasticnet_3
iris_svm
iris_svm_airflow
soldering_cells
soldering_cells_airflow
soldering_cells_airflow 7
best_artifact_uri: s3://mlflow-bucket/mlflow/7/7492da7dcd9d4128ba84ab6d8475f48a/artifacts


In [59]:
mlflow.search_runs([1])

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.mae,metrics.rmse,metrics.r2,params.l1_ratio,params.alpha,tags.mlflow.user,tags.mlflow.source.type,tags.mlflow.source.name,tags.mlflow.log-model.history
0,cc27699c885d4a06a148ae4c9eb0afdc,1,FINISHED,s3://mlflow-bucket/mlflow/1/cc27699c885d4a06a1...,2021-04-12 10:17:11.332000+00:00,2021-04-12 10:17:11.921000+00:00,0.644257,0.809739,0.070982,0.6,0.5,root,LOCAL,/usr/local/lib/python3.6/dist-packages/ipykern...,"[{""run_id"": ""cc27699c885d4a06a148ae4c9eb0afdc""..."
1,13692c7a1b5b4d189fa9ad8028f24024,1,FINISHED,s3://mlflow-bucket/mlflow/1/13692c7a1b5b4d189f...,2021-04-12 10:17:10.599000+00:00,2021-04-12 10:17:11.275000+00:00,0.66765,0.832633,0.017708,0.8,0.6,root,LOCAL,/usr/local/lib/python3.6/dist-packages/ipykern...,"[{""run_id"": ""13692c7a1b5b4d189fa9ad8028f24024""..."
2,be3b0579287f4d91b750ae80c18b2e01,1,FINISHED,s3://mlflow-bucket/mlflow/1/be3b0579287f4d91b7...,2021-04-12 10:17:09.770000+00:00,2021-04-12 10:17:10.519000+00:00,0.627195,0.793164,0.108626,0.5,0.5,root,LOCAL,/usr/local/lib/python3.6/dist-packages/ipykern...,"[{""run_id"": ""be3b0579287f4d91b750ae80c18b2e01""..."
3,e0ad829ccfee4504bbe27a962fa996eb,1,FINISHED,s3://mlflow-bucket/mlflow/1/e0ad829ccfee4504bb...,2021-04-12 10:17:09.023000+00:00,2021-04-12 10:17:09.681000+00:00,0.623578,0.789393,0.117082,0.77,0.33,root,LOCAL,/usr/local/lib/python3.6/dist-packages/ipykern...,"[{""run_id"": ""e0ad829ccfee4504bbe27a962fa996eb""..."
4,fa3b7810d3944706b0992bd1a161dc8b,1,FINISHED,s3://mlflow-bucket/mlflow/1/fa3b7810d3944706b0...,2021-04-12 10:17:07.911000+00:00,2021-04-12 10:17:08.965000+00:00,0.56401,0.732794,0.239153,0.7,0.1,root,LOCAL,/usr/local/lib/python3.6/dist-packages/ipykern...,"[{""run_id"": ""fa3b7810d3944706b0992bd1a161dc8b""..."
5,b9c2e12eadd149db9226081df59ef803,1,FINISHED,s3://mlflow-bucket/mlflow/1/b9c2e12eadd149db92...,2021-04-12 10:16:21.946000+00:00,2021-04-12 10:16:25.316000+00:00,0.564384,0.73364,0.237395,0.2,0.2,root,LOCAL,/usr/local/lib/python3.6/dist-packages/ipykern...,"[{""run_id"": ""b9c2e12eadd149db9226081df59ef803""..."
