# 0.0 Imports

In [1]:
import os
import sys
sys.path.insert(0, '../src/')

from utill.utils import load_config
from sklearn.pipeline import Pipeline
from feature_engine.imputation import MeanMedianImputer
from feature_engine.wrappers import SklearnTransformerWrapper
from sklearn.preprocessing import RobustScaler, StandardScaler
from feature_engine.discretisation import EqualFrequencyDiscretiser, EqualWidthDiscretiser


from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression


from data.data_load import DataLoad
from train.train import TrainModels
from data.data_validation import DataValidation
from data.data_transform import DataTransformation
from data.data_preprocess import DataPreprocess
from sklearn.linear_model import LogisticRegression
from evaluation.classifier_eval import ModelEvaluation


* 'schema_extra' has been renamed to 'json_schema_extra'


In [2]:
import mlflow
from mlflow.tracking import MlflowClient
import joblib

# 1.0 Data Load

In [3]:
dl = DataLoad()
df = dl.load_data('train_dataset_name')

[2m2023-12-22 15:26:33[0m [[32m[1minfo     [0m] [1mStarting data load: train_dataset_name[0m


In [4]:
df.head()

Unnamed: 0,target,TaxaDeUtilizacaoDeLinhasNaoGarantidas,Idade,NumeroDeVezes30-59DiasAtrasoNaoPior,TaxaDeEndividamento,RendaMensal,NumeroDeLinhasDeCreditoEEmprestimosAbertos,NumeroDeVezes90DiasAtraso,NumeroDeEmprestimosOuLinhasImobiliarias,NumeroDeVezes60-89DiasAtrasoNaoPior,NumeroDeDependentes
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


# 2.0 Data Validation

In [5]:
dv = DataValidation()
is_valid = dv.run(df)

Initial Validation
[2m2023-12-22 15:26:34[0m [[32m[1minfo     [0m] [1mValidation columns passed[0m
[2m2023-12-22 15:26:34[0m [[32m[1minfo     [0m] [1mSuccessful Validation.[0m


# 3.0 Data Transformation

In [6]:
dt = DataTransformation(df)
X_train, X_valid, y_train, y_valid = dt.train_test_splitting()

In [7]:
X_train.head()

Unnamed: 0,TaxaDeUtilizacaoDeLinhasNaoGarantidas,Idade,NumeroDeVezes30-59DiasAtrasoNaoPior,TaxaDeEndividamento,RendaMensal,NumeroDeLinhasDeCreditoEEmprestimosAbertos,NumeroDeVezes90DiasAtraso,NumeroDeEmprestimosOuLinhasImobiliarias,NumeroDeVezes60-89DiasAtrasoNaoPior,NumeroDeDependentes
146433,0.183822,30,0,0.176638,5983.0,5,0,2,0,1.0
15597,0.533493,40,0,0.146019,2800.0,2,0,0,0,0.0
111605,1.026997,34,0,0.065518,4700.0,1,0,0,0,0.0
85418,0.0173,83,0,19.0,,4,0,0,0,0.0
9652,0.68246,61,0,0.140232,5333.0,3,0,0,0,2.0


In [8]:
tm = TrainModels(X_train, y_train)

In [9]:
tm.run()

[2m2023-12-22 15:26:38[0m [[32m[1minfo     [0m] [1mSelecting best model on mflow[0m
[2m2023-12-22 15:26:38[0m [[32m[1minfo     [0m] [1mInitializing Model Training: ../models/modelo.joblib[0m
[2m2023-12-22 15:26:38[0m [[32m[1minfo     [0m] [1mInitilized Model Validation[0m


  input_schema = _infer_schema(input_example)
  output_schema = _infer_schema(prediction)
Successfully registered model 'final_model'.
2023/12/22 15:26:41 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: final_model, version 1
Created version '1' of model 'final_model'.


# 4.0 Experimentations

In [16]:
mlflow.set_tracking_uri('http://127.0.0.1:5000')
mlflow.set_experiment('prob_loan')

<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1702725813091, experiment_id='1', last_update_time=1702725813091, lifecycle_stage='active', name='prob_loan', tags={}>

In [10]:
with mlflow.start_run(run_name='baseline'):
    mlflow.set_tag('model_name', 'lr_baseline')


    # 1. Etapa de preprocess
    pipe = Pipeline([
                 ('imputer', MeanMedianImputer(variables=load_config().get('vars_imputer'))), 
                 ('scaler', SklearnTransformerWrapper(StandardScaler()))
                 ])
    
    preprocessor = DataPreprocess(pipe)
    preprocessor.train(X_train)
    X_train_prep = preprocessor.transform(X_train)
    X_valid_prep = preprocessor.transform(X_valid)
    joblib.dump(preprocessor, '../models/preprocess.joblib')


    ##1.1 logger atifact
    mlflow.log_artifact('../models/preprocess.joblib')

    # 1.2 Logger params
    mlflow.log_params(params={'imputer': pipe['imputer'],
                              'scaler': pipe['scaler']})
    
    # 2.0 Model
    model = LogisticRegression()
    model_eval = ModelEvaluation(model,
                                 X_train_prep,
                                 y_train,
                                 n_splits=5)
    roc_auc_scores = model_eval.cross_val_evaluate()

    ## Result LOG
    mlflow.log_metric('train_roc_auc', roc_auc_scores.mean())

    # Train model
    model.fit(X_train_prep, y_train)

    # Save metrics
    y_val_preds = model_eval.model.predict_proba(X_valid_prep)[:, 1]
    val_roc_auc = model_eval.evaluate_predictions(y_valid, y_val_preds)

    mlflow.log_metric('valid_roc_auc', val_roc_auc)


    # Log Model
    mlflow.sklearn.log_model(model, 'lr_model',
                             pyfunc_predict_fn='predict_proba')

[2m2023-12-22 15:19:42[0m [[32m[1minfo     [0m] [1mInitialized Preprocessing[0m
[2m2023-12-22 15:19:42[0m [[32m[1minfo     [0m] [1mData Transformation with preprocess started...[0m
[2m2023-12-22 15:19:42[0m [[32m[1minfo     [0m] [1mData Transformation with preprocess started...[0m
[2m2023-12-22 15:19:42[0m [[32m[1minfo     [0m] [1mEvaluation initialized[0m
[2m2023-12-22 15:19:43[0m [[32m[1minfo     [0m] [1mInitilized Model Validation[0m


# 4.1 Hyperparameter


In [2]:
from hyperopt import fmin, tpe, hp, STATUS_OK

In [3]:
from mlflow.models import MetricThreshold
from sklearn.dummy import DummyClassifier
from mlflow.models import infer_signature


* 'schema_extra' has been renamed to 'json_schema_extra'


In [10]:
pipe = Pipeline([
                    ('imputer', MeanMedianImputer(variables=load_config().get('vars_imputer'))),
                    ('discretizer', EqualFrequencyDiscretiser(variables=load_config().get('vars_imputer'))), 
                    ('scaler', SklearnTransformerWrapper(StandardScaler()))
                    ])

In [17]:
def objective(params):



    with mlflow.start_run(run_name='with_discrtizer_hyperopt'):
        mlflow.set_tag('model_name', 'lr_dhyperopt')
        mlflow.log_params(params)


        # 1. Etapa de preprocess
        preprocessor = DataPreprocess(pipe)
        preprocessor.train(X_train)
        X_train_prep = preprocessor.transform(X_train)
        X_valid_prep = preprocessor.transform(X_valid)
        joblib.dump(preprocessor, '../models/preprocess.joblib')


        ##1.1 logger atifact
        mlflow.log_artifact('../models/preprocess.joblib')

        # 1.2 Logger params
        mlflow.log_params(params={'imputer': pipe['imputer'],
                                'discretizer': pipe['discretizer'],
                                'scaler': pipe['scaler']})
        
        # 2.0 Model
        model = LogisticRegression(**params)
        model_eval = ModelEvaluation(model,
                                    X_train_prep,
                                    y_train,
                                    n_splits=5)
        roc_auc_scores = model_eval.cross_val_evaluate()

        ## Result LOG
        mlflow.log_metric('train_roc_auc', roc_auc_scores.mean())

        # Train model
        model.fit(X_train_prep, y_train)

        # Save metrics
        y_val_preds = model_eval.model.predict_proba(X_valid_prep)[:, 1]
        val_roc_auc = model_eval.evaluate_predictions(y_valid, y_val_preds)

        mlflow.log_metric('valid_roc_auc', val_roc_auc)


        # Log Model
        candidate_model_uri = mlflow.sklearn.log_model(model, 'lr_model',
                                #pyfunc_predict_fn='predict_proba'
                                ).model_uri
        
        ################
        signature = infer_signature(X_valid_prep, y_valid)
        eval_data = X_valid_prep
        eval_data['label'] = y_valid
        thresholds = {
            'accuracy_score': MetricThreshold(threshold=0.7,
                                            min_absolute_change=0.05,
                                            min_relative_change=0.05,
                                            greater_is_better=True)
        }

        baseline_model = DummyClassifier(strategy='uniform').fit(X_train_prep, y_train)

        baseline_model_uri = mlflow.sklearn.log_model(baseline_model,
                                                    'baseline_model',
                                                    signature=signature).model_uri
        # Avaliar modelo
        mlflow.evaluate(candidate_model_uri,
                        eval_data,
                        targets='label',
                        model_type='classifier',
                        validation_thresholds=thresholds,
                        baseline_model=baseline_model_uri)
        
        # Explicabilidade com shap
        #mlflow.shap.log_explanation(model.predict,
        #                            X_valid_prep.drop('label', axis=1))
        mlflow.end_run()

        return {'loss': -roc_auc_scores.mean(),  'status': STATUS_OK}

In [18]:
search_space = {'warm_start': hp.choice('warn_start', [True, False]),
                'fit_intercept': hp.choice('fit_intercept', [True, False]),
                'tol': hp.uniform('tol', 0.00001, 0.0001),
                'C': hp.uniform('C', 0.05, 2),
                'solver': hp.choice('solver', ['newton-cg', 'lbfgs', 'liblinear']),
                'max_iter': hp.choice('max_iter', range(100, 1000)),
                'multi_class': 'auto',
                'class_weight': hp.choice('class_weight', [None, 'balanced'])}

In [19]:
best_result = fmin(fn=objective, space=search_space,
                   algo=tpe.suggest,
                   max_evals=5)

[2m2023-12-22 13:57:50[0m [[32m[1minfo     [0m] [1mInitialized Preprocessing[0m
[2m2023-12-22 13:57:50[0m [[32m[1minfo     [0m] [1mData Transformation with preprocess started...[0m
[2m2023-12-22 13:57:50[0m [[32m[1minfo     [0m] [1mData Transformation with preprocess started...[0m
[2m2023-12-22 13:57:50[0m [[32m[1minfo     [0m] [1mEvaluation initialized[0m
[2m2023-12-22 13:57:52[0m [[32m[1minfo     [0m] [1mInitilized Model Validation[0m
  0%|          | 0/5 [00:01<?, ?trial/s, best loss=?]

Distutils was imported before Setuptools, but importing Setuptools also replaces the `distutils` module in `sys.modules`. This may lead to undesirable behaviors or errors. To avoid these issues, avoid using distutils directly, ensure that setuptools is installed in the traditional way (e.g. not an editable install), and/or make sure that setuptools is always imported before distutils.

Setuptools is replacing distutils.

Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/doc

[2m2023-12-22 13:57:59[0m [[32m[1minfo     [0m] [1mInitialized Preprocessing[0m
[2m2023-12-22 13:57:59[0m [[32m[1minfo     [0m] [1mData Transformation with preprocess started...[0m
[2m2023-12-22 13:57:59[0m [[32m[1minfo     [0m] [1mData Transformation with preprocess started...[0m
[2m2023-12-22 13:57:59[0m [[32m[1minfo     [0m] [1mEvaluation initialized[0m
[2m2023-12-22 13:58:00[0m [[32m[1minfo     [0m] [1mInitilized Model Validation[0m
 20%|██        | 1/5 [00:10<00:35,  8.97s/trial, best loss: -0.69666429288049]

Distutils was imported before Setuptools, but importing Setuptools also replaces the `distutils` module in `sys.modules`. This may lead to undesirable behaviors or errors. To avoid these issues, avoid using distutils directly, ensure that setuptools is installed in the traditional way (e.g. not an editable install), and/or make sure that setuptools is always imported before distutils.

Setuptools is replacing distutils.

Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/doc

[2m2023-12-22 13:58:07[0m [[32m[1minfo     [0m] [1mInitialized Preprocessing[0m
[2m2023-12-22 13:58:07[0m [[32m[1minfo     [0m] [1mData Transformation with preprocess started...[0m
[2m2023-12-22 13:58:07[0m [[32m[1minfo     [0m] [1mData Transformation with preprocess started...[0m
[2m2023-12-22 13:58:07[0m [[32m[1minfo     [0m] [1mEvaluation initialized[0m
[2m2023-12-22 13:58:09[0m [[32m[1minfo     [0m] [1mInitilized Model Validation[0m
 40%|████      | 2/5 [00:19<00:24,  8.32s/trial, best loss: -0.7909351628596438]

Distutils was imported before Setuptools, but importing Setuptools also replaces the `distutils` module in `sys.modules`. This may lead to undesirable behaviors or errors. To avoid these issues, avoid using distutils directly, ensure that setuptools is installed in the traditional way (e.g. not an editable install), and/or make sure that setuptools is always imported before distutils.

Setuptools is replacing distutils.

Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/doc

[2m2023-12-22 13:58:16[0m [[32m[1minfo     [0m] [1mInitialized Preprocessing[0m
[2m2023-12-22 13:58:16[0m [[32m[1minfo     [0m] [1mData Transformation with preprocess started...[0m
[2m2023-12-22 13:58:17[0m [[32m[1minfo     [0m] [1mData Transformation with preprocess started...[0m
[2m2023-12-22 13:58:17[0m [[32m[1minfo     [0m] [1mEvaluation initialized[0m
[2m2023-12-22 13:58:17[0m [[32m[1minfo     [0m] [1mInitilized Model Validation[0m
 60%|██████    | 3/5 [00:27<00:17,  8.71s/trial, best loss: -0.7909351628596438]

Distutils was imported before Setuptools, but importing Setuptools also replaces the `distutils` module in `sys.modules`. This may lead to undesirable behaviors or errors. To avoid these issues, avoid using distutils directly, ensure that setuptools is installed in the traditional way (e.g. not an editable install), and/or make sure that setuptools is always imported before distutils.

Setuptools is replacing distutils.

Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/doc

[2m2023-12-22 13:58:25[0m [[32m[1minfo     [0m] [1mInitialized Preprocessing[0m
[2m2023-12-22 13:58:25[0m [[32m[1minfo     [0m] [1mData Transformation with preprocess started...[0m
[2m2023-12-22 13:58:25[0m [[32m[1minfo     [0m] [1mData Transformation with preprocess started...[0m
[2m2023-12-22 13:58:25[0m [[32m[1minfo     [0m] [1mEvaluation initialized[0m
[2m2023-12-22 13:58:27[0m [[32m[1minfo     [0m] [1mInitilized Model Validation[0m
 80%|████████  | 4/5 [00:36<00:08,  8.55s/trial, best loss: -0.8130256948501391]

Distutils was imported before Setuptools, but importing Setuptools also replaces the `distutils` module in `sys.modules`. This may lead to undesirable behaviors or errors. To avoid these issues, avoid using distutils directly, ensure that setuptools is installed in the traditional way (e.g. not an editable install), and/or make sure that setuptools is always imported before distutils.

Setuptools is replacing distutils.

Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/doc

100%|██████████| 5/5 [00:43<00:00,  8.69s/trial, best loss: -0.8130256948501391]


# 1.0 Data Validation

In [None]:
pipe = Pipeline([
                 ('imputer', MeanMedianImputer(variables=load_config().get('vars_imputer'))), 
                 ('discretizer', EqualFrequencyDiscretiser(variables=load_config().get('vars_imputer'))),
                 ('scaler', SklearnTransformerWrapper(StandardScaler()))
                 ])

In [None]:
preprocessor = DataPreprocess(pipe)

# Selecting the Best Model

In [10]:
current_experiment = dict(mlflow.get_experiment_by_name('prob_loan'))
current_experiment

{'artifact_location': 'mlflow-artifacts:/1',
 'creation_time': 1702725813091,
 'experiment_id': '1',
 'last_update_time': 1702725813091,
 'lifecycle_stage': 'active',
 'name': 'prob_loan',
 'tags': {}}

In [11]:
exepriment_id = current_experiment['experiment_id']

In [12]:
df_mlflow = mlflow.search_runs(filter_string='metrics.valid_roc_auc < 1').sort_values('metrics.valid_roc_auc', ascending=False)
df_mlflow[['run_id', 'metrics.valid_roc_auc']]

Unnamed: 0,run_id,metrics.valid_roc_auc
1,f134ec2b086d4ecd98dc7215c591a0d6,0.821449
0,b33be2e3332c4a489bb2d7831f3127dd,0.821339
3,ca0e18683f0a4d4ca16bbe84218213a2,0.800721
2,64130810d538443faa1072d1d2edbe95,0.800651
4,6cf73a631ff04677b43c68bdea0293d8,0.712063
5,34b34557f1ad451792348db3ef437d8a,0.711076
6,e12f3ec2894341c48d7ffe88aed24bc4,0.711076
7,9ca3c89d940441a6b3aba141a9157b71,0.711076
8,31bb52dd61734498a85cb4f82c664c49,0.707613


In [37]:
df_mlflow.columns

Index(['run_id', 'experiment_id', 'status', 'artifact_uri', 'start_time',
       'end_time', 'metrics.accuracy_score', 'metrics.recall_score',
       'metrics.false_positives', 'metrics.true_positives', 'metrics.log_loss',
       'metrics.f1_score', 'metrics.valid_roc_auc', 'metrics.score',
       'metrics.train_roc_auc', 'metrics.precision_score',
       'metrics.example_count', 'metrics.roc_auc',
       'metrics.precision_recall_auc', 'metrics.false_negatives',
       'metrics.true_negatives', 'params.class_weight', 'params.tol',
       'params.imputer', 'params.warm_start', 'params.max_iter',
       'params.solver', 'params.multi_class', 'params.scaler',
       'params.fit_intercept', 'params.discretizer', 'params.C',
       'tags.mlflow.source.name', 'tags.mlflow.user',
       'tags.mlflow.log-model.history', 'tags.mlflow.runName',
       'tags.mlflow.datasets', 'tags.model_name', 'tags.mlflow.source.type'],
      dtype='object')

In [None]:
params.class_weight', 'params.tol',
       'params.imputer', 'params.warm_start', 'params.max_iter',
       'params.solver', 'params.multi_class', 'params.scaler',
       'params.fit_intercept', 'params.discretizer', 'params.C'

In [31]:
df_mlflow['metrics.valid_roc_auc'].idxmax()

1

In [34]:
run_id  = df_mlflow.loc[df_mlflow['metrics.valid_roc_auc'].idxmax()].run_id

'f134ec2b086d4ecd98dc7215c591a0d6'

In [36]:
run_id = df_mlflow.iloc[0].run_id
run_id

'f134ec2b086d4ecd98dc7215c591a0d6'

In [56]:
df_mlflow.filter(like='params')

Unnamed: 0,params.class_weight,params.tol,params.imputer,params.warm_start,params.max_iter,params.solver,params.multi_class,params.scaler,params.fit_intercept,params.discretizer,params.C
1,,1.0037124436737777e-05,"MeanMedianImputer(variables=['RendaMensal', 'N...",False,469.0,lbfgs,auto,SklearnTransformerWrapper(transformer=Standard...,False,EqualFrequencyDiscretiser(variables=['RendaMen...,0.1916031348122706
0,,2.505992461591161e-05,"MeanMedianImputer(variables=['RendaMensal', 'N...",False,956.0,liblinear,auto,SklearnTransformerWrapper(transformer=Standard...,False,EqualFrequencyDiscretiser(variables=['RendaMen...,1.0890394817000022
3,balanced,6.452644561220985e-05,"MeanMedianImputer(variables=['RendaMensal', 'N...",True,563.0,lbfgs,auto,SklearnTransformerWrapper(transformer=Standard...,True,EqualFrequencyDiscretiser(variables=['RendaMen...,1.856178727893204
2,balanced,4.488938086498805e-05,"MeanMedianImputer(variables=['RendaMensal', 'N...",True,277.0,liblinear,auto,SklearnTransformerWrapper(transformer=Standard...,True,EqualFrequencyDiscretiser(variables=['RendaMen...,0.3033409704279228
4,,2.5155909349476943e-05,"MeanMedianImputer(variables=['RendaMensal', 'N...",False,321.0,newton-cg,auto,SklearnTransformerWrapper(transformer=Standard...,True,EqualFrequencyDiscretiser(variables=['RendaMen...,0.144369727422732
5,,,"MeanMedianImputer(variables=['RendaMensal', 'N...",,,,,SklearnTransformerWrapper(transformer=Standard...,,EqualFrequencyDiscretiser(variables=['RendaMen...,
6,,,"MeanMedianImputer(variables=['RendaMensal', 'N...",,,,,SklearnTransformerWrapper(transformer=Standard...,,EqualFrequencyDiscretiser(variables=['RendaMen...,
7,,,"MeanMedianImputer(variables=['RendaMensal', 'N...",,,,,SklearnTransformerWrapper(transformer=Standard...,,EqualFrequencyDiscretiser(variables=['RendaMen...,
8,,,"MeanMedianImputer(variables=['RendaMensal', 'N...",,,,,SklearnTransformerWrapper(transformer=Standard...,,,


In [59]:
df_mlflow.query(f'run_id == "{run_id}"').filter(like='params')#.filter(like='params')

Unnamed: 0,params.class_weight,params.tol,params.imputer,params.warm_start,params.max_iter,params.solver,params.multi_class,params.scaler,params.fit_intercept,params.discretizer,params.C
1,,1.0037124436737777e-05,"MeanMedianImputer(variables=['RendaMensal', 'N...",False,469,lbfgs,auto,SklearnTransformerWrapper(transformer=Standard...,False,EqualFrequencyDiscretiser(variables=['RendaMen...,0.1916031348122706


In [49]:
preprocessor = DataPreprocess(pipe)
preprocessor.train(X_train)
X_valid_prep = preprocessor.transform(X_valid)

[2m2023-12-22 14:23:58[0m [[32m[1minfo     [0m] [1mInitialized Preprocessing[0m
[2m2023-12-22 14:23:58[0m [[32m[1minfo     [0m] [1mData Transformation with preprocess started...[0m


In [50]:
logged_model = f'runs:/{run_id}/lr_model'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)

# Predict on a Pandas DataFrame.
import pandas as pd
loaded_model.predict(X_valid_prep)

Downloading artifacts: 100%|██████████| 5/5 [00:00<00:00, 144.41it/s]




array([0, 0, 0, ..., 0, 0, 0])

In [51]:
import numpy as np

In [52]:
y = loaded_model.predict(X_valid_prep)
np.unique(y)

array([0, 1])

# 1.0 Data Validation

# 1.0 Data Validation