# 0.0 Imports

In [11]:
import os
import sys
sys.path.insert(0, '../src/')

from utill.utils import load_config
from sklearn.pipeline import Pipeline
from feature_engine.imputation import MeanMedianImputer
from feature_engine.wrappers import SklearnTransformerWrapper
from sklearn.preprocessing import RobustScaler, StandardScaler
from feature_engine.discretisation import EqualFrequencyDiscretiser, EqualWidthDiscretiser


from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression


from data.data_load import DataLoad
from train.train import TrainModels
from data.data_validation import DataValidation
from data.data_transform import DataTransformation
from data.data_preprocess import DataPreprocess
from sklearn.linear_model import LogisticRegression
from evaluation.classifier_eval import ModelEvaluation

# 1.0 Data Load

In [12]:
dl = DataLoad()
df = dl.load_data('train_dataset_name')

[2m2023-12-16 08:39:02[0m [[32m[1minfo     [0m] [1mStarting data load: train_dataset_name[0m


In [13]:
df.head()

Unnamed: 0,target,TaxaDeUtilizacaoDeLinhasNaoGarantidas,Idade,NumeroDeVezes30-59DiasAtrasoNaoPior,TaxaDeEndividamento,RendaMensal,NumeroDeLinhasDeCreditoEEmprestimosAbertos,NumeroDeVezes90DiasAtraso,NumeroDeEmprestimosOuLinhasImobiliarias,NumeroDeVezes60-89DiasAtrasoNaoPior,NumeroDeDependentes
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


# 2.0 Data Validation

In [14]:
dv = DataValidation()
is_valid = dv.run(df)

Initial Validation
[2m2023-12-16 08:39:03[0m [[32m[1minfo     [0m] [1mValidation columns passed[0m
[2m2023-12-16 08:39:03[0m [[32m[1minfo     [0m] [1mSuccessful Validation.[0m


# 3.0 Data Transformation

In [15]:
dt = DataTransformation(df)
X_train, X_valid, y_train, y_valid = dt.train_test_splitting()

In [16]:
X_train.head()

Unnamed: 0,TaxaDeUtilizacaoDeLinhasNaoGarantidas,Idade,NumeroDeVezes30-59DiasAtrasoNaoPior,TaxaDeEndividamento,RendaMensal,NumeroDeLinhasDeCreditoEEmprestimosAbertos,NumeroDeVezes90DiasAtraso,NumeroDeEmprestimosOuLinhasImobiliarias,NumeroDeVezes60-89DiasAtrasoNaoPior,NumeroDeDependentes
146433,0.183822,30,0,0.176638,5983.0,5,0,2,0,1.0
15597,0.533493,40,0,0.146019,2800.0,2,0,0,0,0.0
111605,1.026997,34,0,0.065518,4700.0,1,0,0,0,0.0
85418,0.0173,83,0,19.0,,4,0,0,0,0.0
9652,0.68246,61,0,0.140232,5333.0,3,0,0,0,2.0


# 4.0 Experimentations

In [17]:
import mlflow
from mlflow.tracking import MlflowClient
import joblib

In [18]:
mlflow.set_tracking_uri('http://127.0.0.1:5000')
mlflow.set_experiment('prob_loan')

<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1702725813091, experiment_id='1', last_update_time=1702725813091, lifecycle_stage='active', name='prob_loan', tags={}>

In [24]:
with mlflow.start_run(run_name='baseline'):
    mlflow.set_tag('model_name', 'lr_baseline')


    # 1. Etapa de preprocess
    pipe = Pipeline([
                 ('imputer', MeanMedianImputer(variables=load_config().get('vars_imputer'))), 
                 ('scaler', SklearnTransformerWrapper(StandardScaler()))
                 ])
    
    preprocessor = DataPreprocess(pipe)
    preprocessor.train(X_train)
    X_train_prep = preprocessor.transform(X_train)
    X_valid_prep = preprocessor.transform(X_valid)
    joblib.dump(preprocessor, '../models/preprocess.joblib')


    ##1.1 logger atifact
    mlflow.log_artifact('../models/preprocess.joblib')

    # 1.2 Logger params
    mlflow.log_params(params={'imputer': pipe['imputer'],
                              'scaler': pipe['scaler']})
    
    # 2.0 Model
    model = LogisticRegression()
    model_eval = ModelEvaluation(model,
                                 X_train_prep,
                                 y_train,
                                 n_splits=5)
    roc_auc_scores = model_eval.cross_val_evaluate()

    ## Result LOG
    mlflow.log_metric('train_roc_auc', roc_auc_scores.mean())

    # Train model
    model.fit(X_train_prep, y_train)

    # Save metrics
    y_val_preds = model_eval.model.predict_proba(X_valid_prep)[:, 1]
    val_roc_auc = model_eval.evaluate_predictions(y_valid, y_val_preds)

    mlflow.log_metric('valid_roc_auc', val_roc_auc)


    # Log Model
    mlflow.sklearn.log_model(model, 'lr_model',
                             pyfunc_predict_fn='predict_proba')

[2m2023-12-16 08:41:47[0m [[32m[1minfo     [0m] [1mInitialized Preprocessing[0m


[2m2023-12-16 08:41:47[0m [[32m[1minfo     [0m] [1mData Transformation with preprocess started...[0m
[2m2023-12-16 08:41:47[0m [[32m[1minfo     [0m] [1mData Transformation with preprocess started...[0m
[2m2023-12-16 08:41:48[0m [[32m[1minfo     [0m] [1mEvaluation initialized[0m
[2m2023-12-16 08:41:49[0m [[32m[1minfo     [0m] [1mInitilized Model Validation[0m


# 4.1 Experiment 01

In [25]:
from mlflow.models import MetricThreshold
from sklearn.dummy import DummyClassifier
from mlflow.models import infer_signature

In [29]:
with mlflow.start_run(run_name='with_discrtizer'):
    mlflow.set_tag('model_name', 'lr_discretizer')


    # 1. Etapa de preprocess
    pipe = Pipeline([
                 ('imputer', MeanMedianImputer(variables=load_config().get('vars_imputer'))),
                 ('discretizer', EqualFrequencyDiscretiser(variables=load_config().get('vars_imputer'))), 
                 ('scaler', SklearnTransformerWrapper(StandardScaler()))
                 ])
    
    preprocessor = DataPreprocess(pipe)
    preprocessor.train(X_train)
    X_train_prep = preprocessor.transform(X_train)
    X_valid_prep = preprocessor.transform(X_valid)
    joblib.dump(preprocessor, '../models/preprocess.joblib')


    ##1.1 logger atifact
    mlflow.log_artifact('../models/preprocess.joblib')

    # 1.2 Logger params
    mlflow.log_params(params={'imputer': pipe['imputer'],
                              'discretizer': pipe['discretizer'],
                              'scaler': pipe['scaler']})
    
    # 2.0 Model
    model = LogisticRegression()
    model_eval = ModelEvaluation(model,
                                 X_train_prep,
                                 y_train,
                                 n_splits=5)
    roc_auc_scores = model_eval.cross_val_evaluate()

    ## Result LOG
    mlflow.log_metric('train_roc_auc', roc_auc_scores.mean())

    # Train model
    model.fit(X_train_prep, y_train)

    # Save metrics
    y_val_preds = model_eval.model.predict_proba(X_valid_prep)[:, 1]
    val_roc_auc = model_eval.evaluate_predictions(y_valid, y_val_preds)

    mlflow.log_metric('valid_roc_auc', val_roc_auc)


    # Log Model
    candidate_model_uri = mlflow.sklearn.log_model(model, 'lr_model',
                             #pyfunc_predict_fn='predict_proba'
                             ).model_uri
    
    ################
    signature = infer_signature(X_valid_prep, y_valid)
    eval_data = X_valid_prep
    eval_data['label'] = y_valid
    thresholds = {
        'accuracy_score': MetricThreshold(threshold=0.7,
                                          min_absolute_change=0.05,
                                          min_relative_change=0.05,
                                          greater_is_better=True)
    }

    baseline_model = DummyClassifier(strategy='uniform').fit(X_train_prep, y_train)

    baseline_model_uri = mlflow.sklearn.log_model(baseline_model,
                                                 'baseline_model',
                                                 signature=signature).model_uri
    # Avaliar modelo
    mlflow.evaluate(candidate_model_uri,
                    eval_data,
                    targets='label',
                    model_type='classifier',
                    validation_thresholds=thresholds,
                    baseline_model=baseline_model_uri)
    
    # Explicabilidade com shap
    mlflow.shap.log_explanation(model.predict,
                                X_valid_prep.drop('label', axis=1))
    mlflow.end_run()

[2m2023-12-16 09:03:27[0m [[32m[1minfo     [0m] [1mInitialized Preprocessing[0m


[2m2023-12-16 09:03:27[0m [[32m[1minfo     [0m] [1mData Transformation with preprocess started...[0m
[2m2023-12-16 09:03:27[0m [[32m[1minfo     [0m] [1mData Transformation with preprocess started...[0m
[2m2023-12-16 09:03:27[0m [[32m[1minfo     [0m] [1mEvaluation initialized[0m
[2m2023-12-16 09:03:30[0m [[32m[1minfo     [0m] [1mInitilized Model Validation[0m


  outputs = _infer_schema(model_output) if model_output is not None else None
Downloading artifacts: 100%|██████████| 5/5 [00:00<00:00, 151.81it/s]
Downloading artifacts: 100%|██████████| 5/5 [00:00<00:00, 126.50it/s]
  string_columns = trimmed_df.columns[(df.applymap(type) == str).all(0)]
  data = data.applymap(_hash_array_like_element_as_bytes)
  data = data.applymap(_hash_array_like_element_as_bytes)
  return _infer_schema(self._df)
2023/12/16 09:03:33 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/12/16 09:03:33 INFO mlflow.models.evaluation.default_evaluator: Evaluating candidate model:
2023/12/16 09:03:33 INFO mlflow.models.evaluation.default_evaluator: Computing model predictions.
2023/12/16 09:03:33 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
2023/12/16 09:03:33 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first r

# 1.0 Data Validation

In [None]:
pipe = Pipeline([
                 ('imputer', MeanMedianImputer(variables=load_config().get('vars_imputer'))), 
                 ('discretizer', EqualFrequencyDiscretiser(variables=load_config().get('vars_imputer'))),
                 ('scaler', SklearnTransformerWrapper(StandardScaler()))
                 ])

In [None]:
preprocessor = DataPreprocess(pipe)

# 1.0 Data Validation

# 1.0 Data Validation