# Modélisation Méthode 3 - `E[S]`

## setup

### Import des fichiers

In [411]:
#Temps et fichiers
import os
import warnings
import time
from datetime import timedelta

#Manipulation de données
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport
from functools import partial


#Modélisation
from sklearn.datasets import fetch_openml
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import PoissonRegressor, GammaRegressor
from sklearn.linear_model import TweedieRegressor
from sklearn.metrics import mean_tweedie_deviance
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_absolute_error, mean_squared_error, auc

from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.svm import LinearSVC
from sklearn.model_selection import RandomizedSearchCV# the keys can be accessed with final_pipeline.get_params().keys()
from sklearn.linear_model import LogisticRegression

from xgboost import XGBClassifier

#Text
import re

#Evaluation
from sklearn.metrics import f1_score, confusion_matrix


#Visualisation
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots


#Tracking d'expérience
import mlflow
import mlflow.sklearn

### Utilisation du code du projet packagé

In [2]:
#Cette cellule permet d'appeler la version packagée du projet et d'en assurer le reload avant appel des fonctions
%load_ext autoreload
%autoreload 2

In [8]:
random_state=42

### Configuration de l'experiment MLFlow

In [77]:
mlflow.tracking.get_tracking_uri()

'/mnt/experiments'

## Chargement des données

In [229]:
# On Importe les données

#df
df_merged =pd.read_parquet('/mnt/data/interim/df_merged.gzip')
df_train=pd.read_parquet('/mnt/data/interim/df_train.gzip')
df_val=pd.read_parquet('/mnt/data/interim/df_val.gzip')

#X
X_train=pd.read_parquet('/mnt/data/interim/X_train.gzip')
X_val=pd.read_parquet('/mnt/data/interim/X_val.gzip')
X_test=pd.read_parquet('/mnt/data/interim/X_test.gzip')

#y
y_train=pd.read_parquet('/mnt/data/interim/y_train.gzip')
y_val=pd.read_parquet('/mnt/data/interim/y_val.gzip')

## Création du code

Laissé ici à titre de mémoire, ce code a ensuite été refactorisé dans le package du projet `hackathondsa_groupe4` pour plus de lisibilité entre les notebooks

### scores_to_dict

In [164]:
def scores_to_dict(score_df):
    d = score_df['train'].to_dict()
    d1 = dict(zip([x+'_train_' for x in  list(d.keys())], list(d.values())))
    d = score_df['test'].to_dict()
    d2 = dict(zip([x+'_test' for x in  list(d.keys())], list(d.values())))
    d1.update(d2)
    return d1

### score_estimator

In [186]:
def score_estimator(
    estimator, X_train, X_test, df_train, df_test, target, weights,
    tweedie_powers=None, use_weights=True
):
    """Evaluate an estimator on train and test sets with different metrics"""

    metrics = [
        ("D² explained", None),   # Use default scorer if it exists
        ("mean abs. error", mean_absolute_error),
        ("mean squared error", mean_squared_error),
    ]
    if tweedie_powers:
        metrics += [(
            "mean Tweedie dev p={:.4f}".format(power),
            partial(mean_tweedie_deviance, power=power)
        ) for power in tweedie_powers]

    res = []
    for subset_label, X, df in [
        ("train", X_train, df_train),
        ("test", X_test, df_test),
    ]:
        y = df[target]
        if use_weights:
            _weights =  df[weights]
        for score_label, metric in metrics:
            if isinstance(estimator, tuple) and len(estimator) == 2:
                # Score the model consisting of the product of frequency and
                # severity models.
                est_freq, est_sev = estimator
                y_pred = est_freq.predict(X) * est_sev.predict(X)
            else:
                y_pred = estimator.predict(X)

            if metric is None:
                if not hasattr(estimator, "score"):
                    continue
                if use_weights:
                    score = estimator.score(X, y, sample_weight=_weights)
                else:
                    score = estimator.score(X, y)
            else:
                if use_weights:
                    score = metric(y, y_pred, sample_weight=_weights)
                else:
                    score = metric(y, y_pred)

            res.append(
                {"subset": subset_label, "metric": score_label, "score": score}
            )

    res = (
        pd.DataFrame(res)
        .set_index(["metric", "subset"])
        .score.unstack(-1)
        .round(4)
        .loc[:, ['train', 'test']]
    )
    return res

On commence par définir une fonction générique qui sera en capacité d'ajuster, optimiser et logger dans MLFlow les résultats de pipelines qui seront produits pour chaque essai

In [67]:
def random_state_params(pipe, seed):
    """Crée un dictionnaire constitué de tous les paramètres 'random_state' d'un pipe et leur assigne une valeur unique"""
    rs = re.findall(r"[a-zA-Z\_]+_random_state", ' '.join(list(pipe.get_params().keys())))
    rs=dict.fromkeys(rs, seed)
    return rs

### train pipeline

In [425]:
def trainPipelineMlFlow(mlf_XP, 
                        xp_name_iter, 
                        pipeline, 
                        X_train, y_train, X_test, y_test, 
                        target_col='Frequency', 
                        weight_col='exposure', 
                        use_weights=False, 
                        fixed_params={}, 
                        opti=False, iterable_params={}):
    """
    Fonction générique permettant d'entrainer et d'optimiser un pipeline sklearn
    Les paramètres et résultats sont stockés dans MLFlow
    """
  
    mlflow.set_experiment(mlf_XP)

    with mlflow.start_run(run_name=xp_name_iter):
        
        start_time = time.monotonic()  
        
        warnings.filterwarnings("ignore")
        
        # fit pipeline
        pipeline.set_params(**fixed_params)
        if not opti:
            search = pipeline
        else:
            search = RandomizedSearchCV(pipeline, iterable_params)
        
        if use_weights:
            search.fit(X_train, y_train[target_col], sample_weight=X_train[weight_col])
        else:
            search.fit(X_train, y_train[target_col])
                
        # get params
        params_to_log = fixed_params #select initial params
        if opti:
            params_to_log.update(search.best_params_) #update for optimal solution
        mlflow.log_params(params_to_log)
        
        # Evaluate metrics
        y_pred=search.predict(X_test)
        score = score_estimator(estimator=search, 
                                         X_train=X_train, 
                                         X_test=X_test, 
                                         df_train=y_train, 
                                         df_test=y_test, 
                                         target=target_col, 
                                         weights=weight_col,
                                         use_weights=use_weights)
        
        # Print out metrics
        print(xp_name_iter)
        print("params:" % params_to_log)
        print(score)

        mlflow.log_metrics(scores_to_dict(score))
        mlflow.sklearn.log_model(pipeline, xp_name_iter)
        
        end_time = time.monotonic()
        elapsed_time = timedelta(seconds=end_time - start_time)
        print('elapsed time :', elapsed_time)
        mlflow.set_tag(key="elapsed_time", value=elapsed_time)   
        
    return search
        

La cellule suivante permet de créer des étapes de sélection de colonnes dans les Data Frame en entrée

In [69]:
from sklearn.base import BaseEstimator, TransformerMixin

class TextSelector(BaseEstimator, TransformerMixin):
    def __init__(self, field):
        self.field = field
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.field]

class NumberSelector(BaseEstimator, TransformerMixin):
    def __init__(self, field):
        self.field = field
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.field]

## Essai pipeline

In [14]:

log_scale_transformer = make_pipeline(
    FunctionTransformer(func=np.log),
    StandardScaler()
)

In [119]:


column_trans = ColumnTransformer(
    [
        ("binned_numeric", KBinsDiscretizer(n_bins=10),
            ["VALEUR_DES_BIENS"]),
        ("onehot_categorical", OneHotEncoder(),
            ["FORMULE", "TYPE_RESIDENCE", "TYPE_RESIDENCE", "NB_PIECES", "SITUATION_JURIDIQUE",'NIVEAU_JURIDIQUE','OBJETS_DE_VALEUR', 'ZONIER', 'NBSIN_TYPE1_AN1', 'NBSIN_TYPE1_AN3', 'NBSIN_TYPE2_AN1', 'NBSIN_TYPE2_AN2', 'NBSIN_TYPE2_AN3']),
        ("passthrough_numeric", "passthrough",
            ["NB_PIECES"]),
        ("log_scaled_numeric", log_scale_transformer,
            [])
    ],
    remainder="drop",
)




inspiration : https://towardsdatascience.com/pipeline-columntransformer-and-featureunion-explained-f5491f815f

In [136]:
# Define categorical pipeline
cat_pipe = Pipeline([('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                     ('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))])

# Define numerical pipeline
num_pipe = Pipeline([('imputer', SimpleImputer(strategy='median')),
                     ('scaler', MinMaxScaler())])

num_pipe_binned = Pipeline([('imputer', SimpleImputer(strategy='median')),
                     ('scaler', KBinsDiscretizer(n_bins=10))])

num_pipe_log = Pipeline([('imputer', SimpleImputer(strategy='median')),
                     ('scaler', KBinsDiscretizer(n_bins=10))])


# Fit column transformer to training data
preprocessor = ColumnTransformer(transformers=[('cat', cat_pipe, ["FORMULE", "TYPE_RESIDENCE", "SITUATION_JURIDIQUE",'NIVEAU_JURIDIQUE','OBJETS_DE_VALEUR', 'ZONIER', 'NBSIN_TYPE1_AN1', 'NBSIN_TYPE1_AN3', 'NBSIN_TYPE2_AN1',  'NBSIN_TYPE2_AN3']),
                                               ('num_binned', num_pipe_binned, ["VALEUR_DES_BIENS"])],remainder='drop')

In [166]:
pipeline_GLM = Pipeline(
    steps=[
#        ('na', SimpleImputer(strategy='median')),
#        ('preprocess', column_trans), #Sélection de la colonne à transformer (corpus)
        ('preprocess', preprocessor), 
        ("glm", PoissonRegressor(max_iter=400))
    ]
)

In [167]:
list(pipeline_GLM.get_params().keys())

['memory',
 'steps',
 'verbose',
 'preprocess',
 'glm',
 'preprocess__n_jobs',
 'preprocess__remainder',
 'preprocess__sparse_threshold',
 'preprocess__transformer_weights',
 'preprocess__transformers',
 'preprocess__verbose',
 'preprocess__cat',
 'preprocess__num_binned',
 'preprocess__cat__memory',
 'preprocess__cat__steps',
 'preprocess__cat__verbose',
 'preprocess__cat__imputer',
 'preprocess__cat__encoder',
 'preprocess__cat__imputer__add_indicator',
 'preprocess__cat__imputer__copy',
 'preprocess__cat__imputer__fill_value',
 'preprocess__cat__imputer__missing_values',
 'preprocess__cat__imputer__strategy',
 'preprocess__cat__imputer__verbose',
 'preprocess__cat__encoder__categories',
 'preprocess__cat__encoder__drop',
 'preprocess__cat__encoder__dtype',
 'preprocess__cat__encoder__handle_unknown',
 'preprocess__cat__encoder__sparse',
 'preprocess__num_binned__memory',
 'preprocess__num_binned__steps',
 'preprocess__num_binned__verbose',
 'preprocess__num_binned__imputer',
 'prepr

In [165]:
trainPipelineMlFlow(
    mlf_XP = 'Init',
    xp_name_iter= "Ini-GLM", 
    pipeline=pipeline_GLM, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    y_test=y_test, 
    target_col='PurePremium', 
    weight_col='EXPO', 
    fixed_params=random_state_params(pipeline_GLM,42))

Ini-GLM
params:
subset                    train         test
metric                                      
D² explained             0.0579       0.0294
mean abs. error         61.3388      59.7598
mean squared error  223222.3396  218523.5348
elapsed time : 0:00:11.605138


In [102]:
X_train.isna().sum()

EXPO                       0
FORMULE                    0
TYPE_RESIDENCE             0
TYPE_HABITATION            0
NB_PIECES               7458
SITUATION_JURIDIQUE        0
NIVEAU_JURIDIQUE           0
VALEUR_DES_BIENS           0
OBJETS_DE_VALEUR           0
ZONIER                     0
NBSIN_TYPE1_AN1            0
NBSIN_TYPE1_AN3            0
NBSIN_TYPE2_AN1            0
NBSIN_TYPE2_AN2        13558
NBSIN_TYPE2_AN3            0
id                         0
ANNEE                      0
dtype: int64

In [168]:
trainPipelineMlFlow(
    mlf_XP = 'Init',
    xp_name_iter= "Ini-GLM_alpha1e-4", 
    pipeline=pipeline_GLM, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    y_test=y_test, 
    target_col='PurePremium', 
    weight_col='EXPO', 
    fixed_params={'glm__alpha':1e-4})

Ini-GLM_alpha1e-4
params:
subset                   train         test
metric                                     
D² explained             0.058       0.0280
mean abs. error         61.324      59.7487
mean squared error  223221.747  218525.5891
elapsed time : 0:00:11.671008


## Modélisation prime pure par Tweedie

In [169]:
pipeline_Tweedie = Pipeline(
    steps=[
        ('preprocess', preprocessor), 
        ("Tweedie", TweedieRegressor(power=1.9, alpha=.1, max_iter=10000))
    ]
)

In [189]:
trainPipelineMlFlow(
    mlf_XP = 'Init',
    xp_name_iter= "Ini-Tweedie", 
    pipeline=pipeline_Tweedie, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    y_test=y_test, 
    target_col='PurePremium',
    use_weights=False)

Ini-Tweedie
params:
subset                    train         test
metric                                      
D² explained             0.0223       0.0171
mean abs. error         60.0233      61.5289
mean squared error  561186.4676  526904.9401
elapsed time : 0:00:02.694281


**ATTENTION**

On doit créer une nouvelle classe qui supercherge pipeline pour assayer de faire passer les sample weights

cf [erreur GitHub](https://github.com/scikit-learn/scikit-learn/issues/18159)

In [178]:
class PipelineSW(Pipeline):
    def fit(self, X, y, sample_weight=None):
        """Fit and pass sample weights only to the last step"""
        if sample_weight is not None:
            kwargs = {self.steps[-1][0] + '__sample_weight': sample_weight}
        else:
            kwargs = {}
        return super().fit(X, y, **kwargs)

In [179]:
pipeline_Tweedie_SW = PipelineSW(
    steps=[
        ('preprocess', preprocessor), 
        ("Tweedie", TweedieRegressor(power=1.9, alpha=.1, max_iter=10000))
    ]
)

In [180]:
list(pipeline_Tweedie_SW.get_params().keys())

['memory',
 'steps',
 'verbose',
 'preprocess',
 'Tweedie',
 'preprocess__n_jobs',
 'preprocess__remainder',
 'preprocess__sparse_threshold',
 'preprocess__transformer_weights',
 'preprocess__transformers',
 'preprocess__verbose',
 'preprocess__cat',
 'preprocess__num_binned',
 'preprocess__cat__memory',
 'preprocess__cat__steps',
 'preprocess__cat__verbose',
 'preprocess__cat__imputer',
 'preprocess__cat__encoder',
 'preprocess__cat__imputer__add_indicator',
 'preprocess__cat__imputer__copy',
 'preprocess__cat__imputer__fill_value',
 'preprocess__cat__imputer__missing_values',
 'preprocess__cat__imputer__strategy',
 'preprocess__cat__imputer__verbose',
 'preprocess__cat__encoder__categories',
 'preprocess__cat__encoder__drop',
 'preprocess__cat__encoder__dtype',
 'preprocess__cat__encoder__handle_unknown',
 'preprocess__cat__encoder__sparse',
 'preprocess__num_binned__memory',
 'preprocess__num_binned__steps',
 'preprocess__num_binned__verbose',
 'preprocess__num_binned__imputer',
 'p

In [283]:
model = trainPipelineMlFlow(
    mlf_XP = 'Init',
    xp_name_iter= "Ini-Tweedie-sampleWeight", 
    pipeline=pipeline_Tweedie_SW, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_val, 
    y_test=y_val, 
    target_col='PurePremium', 
    weight_col='EXPO',
    use_weights=True)

Ini-Tweedie-sampleWeight
params:
subset                    train         test
metric                                      
D² explained             0.0195       0.0156
mean abs. error         56.0804      53.9778
mean squared error  223595.3580  218201.8055
elapsed time : 0:00:02.420781


### Opti Tweedie

In [390]:
params = {
    'Tweedie__power': 0
}


In [385]:
model = trainPipelineMlFlow(
    mlf_XP = 'Init',
    xp_name_iter= "Ini-TweedieSW-optiPower-0", 
    pipeline=pipeline_Tweedie_SW, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_val, 
    y_test=y_val, 
    target_col='PurePremium', 
    weight_col='EXPO',
    use_weights=True,
    fixed_params=params
)

Ini-TweedieSW-optiPower-0
params:
subset                    train         test
metric                                      
D² explained             0.0017       0.0014
mean abs. error         58.2854      56.1170
mean squared error  223634.9062  218239.2274
elapsed time : 0:00:02.702185


In [391]:
params = {
    'Tweedie__power': 1
}


In [392]:
model = trainPipelineMlFlow(
    mlf_XP = 'Init',
    xp_name_iter= "Ini-TweedieSW-optiPower-1", 
    pipeline=pipeline_Tweedie_SW, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_val, 
    y_test=y_val, 
    target_col='PurePremium', 
    weight_col='EXPO',
    use_weights=True,
    fixed_params=params
)

Ini-TweedieSW-optiPower-1
params:
subset                    train         test
metric                                      
D² explained             0.0619       0.0441
mean abs. error         58.0435      56.1713
mean squared error  223267.8422  218175.6337
elapsed time : 0:00:04.353207


In [393]:
params = {
    'Tweedie__power': 1.5
}


In [394]:
model = trainPipelineMlFlow(
    mlf_XP = 'Init',
    xp_name_iter= "Ini-TweedieSW-optiPower-1.5", 
    pipeline=pipeline_Tweedie_SW, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_val, 
    y_test=y_val, 
    target_col='PurePremium', 
    weight_col='EXPO',
    use_weights=True,
    fixed_params=params
)

Ini-TweedieSW-optiPower-1.5
params:
subset                    train         test
metric                                      
D² explained             0.0715       0.0555
mean abs. error         57.0900      55.0937
mean squared error  223453.8596  218143.4718
elapsed time : 0:00:03.113444


In [404]:
params = {
    'Tweedie__power': 1.75
}


In [405]:
model = trainPipelineMlFlow(
    mlf_XP = 'Init',
    xp_name_iter= "Ini-TweedieSW-optiPower-1.75", 
    pipeline=pipeline_Tweedie_SW, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_val, 
    y_test=y_val, 
    target_col='PurePremium', 
    weight_col='EXPO',
    use_weights=True,
    fixed_params=params
)

Ini-TweedieSW-optiPower-1.75
params:
subset                    train         test
metric                                      
D² explained             0.0447       0.0355
mean abs. error         56.4804      54.4151
mean squared error  223542.1760  218171.6343
elapsed time : 0:00:02.757547


In [402]:
params = {
    'Tweedie__power': 1.99
}


In [403]:
params = {
    'Tweedie__power': 1.99
}

model = trainPipelineMlFlow(
    mlf_XP = 'Init',
    xp_name_iter= "Ini-TweedieSW-optiPower-1.99", 
    pipeline=pipeline_Tweedie_SW, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_val, 
    y_test=y_val, 
    target_col='PurePremium', 
    weight_col='EXPO',
    use_weights=True,
    fixed_params=params
)

Ini-TweedieSW-optiPower-1.99
params:
subset                    train         test
metric                                      
D² explained             0.0020       0.0016
mean abs. error         55.8333      53.7116
mean squared error  223628.7999  218224.7150
elapsed time : 0:00:02.527945


## Enseignements

La masse en 0 est beaucoup trop fort (98,3%) pour permettre une modélisation classique. On bascule sur l'option 2 : I(N>0)*E[S]

### On essaie de savoir si Tweedie peut déjà aider sur les sinistres

In [409]:
params = {
    'Tweedie__power': 1.99
}

model = trainPipelineMlFlow(
    mlf_XP = 'Charge>0',
    xp_name_iter= "Chg-TweedieSW-optiPower-1.99", 
    pipeline=pipeline_Tweedie_SW, 
    X_train=X_train[y_train['NB']>0], 
    y_train=y_train[y_train['NB']>0], 
    X_test=X_val[y_val['NB']>0], 
    y_test=y_val[y_val['NB']>0], 
    target_col='PurePremium', 
    weight_col='EXPO',
    use_weights=True,
    fixed_params=params
)

INFO: 'Charge>0' does not exist. Creating a new experiment
Chg-TweedieSW-optiPower-1.99
params:
subset                     train          test
metric                                        
D² explained        4.750000e-02  6.900000e-03
mean abs. error     1.494872e+03  1.495566e+03
mean squared error  9.068450e+06  9.489292e+06
elapsed time : 0:00:00.135901


### On passe à RandomForest

In [None]:
# Define categorical pipeline
cat_pipe = Pipeline([('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                     ('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))])

# Define numerical pipeline
num_pipe = Pipeline([('imputer', SimpleImputer(strategy='median')),
                     ('scaler', MinMaxScaler())])

num_pipe_binned = Pipeline([('imputer', SimpleImputer(strategy='median')),
                     ('scaler', KBinsDiscretizer(n_bins=10))])

num_pipe_log = Pipeline([('imputer', SimpleImputer(strategy='median')),
                     ('scaler', KBinsDiscretizer(n_bins=10))])


# Fit column transformer to training data
preprocessor = ColumnTransformer(transformers=[('cat', cat_pipe, ["FORMULE", "TYPE_RESIDENCE", "SITUATION_JURIDIQUE",'NIVEAU_JURIDIQUE','OBJETS_DE_VALEUR', 'ZONIER', 'NBSIN_TYPE1_AN1', 'NBSIN_TYPE1_AN3', 'NBSIN_TYPE2_AN1',  'NBSIN_TYPE2_AN3']),
                                               ('num_binned', num_pipe_binned, ["VALEUR_DES_BIENS"])],remainder='drop')

In [422]:
pipeline_RFR = PipelineSW(
    steps=[
        ('preprocess', preprocessor), 
        ('rf', RandomForestRegressor(random_state=42, n_estimators=100))
    ]
)

In [423]:
list(pipeline_RFR.get_params().keys())

['memory',
 'steps',
 'verbose',
 'preprocess',
 'rf',
 'preprocess__n_jobs',
 'preprocess__remainder',
 'preprocess__sparse_threshold',
 'preprocess__transformer_weights',
 'preprocess__transformers',
 'preprocess__verbose',
 'preprocess__cat',
 'preprocess__num_binned',
 'preprocess__cat__memory',
 'preprocess__cat__steps',
 'preprocess__cat__verbose',
 'preprocess__cat__imputer',
 'preprocess__cat__encoder',
 'preprocess__cat__imputer__add_indicator',
 'preprocess__cat__imputer__copy',
 'preprocess__cat__imputer__fill_value',
 'preprocess__cat__imputer__missing_values',
 'preprocess__cat__imputer__strategy',
 'preprocess__cat__imputer__verbose',
 'preprocess__cat__encoder__categories',
 'preprocess__cat__encoder__drop',
 'preprocess__cat__encoder__dtype',
 'preprocess__cat__encoder__handle_unknown',
 'preprocess__cat__encoder__sparse',
 'preprocess__num_binned__memory',
 'preprocess__num_binned__steps',
 'preprocess__num_binned__verbose',
 'preprocess__num_binned__imputer',
 'prepro

In [None]:
model = trainPipelineMlFlow(
    mlf_XP = 'Charge>0',
    xp_name_iter= "Chg-RFR", 
    pipeline=pipeline_RFR, 
    X_train=X_train[y_train['NB']>0], 
    y_train=y_train[y_train['NB']>0], 
    X_test=X_val[y_val['NB']>0], 
    y_test=y_val[y_val['NB']>0], 
    target_col='PurePremium', 
    weight_col='EXPO',
    use_weights=True,
    fixed_params=params
)