# Modélisation Méthode 2 - `E[I]*E[S|I]`

## setup

### Import des fichiers

In [102]:
#Temps et fichiers
import os
import warnings
import time
from datetime import timedelta

#Manipulation de données
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport
from functools import partial


#Modélisation
from sklearn.datasets import fetch_openml
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import PoissonRegressor, GammaRegressor
from sklearn.linear_model import TweedieRegressor
from sklearn.metrics import mean_tweedie_deviance
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_absolute_error, mean_squared_error, auc

from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.svm import LinearSVC
from sklearn.model_selection import RandomizedSearchCV# the keys can be accessed with final_pipeline.get_params().keys()
from sklearn.linear_model import LogisticRegression

from xgboost import XGBClassifier

#Text
import re

#Evaluation
from sklearn.metrics import f1_score, confusion_matrix


#Visualisation
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots


#Tracking d'expérience
import mlflow
import mlflow.sklearn

### Utilisation du code du projet packagé

In [103]:
#Cette cellule permet d'appeler la version packagée du projet et d'en assurer le reload avant appel des fonctions
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [104]:
random_state=42

### Configuration de l'experiment MLFlow

In [105]:
mlflow.tracking.get_tracking_uri()

'/mnt/experiments'

## Chargement des données

In [106]:
# On Importe les données

#df
df_merged =pd.read_parquet('/mnt/data/interim/df_merged.gzip')
df_train=pd.read_parquet('/mnt/data/interim/df_train.gzip')
df_val=pd.read_parquet('/mnt/data/interim/df_val.gzip')

#X
X_train=pd.read_parquet('/mnt/data/interim/X_train.gzip')
X_val=pd.read_parquet('/mnt/data/interim/X_val.gzip')
X_test=pd.read_parquet('/mnt/data/interim/X_test.gzip')

#y
y_train=pd.read_parquet('/mnt/data/interim/y_train.gzip')
y_val=pd.read_parquet('/mnt/data/interim/y_val.gzip')

## Modélisation de l'indicatrice de sinistres

In [49]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import make_pipeline

In [55]:
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score, precision_recall_fscore_support

In [72]:
from hackathondsa_groupe4.scripts.train import trainPipelineMlFlow

In [47]:
# Define categorical pipeline
cat_pipe = Pipeline([('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                     ('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))])

# Define numerical pipeline
num_pipe = Pipeline([('imputer', SimpleImputer(strategy='median')),
                     ('scaler', MinMaxScaler())])

num_pipe_binned = Pipeline([('imputer', SimpleImputer(strategy='median')),
                     ('scaler', KBinsDiscretizer(n_bins=10))])

num_pipe_log = Pipeline([('imputer', SimpleImputer(strategy='median')),
                     ('scaler', KBinsDiscretizer(n_bins=10))])


# Fit column transformer to training data
preprocessor = ColumnTransformer(transformers=[('cat', cat_pipe, ["FORMULE", "TYPE_RESIDENCE", "SITUATION_JURIDIQUE",'NIVEAU_JURIDIQUE','OBJETS_DE_VALEUR', 'ZONIER', 'NBSIN_TYPE1_AN1', 'NBSIN_TYPE1_AN3', 'NBSIN_TYPE2_AN1',  'NBSIN_TYPE2_AN3']),
                                               ('num_binned', num_pipe_binned, ["VALEUR_DES_BIENS"])],remainder='drop')

In [73]:
class PipelineSW(Pipeline):
    def fit(self, X, y, sample_weight=None):
        """Fit and pass sample weights only to the last step"""
        if sample_weight is not None:
            kwargs = {self.steps[-1][0] + '__sample_weight': sample_weight}
        else:
            kwargs = {}
        return super().fit(X, y, **kwargs)

In [78]:
from imblearn.pipeline import make_pipeline


PipeClassif = PipelineSW(
                steps=[
                    ('preprocess', preprocessor), 
                    ('classif', RandomForestClassifier(n_estimators=50, max_depth=5,class_weight="balanced"))
                ]
)


In [79]:
from imblearn.pipeline import make_pipeline

PipeImb = make_pipeline(
    RandomUnderSampler(random_state=0),
    PipeClassif
)

In [81]:
model = trainPipelineMlFlow(
    mlf_XP = 'Isin',
    xp_name_iter= "Essai_imbalanced_2", 
    pipeline=PipeImb, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_val, 
    y_test=y_val, 
    target_col='Isin', 
#    weight_col='EXPO',
    use_weights=False
)

Essai_imbalanced_2
params:
subset               train    test
metric                            
D² explained        0.5389  0.5388
mean abs. error     0.4611  0.4612
mean squared error  0.4611  0.4612
elapsed time : 0:00:03.452119


### test sans undersampling

In [92]:
from imblearn.pipeline import make_pipeline


PipeClassif = PipelineSW(
                steps=[
                    ('preprocess', preprocessor), 
                    ('classif', RandomForestClassifier(n_estimators=50, max_depth=5,class_weight="balanced"))
                ]
)


In [87]:
from imblearn.pipeline import make_pipeline

PipeImbStd = make_pipeline(
#    RandomUnderSampler(random_state=0),
    PipeClassif
)

In [91]:
model = trainPipelineMlFlow(
    mlf_XP = 'Isin',
    xp_name_iter= "Essai_sans_imbalanced_3", 
    pipeline=PipeClassif, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_val, 
    y_test=y_val, 
    target_col='Isin', 
#    weight_col='EXPO',
    use_weights=False
)

Essai_sans_imbalanced_3
params:
subset               train    test
metric                            
D² explained        0.9827  0.9827
mean abs. error     0.0173  0.0173
mean squared error  0.0173  0.0173
elapsed time : 0:00:05.365246


### test BalancedRandomForestClassifier

In [94]:
from imblearn.ensemble import BalancedRandomForestClassifier

In [95]:
from imblearn.pipeline import make_pipeline


PipeClassifBRC = PipelineSW(
                steps=[
                    ('preprocess', preprocessor), 
                    ('classif', BalancedRandomForestClassifier(n_estimators=100, random_state=0))
                ]
)


In [87]:
from imblearn.pipeline import make_pipeline

PipeImbStd = make_pipeline(
#    RandomUnderSampler(random_state=0),
    PipeClassif
)

In [96]:
model = trainPipelineMlFlow(
    mlf_XP = 'Isin',
    xp_name_iter= "Essai_sans_imbalanced_BRC", 
    pipeline=PipeClassifBRC, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_val, 
    y_test=y_val, 
    target_col='Isin', 
#    weight_col='EXPO',
    use_weights=False
)

Essai_sans_imbalanced_BRC
params:
subset               train   test
metric                           
D² explained        0.5329  0.522
mean abs. error     0.4671  0.478
mean squared error  0.4671  0.478
elapsed time : 0:00:12.510962


## Modélisation de la charge

In [164]:
X_train_chg = X_train[y_train['Isin']==1]
X_val_chg = X_val[y_val['Isin']==1]
y_train_chg= y_train[y_train['Isin']==1]
y_val_chg = y_val[y_val['Isin']==1]

### Utilisation d'un BalancedRandomForestRegressor

In [94]:
from imblearn.ensemble import BalancedRandomForestClassifier

In [100]:
from imblearn.pipeline import make_pipeline


PipeRegRF = PipelineSW(
                steps=[
                    ('preprocess', preprocessor), 
                    ('classif', RandomForestRegressor(n_estimators=100, random_state=0))
                ]
)


In [101]:
model = trainPipelineMlFlow(
    mlf_XP = 'Chg',
    xp_name_iter= "Essai_RF", 
    pipeline=PipeRegRF, 
    X_train=X_train_chg, 
    y_train=y_train_chg, 
    X_test=X_val_chg, 
    y_test=y_val_chg, 
    target_col='COUT', 
#    weight_col='EXPO',
    use_weights=False
)

INFO: 'Chg' does not exist. Creating a new experiment
Essai_RF
params:
subset                     train          test
metric                                        
D² explained        5.653000e-01 -1.610000e-01
mean abs. error     8.767360e+02  1.502462e+03
mean squared error  2.456081e+06  7.839583e+06
elapsed time : 0:00:01.054774


## Utilisation des variables retravaillées

In [146]:
from sklearn.base import BaseEstimator, TransformerMixin

class TextSelector(BaseEstimator, TransformerMixin):
    def __init__(self, field):
        self.field = field
    def fit(self, X, y=None):
        return str(self)
    def transform(self, X):
        return str(X[self.field])

In [142]:
"NB_PIECES", "TYPE_HABITATION", 

('NB_PIECES', 'TYPE_HABITATION')

In [159]:
# Define categorical pipeline
cat_pipe = Pipeline([
 #                       ('text', TextSelector),
#                        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                        ('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))
                    ])

cat_ordinal_pipe = Pipeline([('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                     ('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))])



# Define numerical pipeline
num_pipe = Pipeline([('imputer', SimpleImputer(missing_values=np.nan, strategy='median')),
                     ('scaler', MinMaxScaler())])

num_pipe_binned = Pipeline([('imputer', SimpleImputer(missing_values=np.nan, strategy='median')),
                     ('scaler', KBinsDiscretizer(n_bins=10))])

num_pipe_log = Pipeline([('imputer', SimpleImputer(missing_values=np.nan, strategy='median')),
                     ('scaler', KBinsDiscretizer(n_bins=10))])


# Fit column transformer to training data
preprocessor2 = ColumnTransformer(
                transformers=[
                                ('cat', cat_pipe, ["FORMULE", "TYPE_RESIDENCE", "NB_PIECES", "TYPE_HABITATION", "SITUATION_JURIDIQUE",'OBJETS_DE_VALEUR', 'ZONIER_2', 'NBSIN_TYPE1_AN1_RECODE', 'NBSIN_TYPE1_AN3_RECODE']),
                                #('ordinal', cat_ordinal_pipe, ["NB_PIECES"])
                                ('num_binned', num_pipe_binned, ["VALEUR_DES_BIENS"])
                ],remainder='drop'
                )

In [160]:
PipeClassif2 = PipelineSW(
                steps=[
                    ('preprocess', preprocessor2), 
                    ('classif', RandomForestClassifier(n_estimators=50, max_depth=5,class_weight="balanced"))
                ]
)


In [161]:
model = trainPipelineMlFlow(
    mlf_XP = 'Isin',
    xp_name_iter= "Classif2", 
    pipeline=PipeClassif2, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_val, 
    y_test=y_val, 
    target_col='Isin', 
#    weight_col='EXPO',
    use_weights=False
)

Classif2
params:
subset               train    test
metric                            
D² explained        0.5199  0.5174
mean abs. error     0.4801  0.4826
mean squared error  0.4801  0.4826
elapsed time : 0:00:03.965551


In [162]:
PipeRegRF2 = PipelineSW(
                steps=[
                    ('preprocess', preprocessor2), 
                    ('classif', RandomForestRegressor(n_estimators=100, random_state=0))
                ]
)


In [165]:
model = trainPipelineMlFlow(
    mlf_XP = 'Chg',
    xp_name_iter= "Essai_RF2", 
    pipeline=PipeRegRF2, 
    X_train=X_train_chg, 
    y_train=y_train_chg, 
    X_test=X_val_chg, 
    y_test=y_val_chg, 
    target_col='COUT', 
#    weight_col='EXPO',
    use_weights=False
)

Essai_RF2
params:
subset                     train          test
metric                                        
D² explained        3.307000e-01 -1.009000e-01
mean abs. error     1.139951e+03  1.469016e+03
mean squared error  3.781841e+06  7.433461e+06
elapsed time : 0:00:00.508914


## Essaie de rajoût de l'année pour absorber les effets d'inflation

In [167]:
# Define categorical pipeline
cat_pipe = Pipeline([
 #                       ('text', TextSelector),
#                        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                        ('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))
                    ])

cat_ordinal_pipe = Pipeline([('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                     ('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))])



# Define numerical pipeline
num_pipe = Pipeline([('imputer', SimpleImputer(missing_values=np.nan, strategy='median')),
                     ('scaler', MinMaxScaler())])

num_pipe_binned = Pipeline([('imputer', SimpleImputer(missing_values=np.nan, strategy='median')),
                     ('scaler', KBinsDiscretizer(n_bins=10))])

num_pipe_log = Pipeline([('imputer', SimpleImputer(missing_values=np.nan, strategy='median')),
                     ('scaler', KBinsDiscretizer(n_bins=10))])


# Fit column transformer to training data
preprocessor3 = ColumnTransformer(
                transformers=[
                                ('cat', cat_pipe, ["ANNEE","FORMULE", "TYPE_RESIDENCE", "NB_PIECES", "TYPE_HABITATION", "SITUATION_JURIDIQUE",'OBJETS_DE_VALEUR', 'ZONIER_2', 'NBSIN_TYPE1_AN1_RECODE', 'NBSIN_TYPE1_AN3_RECODE']),
                                #('ordinal', cat_ordinal_pipe, ["NB_PIECES"])
                                ('num_binned', num_pipe_binned, ["VALEUR_DES_BIENS"])
                ],remainder='drop'
                )

In [168]:
PipeClassif3 = PipelineSW(
                steps=[
                    ('preprocess', preprocessor2), 
                    ('classif', RandomForestClassifier(n_estimators=50, max_depth=5,class_weight="balanced"))
                ]
)


In [169]:
model = trainPipelineMlFlow(
    mlf_XP = 'Isin',
    xp_name_iter= "Classif3", 
    pipeline=PipeClassif3, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_val, 
    y_test=y_val, 
    target_col='Isin', 
#    weight_col='EXPO',
    use_weights=False
)

Classif3
params:
subset               train    test
metric                            
D² explained        0.5509  0.5475
mean abs. error     0.4491  0.4525
mean squared error  0.4491  0.4525
elapsed time : 0:00:03.881234


In [170]:
PipeRegRF3 = PipelineSW(
                steps=[
                    ('preprocess', preprocessor3), 
                    ('classif', RandomForestRegressor(n_estimators=100, random_state=0))
                ]
)


In [171]:
model = trainPipelineMlFlow(
    mlf_XP = 'Chg',
    xp_name_iter= "Essai_RF3", 
    pipeline=PipeRegRF3, 
    X_train=X_train_chg, 
    y_train=y_train_chg, 
    X_test=X_val_chg, 
    y_test=y_val_chg, 
    target_col='COUT', 
#    weight_col='EXPO',
    use_weights=False
)

Essai_RF3
params:
subset                     train          test
metric                                        
D² explained        4.942000e-01 -2.158000e-01
mean abs. error     9.867570e+02  1.576058e+03
mean squared error  2.858103e+06  8.209609e+06
elapsed time : 0:00:00.586785
