# *Pipeline de Prétraitement*

In [50]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, FunctionTransformer, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score, recall_score, precision_score
from sklearn.ensemble import StackingClassifier
from sklearn.cluster import KMeans

**Marquage des âges invalides**

In [52]:
def mark_invalid_ages_as_nan(X):
    X[X <= 0] = pd.NA
    return X


In [54]:
def encode_classes_binary(y, class_mapping):
    # Convertir les étiquettes en codes numériques
    y_encoded = y.map(class_mapping)
    # Convertir les codes numériques en chaînes binaires
    y_encoded_binary = y_encoded.apply(lambda x: f"{x:02b}")
    return y_encoded_binary


**Encodage et desencodage des labels**

In [116]:
class CustomLabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.label_encoders = {}

    def fit(self, X, y=None):
        for col in X.columns:
            le = LabelEncoder()
            if X[col].dtype == 'object' or pd.api.types.is_bool_dtype(X[col]):
                le.fit(X[col])
                self.label_encoders[col] = le
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        for col, le in self.label_encoders.items():
            if col in X.columns:  # Vérifier si la colonne est présente dans X
                X_copy[col] = le.transform(X[col])
        return X_copy
        

In [146]:
def to_lower(x):
    df = pd.DataFrame(x)
    return df.map(lambda val: val.lower() if isinstance(val, str) else val)

**Pretraitement des données**

In [148]:
def preprocess_data(data, target_column):
    # Séparation des features et de la target
    X = data.drop(target_column, axis=1)
    y = data[target_column]

    # Identification des colonnes numériques et catégorielles
    numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = X.select_dtypes(include=['object', 'bool']).columns

    # Pipeline pour les features numériques
    numeric_transformer = Pipeline(steps=[
        ('age_marker', FunctionTransformer(mark_invalid_ages_as_nan, validate=False)),  # Marque les âges non logiques comme NaN
        ('imputer', KNNImputer(n_neighbors=5)),  # Utilise KNN pour l'imputation
        ('to_integer', FunctionTransformer(to_lower)),
        ('scaler', MinMaxScaler())  # Normalise entre 0 et 1
    ])

    # Imputation et encodage LabelEncoder pour les features catégorielles
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        # Convert NumPy array to DataFrame for the CustomLabelEncoder
        ('to_dataframe', FunctionTransformer(to_lower)),
        ('label_encoder', CustomLabelEncoder()),
        ('scaler', MinMaxScaler())
    ])

    # Transformation des données
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ], remainder='passthrough'
    )

    # Pipeline complet
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor)
    ])

    # Application du pipeline sur les données
    X_preprocessed = pipeline.fit_transform(X)

    # Encoder la cible (y) avec LabelEncoder
    #le_y = LabelEncoder()
   # y_encoded = le_y.fit_transform(y)  # Assurez-vous d'ajuster le label encoder ici
    # Dictionnaire pour la conversion des classes en codes binaires
    class_mapping = {
        'RE': 0,  # 00
        'RF': 1,  # 01
        'RM': 2   # 10
    }
    y_encoded_binary = encode_classes_binary(y, class_mapping)
    scaler = MinMaxScaler()
    # y_preprocessed = scaler.fit_transform(y_encoded.reshape(-1, 1)).flatten()
    y_preprocessed = y_encoded_binary
    return X_preprocessed, y_preprocessed, pipeline, preprocessor



### ***chargement des donnnées***



In [204]:
# Charger les données depuis un fichier CSV
file_path = 'clients.csv'
data = pd.read_csv(file_path, delimiter=';')
data.dtypes

# Spécifier la colonne cible pour la prédiction
target_column = 'riskLevel (Y)'

print("Valeurs uniques dans riskLevel (Y) :")
print(data['riskLevel (Y)'].unique())

# Prétraiter les données
X_preprocessed, y_preprocessed, pipeline, preprocessor = preprocess_data(data, target_column)
print(y_preprocessed[80])
data_df = pd.DataFrame(data, columns=data.drop(target_column, axis=1).columns)
X_preprocessed_df = pd.DataFrame(X_preprocessed, columns=data.drop(target_column, axis=1).columns)
y_preprocessed_df = pd.DataFrame(y_preprocessed)
print(y_preprocessed_df.iloc[116])
X_preprocessed_df.head(10)

Valeurs uniques dans riskLevel (Y) :
['RM' 'RE' 'RF']
00
riskLevel (Y)    01
Name: 116, dtype: object


Unnamed: 0,age,nationality,gender,Activites_label,Produits,Relation,Pays,isPEP,famCode,VoieDeDistribution
0,0.208333,0.9,1.0,0.0,0.166667,0.625,0.875,0.0,0.0,0.0
1,0.566667,0.9,1.0,0.12766,0.666667,0.0625,0.875,0.0,0.666667,0.0
2,0.291667,0.9,1.0,0.12766,0.0,0.5625,0.875,0.0,0.666667,0.0
3,0.375,0.9,1.0,0.12766,0.0,0.5625,0.875,0.0,0.0,0.0
4,0.458333,0.9,1.0,0.0,0.666667,0.0625,0.875,0.0,0.666667,0.0
5,0.325,0.9,1.0,0.191489,0.0,0.5625,0.875,0.0,0.666667,1.0
6,0.216667,0.9,0.0,0.723404,0.0,0.5625,0.875,0.0,0.0,0.0
7,0.183333,0.9,1.0,0.382979,0.166667,0.8125,0.875,0.0,0.0,0.0
8,0.491667,0.9,0.0,0.723404,0.0,0.5625,0.875,0.0,1.0,0.0
9,0.491667,0.9,1.0,0.12766,0.0,0.5625,0.875,0.0,0.666667,0.0


## ***Entraînement et Évaluation des Modèles***

**Modèles de Classification :**

In [152]:
# Split des données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y_preprocessed, test_size=0.3, random_state=42)

# Définition des modèles et de leurs grilles de paramètres
models = {
    "Gradient Boosting": {
        "model": GradientBoostingClassifier(random_state=42),
        "params": {
            'classifier__n_estimators': [50, 100],
            'classifier__learning_rate': [0.01, 0.1],
            'classifier__max_depth': [3, 5],
            'classifier__min_samples_split': [2, 5],
            'classifier__min_samples_leaf': [1, 2]
        }
    },
    "Random Forest": {
        "model": RandomForestClassifier(class_weight='balanced', random_state=42),
        "params": {
            'classifier__n_estimators': [50, 100],
            'classifier__max_depth': [None, 10, 20],
            'classifier__min_samples_split': [2, 5],
            'classifier__min_samples_leaf': [1, 2],
            'classifier__bootstrap': [True, False]
        }
    },
    "Support Vector Machine": {
        "model": SVC(class_weight='balanced', random_state=42),
        "params": {
            'classifier__C': [0.1, 1.0, 10.0],
            'classifier__kernel': ['linear', 'rbf']
        }
    },
    "k-Nearest Neighbors": {
        "model": KNeighborsClassifier(),
        "params": {
            'classifier__n_neighbors': [3, 5, 7],
            'classifier__weights': ['uniform', 'distance']
        }
    },
    "Neural Network": {
        "model": MLPClassifier(max_iter=200, random_state=42),
        "params": {
            'classifier__hidden_layer_sizes': [(50,), (100,), (50, 50)],
            'classifier__activation': ['relu', 'tanh']
        }
    }
}


Equilibrage des classes avec SMOTE et Optimisation des paramétres avec GridSearchCV

In [154]:
# Dictionnaire pour la conversion des classes en codes binaires
class_mapping = {
    'RE': 0,  # 00
    'RF': 1,  # 01
    'RM': 2   # 10
}

# Entraînement et évaluation des modèles
for model_name, model_info in models.items():
    pipeline = ImbPipeline(steps=[
        ('smote', SMOTE(random_state=42)),
        ('classifier', model_info['model'])
    ])

    grid_search = GridSearchCV(estimator=pipeline,
                               param_grid=model_info['params'],
                               cv=3,
                               scoring='accuracy',
                               n_jobs=-1,
                               verbose=2)

    print(f"Training {model_name}...")
    grid_search.fit(X_train, y_train)

    print(f"Best parameters for {model_name}: {grid_search.best_params_}")
    print(f"Best score for {model_name}: {grid_search.best_score_}")

    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)

    print(f"Accuracy for {model_name}: {accuracy_score(y_test, y_pred)}")
    print(f"Classification Report for {model_name}:\n{classification_report(y_test, y_pred, target_names=class_mapping.keys())}\n")
    print(f"F1 Score for {model_name}: {f1_score(y_test, y_pred, average='weighted')}")
    print(f"Recall Score for {model_name}: {recall_score(y_test, y_pred, average='weighted')}")
    print(f"Precision Score for {model_name}: {precision_score(y_test, y_pred, average='weighted')}\n")


Training Gradient Boosting...
Fitting 3 folds for each of 32 candidates, totalling 96 fits
Best parameters for Gradient Boosting: {'classifier__learning_rate': 0.1, 'classifier__max_depth': 5, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 100}
Best score for Gradient Boosting: 0.9882128618478072
Accuracy for Gradient Boosting: 0.9886777193691872
Classification Report for Gradient Boosting:
              precision    recall  f1-score   support

          RE       0.68      0.72      0.70        29
          RF       0.91      0.99      0.95        95
          RM       1.00      0.99      0.99      2349

    accuracy                           0.99      2473
   macro avg       0.86      0.90      0.88      2473
weighted avg       0.99      0.99      0.99      2473


F1 Score for Gradient Boosting: 0.9888686440439916
Recall Score for Gradient Boosting: 0.9886777193691872
Precision Score for Gradient Boosting: 0.9892057066051324

Trainin



In [156]:

# Split the original data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y_preprocessed, test_size=0.3, random_state=42)
# Create SMOTE object
smote = SMOTE(random_state=42)

# Apply SMOTE to the training data
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

gb_clf = GradientBoostingClassifier()
gb_clf.fit(X_resampled, y_resampled)

y_pred = gb_clf.predict(X_test)

# Calculate and print accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy on test set: {accuracy:.8f}")

# Print Classification Report
print(f"Classification Report:\n{classification_report(y_test, y_pred, target_names=class_mapping.keys())}")

# Calculate and print F1 Score
f1 = f1_score(y_test, y_pred, average='weighted')
print(f"F1 Score: {f1:.2f}")

# Calculate and print Recall Score
recall = recall_score(y_test, y_pred, average='weighted')
print(f"Recall Score: {recall:.2f}")

# Calculate and print Precision Score
precision = precision_score(y_test, y_pred, average='weighted')
print(f"Precision Score: {precision:.2f}")


Accuracy on test set: 0.98584715
Classification Report:
              precision    recall  f1-score   support

          RE       0.64      0.72      0.68        29
          RF       0.88      0.98      0.93        95
          RM       1.00      0.99      0.99      2349

    accuracy                           0.99      2473
   macro avg       0.84      0.90      0.87      2473
weighted avg       0.99      0.99      0.99      2473

F1 Score: 0.99
Recall Score: 0.99
Precision Score: 0.99


 Après avoir comparé les performances des différents modèles de classification (Gradient Boosting, Random Forest, SVM, k-Nearest Neighbors, et Neural Network), il est clair que chaque modèle présente des forces et des faiblesses. Les modèles Gradient Boosting et Random Forest montrent de bonnes performances globales avec des précisions de 98.79% et 98.38% respectivement. Cependant, tous les modèles éprouvent des difficultés à bien prédire la classe RE, qui est cruciale pour la classification des risques. => **nécessite d'empilement.**

## ***Classificateur d'Empilement (Stacking)***

In [158]:

# Define base models
base_models = [
    ('gb', GradientBoostingClassifier(learning_rate=0.1, max_depth=5, min_samples_leaf=1, min_samples_split=5, n_estimators=100, random_state=42)),
    ('rf', RandomForestClassifier(bootstrap=False, max_depth=20, min_samples_leaf=1, min_samples_split=10, n_estimators=50, random_state=42))
]

# Define the meta model
meta_model = SVC(kernel='linear', probability=True, random_state=42)

# Define the stacking classifier
stacking_clf = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_model,
    cv=5,
    n_jobs=-1
)

# Fit the stacking classifier
stacking_clf.fit(X_train, y_train)

# Evaluate the final model
y_pred = stacking_clf.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Classification Report:\n{classification_report(y_test, y_pred, target_names=class_mapping.keys())}")
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted')}")
print(f"Recall Score: {recall_score(y_test, y_pred, average='weighted')}")
print(f"Precision Score: {precision_score(y_test, y_pred, average='weighted')}")


Accuracy: 0.9943388596845936
Classification Report:
              precision    recall  f1-score   support

          RE       0.88      0.76      0.81        29
          RF       0.97      0.99      0.98        95
          RM       1.00      1.00      1.00      2349

    accuracy                           0.99      2473
   macro avg       0.95      0.92      0.93      2473
weighted avg       0.99      0.99      0.99      2473

F1 Score: 0.994198722881669
Recall Score: 0.9943388596845936
Precision Score: 0.9941725278142884


In [69]:
print (data_df)

      age nationality gender  \
0      26     Tunisie  homme   
1      69     Tunisie  homme   
2      36     Tunisie  homme   
3      46     Tunisie  homme   
4      56     Tunisie  homme   
...   ...         ...    ...   
8237   38     Tunisie  homme   
8238   35     Tunisie  homme   
8239   63     Tunisie  homme   
8240   44     Tunisie  femme   
8241   34     Tunisie  homme   

                                        Activites_label        Produits  \
0                  ADMINISTRATIONS  FONCTIONS PUBLIQUES  Autres risques   
1                                       AUTRES SERVICES      Prevoyance   
2                                       AUTRES SERVICES      AUTOMOBILE   
3                                       AUTRES SERVICES      AUTOMOBILE   
4                  ADMINISTRATIONS  FONCTIONS PUBLIQUES      Prevoyance   
...                                                 ...             ...   
8237               ADMINISTRATIONS  FONCTIONS PUBLIQUES      AUTOMOBILE   
8238           

 Cette approche a permis d'atteindre une précision globale de 99.41%, surpassant les performances de chaque modèle individuel et offrant une solution plus robuste et fiable pour la classification des niveaux de risque.

### Calcul du Coefficient de Risque

Identification des valeurs de caractéristiques à haut risque

In [160]:
print("Classes du modèle stacking classificateur :", stacking_clf.classes_)

Classes du modèle stacking classificateur : ['00' '01' '10']


In [162]:
print("Classes du modèle GB :", gb_clf.classes_)

Classes du modèle GB : ['00' '01' '10']


In [192]:
def calculate_risk_coefficient(model, X_preprocessed_df, y_preprocessed_df, data_df, feature_name, feature_value):
    # Convertir feature_value au même type que dans data_df
    feature_value = pd.Series([feature_value]).astype(data_df[feature_name].dtype)[0]

    # Trouver l'indice de feature_value dans les données originales
    if feature_value in data_df[feature_name].values:
        index = data_df[data_df[feature_name] == feature_value].index[0]
    else:
        raise ValueError(f"La valeur '{feature_value}' n'a pas été trouvée dans la colonne '{feature_name}' de data_df")

    # Prédire les probabilités en utilisant le modèle de stacking
    probas = model.predict_proba([X_preprocessed_df.iloc[index]])

    # Trouver l'indice de la classe avec la probabilité la plus élevée
    predicted_class_index = np.argmax(probas[0])

    # Trouver la classe prédite
    predicted_class = model.classes_[predicted_class_index]

    # Calculer le coefficient de risque en pourcentage basé sur la proportion de la classe prédite
    risk_percentage = probas[0][predicted_class_index] * 100

    # Déterminer le niveau de risque associé en fonction du coefficient de risque
    if predicted_class == '00':
        risk_suggestion = 'RE (Risque Élevé)'
    elif predicted_class == '01':
        risk_suggestion = 'RF (Risque Faible)'
    elif predicted_class == '10':
        risk_suggestion = 'RM (Risque Moyen)'
    else:
        risk_suggestion = 'Inconnu'

    return risk_percentage, risk_suggestion

# Utilisation de la fonction pour calculer le taux de risque en pourcentage et suggestion de risque
feature_name = 'Produits'
feature_value = 'Epargne'  # Remplacez par la valeur réelle que vous souhaitez interroger
risk_percentage, risk_suggestion = calculate_risk_coefficient(gb_clf, X_preprocessed_df, y_preprocessed_df, data_df, feature_name, feature_value)

print(f"Coefficient de risque pour {feature_name} = {feature_value}: {risk_percentage:.2f}%")
print(f"Suggestion de risque associée : {risk_suggestion}")


Coefficient de risque pour Produits = Epargne: 98.39%
Suggestion de risque associée : RE (Risque Élevé)


In [198]:
def calculate_risk_coefficients(model, X_preprocessed_df, y_preprocessed_df, data_df, risk_threshold=70):
    # Initialisation d'un dictionnaire pour stocker les résultats de haut risque
    high_risk_coefficients = {}

    # Itération sur toutes les colonnes de X_preprocessed_df
    for feature_name in X_preprocessed_df.columns:
        feature_values = data_df[feature_name].unique()
        feature_high_risk_coefficients = {}

        # Calculer le coefficient de risque pour chaque valeur de la colonne
        for feature_value in feature_values:
            try:
                risk_percentage, risk_suggestion = calculate_risk_coefficient(model, X_preprocessed_df, y_preprocessed_df, data_df, feature_name, feature_value)

                # Filtrer seulement les valeurs avec un risque élevé
                if risk_percentage >= risk_threshold:
                    feature_high_risk_coefficients[feature_value] = {
                        'risk_percentage': risk_percentage,
                        'risk_suggestion': risk_suggestion
                    }
            except ValueError as e:
                print(f"Erreur : {e}")

        # Stocker les résultats de haut risque pour la colonne actuelle
        if feature_high_risk_coefficients:
            high_risk_coefficients[feature_name] = feature_high_risk_coefficients

    return high_risk_coefficients

# Utilisation de la fonction pour calculer les coefficients de risque de haut risque
risk_threshold = 70  # Modifier ce seuil selon votre critère de haut risque
high_risk_coefficients = calculate_risk_coefficients(gb_clf, X_preprocessed_df, y_preprocessed_df, data_df, risk_threshold)

# Afficher les résultats de haut risque uniquement
print("=== Coefficient de risque de haut risque ===")
for feature_name, feature_values in high_risk_coefficients.items():
    print(f"=== Pour la colonne '{feature_name}' ===")
    for feature_value, risk_info in feature_values.items():
        print(f"Valeur: {feature_value} - Risque: {risk_info['risk_percentage']:.2f}% - Suggestion: {risk_info['risk_suggestion']}")
    print()


=== Coefficient de risque de haut risque ===
=== Pour la colonne 'age' ===
Valeur: 26 - Risque: 99.88% - Suggestion: RM (Risque Moyen)
Valeur: 69 - Risque: 91.74% - Suggestion: RM (Risque Moyen)
Valeur: 36 - Risque: 99.70% - Suggestion: RM (Risque Moyen)
Valeur: 46 - Risque: 99.61% - Suggestion: RM (Risque Moyen)
Valeur: 56 - Risque: 99.26% - Suggestion: RM (Risque Moyen)
Valeur: 40 - Risque: 97.72% - Suggestion: RM (Risque Moyen)
Valeur: 27 - Risque: 99.62% - Suggestion: RM (Risque Moyen)
Valeur: 23 - Risque: 99.89% - Suggestion: RM (Risque Moyen)
Valeur: 60 - Risque: 99.58% - Suggestion: RM (Risque Moyen)
Valeur: 41 - Risque: 99.81% - Suggestion: RM (Risque Moyen)
Valeur: 31 - Risque: 99.09% - Suggestion: RM (Risque Moyen)
Valeur: 55 - Risque: 77.94% - Suggestion: RM (Risque Moyen)
Valeur: 59 - Risque: 98.67% - Suggestion: RM (Risque Moyen)
Valeur: 39 - Risque: 98.96% - Suggestion: RM (Risque Moyen)
Valeur: 25 - Risque: 82.40% - Suggestion: RM (Risque Moyen)
Valeur: 47 - Risque: 98.7

In [202]:
def predict_risk_coefficient_with_proba(model, preprocess_pipeline, feature_dict):
    # Convertir le dictionnaire en DataFrame
    input_df = pd.DataFrame([feature_dict])

    # Prétraiter les données
    preprocessed_input = preprocess_pipeline.transform(input_df)

    # Prédire les probabilités pour chaque classe
    risk_probas = model.predict_proba(preprocessed_input)[0]

    # Mapper les probabilités aux labels
    risk_mapping = {'00': 'RE', '01': 'RF', '10': 'RM'}

    # Convertir les labels des classes du modèle en chaînes binaires pour correspondre au mappage
    class_labels = model.classes_.astype(str)  # Convertir les classes en chaînes
    risk_percentages = {risk_mapping[class_labels[i]]: prob * 100 for i, prob in enumerate(risk_probas)}

    # Déterminer la classe prédite
    predicted_class_index = risk_probas.argmax()
    predicted_risk = risk_mapping[class_labels[predicted_class_index]]

    return predicted_risk, risk_percentages

# Exemple d'utilisation
feature_dict = {
    'age': 83,
    'nationality': 'Tunisie',
    'gender': 'homme',
    'Activites_label': 'AUTRES SERVICES',
    'Produits': 'Prevoyance',
    'Relation': 'Membre',
    'Pays': 'TUNISIE',
    'isPEP': False,
    'famCode': 'V',
    'VoieDeDistribution': 'Agences'
}

# Appel de la fonction pour prédire le coefficient de risque pour les caractéristiques fournies
predicted_risk, risk_percentages = predict_risk_coefficient_with_proba(gb_clf, preprocessor, feature_dict)

print(f"La classe prédite est: {predicted_risk}")
print("Probabilités pour chaque classe:")
for risk, percentage in risk_percentages.items():
    print(f"{risk}: {percentage:.2f}%")


La classe prédite est: RE
Probabilités pour chaque classe:
RE: 91.49%
RF: 0.02%
RM: 8.49%


In [83]:
from sklearn.pipeline import Pipeline
import joblib

# Supposons que tu as un préprocesseur nommé 'preprocessor' et ton modèle 'gb_clf'

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', gb_clf)
])

joblib.dump(pipeline, 'models/risk_pipeline.pkl')

['models/risk_pipeline.pkl']