# **TRAINING MODEL**

## **LOADING DATA**

In [30]:
# Import dataset

file_score = "../DATAS/CleanALL_EHCVM_ML.csv"

## **IMPORTS LIBRARIES**

In [31]:
# Imports Libairies

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import sklearn
import joblib

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, LabelBinarizer
from sklearn.model_selection import RandomizedSearchCV, cross_val_score

from sklearn.metrics import accuracy_score, precision_score, recall_score, r2_score, mean_squared_error, confusion_matrix, classification_report, ConfusionMatrixDisplay
from sklearn.model_selection import GridSearchCV 


In [32]:
# Import Modèles

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier 

from sklearn.linear_model import LinearRegression

from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=3)

from sklearn.ensemble import StackingClassifier, VotingClassifier

from sklearn.svm import SVC

from sklearn.svm import LinearSVC

from sklearn.inspection import DecisionBoundaryDisplay

from sklearn.ensemble import BaggingClassifier



import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

## **READ DATA**

In [33]:
df_score = pd.read_csv(file_score)
C_df_score = df_score.copy()
C_df_score

Unnamed: 0,mstat,rev_total_mois,age_grp,empl_formel,bancarise,a_assurance,logem
0,Célibataire,279244.0,25-34,1,1,0,Locataire
1,Célibataire,0.0,0-17,0,0,0,Locataire
2,Célibataire,0.0,0-17,0,0,0,Locataire
3,Célibataire,0.0,0-17,0,0,0,Locataire
4,Marié(e),0.0,25-34,0,0,0,Locataire
...,...,...,...,...,...,...,...
64469,Célibataire,0.0,0-17,0,0,0,Proprietaire titre
64470,Célibataire,0.0,0-17,0,0,0,Proprietaire titre
64471,Célibataire,0.0,0-17,0,0,0,Proprietaire titre
64472,Célibataire,0.0,0-17,0,0,0,Proprietaire titre


## **LABELING OF VARIABLES**

In [34]:
label_encoders = {}

# Variables catégorielles à encoder
cat_cols = ['mstat', 'age_grp', 'logem']

for col in cat_cols:
    le = LabelEncoder()
    C_df_score[col] = le.fit_transform(C_df_score[col])
    label_encoders[col] = le
C_df_score

Unnamed: 0,mstat,rev_total_mois,age_grp,empl_formel,bancarise,a_assurance,logem
0,0,279244.0,2,1,1,0,1
1,0,0.0,0,0,0,0,1
2,0,0.0,0,0,0,0,1
3,0,0.0,0,0,0,0,1
4,2,0.0,2,0,0,0,1
...,...,...,...,...,...,...,...
64469,0,0.0,0,0,0,0,3
64470,0,0.0,0,0,0,0,3
64471,0,0.0,0,0,0,0,3
64472,0,0.0,0,0,0,0,3


In [35]:
# Afficher les mappings pour chaque variable
for col in cat_cols:
    le = label_encoders[col]
    mapping = dict(zip(le.classes_, le.transform(le.classes_)))
    print(f"Mapping '{col}':")
    for k, v in mapping.items():
        print(f"  {k} -> {v}")
    print("-" * 30)

Mapping 'mstat':
  Célibataire -> 0
  Divorcé(e) -> 1
  Marié(e) -> 2
  Séparé -> 3
  Union libre -> 4
  Veuf(ve) -> 5
------------------------------
Mapping 'age_grp':
  0-17 -> 0
  18-24 -> 1
  25-34 -> 2
  35-44 -> 3
  45-54 -> 4
  55-64 -> 5
  65+ -> 6
------------------------------
Mapping 'logem':
  Autre -> 0
  Locataire -> 1
  Proprietaire sans titre -> 2
  Proprietaire titre -> 3
------------------------------


## **CREATION OF THE TARGET**

In [36]:
C_df_score

Unnamed: 0,mstat,rev_total_mois,age_grp,empl_formel,bancarise,a_assurance,logem
0,0,279244.0,2,1,1,0,1
1,0,0.0,0,0,0,0,1
2,0,0.0,0,0,0,0,1
3,0,0.0,0,0,0,0,1
4,2,0.0,2,0,0,0,1
...,...,...,...,...,...,...,...
64469,0,0.0,0,0,0,0,3
64470,0,0.0,0,0,0,0,3
64471,0,0.0,0,0,0,0,3
64472,0,0.0,0,0,0,0,3


In [50]:
def score_inclusion_financiere(row):
    score = 0

    # Situation Matrimoniale (20 points)
    if row['mstat'] in ['2', '4']:
        score += 20
    elif row['mstat'] in ['0', '1', '3']:
        score += 8
    elif row['mstat'] == '5':
        score += 10
    
    # Groupe d'Âge (10 points)
    if row['age_grp'] in ['1', '6']:
        score += 5
    elif row['age_grp'] in ['3', '4']:
        score += 10
    elif row['age_grp'] in ['2', '5']:
        score += 7
    
    # Bancarisation (30 points)
    if row['bancarise'] == 1:
        score += 30
    
    # Assurance (10 points)
    if row['a_assurance'] == 1:
        score += 10
    
    # Emploi formel (10 points)
    if row['empl_formel'] == 1:
        score += 10
    
    # Logement (20 points)
    if row['logem'] in ['2', '3']:
        score += 30
    elif row['logem'] == '1':
        score += 10
    
    return score
C_df_score['Profil_Score'] = C_df_score.apply(score_inclusion_financiere, axis=1)

In [51]:
C_df_score['Profil_Score'].unique()

array([40,  0, 30, 10, 50, 20])

In [52]:
# Separation des caractérisques et de la cible

X = C_df_score.drop(columns=['Profil_Score'])
y = C_df_score['Profil_Score']

## **STANDARDIZATION OF VARIABLES**

In [53]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [54]:
# Données d'entrainement et de test

X_train_s, X_test_s, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=40)
print(f"X_train :{X_train_s.shape}")
print(f"y_train :{y_train.shape}")
print(f"X_test :{X_test_s.shape}")
print(f"y_test :{y_test.shape}")

X_train :(51579, 8)
y_train :(51579,)
X_test :(12895, 8)
y_test :(12895,)


## **MODEL TRAINING PROCESS**