In [65]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.tree import export_text

In [66]:
df = pd.read_csv("../DATAS/ANSTAT2021_clusters_PC.csv")

In [67]:
label_encoders = {}

# Variables catégorielles à encoder
cat_vars = ['sex', 'marital_status', 'city', 'milieu_resid', 'region_name']


for col in cat_vars:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le
df

Unnamed: 0,cluster,age_num,sex,marital_status,city,milieu_resid,region_name,bancarise
0,17,29,0,0,1,1,1,1
1,12,17,0,0,1,1,1,0
2,1,15,1,0,1,1,1,0
3,12,12,0,0,1,1,1,0
4,22,34,0,2,1,1,1,0
...,...,...,...,...,...,...,...,...
64469,89,11,1,0,395,0,23,0
64470,89,7,1,0,395,0,23,0
64471,89,10,1,0,395,0,23,0
64472,89,4,1,0,395,0,23,0


In [68]:
#df["region_name"].unique()

In [69]:
# Afficher les mappings pour chaque variable
for col in cat_vars:
    le = label_encoders[col]
    mapping = dict(zip(le.classes_, le.transform(le.classes_)))
    print(f"Mapping '{col}':")
    for k, v in mapping.items():
        print(f"  {k} -> {v}")
    print("-" * 30)

Mapping 'sex':
  Féminin -> 0
  Masculin -> 1
------------------------------
Mapping 'marital_status':
  Célibataire -> 0
  Divorcé(e) -> 1
  Marié(e) -> 2
  Séparé -> 3
  Union libre -> 4
  Veuf(ve) -> 5
------------------------------
Mapping 'city':
  ABENGOUROU -> 0
  ABIDJAN -> 1
  ABIGUI -> 2
  ABOISSO -> 3
  ABOISSO-COMOE -> 4
  ABONGOUA -> 5
  ABOUDE -> 6
  ADAOU -> 7
  ADIAKE -> 8
  ADJOUAN -> 9
  ADZOPE -> 10
  AFFERY -> 11
  AGBOVILLE -> 12
  AGNIBILEKROU -> 13
  AGOU -> 14
  AHOUANOU -> 15
  AKOBOISSUE -> 16
  AKOUPE -> 17
  ALEPE -> 18
  ALLOSSO -> 19
  AMELEKIA -> 20
  ANANDA -> 21
  ANANGUIE -> 22
  ANDE -> 23
  ANDO-KEKRENOU -> 24
  ANGODA -> 25
  ANIASSUE -> 26
  ANNEPE -> 27
  ANOUMABA -> 28
  ANYAMA -> 29
  ARRHA -> 30
  ASSAHARA -> 31
  ASSIE-KOUMASSI -> 32
  ASSIKOI -> 33
  ASSUEFRY -> 34
  ATTIEGOUAKRO -> 35
  ATTOBROU -> 36
  ATTOUTOU -> 37
  AYAME -> 38
  AYAOU-SRAN -> 39
  AZAGUIE -> 40
  BACANDA -> 41
  BADIKAHA -> 42
  BAGOHOUO -> 43
  BAKO -> 44
  BAKOUBLY ->

In [70]:
y = df['cluster']
X = df.drop(columns=['cluster'])

In [71]:
features = X.columns.tolist()
features

['age_num',
 'sex',
 'marital_status',
 'city',
 'milieu_resid',
 'region_name',
 'bancarise']

In [72]:
# (Optionnel) split train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

In [73]:
X

Unnamed: 0,age_num,sex,marital_status,city,milieu_resid,region_name,bancarise
0,29,0,0,1,1,1,1
1,17,0,0,1,1,1,0
2,15,1,0,1,1,1,0
3,12,0,0,1,1,1,0
4,34,0,2,1,1,1,0
...,...,...,...,...,...,...,...
64469,11,1,0,395,0,23,0
64470,7,1,0,395,0,23,0
64471,10,1,0,395,0,23,0
64472,4,1,0,395,0,23,0


In [74]:
y

0        17
1        12
2         1
3        12
4        22
         ..
64469    89
64470    89
64471    89
64472    89
64473    85
Name: cluster, Length: 64474, dtype: int64

# ================================================================
# Entraîner un arbre de décision pour "expliquer" les clusters
# ================================================================

In [75]:
tree_clf = DecisionTreeClassifier(
    max_depth=4,   # limite la profondeur pour garder des règles lisibles
    min_samples_leaf=30,  # évite des règles sur 2-3 individus
    random_state=42
)

In [76]:
tree_clf.fit(X_train, y_train)

In [77]:
print("Score (accuracy) de l'arbre sur le test :", tree_clf.score(X_test, y_test))

Score (accuracy) de l'arbre sur le test : 0.370263144289924


In [78]:
param_grid = {
    "max_depth": [3, 4, 5, 6, 8, None],
    "min_samples_leaf": [1, 5, 10, 20, 30],
    "criterion": ["gini", "entropy"]
}

dt = DecisionTreeClassifier(random_state=42)

In [79]:
grid = GridSearchCV(
    dt,
    param_grid,
    cv=5,
    scoring="accuracy",
    n_jobs=-1
)
grid.fit(X_train, y_train)

best_dt = grid.best_estimator_
print(grid.best_params_)

{'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 1}


In [80]:
y_pred = best_dt.predict(X_test)

print("Accuracy :", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))   # precision, recall, f1 par cluster

Accuracy : 0.9925554464147237
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        97
           1       1.00      1.00      1.00       284
           2       0.97      1.00      0.99       107
           3       0.98      1.00      0.99       433
           4       1.00      1.00      1.00        84
           5       1.00      1.00      1.00        63
           6       1.00      0.99      1.00       671
           7       1.00      0.98      0.99       275
           8       0.98      0.98      0.98       275
           9       0.99      0.99      0.99       260
          10       1.00      0.99      1.00       235
          11       0.99      0.98      0.99       144
          12       1.00      0.99      0.99       299
          13       0.99      0.99      0.99       209
          14       1.00      1.00      1.00       410
          15       1.00      1.00      1.00       443
          16       0.99      1.00      1.00       1

In [81]:
print(confusion_matrix(y_test, y_pred))

[[ 97   0   0 ...   0   0   0]
 [  0 283   0 ...   0   0   0]
 [  0   0 107 ...   0   0   0]
 ...
 [  0   0   0 ... 156   0   0]
 [  0   0   0 ...   0 151   0]
 [  0   0   0 ...   0   0  79]]


# ================================
# 5. Extraction des règles
# ================================

In [82]:
# Règles textuelles interprétables
tree_rules = export_text(best_dt, feature_names=features)
print("\n=== RÈGLES DE SEGMENTATION (ARBRE DE DÉCISION) ===")
print(tree_rules)


=== RÈGLES DE SEGMENTATION (ARBRE DE DÉCISION) ===
|--- marital_status <= 0.50
|   |--- milieu_resid <= 0.50
|   |   |--- bancarise <= 0.50
|   |   |   |--- sex <= 0.50
|   |   |   |   |--- region_name <= 2.50
|   |   |   |   |   |--- age_num <= 27.50
|   |   |   |   |   |   |--- class: 71
|   |   |   |   |   |--- age_num >  27.50
|   |   |   |   |   |   |--- age_num <= 58.00
|   |   |   |   |   |   |   |--- class: 80
|   |   |   |   |   |   |--- age_num >  58.00
|   |   |   |   |   |   |   |--- class: 7
|   |   |   |   |--- region_name >  2.50
|   |   |   |   |   |--- region_name <= 5.50
|   |   |   |   |   |   |--- region_name <= 4.50
|   |   |   |   |   |   |   |--- region_name <= 3.50
|   |   |   |   |   |   |   |   |--- age_num <= 33.00
|   |   |   |   |   |   |   |   |   |--- class: 27
|   |   |   |   |   |   |   |   |--- age_num >  33.00
|   |   |   |   |   |   |   |   |   |--- class: 80
|   |   |   |   |   |   |   |--- region_name >  3.50
|   |   |   |   |   |   |   |   |--- a

In [83]:
with open("./Rules Clusters/regles_arbre_clusters.txt", "w", encoding="utf-8") as f:
    f.write(tree_rules)

Exception ignored in: <function ResourceTracker.__del__ at 0x74e597d89c60>
Traceback (most recent call last):
  File "/home/didi/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/home/didi/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/home/didi/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x76b896191c60>
Traceback (most recent call last):
  File "/home/didi/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/home/didi/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/home/didi/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x7

# =========================================================
# 6. Application de la segmentationà un NOUVEL échantillon
# =========================================================
