In [None]:
import pandas as pd
import numpy as np
from sklearn.tree import _tree
import pickle

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# ============================================================================
# 1. CHARGEMENT ET ENCODAGE DES DONNÉES
# ============================================================================

In [47]:
df = pd.read_csv("../DATAS/ANSTAT2021_clusters_PC.csv")

In [48]:
# Sauvegarde des données originales pour référence
df_original = df.copy()

In [49]:
# Variables catégorielles à encoder
cat_vars = ['sex', 'marital_status', 'city', 'milieu_resid', 'region_name']

# Dictionnaires pour stocker les encoders et les mappings inversés
label_encoders = {}
inverse_mappings = {}  # Pour reconvertir code → label original

for col in cat_vars:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

    # Créer le mapping inverse : {code_numérique: label_original}
    inverse_mappings[col] = dict(enumerate(le.classes_))


print("✓ Encodage terminé")
print(f"Mappings inversés disponibles pour: {list(inverse_mappings.keys())}")
df

✓ Encodage terminé
Mappings inversés disponibles pour: ['sex', 'marital_status', 'city', 'milieu_resid', 'region_name']


Unnamed: 0,cluster,age_num,sex,marital_status,city,milieu_resid,region_name,bancarise
0,17,29,0,0,1,1,1,1
1,12,17,0,0,1,1,1,0
2,1,15,1,0,1,1,1,0
3,12,12,0,0,1,1,1,0
4,22,34,0,2,1,1,1,0
...,...,...,...,...,...,...,...,...
64469,89,11,1,0,395,0,23,0
64470,89,7,1,0,395,0,23,0
64471,89,10,1,0,395,0,23,0
64472,89,4,1,0,395,0,23,0


# ============================================================================
# 2. PRÉPARATION DES DONNÉES POUR L'ENTRAÎNEMENT
# ============================================================================

In [50]:
y = df['cluster']
X = df.drop(columns=['cluster'])

In [51]:
features = X.columns.tolist()
features

['age_num',
 'sex',
 'marital_status',
 'city',
 'milieu_resid',
 'region_name',
 'bancarise']

In [52]:
# (Optionnel) split train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

In [53]:
print(f"\n✓ Données préparées:")
print(f"  - Features: {features}")
print(f"  - Train: {len(X_train)}, Test: {len(X_test)}")


✓ Données préparées:
  - Features: ['age_num', 'sex', 'marital_status', 'city', 'milieu_resid', 'region_name', 'bancarise']
  - Train: 45131, Test: 19343


# ================================================================
# 3. Entraîner un arbre de décision pour "expliquer" les clusters
# ================================================================

In [54]:
tree_clf = DecisionTreeClassifier(
    max_depth=4,   # limite la profondeur pour garder des règles lisibles
    min_samples_leaf=30,  # évite des règles sur 2-3 individus
    random_state=42
)

In [55]:
tree_clf.fit(X_train, y_train)

In [56]:
print("Score (accuracy) de l'arbre sur le test :", tree_clf.score(X_test, y_test))

Score (accuracy) de l'arbre sur le test : 0.370263144289924


In [57]:
param_grid = {
    "max_depth": [3, 4, 5, 6, 8, None],
    "min_samples_leaf": [1, 5, 10, 20, 30],
    "criterion": ["gini", "entropy"]
}

dt = DecisionTreeClassifier(random_state=42)

In [58]:
grid = GridSearchCV(
    dt,
    param_grid,
    cv=5,
    scoring="accuracy",
    n_jobs=-1
)
grid.fit(X_train, y_train)

best_dt = grid.best_estimator_
print(grid.best_params_)

{'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 1}


In [59]:
y_pred = best_dt.predict(X_test)

print("Accuracy :", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))   # precision, recall, f1 par cluster

Accuracy : 0.9925554464147237
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        97
           1       1.00      1.00      1.00       284
           2       0.97      1.00      0.99       107
           3       0.98      1.00      0.99       433
           4       1.00      1.00      1.00        84
           5       1.00      1.00      1.00        63
           6       1.00      0.99      1.00       671
           7       1.00      0.98      0.99       275
           8       0.98      0.98      0.98       275
           9       0.99      0.99      0.99       260
          10       1.00      0.99      1.00       235
          11       0.99      0.98      0.99       144
          12       1.00      0.99      0.99       299
          13       0.99      0.99      0.99       209
          14       1.00      1.00      1.00       410
          15       1.00      1.00      1.00       443
          16       0.99      1.00      1.00       1

In [60]:
print(confusion_matrix(y_test, y_pred))

[[ 97   0   0 ...   0   0   0]
 [  0 283   0 ...   0   0   0]
 [  0   0 107 ...   0   0   0]
 ...
 [  0   0   0 ... 156   0   0]
 [  0   0   0 ...   0 151   0]
 [  0   0   0 ...   0   0  79]]


# ============================================================================
# 4. EXTRACTION DES RÈGLES LISIBLES (VALEURS ORIGINALES)
# ============================================================================

In [61]:
def decode_threshold_to_labels(feature, threshold, operator):
    """
    Convertit un threshold numérique en labels originaux
    """
    if feature not in inverse_mappings:
        # Pour age_num ou bancarise (pas de décodage nécessaire)
        if feature == "age_num":
            return f"{int(threshold)} ans"
        return f"{threshold:.2f}"
    
    mapping = inverse_mappings[feature]
    
    # Récupérer tous les codes possibles
    all_codes = sorted(mapping.keys())
    
    if operator == "<=":
        # Codes <= threshold
        selected_codes = [c for c in all_codes if c <= threshold]
    else:  # ">"
        # Codes > threshold
        selected_codes = [c for c in all_codes if c > threshold]
    
    # Convertir en labels originaux
    labels = [mapping[c] for c in selected_codes]
    
    return labels

In [62]:
def format_condition(feature, operator, threshold):
    """
    Formate une condition en langage naturel avec valeurs originales
    Optimise l'affichage en utilisant 'PAS DANS' quand c'est plus court
    """
    decoded = decode_threshold_to_labels(feature, threshold, operator)
    
    if isinstance(decoded, list):
        # Nettoyer les types numpy
        decoded_clean = [str(v).replace('np.int64(', '').replace(')', '') for v in decoded]
        
        if len(decoded_clean) == 0:
            return f"{feature} (aucune valeur)"
        elif len(decoded_clean) == 1:
            return f"{feature} = '{decoded_clean[0]}'"
        else:
            # Calculer si "PAS DANS" serait plus court
            if feature in inverse_mappings:
                all_values = list(inverse_mappings[feature].values())
                all_values_clean = [str(v).replace('np.int64(', '').replace(')', '') for v in all_values]
                excluded = [v for v in all_values_clean if v not in decoded_clean]
                #inclues = [v for v in all_values_clean if v in decoded_clean]
                
                # Si moins de valeurs exclues et que ça représente < 50% du total
                # if len(excluded) > 0 and len(excluded) < len(decoded_clean) and len(excluded) <= 10:
                #     values_str = "', '".join(excluded)
                #     return f"{feature} PAS DANS ['{values_str}']"
            
            # Sinon, afficher toutes les valeurs incluses
            values_str = "', '".join(decoded_clean)
            # print(f"values_str: {values_str}")
            # print(f"decoded: {decoded}")
            # print(f"Include: {inclues}")
            # print(f"Exclude: {excluded}")
            # print("------------------------------------------------------")
            return f"{feature} ∈ ['{values_str}']"
    else:
        return f"{feature} {operator} {decoded}"

In [63]:
def extract_rules_with_original_values(tree_model, feature_names):
    """
    Extrait toutes les règles de l'arbre avec les valeurs originales
    """
    tree = tree_model.tree_
    
    def recurse(node_id=0, current_conditions=None):
        if current_conditions is None:
            current_conditions = []
        
        rules = []
        
        # Feuille : on a trouvé une règle complète
        if tree.feature[node_id] == _tree.TREE_UNDEFINED:
            cluster = int(np.argmax(tree.value[node_id][0]))
            n_samples = int(tree.n_node_samples[node_id])
            rules.append({
                "conditions": current_conditions.copy(),
                "cluster": cluster,
                "n_samples": n_samples
            })
            return rules
        
        # Nœud interne : on continue la récursion
        feature = feature_names[tree.feature[node_id]]
        threshold = tree.threshold[node_id]
        
        # Branche gauche (<=)
        left_conditions = current_conditions + [(feature, "<=", threshold)]
        rules.extend(recurse(tree.children_left[node_id], left_conditions))
        
        # Branche droite (>)
        right_conditions = current_conditions + [(feature, ">", threshold)]
        rules.extend(recurse(tree.children_right[node_id], right_conditions))
        
        return rules
    
    return recurse()

In [64]:
# Extraction des règles
print("\n⏳ Extraction des règles avec valeurs originales...")
rules_raw = extract_rules_with_original_values(best_dt, features)
rules_raw


⏳ Extraction des règles avec valeurs originales...


[{'conditions': [('marital_status', '<=', np.float64(0.5)),
   ('milieu_resid', '<=', np.float64(0.5)),
   ('bancarise', '<=', np.float64(0.5)),
   ('sex', '<=', np.float64(0.5)),
   ('region_name', '<=', np.float64(2.5)),
   ('age_num', '<=', np.float64(27.5))],
  'cluster': 71,
  'n_samples': 388},
 {'conditions': [('marital_status', '<=', np.float64(0.5)),
   ('milieu_resid', '<=', np.float64(0.5)),
   ('bancarise', '<=', np.float64(0.5)),
   ('sex', '<=', np.float64(0.5)),
   ('region_name', '<=', np.float64(2.5)),
   ('age_num', '>', np.float64(27.5)),
   ('age_num', '<=', np.float64(58.0))],
  'cluster': 80,
  'n_samples': 11},
 {'conditions': [('marital_status', '<=', np.float64(0.5)),
   ('milieu_resid', '<=', np.float64(0.5)),
   ('bancarise', '<=', np.float64(0.5)),
   ('sex', '<=', np.float64(0.5)),
   ('region_name', '<=', np.float64(2.5)),
   ('age_num', '>', np.float64(27.5)),
   ('age_num', '>', np.float64(58.0))],
  'cluster': 7,
  'n_samples': 6},
 {'conditions': [('ma

In [65]:
rules_readable = []
for rule in rules_raw:
    # Regrouper les conditions par feature pour fusionner les répétitions
    conditions_by_feature = {}
    
    for feature, op, threshold in rule["conditions"]:
        if feature not in conditions_by_feature:
            conditions_by_feature[feature] = []
        conditions_by_feature[feature].append((op, threshold))
    
    # Fusionner les conditions multiples sur la même variable
    conditions_text = []
    for feature, ops_thresholds in conditions_by_feature.items():
        if len(ops_thresholds) == 1:
            # Une seule condition : format normal
            op, threshold = ops_thresholds[0]
            cond_str = format_condition(feature, op, threshold)
            conditions_text.append(cond_str)
        else:
            # Plusieurs conditions sur la même variable : FUSIONNER
            # Pour les variables catégorielles, calculer l'intersection
            if feature in inverse_mappings:
                all_possible = set(inverse_mappings[feature].values())
                current_set = all_possible.copy()
                
                for op, threshold in ops_thresholds:
                    decoded = decode_threshold_to_labels(feature, threshold, op)
                    if isinstance(decoded, list):
                        current_set &= set(decoded)
                
                # Créer la condition finale avec les valeurs fusionnées
                final_values = sorted(list(current_set))
                if len(final_values) == 0:
                    cond_str = f"{feature} (aucune valeur)"
                elif len(final_values) == 1:
                    cond_str = f"{feature} = '{final_values[0]}'"
                else:
                    values_str = "', '".join([str(v) for v in final_values])
                    cond_str = f"{feature} ∈ ['{values_str}']"
                
                conditions_text.append(cond_str)
            else:
                # Pour les variables numériques (age_num), garder les bornes min/max
                less_equal = [t for o, t in ops_thresholds if o == "<="]
                greater = [t for o, t in ops_thresholds if o == ">"]
                
                if less_equal and greater:
                    min_val = int(max(greater))
                    max_val = int(min(less_equal))
                    cond_str = f"{min_val} < {feature} <= {max_val} ans"
                elif less_equal:
                    max_val = int(min(less_equal))
                    cond_str = f"{feature} <= {max_val} ans"
                elif greater:
                    min_val = int(max(greater))
                    cond_str = f"{feature} > {min_val} ans"
                
                conditions_text.append(cond_str)
    
    rule_text = " ET ".join(conditions_text)
    
    rules_readable.append({
        "Règle_ID": len(rules_readable) + 1,
        "Conditions": rule_text,
        "Cluster_Assigné": rule["cluster"],
        "Nombre_Individus": rule["n_samples"]
    })

rules_df = pd.DataFrame(rules_readable)
print(f"✓ {len(rules_df)} règles extraites")
# rules_df

✓ 804 règles extraites


In [66]:
rules_df_sorted = rules_df.sort_values(
    by=["Cluster_Assigné", "Règle_ID"],
    ascending=[True, True]
).reset_index(drop=True)

rules_df_sorted.head(20)

Unnamed: 0,Règle_ID,Conditions,Cluster_Assigné,Nombre_Individus
0,61,marital_status = 'Célibataire' ET milieu_resid...,0,179
1,197,marital_status = 'Célibataire' ET milieu_resid...,0,3
2,200,marital_status = 'Célibataire' ET milieu_resid...,0,1
3,205,marital_status = 'Célibataire' ET milieu_resid...,0,24
4,274,marital_status = 'Célibataire' ET milieu_resid...,0,19
5,483,marital_status = 'Veuf(ve)' ET bancarise <= 0....,0,1
6,335,marital_status = 'Célibataire' ET milieu_resid...,1,367
7,338,marital_status = 'Célibataire' ET milieu_resid...,1,11
8,379,marital_status = 'Célibataire' ET milieu_resid...,1,274
9,390,marital_status = 'Célibataire' ET milieu_resid...,1,9


In [67]:
rules_df = rules_df.sort_values(
    by=["Cluster_Assigné", "Règle_ID"]
).reset_index(drop=True)
rules_df

Unnamed: 0,Règle_ID,Conditions,Cluster_Assigné,Nombre_Individus
0,61,marital_status = 'Célibataire' ET milieu_resid...,0,179
1,197,marital_status = 'Célibataire' ET milieu_resid...,0,3
2,200,marital_status = 'Célibataire' ET milieu_resid...,0,1
3,205,marital_status = 'Célibataire' ET milieu_resid...,0,24
4,274,marital_status = 'Célibataire' ET milieu_resid...,0,19
...,...,...,...,...
799,286,marital_status = 'Célibataire' ET milieu_resid...,95,3
800,295,marital_status = 'Célibataire' ET milieu_resid...,95,6
801,354,marital_status = 'Célibataire' ET milieu_resid...,95,103
802,420,marital_status = 'Célibataire' ET milieu_resid...,95,68


# ============================================================================
# 5. SAUVEGARDE DES RÈGLES ET DU MODÈLE
# ============================================================================

In [68]:
rules_df.to_csv("regles_segmentation_lisibles.csv", index=False, encoding="utf-8")
print("✓ Règles sauvegardées: regles_segmentation_lisibles.csv")

✓ Règles sauvegardées: regles_segmentation_lisibles.csv


In [69]:
# Sauvegarder le modèle et les encoders
with open("modele_clustering.pkl", "wb") as f:
    pickle.dump({
        "model": best_dt,
        "label_encoders": label_encoders,
        "inverse_mappings": inverse_mappings,
        "features": features
    }, f)
print("✓ Modèle sauvegardé: modele_clustering.pkl")

✓ Modèle sauvegardé: modele_clustering.pkl


In [70]:
# Afficher un échantillon de règles
print("\n" + "="*80)
print("EXEMPLE DE RÈGLES EXTRAITES (5 premières):")
print("="*80)
print(rules_df.head(10).to_string(index=False))


EXEMPLE DE RÈGLES EXTRAITES (5 premières):
 Règle_ID                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   

# ============================================================================
# 6. FONCTION D'ASSIGNATION POUR NOUVELLES DONNÉES
# ============================================================================

In [71]:
def assigner_nouveaux_individus(csv_path, model_path="modele_clustering.pkl"):
    """
    Assigne des clusters à de nouveaux individus
    
    Args:
        csv_path: Chemin vers le CSV avec les nouvelles données
        model_path: Chemin vers le modèle sauvegardé
    
    Returns:
        DataFrame avec les clusters assignés
    """
    # Charger le modèle et les encoders
    with open(model_path, "rb") as f:
        saved_data = pickle.load(f)
    
    model = saved_data["model"]
    label_encoders = saved_data["label_encoders"]
    features = saved_data["features"]
    
    # Charger les nouvelles données
    new_data = pd.read_csv(csv_path)
    print(f"✓ {len(new_data)} nouveaux individus chargés")
    
    # Encoder les variables catégorielles avec les MÊMES encoders
    new_data_encoded = new_data.copy()
    for col in label_encoders.keys():
        if col in new_data_encoded.columns:
            le = label_encoders[col]
            # Gérer les valeurs inconnues
            new_data_encoded[col] = new_data_encoded[col].apply(
                lambda x: le.transform([x])[0] if x in le.classes_ else -1
            )
    
    # Prédiction
    X_new = new_data_encoded[features]
    clusters = model.predict(X_new)
    result = new_data.copy()
    result['Cluster_Assigné'] = clusters
    
    return result


In [None]:
# nouvelles_assignations = assigner_nouveaux_individus("./Paquets/Proxy.csv")
# nouvelles_assignations = assigner_nouveaux_individus("../DATAS/ANSTAT2021_clusters_PC.csv")
nouvelles_assignations = assigner_nouveaux_individus("../DATAS/ANSTAT2021_dataset_Clean.csv")

print(nouvelles_assignations['Cluster_Assigné'].nunique())
print(nouvelles_assignations['Cluster_Assigné'].unique())

# nouvelles_assignations.to_csv("resultats_assignation.csv", index=False)
nouvelles_assignations.to_csv("anstat_assignation.csv", index=False)

✓ 64474 nouveaux individus chargés
97
[17 12  1 22 56 94 23 11 33 34 91  3  9 60 81 55 92  8 10 71 88  7 31 13
 80 77 52 21 24 85 15  2 59 53 25 19  6 14 76 64 68 93 62 50 44 72 49 90
 96 83 84 57 78 29 36 42 38 32 26 54 40 61  0 74 45  4 30 79 66 48 70 39
 95 28 27 82 58 35 47 16 51 37 65 67  5 87 20 46 89 73 63 86 18 43 69 41
 75]


In [73]:
print("\n" + "="*80)
print("✓ PROCESSUS TERMINÉ")
print("="*80)
print("\nFichiers générés:")
print("  1. regles_segmentation_lisibles.csv - Règles en langage naturel")
print("  2. modele_clustering.pkl - Modèle + encoders pour nouvelles données")
print("\nPour assigner de nouveaux individus:")
print("  nouvelles = assigner_nouveaux_individus('votre_fichier.csv')")


✓ PROCESSUS TERMINÉ

Fichiers générés:
  1. regles_segmentation_lisibles.csv - Règles en langage naturel
  2. modele_clustering.pkl - Modèle + encoders pour nouvelles données

Pour assigner de nouveaux individus:
  nouvelles = assigner_nouveaux_individus('votre_fichier.csv')


# STATS SUR ALL DATAS ANSTAT

In [208]:
df_anstat = pd.read_csv("./anstat_assignation.csv")
df_anstat

Unnamed: 0,agro_zone,region_name,department,city,milieu_resid,hhweight,resid,sex,age_num,lien,...,cuisin,ordin,decod,car,superf,bancarise,volhor_manquant,salaire_mois_manquant,Stg_formel,Cluster_Assigné
0,ABIDJAN,AUTONOME D'ABIDJAN,ABIDJAN,ABIDJAN,Urbain,1098.1172,Oui,Féminin,29,"Fils, Fille",...,Non,Non,Oui,Non,0.0,1,0,0,1,17
1,ABIDJAN,AUTONOME D'ABIDJAN,ABIDJAN,ABIDJAN,Urbain,1098.1172,Oui,Féminin,17,"Fils, Fille",...,Non,Non,Oui,Non,0.0,0,1,1,0,12
2,ABIDJAN,AUTONOME D'ABIDJAN,ABIDJAN,ABIDJAN,Urbain,1098.1172,Oui,Masculin,15,"Fils, Fille",...,Non,Non,Oui,Non,0.0,0,1,1,0,1
3,ABIDJAN,AUTONOME D'ABIDJAN,ABIDJAN,ABIDJAN,Urbain,1098.1172,Oui,Féminin,12,"Fils, Fille",...,Non,Non,Oui,Non,0.0,0,1,1,0,12
4,ABIDJAN,AUTONOME D'ABIDJAN,ABIDJAN,ABIDJAN,Urbain,1098.1172,Oui,Féminin,34,Conjoint ( e ),...,Non,Non,Oui,Non,0.0,0,1,1,0,22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64469,CENTRE,MORONOU,M'BATTO,TIEMELEKRO,Rural,242.7796,Oui,Masculin,11,Neveu/Nièce,...,Non,Non,Oui,Non,8566398.0,0,1,1,0,89
64470,CENTRE,MORONOU,M'BATTO,TIEMELEKRO,Rural,242.7796,Oui,Masculin,7,"Fils, Fille",...,Non,Non,Oui,Non,8566398.0,0,1,1,0,89
64471,CENTRE,MORONOU,M'BATTO,TIEMELEKRO,Rural,242.7796,Oui,Masculin,10,Neveu/Nièce,...,Non,Non,Oui,Non,8566398.0,0,1,1,0,89
64472,CENTRE,MORONOU,M'BATTO,TIEMELEKRO,Rural,242.7796,Oui,Masculin,4,"Fils, Fille",...,Non,Non,Oui,Non,8566398.0,0,1,1,0,89


In [209]:
df_anstat_sort = df_anstat.sort_values(
    by=["Cluster_Assigné"],
    ascending=[True]
).reset_index(drop=True)

df_anstat_sort

Unnamed: 0,agro_zone,region_name,department,city,milieu_resid,hhweight,resid,sex,age_num,lien,...,cuisin,ordin,decod,car,superf,bancarise,volhor_manquant,salaire_mois_manquant,Stg_formel,Cluster_Assigné
0,CENTRE,N'ZI,BOCANDA,BENGASSOU,Rural,288.38315,Oui,Féminin,4,Autres Parents du CM/Conjoint,...,Non,Non,Non,Non,0.000000e+00,0,1,1,0,0
1,CENTRE,N'ZI,BOCANDA,BENGASSOU,Rural,288.38315,Oui,Féminin,7,"Petit fils, petite fille",...,Non,Non,Non,Non,4.052600e+06,0,1,1,0,0
2,CENTRE,N'ZI,BOCANDA,BENGASSOU,Rural,315.77286,Oui,Féminin,3,"Petit fils, petite fille",...,Non,Non,Non,Non,3.400000e+00,0,1,1,0,0
3,CENTRE,N'ZI,BOCANDA,BENGASSOU,Rural,288.38315,Oui,Féminin,0,"Petit fils, petite fille",...,Non,Non,Non,Non,0.000000e+00,0,1,1,0,0
4,CENTRE,N'ZI,BOCANDA,BENGASSOU,Rural,288.38315,Oui,Féminin,0,"Petit fils, petite fille",...,Non,Non,Non,Non,0.000000e+00,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64469,SUD-EST,INDENIE-DJUABLIN,ABENGOUROU,ABENGOUROU,Rural,205.38361,Oui,Masculin,19,Autres Parents du CM/Conjoint,...,Non,Non,Non,Non,5.035685e+03,0,0,1,1,96
64470,SUD-EST,INDENIE-DJUABLIN,ABENGOUROU,ABENGOUROU,Rural,205.38361,Oui,Masculin,13,"Fils, Fille",...,Non,Non,Non,Non,1.491057e+07,0,1,1,0,96
64471,SUD-EST,INDENIE-DJUABLIN,ABENGOUROU,ABENGOUROU,Rural,205.38361,Oui,Masculin,27,"Frère, sœur",...,Non,Non,Non,Non,2.558464e+07,0,0,0,0,96
64472,SUD-EST,INDENIE-DJUABLIN,ABENGOUROU,ABENGOUROU,Rural,205.38361,Oui,Masculin,6,"Fils, Fille",...,Non,Non,Non,Non,3.001852e+05,0,1,1,0,96


In [210]:
df_anstat_sort["nb_lignes_cluster"] = (
    df_anstat_sort
        .groupby("Cluster_Assigné")["Cluster_Assigné"]
        .transform("size")
)
df_anstat_sort

Unnamed: 0,agro_zone,region_name,department,city,milieu_resid,hhweight,resid,sex,age_num,lien,...,ordin,decod,car,superf,bancarise,volhor_manquant,salaire_mois_manquant,Stg_formel,Cluster_Assigné,nb_lignes_cluster
0,CENTRE,N'ZI,BOCANDA,BENGASSOU,Rural,288.38315,Oui,Féminin,4,Autres Parents du CM/Conjoint,...,Non,Non,Non,0.000000e+00,0,1,1,0,0,324
1,CENTRE,N'ZI,BOCANDA,BENGASSOU,Rural,288.38315,Oui,Féminin,7,"Petit fils, petite fille",...,Non,Non,Non,4.052600e+06,0,1,1,0,0,324
2,CENTRE,N'ZI,BOCANDA,BENGASSOU,Rural,315.77286,Oui,Féminin,3,"Petit fils, petite fille",...,Non,Non,Non,3.400000e+00,0,1,1,0,0,324
3,CENTRE,N'ZI,BOCANDA,BENGASSOU,Rural,288.38315,Oui,Féminin,0,"Petit fils, petite fille",...,Non,Non,Non,0.000000e+00,0,1,1,0,0,324
4,CENTRE,N'ZI,BOCANDA,BENGASSOU,Rural,288.38315,Oui,Féminin,0,"Petit fils, petite fille",...,Non,Non,Non,0.000000e+00,0,1,1,0,0,324
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64469,SUD-EST,INDENIE-DJUABLIN,ABENGOUROU,ABENGOUROU,Rural,205.38361,Oui,Masculin,19,Autres Parents du CM/Conjoint,...,Non,Non,Non,5.035685e+03,0,0,1,1,96,266
64470,SUD-EST,INDENIE-DJUABLIN,ABENGOUROU,ABENGOUROU,Rural,205.38361,Oui,Masculin,13,"Fils, Fille",...,Non,Non,Non,1.491057e+07,0,1,1,0,96,266
64471,SUD-EST,INDENIE-DJUABLIN,ABENGOUROU,ABENGOUROU,Rural,205.38361,Oui,Masculin,27,"Frère, sœur",...,Non,Non,Non,2.558464e+07,0,0,0,0,96,266
64472,SUD-EST,INDENIE-DJUABLIN,ABENGOUROU,ABENGOUROU,Rural,205.38361,Oui,Masculin,6,"Fils, Fille",...,Non,Non,Non,3.001852e+05,0,1,1,0,96,266


In [211]:
df_anstat_sort.columns

Index(['agro_zone', 'region_name', 'department', 'city', 'milieu_resid',
       'hhweight', 'resid', 'sex', 'age_num', 'lien', 'marital_status',
       'religion', 'ethnie', 'nation', 'agemar', 'mal30j', 'aff30j', 'arrmal',
       'durarr', 'con30j', 'hos12m', 'couvmal', 'handit', 'handig', 'alfa',
       'alfa2', 'scol', 'educ_scol', 'educ_hi', 'diplome', 'telpor',
       'internet', 'activ7j', 'activ12m', 'branch', 'sectins', 'csp', 'volhor',
       'salaire', 'emploi_sec', 'sectins_sec', 'csp_sec', 'volhor_sec',
       'salaire_sec', 'serviceconsult', 'persconsult', 'salaire_mois',
       'salaire_sec_mois', 'rev_total_mois', 'log_revenu', 'sans_revenu',
       'age_grp', 'a_assurance', 'alphabete', 'logem', 'elec_ac', 'elec_ur',
       'elec_ua', 'tv', 'fer', 'frigo', 'cuisin', 'ordin', 'decod', 'car',
       'superf', 'bancarise', 'volhor_manquant', 'salaire_mois_manquant',
       'Stg_formel', 'Cluster_Assigné', 'nb_lignes_cluster'],
      dtype='object')

In [212]:
df_anstat_sort.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64474 entries, 0 to 64473
Data columns (total 72 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   agro_zone              64474 non-null  object 
 1   region_name            64474 non-null  object 
 2   department             64474 non-null  object 
 3   city                   64474 non-null  object 
 4   milieu_resid           64474 non-null  object 
 5   hhweight               64474 non-null  float64
 6   resid                  64474 non-null  object 
 7   sex                    64474 non-null  object 
 8   age_num                64474 non-null  int64  
 9   lien                   64474 non-null  object 
 10  marital_status         64474 non-null  object 
 11  religion               64474 non-null  object 
 12  ethnie                 64474 non-null  object 
 13  nation                 64474 non-null  object 
 14  agemar                 64474 non-null  float64
 15  ma

In [213]:
df_anstat_sort = df_anstat_sort.drop(columns=['agro_zone', 'region_name', 'department', 'city', 'milieu_resid',
       'hhweight', 'sex', 'age_num', 'marital_status', 'bancarise', 'salaire_mois', 'salaire_sec_mois', 'rev_total_mois', 'log_revenu',
       'sans_revenu', 'volhor_manquant',
       'salaire_mois_manquant'], axis=1)
df_anstat_sort

Unnamed: 0,resid,lien,religion,ethnie,nation,agemar,mal30j,aff30j,arrmal,durarr,...,fer,frigo,cuisin,ordin,decod,car,superf,Stg_formel,Cluster_Assigné,nb_lignes_cluster
0,Oui,Autres Parents du CM/Conjoint,Chrétien,MALINKE OU MANINKA,Cote d'ivoire,-1.0,Non,Aucun,Non,Aucun,...,Non,Non,Non,Non,Non,Non,0.000000e+00,0,0,324
1,Oui,"Petit fils, petite fille",Chrétien,AHIZI,Cote d'ivoire,-1.0,Oui,"Toux, rhume, grippe",Non,Aucun,...,Non,Non,Non,Non,Non,Non,4.052600e+06,0,0,324
2,Oui,"Petit fils, petite fille",Sans Réligion,BAOULE,Cote d'ivoire,-1.0,Non,Aucun,Non,Aucun,...,Non,Non,Non,Non,Non,Non,3.400000e+00,0,0,324
3,Oui,"Petit fils, petite fille",Chrétien,GOURO,Cote d'ivoire,-1.0,Non,Aucun,Non,Aucun,...,Non,Non,Non,Non,Non,Non,0.000000e+00,0,0,324
4,Oui,"Petit fils, petite fille",Chrétien,GOURO,Cote d'ivoire,-1.0,Non,Aucun,Non,Aucun,...,Non,Non,Non,Non,Non,Non,0.000000e+00,0,0,324
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64469,Oui,Autres Parents du CM/Conjoint,Chrétien,AGNI,Cote d'ivoire,-1.0,Non,Aucun,Non,Aucun,...,Non,Non,Non,Non,Non,Non,5.035685e+03,1,96,266
64470,Oui,"Fils, Fille",Musulman,Inconnue,Niger,-1.0,Non,Aucun,Non,Aucun,...,Non,Non,Non,Non,Non,Non,1.491057e+07,0,96,266
64471,Oui,"Frère, sœur",Musulman,Inconnue,Burkina Faso,-1.0,Oui,Douleurs/fatigue,Non,Aucun,...,Non,Non,Non,Non,Non,Non,2.558464e+07,0,96,266
64472,Oui,"Fils, Fille",Chrétien,BAOULE,Cote d'ivoire,-1.0,Non,Aucun,Non,Aucun,...,Non,Non,Non,Non,Non,Non,3.001852e+05,0,96,266


In [214]:
df_anstat_sort.columns.unique()

Index(['resid', 'lien', 'religion', 'ethnie', 'nation', 'agemar', 'mal30j',
       'aff30j', 'arrmal', 'durarr', 'con30j', 'hos12m', 'couvmal', 'handit',
       'handig', 'alfa', 'alfa2', 'scol', 'educ_scol', 'educ_hi', 'diplome',
       'telpor', 'internet', 'activ7j', 'activ12m', 'branch', 'sectins', 'csp',
       'volhor', 'salaire', 'emploi_sec', 'sectins_sec', 'csp_sec',
       'volhor_sec', 'salaire_sec', 'serviceconsult', 'persconsult', 'age_grp',
       'a_assurance', 'alphabete', 'logem', 'elec_ac', 'elec_ur', 'elec_ua',
       'tv', 'fer', 'frigo', 'cuisin', 'ordin', 'decod', 'car', 'superf',
       'Stg_formel', 'Cluster_Assigné', 'nb_lignes_cluster'],
      dtype='object')

In [224]:
df_anstat_sort["agemar"].unique()

array([-1., 20., 30., 27., 17., 19., 18., 36., 25., 23., 35., 24., 29.,
       22., 26., 15., 34., 28., 21., 33., 44., 16., 38., 37., 45., 31.,
       41., 54., 32., 52., 14., 13., 39., 12., 43., 40., 50., 42., 47.,
       10., 60., 46., 53., 48., 49., 59., 55., 56., 70., 51., 61., 66.,
       57., 11.])

In [217]:
variables_categorielles = ['lien', 'religion', 'ethnie', 'nation', 'aff30j', 'durarr', 'educ_scol', 'educ_hi', 'diplome', 'activ7j', 'activ12m', 'branch', 'sectins', 'csp', 'sectins_sec', 'csp_sec', 'serviceconsult', 'persconsult', 'age_grp', 'logem']
variables_numeriques = ['agemar', 'volhor', 'salaire', 'volhor_sec', 'salaire_sec', 'superf']
variables_binaires = ['resid', 'mal30j', 'arrmal', 'con30j', 'hos12m', 'couvmal', 'handit', 'handig', 'alfa', 'alfa2', 'scol', 'emploi_sec', 'telpor', 'internet', 'a_assurance', 'alphabete', 'elec_ac', 'elec_ur', 'elec_ua', 'tv', 'fer', 'frigo', 'cuisin', 'ordin', 'decod', 'car', 'Stg_formel']


## Preparations des proportions

In [218]:
print(df_anstat[variables_numeriques].dtypes)

agemar         float64
volhor         float64
salaire        float64
volhor_sec     float64
salaire_sec      int64
superf         float64
dtype: object


In [219]:
# Stats numériques
stats_num = (
    df_anstat
    .groupby("Cluster_Assigné")[variables_numeriques]
    .agg(["mean", "std", "min", "max"])
)
stats_num

Unnamed: 0_level_0,agemar,agemar,agemar,agemar,volhor,volhor,volhor,volhor,salaire,salaire,...,volhor_sec,volhor_sec,salaire_sec,salaire_sec,salaire_sec,salaire_sec,superf,superf,superf,superf
Unnamed: 0_level_1,mean,std,min,max,mean,std,min,max,mean,std,...,min,max,mean,std,min,max,mean,std,min,max
Cluster_Assigné,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,-0.935185,1.166667,-1.0,20.0,17.925926,159.230174,0.0,1920.0,925.925926,1.666667e+04,...,0.0,0.0,0.000000,0.000000,0,0,4.002500e+06,1.866459e+07,0.0,1.804329e+08
1,-0.967196,1.008430,-1.0,30.0,152.044966,615.053167,0.0,3900.0,40639.186921,2.328150e+05,...,0.0,480.0,634.920635,19518.001459,0,600000,1.481481e-03,3.218600e-02,0.0,7.000000e-01
2,25.232687,6.046223,15.0,54.0,1881.154615,882.071794,0.0,3600.0,334126.915402,8.283374e+05,...,0.0,1440.0,21814.404432,116698.430975,0,960000,7.718331e+06,7.621480e+07,0.0,1.290846e+09
3,21.185645,5.188242,12.0,50.0,1109.781714,1126.695938,0.0,3900.0,155094.746487,6.448905e+05,...,0.0,1560.0,910.973085,24612.992284,0,720000,5.965139e+06,1.001743e+08,0.0,2.557890e+09
4,-0.946237,0.898027,-1.0,14.0,35.053763,295.953369,0.0,3300.0,2365.591398,3.065465e+04,...,0.0,0.0,0.000000,0.000000,0,0,5.928433e+05,1.735837e+06,0.0,1.011823e+07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92,19.507619,3.962941,11.0,36.0,617.135675,965.496778,0.0,3600.0,34179.847895,1.575839e+05,...,0.0,1200.0,1779.619048,34525.184075,0,960000,4.193878e+07,6.840394e+08,0.0,2.093170e+10
93,24.518519,6.072242,14.0,47.0,1433.074866,1120.802344,0.0,3960.0,442924.074007,8.926426e+05,...,0.0,1680.0,22043.097643,136055.792515,0,1212000,3.925528e+04,4.717115e+05,0.0,5.758084e+06
94,0.034549,5.249428,-1.0,32.0,1050.783056,1244.825280,0.0,4500.0,644114.586929,1.224697e+06,...,0.0,1680.0,4445.297505,69680.930663,0,1332000,5.758157e-04,7.573646e-03,0.0,1.000000e-01
95,-1.000000,0.000000,-1.0,-1.0,89.150685,488.524588,0.0,3744.0,7779.072407,8.735432e+04,...,0.0,0.0,0.000000,0.000000,0,0,1.846358e+05,7.388947e+05,0.0,3.565832e+06


In [220]:
print(df_anstat[variables_binaires].dtypes)


resid          object
mal30j         object
arrmal         object
con30j         object
hos12m         object
couvmal        object
handit         object
handig         object
alfa           object
alfa2          object
scol           object
emploi_sec     object
telpor         object
internet       object
a_assurance     int64
alphabete       int64
elec_ac        object
elec_ur        object
elec_ua        object
tv             object
fer            object
frigo          object
cuisin         object
ordin          object
decod          object
car            object
Stg_formel      int64
dtype: object


In [231]:
df_anstat = df_anstat.copy()

# pour chaque variable binaire : convertir Oui/Non en 1/0, puis en numérique
for col in variables_binaires:
    # si c'est du texte Oui/Non
    if df_anstat[col].dtype == "object":
        df_anstat[col] = df_anstat[col].str.strip().str.lower()
        df_anstat[col] = df_anstat[col].map({"oui": 1, "non": 0})
    # dans tous les cas, forcer en numérique (garde 0/1, convertit le reste en NaN)
    df_anstat[col] = pd.to_numeric(df_anstat[col], errors="coerce")


In [232]:
# Proportions pour les binaires (moyenne = part de 1)
props_bin = (
    df_anstat
    .groupby("Cluster_Assigné")[variables_binaires]
    .mean()   # valeurs entre 0 et 1
)
props_bin

Unnamed: 0_level_0,resid,mal30j,arrmal,con30j,hos12m,couvmal,handit,handig,alfa,alfa2,...,elec_ur,elec_ua,tv,fer,frigo,cuisin,ordin,decod,car,Stg_formel
Cluster_Assigné,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.981481,0.268519,0.154321,0.157407,0.009259,0.030864,0.033951,0.009259,0.527778,0.521605,...,0.679012,0.104938,0.287037,0.012346,0.067901,0.009259,0.006173,0.169753,0.000000,0.003086
1,0.996825,0.230688,0.122751,0.167196,0.020106,0.141799,0.033862,0.012698,0.688889,0.679365,...,0.938624,0.059259,0.847619,0.133333,0.470899,0.211640,0.126984,0.488889,0.096296,0.025397
2,1.000000,0.351801,0.246537,0.210526,0.030471,0.055402,0.227147,0.041551,0.642659,0.637119,...,0.531856,0.313019,0.365651,0.019391,0.060942,0.022161,0.013850,0.202216,0.005540,0.077562
3,0.998620,0.335404,0.228433,0.214631,0.048999,0.094548,0.162181,0.035197,0.472050,0.465839,...,0.864044,0.115942,0.747412,0.080055,0.263630,0.080745,0.047619,0.443064,0.030366,0.051760
4,0.992832,0.204301,0.118280,0.136201,0.010753,0.025090,0.025090,0.014337,0.512545,0.508961,...,0.713262,0.232975,0.641577,0.035842,0.172043,0.017921,0.017921,0.240143,0.000000,0.007168
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92,1.000000,0.263810,0.162857,0.157143,0.031429,0.008571,0.060952,0.019048,0.220000,0.218095,...,0.680000,0.239048,0.534286,0.017143,0.083810,0.016190,0.007619,0.249524,0.003810,0.010476
93,0.996633,0.397306,0.286195,0.252525,0.060606,0.077441,0.202020,0.033670,0.744108,0.740741,...,0.808081,0.148148,0.703704,0.094276,0.178451,0.111111,0.043771,0.420875,0.013468,0.178451
94,0.990403,0.230326,0.138196,0.142035,0.013436,0.084453,0.094050,0.026871,0.921305,0.913628,...,0.907869,0.088292,0.856046,0.176583,0.500960,0.216891,0.213052,0.495202,0.126679,0.205374
95,0.992172,0.236791,0.129159,0.164384,0.015656,0.025440,0.025440,0.003914,0.551859,0.534247,...,0.827789,0.129159,0.702544,0.015656,0.156556,0.039139,0.013699,0.428571,0.000000,0.013699


## Proportions globales 

In [238]:
def proportions_par_cluster(df, col_cat):
    return (
        df
        .groupby("Cluster_Assigné")[col_cat]
        .value_counts(normalize=True)      # proportion dans le cluster
        .rename("prop")
        .reset_index()
    )

dict_props_cat = {
    col: proportions_par_cluster(df_anstat, col)
    for col in variables_categorielles
}

dict_props_num = {
    col: proportions_par_cluster(df_anstat, col)
    for col in variables_numeriques
}

dict_props_bin = {
    col: proportions_par_cluster(df_anstat, col)
    for col in variables_binaires
}

dict_props_num["salaire"]

Unnamed: 0,Cluster_Assigné,salaire,prop
0,0,0.00,0.996914
1,0,300000.00,0.003086
2,1,0.00,0.958730
3,1,720000.00,0.007407
4,1,600000.00,0.003175
...,...,...,...
2771,95,1500000.00,0.001957
2772,96,0.00,0.984962
2773,96,497349.00,0.007519
2774,96,580898.44,0.003759


## MODALITE MAJORITAIRE PAR CLUSTER 

In [234]:
def top_modalite_par_cluster(df, col):
    tmp = (
        df
        .groupby("Cluster_Assigné")[col]
        .value_counts(normalize=True)
        .rename("prop")
        .reset_index()
    )  # colonnes : Cluster_Assigné, col, prop

    # on garde, pour chaque cluster, la ligne où prop est max
    idx_max = tmp.groupby("Cluster_Assigné")["prop"].idxmax()
    top = tmp.loc[idx_max].reset_index(drop=True)

    # renommer les colonnes pour intégrer facilement dans un dataset global
    top = top.rename(columns={
        col: f"{col}_top",
        "prop": f"{col}_prop_top"
    })
    return top


In [237]:
# DataFrame de base : une ligne par cluster
clusters = (
    df_anstat[["Cluster_Assigné"]]
    .drop_duplicates()
    .sort_values("Cluster_Assigné")
    .reset_index(drop=True)
)

# 1) variables catégorielles
df_stats = clusters.copy()
for col in variables_categorielles:
    top = top_modalite_par_cluster(df_anstat, col)
    df_stats = df_stats.merge(top, on="Cluster_Assigné", how="left")

# 2) variables binaires : après normalisation en 0/1
#    ici, pour les binaires, on veut simplement la proportion de 1
props_bin = (
    df_anstat
    .groupby("Cluster_Assigné")[variables_binaires]
    .mean()
    .reset_index()
)

df_stats = df_stats.merge(props_bin, on="Cluster_Assigné", how="left")

# 3) variables numériques : max ou autre stat par cluster
stats_num = (
    df_anstat
    .groupby("Cluster_Assigné")[variables_numeriques]
    .agg(["mean", "std", "min", "max"])
)

# aplatir les colonnes multi-index de stats_num
stats_num.columns = [
    f"{col}_{stat}" for col, stat in stats_num.columns.to_flat_index()
]
stats_num = stats_num.reset_index()

df_stats = df_stats.merge(stats_num, on="Cluster_Assigné", how="left")
df_stats


Unnamed: 0,Cluster_Assigné,lien_top,lien_prop_top,religion_top,religion_prop_top,ethnie_top,ethnie_prop_top,nation_top,nation_prop_top,aff30j_top,...,volhor_sec_min,volhor_sec_max,salaire_sec_mean,salaire_sec_std,salaire_sec_min,salaire_sec_max,superf_mean,superf_std,superf_min,superf_max
0,0,"Fils, Fille",0.490741,Chrétien,0.737654,BAOULE,0.861111,Cote d'ivoire,0.987654,Aucun,...,0.0,0.0,0.000000,0.000000,0,0,4.002500e+06,1.866459e+07,0.0,1.804329e+08
1,1,"Fils, Fille",0.720635,Chrétien,0.505820,Inconnue,0.175661,Cote d'ivoire,0.824339,Aucun,...,0.0,480.0,634.920635,19518.001459,0,600000,1.481481e-03,3.218600e-02,0.0,7.000000e-01
2,2,Chef de ménage,0.916898,Chrétien,0.609418,BAOULE,0.385042,Cote d'ivoire,0.889197,Aucun,...,0.0,1440.0,21814.404432,116698.430975,0,960000,7.718331e+06,7.621480e+07,0.0,1.290846e+09
3,3,Conjoint ( e ),0.867495,Musulman,0.619738,SENOUFO,0.165631,Cote d'ivoire,0.864734,Aucun,...,0.0,1560.0,910.973085,24612.992284,0,720000,5.965139e+06,1.001743e+08,0.0,2.557890e+09
4,4,"Fils, Fille",0.688172,Musulman,0.505376,Inconnue,0.229391,Cote d'ivoire,0.770609,Aucun,...,0.0,0.0,0.000000,0.000000,0,0,5.928433e+05,1.735837e+06,0.0,1.011823e+07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92,92,Conjoint ( e ),0.929524,Musulman,0.678095,Inconnue,0.320952,Cote d'ivoire,0.679048,Aucun,...,0.0,1200.0,1779.619048,34525.184075,0,960000,4.193878e+07,6.840394e+08,0.0,2.093170e+10
93,93,Chef de ménage,0.552189,Chrétien,0.713805,BAOULE,0.279461,Cote d'ivoire,0.922559,Aucun,...,0.0,1680.0,22043.097643,136055.792515,0,1212000,3.925528e+04,4.717115e+05,0.0,5.758084e+06
94,94,"Fils, Fille",0.403071,Chrétien,0.568138,Inconnue,0.151631,Cote d'ivoire,0.848369,Aucun,...,0.0,1680.0,4445.297505,69680.930663,0,1332000,5.758157e-04,7.573646e-03,0.0,1.000000e-01
95,95,"Fils, Fille",0.757339,Musulman,0.575342,Inconnue,0.223092,Cote d'ivoire,0.776908,Aucun,...,0.0,0.0,0.000000,0.000000,0,0,1.846358e+05,7.388947e+05,0.0,3.565832e+06


In [239]:
df_stats.to_csv("Statistiques_Clust_Anstat.csv", index=False)


In [240]:
# DataFrame de base : une ligne par cluster
clusters = (
    df_anstat[["Cluster_Assigné"]]
    .drop_duplicates()
    .sort_values("Cluster_Assigné")
    .reset_index(drop=True)
)

# 1) variables catégorielles
df_stats_1 = clusters.copy()
for col in variables_categorielles:
    top = top_modalite_par_cluster(df_anstat, col)
    df_stats_1 = df_stats_1.merge(top, on="Cluster_Assigné", how="left")

# 2) variables binaires : après normalisation en 0/1
#    ici, pour les binaires, on veut simplement la proportion de 1
props_bin = (
    df_anstat
    .groupby("Cluster_Assigné")[variables_binaires]
    .mean()
    .reset_index()
)

df_stats_1 = df_stats_1.merge(props_bin, on="Cluster_Assigné", how="left")

# 3) variables numériques : max ou autre stat par cluster

for col in variables_numeriques:
    top = top_modalite_par_cluster(df_anstat, col)
    df_stats_1 = df_stats_1.merge(top, on="Cluster_Assigné", how="left")

df_stats_1


Unnamed: 0,Cluster_Assigné,lien_top,lien_prop_top,religion_top,religion_prop_top,ethnie_top,ethnie_prop_top,nation_top,nation_prop_top,aff30j_top,...,volhor_top,volhor_prop_top,salaire_top,salaire_prop_top,volhor_sec_top,volhor_sec_prop_top,salaire_sec_top,salaire_sec_prop_top,superf_top,superf_prop_top
0,0,"Fils, Fille",0.490741,Chrétien,0.737654,BAOULE,0.861111,Cote d'ivoire,0.987654,Aucun,...,0.0,0.984568,0.0,0.996914,0.0,1.000000,0,1.000000,0.0,0.250000
1,1,"Fils, Fille",0.720635,Chrétien,0.505820,Inconnue,0.175661,Cote d'ivoire,0.824339,Aucun,...,0.0,0.930159,0.0,0.958730,0.0,0.998942,0,0.998942,0.0,0.997884
2,2,Chef de ménage,0.916898,Chrétien,0.609418,BAOULE,0.385042,Cote d'ivoire,0.889197,Aucun,...,2400.0,0.116343,0.0,0.775623,0.0,0.770083,0,0.955679,0.0,0.240997
3,3,Conjoint ( e ),0.867495,Musulman,0.619738,SENOUFO,0.165631,Cote d'ivoire,0.864734,Aucun,...,0.0,0.387164,0.0,0.900621,0.0,0.919255,0,0.998620,0.0,0.702553
4,4,"Fils, Fille",0.688172,Musulman,0.505376,Inconnue,0.229391,Cote d'ivoire,0.770609,Aucun,...,0.0,0.982079,0.0,0.992832,0.0,1.000000,0,1.000000,0.0,0.498208
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92,92,Conjoint ( e ),0.929524,Musulman,0.678095,Inconnue,0.320952,Cote d'ivoire,0.679048,Aucun,...,0.0,0.621905,0.0,0.940952,0.0,0.949524,0,0.996190,0.0,0.510476
93,93,Chef de ménage,0.552189,Chrétien,0.713805,BAOULE,0.279461,Cote d'ivoire,0.922559,Aucun,...,0.0,0.218855,0.0,0.686869,0.0,0.855219,0,0.966330,0.0,0.683502
94,94,"Fils, Fille",0.403071,Chrétien,0.568138,Inconnue,0.151631,Cote d'ivoire,0.848369,Aucun,...,0.0,0.485605,0.0,0.612284,0.0,0.980806,0,0.994242,0.0,0.994242
95,95,"Fils, Fille",0.757339,Musulman,0.575342,Inconnue,0.223092,Cote d'ivoire,0.776908,Aucun,...,0.0,0.964775,0.0,0.986301,0.0,1.000000,0,1.000000,0.0,0.739726


In [244]:
df_stats['volhor_sec_max'].unique()

array([   0.,  480., 1440., 1560., 1764., 1540., 1680., 1470., 1080.,
       1584., 1728.,  960.,  624.,  840., 1536.,  560.,  900.,  600.,
        240.,  825.,  720.,   60.,  300.,  420.,  756., 1200.,  910.,
         52.,   64.,  672., 1152.,  520.,  675.,  270.,   80., 1260.,
       1656., 1320.,  312.,  630., 1008.])