***Equilibrage des données***

***Pour option 2***

In [None]:
df_balanced = df.copy()

label_encoder = LabelEncoder()
df_balanced['Risk_encoded'] = label_encoder.fit_transform(df_balanced['Risk'])

feature_cols = ['External_Debt_GDP', 'Debt_Service_Exports','GDP_Growth',
                'Current_Account', 'Exchange_Rate',
                'Corruption_Governance', 'Net_ODA_received']

train_mask = df_balanced['Year'] <= 2021
test_mask = df_balanced['Year'] > 2021

X_train_raw = df_balanced.loc[train_mask, feature_cols]
y_train = df_balanced.loc[train_mask, 'Risk_encoded']

X_test_raw = df_balanced.loc[test_mask, feature_cols]
y_test = df_balanced.loc[test_mask, 'Risk_encoded']


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_raw)
X_test_scaled = scaler.transform(X_test_raw)

X_train_scaled = pd.DataFrame(X_train_scaled, columns=feature_cols, index=X_train_raw.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=feature_cols, index=X_test_raw.index)


class_counts = Counter(y_train)
max_count = max(class_counts.values())
target_minority = max_count // 2  # Ratio 1:2 (Agressif mais souvent nécessaire pour DISTRESS)

sampling_strategy_dict = {}
for class_label, count in class_counts.items():
    if count < target_minority:
        sampling_strategy_dict[class_label] = target_minority

print(f"Distribution originale Train : {class_counts}")
print(f"Cible pour les minoritaires : {target_minority}")

distress_code = label_encoder.transform(['DISTRESS'])[0] if 'DISTRESS' in label_encoder.classes_ else -1
if distress_code != -1:
    n_distress = class_counts[distress_code]
    k = min(5, n_distress - 1)
else:
    k = 5

smote_enn = SMOTEENN(
    sampling_strategy=sampling_strategy_dict,
    smote=SMOTE(k_neighbors=k, random_state=42),
    enn=EditedNearestNeighbours(sampling_strategy='all'),
    random_state=42
)

print("\nApplication de SMOTEENN en cours...")
X_train_resampled, y_train_resampled = smote_enn.fit_resample(X_train_scaled, y_train)

print(f"Nouvelle distribution Train : {Counter(y_train_resampled)}")


from sklearn.utils.class_weight import compute_class_weight

classes_present = np.unique(y_train_resampled)
weights = compute_class_weight(class_weight='balanced', classes=classes_present, y=y_train_resampled)
class_weight_dict = dict(zip(classes_present, weights))

print("\nPoids résiduels (à passer au modèle) :")
for cls, w in class_weight_dict.items():
    print(f"Classe {label_encoder.inverse_transform([cls])[0]} : {w:.2f}")


X_train_final = pd.DataFrame(X_train_resampled, columns=feature_cols)
y_train_final = y_train_resampled
X_test_final = pd.DataFrame(X_test_scaled, columns=feature_cols)
y_test_final = y_test

Distribution originale Train : Counter({3: 217, 1: 157, 2: 128, 0: 50})
Cible pour les minoritaires : 108

Application de SMOTEENN en cours...
Nouvelle distribution Train : Counter({0: 198, 2: 145, 1: 128, 3: 66})

Poids résiduels (à passer au modèle) :
Classe DISTRESS : 0.68
Classe HIGH : 1.05
Classe LOW : 0.93
Classe MODERATE : 2.03


In [None]:
print(class_counts1, target_minority1)

Counter({3: 198, 2: 128, 1: 122, 0: 35}) 99


======================================================================