In [1]:
# ===========================
#   GENERATION BASE SCORING PME
# ===========================
import pandas as pd
import numpy as np
from faker import Faker
import random

fake = Faker("fr_FR")
np.random.seed(42)
random.seed(42)

n = 2500
data = []

secteurs = ["Commerce", "Services", "Agroalimentaire", "Transport", "Industrie", "BTP"]
formes = ["SARL", "SA", "SAS", "Entreprise Individuelle"]
regions = ["Ouaga", "Bobo", "Koudougou", "Ouahigouya", "Fada", "Banfora"]

for i in range(n):
    
    # Caractéristiques PME
    secteur = random.choice(secteurs)
    forme = random.choice(formes)
    region = random.choice(regions)
    anciennete = np.random.randint(1, 25)
    nb_employes = np.random.randint(3, 200)
    experience_dirigeant = np.random.randint(1, 30)
    
    # Données financières réalistes
    ca = np.random.normal(200_000_000, 120_000_000)  # 200 millions en moyenne
    ca = max(50_000_000, ca)  # minimum 50 MFCFA
    
    resultat_net = ca * np.random.uniform(0.02, 0.15) * np.random.choice([1, 1, 1, -1])
    fonds_propres = ca * np.random.uniform(0.1, 0.6)
    
    dettes = ca * np.random.uniform(0.2, 0.8)
    tresorerie = ca * np.random.uniform(-0.05, 0.2)
    
    ratio_endettement = dettes / ca
    marge = resultat_net / ca
    
    # Comportement bancaire
    incidents = np.random.poisson(0.4)
    utilisation_decouvert = np.random.uniform(0, 1)
    anciennete_banque = np.random.randint(1, 15)
    
    # Crédit demandé
    montant_credit = np.random.uniform(20_000_000, 300_000_000)
    duree = np.random.choice([12, 24, 36, 48, 60])
    ratio_credit_ca = montant_credit / ca
    
    # Garanties
    garantie = montant_credit * np.random.uniform(0.3, 1.2)
    ltv = montant_credit / garantie
    
    # Variable cible : défaut (probabilité basée sur risques réels)
    prob_default = (
        0.15 * (ratio_endettement > 0.6)
        + 0.12 * (incidents > 2)
        + 0.10 * (ratio_credit_ca > 1)
        + 0.08 * (tresorerie < 0)
        + 0.05 * (anciennete < 3)
        + np.random.uniform(0, 0.1)
    )
    default = np.random.binomial(1, min(prob_default, 0.65))
    
    data.append([
        secteur, forme, region, anciennete, nb_employes, experience_dirigeant,
        ca, resultat_net, fonds_propres, dettes, tresorerie,
        ratio_endettement, marge,
        incidents, utilisation_decouvert, anciennete_banque,
        montant_credit, duree, ratio_credit_ca,
        garantie, ltv,
        default
    ])

cols = [
    "secteur", "forme", "region", "anciennete", "nb_employes", "experience_dirigeant",
    "ca", "resultat_net", "fonds_propres", "dettes", "tresorerie",
    "ratio_endettement", "marge",
    "incidents", "utilisation_decouvert", "anciennete_banque",
    "montant_credit", "duree", "ratio_credit_ca",
    "garantie", "ltv",
    "default"
]

df = pd.DataFrame(data, columns=cols)
df.to_csv("scoring_pme_dataset.csv", index=False)

print("Dataset généré : scoring_pme_dataset.csv")
print(df.head())


Dataset généré : scoring_pme_dataset.csv
     secteur forme   region  anciennete  nb_employes  experience_dirigeant  \
0        BTP  SARL    Ouaga           7          182                    29   
1        BTP   SAS     Bobo           1           60                    22   
2   Services    SA  Banfora           5           53                     7   
3   Commerce  SARL     Fada          18           46                     2   
4  Transport  SARL    Ouaga          13           43                    29   

             ca  resultat_net  fonds_propres        dettes  ...     marge  \
0  2.652492e+08  2.588581e+07   4.721363e+07  6.229381e+07  ...  0.097591   
1  1.261484e+08  2.638851e+06   3.098389e+07  7.154025e+07  ...  0.020919   
2  1.279695e+08 -4.184268e+06   2.821891e+07  7.805606e+07  ... -0.032697   
3  3.136928e+08 -2.238811e+07   1.251477e+08  2.362497e+08  ... -0.071370   
4  2.698547e+08  3.097148e+07   1.087461e+08  2.039510e+08  ...  0.114771   

   incidents  utilisation_d

In [3]:
df

Unnamed: 0,secteur,forme,region,anciennete,nb_employes,experience_dirigeant,ca,resultat_net,fonds_propres,dettes,...,marge,incidents,utilisation_decouvert,anciennete_banque,montant_credit,duree,ratio_credit_ca,garantie,ltv,default
0,BTP,SARL,Ouaga,7,182,29,2.652492e+08,2.588581e+07,4.721363e+07,6.229381e+07,...,0.097591,0,0.708073,6,3.579524e+07,48,0.134949,4.097472e+07,0.873593,0
1,BTP,SAS,Bobo,1,60,22,1.261484e+08,2.638851e+06,3.098389e+07,7.154025e+07,...,0.020919,0,0.366362,14,4.536980e+07,36,0.359654,2.922794e+07,1.552275,0
2,Services,SA,Banfora,5,53,7,1.279695e+08,-4.184268e+06,2.821891e+07,7.805606e+07,...,-0.032697,1,0.391061,2,2.055062e+08,24,1.605900,1.402868e+08,1.464900,0
3,Commerce,SARL,Fada,18,46,2,3.136928e+08,-2.238811e+07,1.251477e+08,2.362497e+08,...,-0.071370,0,0.045227,8,2.564695e+08,60,0.817582,1.395743e+08,1.837512,0
4,Transport,SARL,Ouaga,13,43,29,2.698547e+08,3.097148e+07,1.087461e+08,2.039510e+08,...,0.114771,2,0.095410,7,1.070751e+08,60,0.396788,1.024329e+08,1.045319,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2495,Agroalimentaire,SARL,Banfora,11,7,20,3.051865e+08,-4.472498e+07,1.231717e+08,1.724109e+08,...,-0.146550,0,0.276316,9,2.120624e+08,48,0.694862,1.403259e+08,1.511214,0
2496,Services,Entreprise Individuelle,Bobo,22,166,8,2.124548e+08,9.034613e+06,2.876096e+07,1.665888e+08,...,0.042525,1,0.015797,2,2.746214e+08,36,1.292611,1.654005e+08,1.660342,0
2497,Services,SARL,Koudougou,20,195,25,1.059303e+08,1.518448e+07,2.005034e+07,2.443851e+07,...,0.143344,0,0.431246,4,1.433951e+08,36,1.353674,5.069676e+07,2.828487,0
2498,Transport,SA,Ouahigouya,21,57,13,3.380195e+08,-1.386754e+07,8.762158e+07,8.521420e+07,...,-0.041026,0,0.443763,13,1.998635e+08,60,0.591278,2.107508e+08,0.948340,0


In [4]:
# ========================================
# MODELE LOGISTIQUE SCORING PME
# ========================================

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

df = pd.read_csv("scoring_pme_dataset.csv")

# Variables catégorielles
cat_vars = ["secteur", "forme", "region"]

# Variables numériques
num_vars = [
    "anciennete", "nb_employes", "experience_dirigeant",
    "ca", "resultat_net", "fonds_propres", "dettes", "tresorerie",
    "ratio_endettement", "marge",
    "incidents", "utilisation_decouvert", "anciennete_banque",
    "montant_credit", "duree", "ratio_credit_ca",
    "garantie", "ltv"
]

X = df[cat_vars + num_vars]
y = df["default"]

# Préprocessing
preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(drop="first"), cat_vars),
        ("num", StandardScaler(), num_vars)
    ]
)

X_processed = preprocess.fit_transform(X)

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y, test_size=0.3, random_state=42
)

# Modèle
model = LogisticRegression(max_iter=500)
model.fit(X_train, y_train)

# Prédictions
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print("=== Classification Report ===")
print(classification_report(y_test, y_pred))

print("AUC:", roc_auc_score(y_test, y_prob))

=== Classification Report ===
              precision    recall  f1-score   support

           0       0.85      1.00      0.92       636
           1       0.00      0.00      0.00       114

    accuracy                           0.85       750
   macro avg       0.42      0.50      0.46       750
weighted avg       0.72      0.85      0.78       750

AUC: 0.6686665563279267


In [5]:
import pickle

with open("model_logistic.pkl", "wb") as f:
    pickle.dump(model, f)

with open("preprocess.pkl", "wb") as f:
    pickle.dump(preprocess, f)