## MACHINE LEARNING achat_produit_fournisseur_stocks

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:


# Chargement des données
df = pd.read_csv("acaht_prod_fourn_stock.csv")

df['date_achat'] = pd.to_datetime(df['date_achat'])

# Création de caractéristiques temporelles
df['mois_achat'] = df['date_achat'].dt.month
df['annee'] = df['date_achat'].dt.year
df['jour_semaine'] = df['date_achat'].dt.dayofweek
df['trimestre'] = df['date_achat'].dt.quarter


In [None]:
df.columns

### Préprocessing 

In [None]:

# Identification des variables numériques et catégorielles
cat_features = ['id_produit', 'id_fournisseur', 'catégorie', 'marque', 
                'nom_fournisseur', 'ville', 'pays', 'entrepot']
num_features = ['prix_unitaire', 'fiabilité', 'stock_minimum', 'niveau_stock', 
                'délai_moyen_jours', 'prix']

# Préprocesseur pour transformer les variables
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
    ])


### Modèle de prévision de la demande


In [None]:
# Agrégation des données par produit et par mois pour la prévision
demand_data = df.groupby(['id_produit', 'année', 'mois'])['quantité'].sum().reset_index()

# Création de variables lag (historique des ventes précédentes)
for lag in [1, 2, 3, 6]:
    demand_data[f'lag_{lag}'] = demand_data.groupby('id_produit')['quantité'].shift(lag)

# Suppression des lignes avec valeurs manquantes (dues aux lags)
demand_data = demand_data.dropna()


In [None]:
demand_data.columns

In [None]:

# Identification des variables numériques et catégorielles
cat_features_ = ['id_produit']
num_features_ = ['mois', 'lag_1', 'lag_2', 'lag_3', 'lag_6']

preprocessor1 = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_features_),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features_)
    ])

# Préparation des features et de la cible
X = demand_data.drop(['quantité'], axis=1)
y = demand_data['quantité']

# Division train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Modèle XGBoost pour la prévision de demande
model_demand = Pipeline([
    ('preprocessor', preprocessor1),
    ('regressor', XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5))
])

# Entraînement et évaluation
model_demand.fit(X_train, y_train)
y_pred = model_demand.predict(X_test)

# Métriques d'évaluation
print(f"MAE: {mean_absolute_error(y_test, y_pred)}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred))}")
print(f"R²: {r2_score(y_test, y_pred)}")

# Analyse de l'importance des features
feature_importance = model_demand.named_steps['regressor'].feature_importances_

### Modèle d'estimation des délais fournisseurs


In [None]:
# Préparation des données pour le modèle de délai
X_delay = df[['id_fournisseur', 'quantité', 'catégorie', 'prix_unitaire','prix','fiabilité', 'mois', 'jour_semaine', 'pays', 'ville','entrepot']]
y_delay = df['délai_livraison_jours']

# Identification des variables numériques et catégorielles
cat_featuresf = ['id_fournisseur', 'catégorie','mois', 'jour_semaine','entrepot','ville', 'pays']
num_featuresf = ['prix_unitaire', 'fiabilité', 'prix','quantite']

# Préprocesseur pour transformer les variables
preprocessorf = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_featuresf),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_featuresf)
    ])

# Division train/test
X_train, X_test, y_train, y_test = train_test_split(X_delay, y_delay, test_size=0.2, random_state=42)

# Modèle Random Forest pour la prédiction des délais
model_delay = Pipeline([
    ('preprocessor', preprocessorf),
    ('regressor', RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42))
])

# Entraînement et évaluation
model_delay.fit(X_train, y_train)
y_pred = model_delay.predict(X_test)

# Métriques d'évaluation
print(f"MAE: {mean_absolute_error(y_test, y_pred)}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred))}")
print(f"R²: {r2_score(y_test, y_pred)}")

In [None]:
from sklearn.model_selection import cross_val_score, TimeSeriesSplit

# Pour le modèle de prévision de demande (validation temporelle)
tscv = TimeSeriesSplit(n_splits=5)
cv_scores = cross_val_score(model_demand, X, y, cv=tscv, scoring='neg_mean_absolute_error')
print(f"MAE CV: {-np.mean(cv_scores)}")

# Pour le modèle de délai (validation standard)
cv_scores = cross_val_score(model_delay, X_delay, y_delay, cv=5, scoring='neg_mean_absolute_error')
print(f"MAE CV: {-np.mean(cv_scores)}")


## Random Forest

In [None]:
# Feature Engineering avancé
df['demande_rolling_30j'] = df.groupby('id_produit')['quantité'].transform(
    lambda x: x.rolling(window=30, min_periods=1).mean()
)
df['pression_stock'] = df['niveau_stock'] / df['stock_minimum']


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_absolute_error, roc_auc_score

# 1. Chargement des données
df = pd.read_csv('.../code/achat_prod_fournisseur_stock.csv',sep=';', parse_dates=['date_achat', 'date_expedition', 'date_livraison'])
# Supposons une table retours dans retours.csv
retours = pd.read_csv('.../base/retoursV2.csv', parse_dates=['date_retour'])

# 2. Calcul des KPI
# 2.1 Rupture de stock
df['rupture_flag'] = (df['niveau_stock'] < df['stock_minimum']).astype(int)
kpi_rupture = df.groupby('id_produit')['rupture_flag'].mean().reset_index(name='taux_rupture_pct')

# 2.2 OTIF
# OnTimeFlag et FullFlag
df['on_time'] = (df['date_livraison'] <= df['date_attendue']).astype(int)
df['full']    = (df['quant_livree'] >= df['quant_attendue']).astype(int)
df['otif_flag'] = df['on_time'] * df['full']
kpi_otif     = df['otif_flag'].mean() * 100

# 2.3 Taux de retour
ret_total = retours.groupby('id_commande')['quantite_retournee'].sum().reset_index()
deliv_total = df.groupby('id_commande')['quant_livree'].sum().reset_index()
ret = ret_total.merge(deliv_total, on='id_commande')
kpi_taux_retour = (ret['quantite_retournee'].sum() / ret['quant_livree'].sum()) * 100

# 2.4 Délai moyen expédition
df['delai_jours'] = (df['date_livraison'] - df['date_expedition']).dt.days
kpi_delai_moyen = df['delai_jours'].mean()

# 2.5 Stock turnover
# consommation annuelle et stock moyen
consommation = df.groupby(df['date_achat'].dt.year)['quant_achetee'].sum().iloc[-1]
stock_jan1 = df[df['date_achat'] == pd.to_datetime(f"{df['date_achat'].dt.year.max()}-01-01")]['niveau_stock'].mean()
stock_dec31 = df[df['date_achat'] == pd.to_datetime(f"{df['date_achat'].dt.year.max()}-12-31")]['niveau_stock'].mean()
stock_moyen = (stock_jan1 + stock_dec31) / 2
kpi_rotation = consommation / stock_moyen

# 2.6 Coût logistique unitaire
kpi_cout_unitaire = df['cout_transport'].sum() / df['quant_livree'].sum()

# 2.7 Fill Rate
df['fill_rate'] = df['quant_livree'] / df['quant_attendue']
kpi_fill_rate = df['fill_rate'].mean() * 100

# Agrégation des KPI globaux
glob_kpis = pd.DataFrame({
    'otif_pct': [kpi_otif],
    'taux_retour_pct': [kpi_taux_retour],
    'delai_moyen_jours': [kpi_delai_moyen],
    'rotation_stock': [kpi_rotation],
    'cout_unitaire': [kpi_cout_unitaire],
    'fill_rate_pct': [kpi_fill_rate]
})

# 3. Feature Engineering pour ML
# Ajout de variables temporelles
df['jour_semaine'] = df['date_achat'].dt.dayofweek
df['mois']         = df['date_achat'].dt.month
# Moyenne mobile du niveau stock
df = df.sort_values('date_achat')
df['stock_mm_7'] = df.groupby('id_produit')['niveau_stock'].transform(lambda x: x.rolling(7, min_periods=1).mean())

# Merge kpi_rupture sur df
df = df.merge(kpi_rupture, on='id_produit', how='left')

# 4. Cas d'usage: prédiction du délai fournisseur (Régression)
features = ['jour_semaine', 'mois', 'niv...']  # compléter
X = df[features]
y = df['delai_jours']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
reg = RandomForestRegressor(n_estimators=100, random_state=42)
reg.fit(X_train, y_train)
pred = reg.predict(X_test)
print('MAE délai:', mean_absolute_error(y_test, pred))

# 5. Cas d'usage: classification retour
# Préparation dataset retour vs non-retour
df_class = df.merge(ret[['id_commande','quantite_retournee']], on='id_commande', how='left').fillna(0)
df_class['retour_flag'] = (df_class['quantite_retournee']>0).astype(int)
Xc = df_class[features]
yc = df_class['retour_flag']
Xc_train, Xc_test, yc_train, yc_test = train_test_split(Xc, yc, test_size=0.2, random_state=42, stratify=yc)
clf = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
clf.fit(Xc_train, yc_train)
yc_pred = clf.predict_proba(Xc_test)[:,1]
print('ROC AUC retour:', roc_auc_score(yc_test, yc_pred))

# Export des KPI pour reporting
glob_kpis.to_csv('kpi_supply_chain.csv', index=False)


## Other model

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, TimeSeriesSplit, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import mean_absolute_error, roc_auc_score, roc_curve
from statsmodels.tsa.statespace.sarimax import SARIMAX

# 1. Chargement & préparation
df = pd.read_csv('../stephanie/acaht_prod_fourn_stock.csv', parse_dates=['date_achat','date_expedition','date_livraison'])
retours = pd.read_csv('../stephanie/retoursV2.csv', parse_dates=['date_retour'])

# KPI calculs (cf. sections précédentes)
df['rupture_flag'] = (df['niveau_stock'] < df['stock_minimum']).astype(int)
df['on_time']      = (df['date_livraison'] <= df['date_attendue']).astype(int)
df['full']         = (df['quant_livree'] >= df['quant_attendue']).astype(int)
df['otif_flag']    = df['on_time'] * df['full']
ret_tot   = retours.groupby('id_commande')['quantite_retournee'].sum()
deliv_tot = df.groupby('id_commande')['quant_livree'].sum()
ret = ret_tot.to_frame('quantite_retournee').join(deliv_tot.to_frame('quant_livree'), how='inner')
df['delai_jours']  = (df['date_livraison'] - df['date_expedition']).dt.days

glob_kpis = pd.DataFrame({
    'otif_pct':      [df['otif_flag'].mean()*100],
    'taux_retour_pct': [(ret['quantite_retournee'].sum()/ret['quant_livree'].sum())*100],
    'delai_moyen_jours': [df['delai_jours'].mean()],
    'rotation_stock': [ df.groupby(df['date_achat'].dt.year)['quant_achetee'].sum().iloc[-1] / \
                       ((df[df['date_achat'].dt.dayofyear==1]['niveau_stock'].mean()+
                         df[df['date_achat'].dt.dayofyear==365]['niveau_stock'].mean())/2) ],
    'cout_unitaire': [df['cout_transport'].sum()/df['quant_livree'].sum()],
    'fill_rate_pct': [df['quant_livree'].sum()/df['quant_attendue'].sum()*100]
})

# 2. Feature Engineering pour ML
df = df.sort_values('date_achat')
df['jour_semaine'] = df['date_achat'].dt.dayofweek
df['mois']         = df['date_achat'].dt.month
for lag in [1,7,14]:
    df[f'lag_{lag}'] = df.groupby('id_produit')['quant_achetee'].shift(lag)
for w in [7,30]:
    df[f'mm_{w}'] = df.groupby('id_produit')['quant_achetee'].transform(lambda x: x.rolling(w,1).mean())
# suppress NaNs
df_ml = df.dropna(subset=[f'lag_{l}' for l in [1,7,14]] + [f'mm_{w}' for w in [7,30]])

# 3. Forecasting
results = {}
# 3.1 Série temporelle SARIMAX par produit
prods = df_ml['id_produit'].unique()[:3]  # exemple
for pid in prods:
    ts = df_ml[df_ml['id_produit']==pid].set_index('date_achat')['quant_achetee']
    train, test = ts.iloc[:-30], ts.iloc[-30:]
    model = SARIMAX(train, order=(1,1,1), seasonal_order=(1,1,1,7))
    fit = model.fit(disp=False)
    pred = fit.predict(start=test.index[0], end=test.index[-1])
    mae = mean_absolute_error(test, pred)
    results[pid] = {'sarimax_mae': mae, 'actual':test, 'pred':pred}

# 3.2 ML-based forecasting (RandomForest)
features = ['jour_semaine','mois','lag_1','lag_7','lag_14','mm_7','mm_30']
fore_mae = {}
for pid in prods:
    sub = df_ml[df_ml['id_produit']==pid]
    X = sub[features]; y=sub['quant_achetee']
    tscv = TimeSeriesSplit(n_splits=5)
    maes=[]
    for tr, ts in tscv.split(X):
        rf=RandomForestRegressor(n_estimators=100,random_state=0)
        rf.fit(X.iloc[tr],y.iloc[tr])
        p=rf.predict(X.iloc[ts])
        maes.append(mean_absolute_error(y.iloc[ts],p))
    fore_mae[pid] = np.mean(maes)
    results[pid]['rf_mae']=fore_mae[pid]

# 4. Clustering
# 4.1 Produits
prod_feats = df.groupby('id_produit').agg({
    'quant_achetee':'sum','prix_unitaire':'mean','niveau_stock':'mean','delai_jours':'mean'
}).reset_index()
kmeans_p = KMeans(n_clusters=4, random_state=0).fit(prod_feats.drop('id_produit',1))
prod_feats['cluster_prod']=kmeans_p.labels_
# 4.2 Fournisseurs
fourn_feats = df.groupby('nom_fournisseur').agg({
    'quant_achetee':'sum','prix_unitaire':'mean','delai_jours':'mean'
}).reset_index()
kmeans_f = KMeans(n_clusters=3, random_state=0).fit(fourn_feats.drop('nom_fournisseur',1))
fourn_feats['cluster_fourn']=kmeans_f.labels_

# 5. Hyperparam & validation croisée (exemple RandomForest rég.)
param_dist = {'n_estimators':[50,100,200],'max_depth':[5,10,None],'min_samples_split':[2,5,10]}
rs = RandomizedSearchCV(RandomForestRegressor(), param_dist, n_iter=10, cv=3, scoring='neg_mean_absolute_error', random_state=0)
X_all, y_all = df_ml[features], df_ml['delai_jours']
rs.fit(X_all, y_all)
best_params = rs.best_params_

# 6. Visualisations
plt.figure(); glob_kpis.T.plot(kind='bar', legend=False); plt.title('KPI globaux'); plt.tight_layout(); plt.show()

for pid,res in results.items():
    plt.figure();
    res['actual'].plot(label='actuel'); res['pred'].plot(label='SARIMAX');
    plt.title(f'Forecast SARIMAX PID {pid} (MAE={res["sarimax_mae"]:.1f})'); plt.legend(); plt.show()

plt.figure()
ids, maes_s, maes_rf = zip(*[(pid,res['sarimax_mae'],res['rf_mae']) for pid,res in results.items()])
plt.plot(ids, maes_s, label='SARIMAX MAE'); plt.plot(ids, maes_rf, label='RF MAE'); plt.title('MAE comparatif'); plt.legend(); plt.show()

# ROC pour classification retour
df_class = df.merge(ret[['quantite_retournee']], on='id_commande', how='left').fillna(0)
df_class['retour_flag']=(df_class['quantite_retournee']>0).astype(int)
Xc,yc = df_class[features],df_class['retour_flag']
Xct, Xcv, yct, ycv = train_test_split(Xc,yc,test_size=0.2,random_state=0,stratify=yc)
clf = RandomForestClassifier(n_estimators=100,random_state=0)
clf.fit(Xct,yct)
probs = clf.predict_proba(Xcv)[:,1]
fpr,tpr,_ = roc_curve(ycv,probs)
plt.figure(); plt.plot(fpr,tpr); plt.title(f'ROC AUC={roc_auc_score(ycv,probs):.2f}'); plt.xlabel('FPR'); plt.ylabel('TPR'); plt.show()

# Export des résultats
glob_kpis.to_csv('kpi_supply_chain.csv',index=False)
prod_feats.to_csv('clusters_produits.csv',index=False)
fourn_feats.to_csv('clusters_fournisseurs.csv',index=False)


In [None]:
# Python Notebook pour optimisation de la chaîne d'approvisionnement

# 0. Imports et configuration
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, TimeSeriesSplit, RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import mean_absolute_error, mean_squared_error, roc_auc_score, roc_curve, precision_score, recall_score, f1_score
from statsmodels.tsa.statespace.sarimax import SARIMAX
from prophet import Prophet
from xgboost import XGBRegressor, XGBClassifier

# 1. Chargement et préparation des données
# Lecture depuis les CSV extraits
df_cmd = pd.read_csv('/base/commandesV2.csv', sep=";",parse_dates=['date_commande'])
df_exp = pd.read_csv('/mnt/data/expeditions.csv', parse_dates=['date_expedition','date_livraison'])
df_ret = pd.read_csv('/mnt/data/retours.csv', parse_dates=['date_retour'])
df_stock = pd.read_csv('/mnt/data/stocks.csv', parse_dates=['date_stock'])
df_achats = pd.read_csv('/mnt/data/achats.csv', parse_dates=['date_achat'])
df_fourn = pd.read_csv('/mnt/data/fournisseurs.csv')
df_prod = pd.read_csv('/mnt/data/produits.csv')

# Harmonisation texte
for df in [df_cmd, df_exp, df_ret, df_stock, df_achats, df_fourn, df_prod]:
    for col in df.select_dtypes(include=['object']):
        df[col] = df[col].str.strip().str.lower()

# 2. Calcul des KPI
# 2.1 OTIF
df_exp['on_time'] = (df_exp['date_livraison'] <= df_exp['date_attendue']).astype(int)
df_exp['full']    = (df_exp['quant_livree'] >= df_exp['quant_attendue']).astype(int)
df_exp['otif_flag'] = df_exp['on_time'] * df_exp['full']
otif_pct = df_exp['otif_flag'].mean() * 100

# 2.2 Taux de retour
ret_tot   = df_ret.groupby('id_commande')['quantite_retournee'].sum()
deliv_tot = df_exp.groupby('id_commande')['quant_livree'].sum()
ret = ret_tot.to_frame('quantite_retournee').join(deliv_tot.to_frame('quant_livree'), how='inner')
retour_pct = (ret['quantite_retournee'].sum() / ret['quant_livree'].sum()) * 100

# 2.3 Délai moyen expédition
df_exp['lead_time'] = (df_exp['date_livraison'] - df_exp['date_expedition']).dt.days
delai_moyen = df_exp['lead_time'].mean()

# 2.4 Rotation des stocks
# consommation annuelle (quantité expédiée) et stock moyen
year = df_stock['date_stock'].dt.year.max()
consommation = df_exp[df_exp['date_expedition'].dt.year==year]['quant_livree'].sum()
stock_deb = df_stock[df_stock['date_stock']==pd.to_datetime(f"{year}-01-01")]['niveau_stock'].mean()
stock_fin = df_stock[df_stock['date_stock']==pd.to_datetime(f"{year}-12-31")]['niveau_stock'].mean()
rotation_stock = consommation / ((stock_deb + stock_fin)/2)

# 2.5 Coût logistique unitaire
cout_unitaire = df_exp['cout_transport'].sum() / df_exp['quant_livree'].sum()

# 2.6 Fill Rate
fill_rate = df_exp['quant_livree'].sum() / df_exp['quant_attendue'].sum() * 100

# Regroup KPI
glob_kpis = pd.Series({
    'OTIF_%': otif_pct,
    'Taux_retour_%': retour_pct,
    'Delai_moyen_jours': delai_moyen,
    'Rotation_stock': rotation_stock,
    'Cout_unitaire': cout_unitaire,
    'Fill_rate_%': fill_rate
})

# 3. EDA rapide
glob_kpis.plot(kind='bar', title='KPI Supply Chain'); plt.tight_layout(); plt.show()

# 4. Feature Engineering pour ML
# Sur base commandes
df_ml = df_cmd.merge(ret[['quantite_retournee']], on='id_commande', how='left').fillna(0)
# Variables temporelles
df_ml['mois'] = df_ml['date_commande'].dt.month
df_ml['jour_semaine'] = df_ml['date_commande'].dt.dayofweek
# Rolling commandes mensuelles :
monthly = df_ml.set_index('date_commande')['quantite_retournee'].resample('M').sum()
df_ml = df_ml.merge(monthly.shift(1).rename('roll_1m'),
                    left_on=df_ml['date_commande'].dt.to_period('M').dt.to_timestamp(),
                    right_index=True, how='left')

# 5. Forecasting de la demande
# Préparation séries temporelles journalière
df_ts = df_cmd.set_index('date_commande').groupby('id_produit')['quantite'].resample('D').sum().fillna(0)
results = {}
prods = df_ts.index.get_level_values(0).unique()[:3]
for pid in prods:
    ts = df_ts.loc[pid]
    train, test = ts[:-30], ts[-30:]
    # SARIMAX
    model = SARIMAX(train, order=(1,1,1), seasonal_order=(1,1,1,7)).fit(disp=False)
    sar_pred = model.predict(start=test.index[0], end=test.index[-1])
    mae_s = mean_absolute_error(test, sar_pred)
    # Prophet
    df_prop = train.reset_index().rename(columns={'date_commande':'ds','quantite':'y'})
    m = Prophet(); m.fit(df_prop)
    future = m.make_future_dataframe(periods=30, freq='D')
    pr = m.predict(future); pr_pred = pr.set_index('ds')['yhat'].loc[test.index]
    mae_p = mean_absolute_error(test, pr_pred)
    # RF
    lags = pd.concat([train.shift(lag) for lag in [1,7,14]], axis=1)
    lags.columns = ['lag1','lag7','lag14']
    lags = lags.dropna()
    X = lags.values; y = train.loc[lags.index].values
    rf = RandomForestRegressor(n_estimators=100, random_state=0).fit(X,y)
    X_test = np.vstack([test.shift(lag).loc[lags.index].values for lag in [1,7,14]]).T
    mae_rf = mean_absolute_error(test.loc[lags.index], rf.predict(X_test))
    results[pid] = {'SARIMAX_MAE':mae_s,'Prophet_MAE':mae_p,'RF_MAE':mae_rf}

# Affichage comparatif
res_df = pd.DataFrame(results).T
res_df.plot(kind='bar', title='MAE comparatif Forecast'); plt.tight_layout(); plt.show()

# 6. Clustering de produits & fournisseurs
# Produits
df_p = df_achats.groupby('id_produit').agg({'quant_achetee':'sum','prix_unitaire':'mean'}).reset_index()
k_p = KMeans(n_clusters=4, random_state=0).fit(df_p[['quant_achetee','prix_unitaire']])
df_p['cluster'] = k_p.labels_
# Fournisseurs
df_f = df_achats.merge(df_fourn[['id_fournisseur','delai_moyen_jours']],on='id_fournisseur')
df_f = df_f.groupby('id_fournisseur').agg({'quant_achetee':'sum','delai_moyen_jours':'mean'}).reset_index()
k_f = DBSCAN(eps=10, min_samples=2).fit(df_f[['quant_achetee','delai_moyen_jours']])
df_f['db_cluster'] = k_f.labels_

# PCA pour visualiser clusters fabricants
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
comp = pca.fit_transform(df_f[['quant_achetee','delai_moyen_jours']])
df_f['pca1'], df_f['pca2'] = comp[:,0], comp[:,1]
plt.figure()
plt.scatter(df_f['pca1'], df_f['pca2'], c=df_f['db_cluster'], cmap='tab10')
plt.title('Clusters Fournisseurs (DBSCAN)')
plt.show()

# 7. Modèles de classification des retards
df_delay = df_exp.copy()
df_delay['late'] = (df_delay['lead_time']>3).astype(int)
X = pd.get_dummies(df_delay[['quant_livree','cout_transport','entrepôt']], drop_first=True)
y = df_delay['late']
X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y,test_size=0.3,random_state=42)
def eval_clf(m):
    m.fit(X_train,y_train)
    pred = m.predict(X_test); prob = m.predict_proba(X_test)[:,1]
    print(m.__class__.__name__)
    print(f"Precision: {precision_score(y_test,pred):.2f}, Recall: {recall_score(y_test,pred):.2f}, F1: {f1_score(y_test,pred):.2f}, AUC: {roc_auc_score(y_test,prob):.2f}")
for model in [RandomForestClassifier(n_estimators=100), XGBClassifier(use_label_encoder=False, eval_metric='logloss')]:
    eval_clf(model)

# Courbe ROC pour RF
rf_clf = RandomForestClassifier(n_estimators=100).fit(X_train,y_train)
fpr,tpr,_ = roc_curve(y_test, rf_clf.predict_proba(X_test)[:,1])
plt.figure(); plt.plot(fpr,tpr); plt.title('ROC RF'); plt.show()

# 8. Hyperparamétrage example pour RF rég.
param_dist = {'n_estimators':[50,100,200],'max_depth':[None,5,10]}
rs = RandomizedSearchCV(RandomForestRegressor(), param_dist, n_iter=5, cv=3, scoring='neg_mean_absolute_error',random_state=42)
rs.fit(df_p[['quant_achetee']], df_p['quant_achetee'])
print('Best Params Reg:', rs.best_params_)
```
'''
_Notes_ :
- Remplacement des connexions SQL par lecture CSV.
- Fusion des tables selon votre schéma de projet.
- Intégration forecasting (SARIMAX, Prophet, RF), clustering (KMeans, DBSCAN), classification retards.
- Visualisations matplotlib intégrées.
- Exemples d’hyperparamétrage avec RandomizedSearchCV.
'''
