In [11]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

N = 20000

def random_date(start, end):
    return start + timedelta(
        seconds=random.randint(0, int((end - start).total_seconds())))

def random_specification():
    power = random.choice([100, 200, 300, 400])
    size = random.choice(["Petit", "Moyen", "Grand"])
    weight = random.choice([100, 200, 300, 400])
    return f"{power} HP, {size}, {weight} kg"

def incidents_fabrication(Localisation_Usine):
    if Localisation_Usine in ["Usine C", "Usine D"]:
        return np.random.randint(0, 2)  # Moins d'incidents dans les usines 1 et 2
    else:
        return np.random.randint(0, 4)

# Mise à jour de la fonction pour inclure l'impact des incidents sur la durée de fabrication
def fabrication_duration(Localisation_Usine, incidents):
    base_duration = np.random.randint(1, 30)  # Durée de base en jours
    incident_impact = incidents * 2  # Chaque incident ajoute 2 jours à la durée de base
    if Localisation_Usine == "Usine C":
        return (base_duration + incident_impact) * 2  # L'usine 4 prend deux fois plus de temps
    else:
        return base_duration + incident_impact
    
def cout_fabrication(type_moteur):
    if type_moteur == "Diesel":
        return 1000
    elif type_moteur == "Électrique":
        return 1500
    elif type_moteur == "Hybride":
        return 1200


def delai_livraison(mode_expedition):
    if mode_expedition == "Terrestre":
        return 7
    elif mode_expedition == "Aérien":
        return 3
    elif mode_expedition == "Maritime":
        return 14


def demande_client(type_moteur):
    if type_moteur == "Diesel":
        return 80
    elif type_moteur == "Électrique":
        return 95
    elif type_moteur == "Hybride":
        return 85


df = pd.DataFrame({
    "ID_Moteur": range(1, N + 1),
    "Type_Moteur": np.random.choice(["Diesel", "Électrique", "Hybride"], N),
    "Spécifications": [random_specification() for _ in range(N)],
    "Localisation_Usine": np.random.choice(["Usine A", "Usine B", "Usine C", "Usine D"], N),
    "Date_Début_Fabrication": [random_date(datetime(2020, 1, 1), datetime(2020, 12, 31)) for _ in range(N)],
})

# Génération des incidents avant de calculer la durée de fabrication
df["Incidents_Fabrication"] = df["Localisation_Usine"].apply(incidents_fabrication)

df["Incidents_Fabrication"] = np.where(((df["Localisation_Usine"]=="Usine D")&(df["Date_Début_Fabrication"]>"2020-08-01")&(df["Date_Début_Fabrication"]<"2020-09-01")),df["Incidents_Fabrication"]*10,df["Incidents_Fabrication"])

# Calcul de la durée de fabrication en prenant en compte les incidents
df["Durée_Fabrication"] = df.apply(lambda x: fabrication_duration(x["Localisation_Usine"], x["Incidents_Fabrication"]), axis=1)
df["Mode_Expédition"] = np.random.choice(["Terrestre", "Aérien", "Maritime"], N,p=[0.5,0.2,0.3])
# Ajout de complexité
df["Durée_Fabrication"] = np.where(df["Localisation_Usine"]=="Usine B",df["Durée_Fabrication"]*2,df["Durée_Fabrication"])
df["Durée_Fabrication"] = np.where(df["Type_Moteur"]=="Diesel",df["Durée_Fabrication"]*0.7,df["Durée_Fabrication"])
df["Durée_Fabrication"] = np.where(df["Type_Moteur"]=="Hybride",df["Durée_Fabrication"]*0.5,df["Durée_Fabrication"])
df["Durée_Fabrication"] = np.where(((df["Localisation_Usine"]=="Usine B")&(df["Date_Début_Fabrication"]>"2020-08-16")),df["Durée_Fabrication"]*0.5,df["Durée_Fabrication"])
df["Durée_Fabrication"] = np.where(((df["Localisation_Usine"]=="Usine B")&(df["Date_Début_Fabrication"]>"2020-10-16")),df["Durée_Fabrication"]*1.3,df["Durée_Fabrication"])

df["Durée_Fabrication"] = np.where(((df["Localisation_Usine"]=="Usine A")&(df["Date_Début_Fabrication"]>"2020-02-16")&(df["Mode_Expédition"]=="Aérien")),df["Durée_Fabrication"]*3,df["Durée_Fabrication"])
df["Durée_Fabrication"] = np.where(((df["Localisation_Usine"]=="Usine B")&(df["Date_Début_Fabrication"]>"2020-09-02")),df["Durée_Fabrication"]*0.5,df["Durée_Fabrication"])

# Ajout d'une variation sinusoidale
# Convertir la date en numéro du jour de l'année
df['Mois'] = df['Date_Début_Fabrication'].dt.month

# Calcul de la variation sinusoïdale (exemple avec une amplitude de 10% et une fréquence annuelle)
amplitude = 0.5  # 10% de la durée de fabrication
fréquence = 2 * np.pi / 12  # Une fréquence annuelle
df['Variation_Sinus'] = np.sin(df['Mois'] * fréquence) * amplitude

# Appliquer la variation uniquement aux moteurs électriques
df['Durée_Fabrication'] = np.where(df['Type_Moteur'] == "Électrique",
                                   df['Durée_Fabrication'] * (1 + df['Variation_Sinus']),
                                   df['Durée_Fabrication'])



df["Date_Fin_Fabrication"] = df["Date_Début_Fabrication"] + pd.to_timedelta(df["Durée_Fabrication"], unit='d')
df["Coût_Fabrication"] = df["Type_Moteur"].apply(cout_fabrication)*(1+np.random.rand(1)[0])
df["Contrôles_Qualité_Passés"] = np.random.randint(0, 5, N)

df["Délai_Livraison"] = df["Mode_Expédition"].apply(delai_livraison)*(1+np.random.rand(1)[0])
df["Coût_Expédition"] = np.random.uniform(100, 1000, N).round(2)
df["Demande_Client"] = df["Type_Moteur"].apply(demande_client)*(1+np.random.rand(1)[0])
df["Capacité_Production_Usine"] = df["Localisation_Usine"].map({"Usine A": 400, "Usine B": 300, "Usine C": 200, "Usine D": 100})
df["Stocks_Disponibles"] = np.random.randint(0, 50, N)

df['DLT'] = df['Durée_Fabrication'] + df['Délai_Livraison']
df.to_pickle("dataset_moteurs_enrichi.pkl")


In [2]:
df.head(5)

Unnamed: 0,ID_Moteur,Type_Moteur,Spécifications,Localisation_Usine,Date_Début_Fabrication,Incidents_Fabrication,Durée_Fabrication,Mode_Expédition,Date_Fin_Fabrication,Coût_Fabrication,Contrôles_Qualité_Passés,Délai_Livraison,Coût_Expédition,Demande_Client,Capacité_Production_Usine,Stocks_Disponibles,DLT
0,1,Diesel,"100 HP, Petit, 100 kg",Usine B,2020-01-07 01:45:09,0,32.2,Maritime,2020-02-08 06:33:09,1881.452271,0,21.39843,929.55,94.870098,300,46,53.59843
1,2,Diesel,"100 HP, Moyen, 100 kg",Usine C,2020-12-08 05:15:09,0,19.6,Maritime,2020-12-27 19:39:09,1881.452271,2,21.39843,383.89,94.870098,200,10,40.99843
2,3,Diesel,"100 HP, Moyen, 300 kg",Usine C,2020-03-19 08:43:35,1,11.2,Terrestre,2020-03-30 13:31:35,1881.452271,2,10.699215,240.88,94.870098,200,48,21.899215
3,4,Diesel,"400 HP, Moyen, 400 kg",Usine C,2020-02-05 08:49:38,0,22.4,Terrestre,2020-02-27 18:25:38,1881.452271,2,10.699215,959.27,94.870098,200,38,33.099215
4,5,Diesel,"200 HP, Petit, 400 kg",Usine C,2020-12-20 04:37:58,1,19.6,Aérien,2021-01-08 19:01:58,1881.452271,3,4.585378,538.33,94.870098,200,19,24.185378


In [12]:
df.columns

Index(['ID_Moteur', 'Type_Moteur', 'Spécifications', 'Localisation_Usine',
       'Date_Début_Fabrication', 'Incidents_Fabrication', 'Durée_Fabrication',
       'Mode_Expédition', 'Mois', 'Variation_Sinus', 'Date_Fin_Fabrication',
       'Coût_Fabrication', 'Contrôles_Qualité_Passés', 'Délai_Livraison',
       'Coût_Expédition', 'Demande_Client', 'Capacité_Production_Usine',
       'Stocks_Disponibles', 'DLT'],
      dtype='object')

In [13]:

df[['Type_Moteur', 'Spécifications', 'Localisation_Usine',
       'Date_Début_Fabrication',
       'Mode_Expédition']].dtypes

Type_Moteur                       object
Spécifications                    object
Localisation_Usine                object
Date_Début_Fabrication    datetime64[ns]
Mode_Expédition                   object
dtype: object

In [None]:
['Type_Moteur', 'Spécifications', 'Localisation_Usine',
       'Date_Début_Fabrication',
       'Mode_Expédition']

In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.base import BaseEstimator, TransformerMixin
import random

class DateFeatureExtractor(BaseEstimator, TransformerMixin):
    """Extracts features from a date column in a DataFrame."""
    def __init__(self, date_col_name):
        self.date_col_name = date_col_name

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Assure that X is a DataFrame
        if isinstance(X, pd.DataFrame):
            date_col = X[self.date_col_name]
            X_ = pd.DataFrame()
            X_["Year"] = date_col.dt.year
            X_["Month"] = date_col.dt.month
            X_["Day"] = date_col.dt.day
            return X_
        else:
            raise TypeError("Input must be a DataFrame")

def bootstrap_intervals(X, y, model, n_bootstraps=1000):
    predictions = []

    for _ in range(n_bootstraps):
        # Sample with replacement
        indices = np.random.randint(0, len(X), len(X))
        X_sample = X.iloc[indices]
        y_sample = y.iloc[indices]
        
        # Fit model and predict
        model.fit(X_sample, y_sample)
        preds = model.predict(X)
        
        predictions.append(preds)

    # Calculate percentiles
    predictions = np.array(predictions)
    lower_bound = np.percentile(predictions, 5, axis=0)
    upper_bound = np.percentile(predictions, 95, axis=0)

    return lower_bound, upper_bound

# Preprocessing
cat_features = ['Type_Moteur', 'Spécifications', 'Localisation_Usine', 'Mode_Expédition']
date_feature = ['Date_Début_Fabrication']

preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(), cat_features),
    # Here we pass the name of the date column to our transformer
    ('date', DateFeatureExtractor(date_col_name='Date_Début_Fabrication'), ['Date_Début_Fabrication'])
], remainder='drop')

# Load your data here
# données = pd.read_csv("your_data.csv")

# Example: Split your data
X, y = df.drop(columns=['DLT']), df['DLT']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define model
model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor(random_state=42))
])

# Assuming X and y are defined and hold your features and target
lower_bound, upper_bound = bootstrap_intervals(X, y, model)

# Now you can add these bounds to your dataset
df['DLT_lower_bound'] = lower_bound
df['DLT_upper_bound'] = upper_bound


In [16]:
df.to_pickle("dataset_moteurs_enrichi.pkl")


In [17]:
df.head(10)

Unnamed: 0,ID_Moteur,Type_Moteur,Spécifications,Localisation_Usine,Date_Début_Fabrication,Incidents_Fabrication,Durée_Fabrication,Mode_Expédition,Mois,Variation_Sinus,...,Coût_Fabrication,Contrôles_Qualité_Passés,Délai_Livraison,Coût_Expédition,Demande_Client,Capacité_Production_Usine,Stocks_Disponibles,DLT,DLT_lower_bound,DLT_upper_bound
0,1,Diesel,"300 HP, Petit, 300 kg",Usine D,2020-07-12 07:24:14,0,18.2,Maritime,7,-0.25,...,1992.843606,1,16.557448,482.14,146.904682,100,41,34.757448,27.699947,29.357837
1,2,Hybride,"400 HP, Moyen, 400 kg",Usine D,2020-08-04 17:36:29,10,17.5,Terrestre,8,-0.433013,...,2391.412327,4,8.278724,722.06,156.086224,100,44,25.778724,16.362241,18.512632
2,3,Diesel,"200 HP, Grand, 200 kg",Usine D,2020-04-10 02:32:20,0,4.2,Terrestre,4,0.433013,...,1992.843606,3,8.278724,523.01,146.904682,100,34,12.478724,19.294129,20.801249
3,4,Diesel,"300 HP, Moyen, 100 kg",Usine D,2020-02-08 21:54:04,0,20.3,Terrestre,2,0.433013,...,1992.843606,1,8.278724,552.07,146.904682,100,27,28.578724,19.043308,21.323842
4,5,Électrique,"400 HP, Petit, 400 kg",Usine A,2020-04-07 01:33:53,2,17.196152,Terrestre,4,0.433013,...,2989.265409,4,8.278724,482.12,174.44931,400,14,25.474877,38.098227,41.79738
5,6,Diesel,"100 HP, Petit, 400 kg",Usine B,2020-04-27 22:28:15,2,32.2,Aérien,4,0.433013,...,1992.843606,3,3.548025,144.94,146.904682,300,14,35.748025,29.455779,32.733917
6,7,Électrique,"400 HP, Petit, 400 kg",Usine B,2020-09-03 15:38:28,2,1.75,Aérien,9,-0.5,...,2989.265409,4,3.548025,459.61,174.44931,300,49,5.298025,13.158691,16.335869
7,8,Diesel,"200 HP, Grand, 200 kg",Usine A,2020-02-25 21:46:06,0,1.4,Terrestre,2,0.433013,...,1992.843606,0,8.278724,290.07,146.904682,400,35,9.678724,19.519003,21.761343
8,9,Diesel,"100 HP, Moyen, 300 kg",Usine B,2020-03-20 14:17:45,3,22.4,Terrestre,3,0.5,...,1992.843606,3,8.278724,155.52,146.904682,300,12,30.678724,32.40426,34.398976
9,10,Hybride,"300 HP, Moyen, 300 kg",Usine D,2020-02-07 01:59:05,1,3.0,Maritime,2,0.433013,...,2391.412327,2,16.557448,550.2,156.086224,100,34,19.557448,24.801422,26.782222


In [37]:
# Graphique de clustering
fig = px.scatter(df, x='DLT', y='Incidents_Fabrication',
                 color='Cluster', symbol='Type_Moteur',
                 hover_data=['Localisation_Usine'],
                 title="Clustering des Types de Moteur et des Usines selon DLT et Incidents de Fabrication")


In [38]:
fig

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [None]:
df.groupby("ID_Usine").agg({"DLT":"mean"})

Unnamed: 0_level_0,DLT
ID_Usine,Unnamed: 1_level_1
1,25.49453
2,25.078782
3,27.315287
4,45.655385
