# Création du Dataset

In [15]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sessions = pd.read_csv("data/sessions.csv", parse_dates=["charge_date"])

sessions.head(10)

Unnamed: 0,charge_date,charger_id,connector_id,sessions,failed_sessions
0,2023-01-16,34406128,5,4,0
1,2023-01-23,37A80B4D,3,3,3
2,2023-01-18,AA92D18E,2,2,0
3,2023-01-28,FF15DC0B,3,2,0
4,2023-01-19,06B95C56,2,1,0
5,2023-01-28,11A94CC8,2,2,0
6,2023-01-07,1D3FBF8A,5,3,0
7,2023-01-16,DF5FD46E,1,2,0
8,2023-01-20,6392F834,2,2,1
9,2023-01-06,24DB5287,5,3,0


In [18]:
ocpp_logs = pd.read_csv("data/ocpp.csv", parse_dates=["error_at"])

print(ocpp_logs.columns)
ocpp_logs.head()


Index(['supplier', 'charger_id', 'connector_id', 'error_code',
       'vendor_error_code', 'total_error_notifications', 'error_at'],
      dtype='object')


Unnamed: 0,supplier,charger_id,connector_id,error_code,vendor_error_code,total_error_notifications,error_at
0,513B5F6A,4365B1B8,8,InternalError,sessmgr.emvPaymentController,1,2023-09-07
1,513B5F6A,790A884E,3,OtherError,RemoteStart,1,2023-08-10
2,513B5F6A,4365B1B8,8,InternalError,ServiceFail,1,2023-09-07
3,513B5F6A,123468DF,5,EVCommunicationError,COMMUNICATION_ERROR,1,2023-07-24
4,513B5F6A,02D469F0,7,InternalError,ServiceFail,2,2023-06-16


In [None]:
# On suppose que chaque erreur a un `charger_id` et une `error_at`
# Supposons que la colonne contenant les messages d'erreurs s'appelle "error_message"
# Supprime les lignes où error_message est vide
ocpp_logs = ocpp_logs.dropna(subset=["error_code"])

# Compter le nombre d'erreurs uniques par `charger_id` et `date`
error_counts_clean = ocpp_logs.groupby(["charger_id", "error_at"])["error_code"].nunique().reset_index()

# Renommer pour être plus explicite
error_counts_clean.rename(columns={"error_code": "unique_error_count"}, inplace=True)

# Vérification
print(error_counts_clean.head())

KeyError: ['error_message']

In [None]:
# Trier les DataFrames pour garantir un bon alignement temporel
sessions = sessions.sort_values(by=["charger_id", "charge_date"])
error_counts_clean = error_counts_clean.sort_values(by=["charger_id", "error_at"])

# Vérifier que les dates sont bien triées
print("Sessions trié ?", sessions["charge_date"].is_monotonic_increasing)
print("Erreurs trié ?", error_counts_clean["error_at"].is_monotonic_increasing)

In [2]:
# Load the data
sessions = pd.read_csv('data/sessions.csv')
ocpp = pd.read_csv('data/ocpp.csv')

In [4]:
# Étape 1 : Trier les données pour appliquer les transformations temporelles
sessions = sessions.sort_values(by=["charger_id", "charge_date"])
ocpp_logs = ocpp.sort_values(by=["charger_id", "error_at"])

In [5]:
# Étape 2 : Calculer les features pour les sessions
sessions["sessions_7d"] = sessions.groupby("charger_id")["sessions"].transform(lambda x: x.rolling(7, min_periods=1).sum())
sessions["failure_rate_7d"] = sessions.groupby("charger_id")["failed_sessions"].transform(lambda x: x.rolling(7, min_periods=1).mean())

In [7]:
# Jours sans sessions consécutifs (corrigé)
sessions["no_session_days"] = sessions.groupby("charger_id")["sessions"].transform(lambda x: (x == 0).astype(int).groupby((x != 0).cumsum()).cumsum())

In [None]:
# Étape 4 : Fusionner les sessions et les erreurs
merged_df = pd.merge_asof(
    sessions.sort_values(by=["charger_id", "charge_date"]),
    ocpp_logs.sort_values(by=["charger_id", "error_at"]),
    left_on="charge_date",
    right_on="error_at",
    by="charger_id",
    direction="backward",  # Associe l'erreur la plus proche avant la session
    tolerance=pd.Timedelta(days=1)  # Tolérance d'un jour
)

# Remplacer les NaN par 0 pour les erreurs (si pas d'erreur trouvée)
merged_df["errors_7d"] = merged_df["errors_7d"].fillna(0)

# Étape 5 : Calculer la target (probabilité d’échec)
merged_df["failure_probability"] = merged_df["failed_sessions"] / merged_df["sessions"] + 1 / (1 + merged_df["sessions"])

# Étape 6 : Nettoyage final
final_features = [
    "charger_id", "charge_date", "sessions_7d", "failure_rate_7d",
    "no_session_days", "errors_7d", "failure_probability"
]
final_df = merged_df[final_features]

# Exporter en CSV pour l'entraînement
final_df.to_csv("dataset_model.csv", index=False)

print("✅ Dataset prêt pour le modèle !")