# Préparation des données d'erreurs

On créé une colonne par type d'erreur, et on fait des sommes glissantes sur 3 heures

In [1]:
import pandas as pd
import numpy  as np
def erreur_glissante(window):
    #Calcule les erreurs par type sur une fenêtre glissante de longueur window en heures
    error = pd.read_csv("../data/raw/PdM_errors.csv")
    # transformer les dates dans le bon type
    error["datetime"] = pd.to_datetime(error["datetime"])
    #liste des noms d'erreurs
    error_names=['error1','error2','error3','error4','error5']
    #Création de dataframe contenant les erreurs dans des champs séparés, indexées par heure et machine id, duplication en temps décalé
    for i in range(window):
        df = pd.DataFrame(data = pd.get_dummies(error["errorID"]).values, index = [error["machineID"],error["datetime"]+pd.DateOffset(hours = i)],columns = error_names)
        # Addition selon les clés date et id machine
        if i == 0:
            errors_glissant = df
        else:
            errors_glissant = errors_glissant.add(df, fill_value = 0)
    errors_glissant["total_error"]=errors_glissant.sum(axis = 1)
    return errors_glissant.groupby(["machineID","datetime"]).sum()

errors_g = erreur_glissante(3)
errors_g.tail(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,error1,error2,error3,error4,error5,total_error
machineID,datetime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
100,2015-11-05 03:00:00,0.0,0.0,1.0,0.0,0.0,1.0
100,2015-11-05 04:00:00,0.0,0.0,1.0,0.0,0.0,1.0
100,2015-11-07 17:00:00,1.0,0.0,0.0,0.0,0.0,1.0
100,2015-11-07 18:00:00,1.0,0.0,0.0,0.0,0.0,1.0
100,2015-11-07 19:00:00,1.0,0.0,0.0,0.0,0.0,1.0
100,2015-11-12 01:00:00,1.0,0.0,0.0,0.0,0.0,1.0
100,2015-11-12 02:00:00,1.0,0.0,0.0,0.0,0.0,1.0
100,2015-11-12 03:00:00,1.0,0.0,0.0,0.0,0.0,1.0
100,2015-11-21 08:00:00,0.0,1.0,0.0,0.0,0.0,1.0
100,2015-11-21 09:00:00,0.0,1.0,0.0,0.0,0.0,1.0


On créé aussi des indexes avec des points toutes les 3 heures, pour chaque machine

In [2]:
machines = pd.read_csv("../data/raw/PdM_machines.csv")

time_series = pd.date_range(start = "2015-01-01 06:00:00", end = "2016-01-01 06:00:00", freq="3H")
indexes = pd.MultiIndex.from_product([machines["machineID"].values,time_series.values], names=['machineID', 'datetime'])

Avec une jointure on donne les erreurs aux bonnes dates pour chaque machine, et on fait une somme sur 24h

In [3]:
erreurs_3 = pd.DataFrame(index = indexes).join(errors_g).fillna(0)
erreurs_24 = erreurs_3.groupby(level="machineID").apply(lambda x:x.rolling(window = 8, min_periods=1).sum())
erreurs = erreurs_3.join(erreurs_24.rename(columns = {'error1':'error1_24','error2':'error2_24','error3':'error3_24','error4':'error4_24','error5':'error5_24', 'total_error':'total_error_24'}))

In [5]:
erreurs.head(30)

Unnamed: 0_level_0,Unnamed: 1_level_0,error1,error2,error3,error4,error5,total_error,error1_24,error2_24,error3_24,error4_24,error5_24,total_error_24
machineID,datetime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,2015-01-01 06:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2015-01-01 09:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2015-01-01 12:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2015-01-01 15:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2015-01-01 18:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2015-01-01 21:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2015-01-02 00:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2015-01-02 03:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2015-01-02 06:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2015-01-02 09:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
