In [127]:
#run this script to install all the required packages
#!pip install -r requirements.txt

In [128]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [132]:
# Import Wartung.csv data

data = pd.read_csv('wartung.csv', sep=';', decimal=',', encoding='latin-1')

Hier muss noch eine Datenbank angebunden werden, um die Daten zu speichern.

Primary Key: ID, Datentypen müssen gegenenfalls noch angepasst werden nach der Preparation

## Datenaufbereiten

In [133]:
#Datentypen ausgeben
print("Datentypen:")
print(data.info())

Datentypen:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1095 entries, 0 to 1094
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ï»¿MesswertID     1095 non-null   int64  
 1   MaschinenID       1095 non-null   object 
 2   Datum             1095 non-null   object 
 3   Zeit              1095 non-null   object 
 4   Druck             911 non-null    float64
 5   Temperatur        917 non-null    float64
 6   Vibration         915 non-null    float64
 8   Ausschuss         803 non-null    float64
 9   Produktionsindex  1095 non-null   int64  
dtypes: float64(4), int64(3), object(3)
memory usage: 85.7+ KB
None


In [134]:
#Anzahl missing data
print("Datentypen:")
print(data.isnull().sum())

Datentypen:
ï»¿MesswertID         0
MaschinenID           0
Datum                 0
Zeit                  0
Druck               184
Temperatur          178
Vibration           180
Ausschuss           292
Produktionsindex      0
dtype: int64


In [135]:
#Datensätze mit Duplikaten entfernen
data.drop_duplicates(inplace=True)
print(data.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1095 entries, 0 to 1094
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ï»¿MesswertID     1095 non-null   int64  
 1   MaschinenID       1095 non-null   object 
 2   Datum             1095 non-null   object 
 3   Zeit              1095 non-null   object 
 4   Druck             911 non-null    float64
 5   Temperatur        917 non-null    float64
 6   Vibration         915 non-null    float64
 8   Ausschuss         803 non-null    float64
 9   Produktionsindex  1095 non-null   int64  
dtypes: float64(4), int64(3), object(3)
memory usage: 85.7+ KB
None


### Fehlerhafte Daten  beim Datum identifizieren und korrigieren

In [136]:
# Convert date column to datetime format if value is not usable as date format remove row
data['Datum'] = pd.to_datetime(data['Datum'], format='%d.%m.%Y', errors='coerce')

In [137]:
# Convert time column to datetime format if value is not usable as time format remove row
data['Zeit'] = pd.to_datetime(data['Zeit'], format='%H:%M:%S', errors='coerce')
d1 = data

### Neue Spalten hinzufügen

In [138]:
# Add new Column 'Ausfall' with 0 or 1 for each row. Ausfall = 1 means Column Druck, Temperatur Vibration and Ausschuss are all NaN
d1['Ausfall'] = np.where( d1['Druck'].isnull() & d1['Temperatur'].isnull() & d1['Vibration'].isnull() & d1['Ausschuss'].isnull(), 1, 0)
print(d1.head(10))
d2 = d1

   ï»¿MesswertID MaschinenID      Datum Zeit      Druck  Temperatur  \
0          12478          A1 2022-02-01  NaT  47.619048   61.904762   
1          12479          A1 2022-02-01  NaT  48.571429   62.857143   
2          12480          A1 2022-02-01  NaT  57.142857   76.190476   
3          12481          A1 2022-02-01  NaT        NaN         NaN   
4          12482          A1 2022-02-01  NaT        NaN         NaN   
5          12483          A1 2022-02-01  NaT        NaN         NaN   
6          12484          A1 2022-02-01  NaT        NaN         NaN   
7          12485          A1 2022-02-01  NaT        NaN         NaN   
8          12486          A1 2022-02-01  NaT        NaN         NaN   
9          12487          A1 2022-02-01  NaT        NaN         NaN   

0  28.571429              3        2.0                85        0  
1  47.619048              5        3.0                87        0  
2  33.333333             12       11.0                91        0  
3        NaN  

In [139]:
d1

Unnamed: 0,ï»¿MesswertID,MaschinenID,Datum,Zeit,Druck,Temperatur,Vibration,Anzahlwarning,Ausschuss,Produktionsindex,Ausfall
0,12478,A1,2022-02-01,NaT,47.619048,61.904762,28.571429,3,2.0,85,0
1,12479,A1,2022-02-01,NaT,48.571429,62.857143,47.619048,5,3.0,87,0
2,12480,A1,2022-02-01,NaT,57.142857,76.190476,33.333333,12,11.0,91,0
3,12481,A1,2022-02-01,NaT,,,,2,,84,1
4,12482,A1,2022-02-01,NaT,,,,2,,85,1
...,...,...,...,...,...,...,...,...,...,...,...
1090,12521,A1,NaT,NaT,61.904762,68.571429,38.095238,7,6.0,86,0
1091,12522,A1,NaT,NaT,63.809524,69.523810,42.857143,8,7.0,85,0
1092,12523,A1,NaT,NaT,63.809524,69.523810,42.857143,8,7.0,83,0
1093,12524,A1,NaT,NaT,63.809524,69.523810,42.857143,9,7.0,83,0


In [141]:
# if one of druck, vibration or temperatur is NaN and Ausfall = 0 then new column 'Error' = 1
d2['Error'] = np.where((d2['Ausfall'] == 0) & (d2['Druck'].isnull() | d2['Temperatur'].isnull() | d2['Vibration'].isnull()), 1, 0)


In [142]:
d2

Unnamed: 0,ï»¿MesswertID,MaschinenID,Datum,Zeit,Druck,Temperatur,Vibration,Anzahlwarning,Ausschuss,Produktionsindex,Ausfall,Error
0,12478,A1,2022-02-01,NaT,47.619048,61.904762,28.571429,3,2.0,85,0,0
1,12479,A1,2022-02-01,NaT,48.571429,62.857143,47.619048,5,3.0,87,0,0
2,12480,A1,2022-02-01,NaT,57.142857,76.190476,33.333333,12,11.0,91,0,0
3,12481,A1,2022-02-01,NaT,,,,2,,84,1,0
4,12482,A1,2022-02-01,NaT,,,,2,,85,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1090,12521,A1,NaT,NaT,61.904762,68.571429,38.095238,7,6.0,86,0,0
1091,12522,A1,NaT,NaT,63.809524,69.523810,42.857143,8,7.0,85,0,0
1092,12523,A1,NaT,NaT,63.809524,69.523810,42.857143,8,7.0,83,0,0
1093,12524,A1,NaT,NaT,63.809524,69.523810,42.857143,9,7.0,83,0,0


In [143]:
#count the number of errors
print(d2['Error'].value_counts())

Error
0    1075
1      20
Name: count, dtype: int64


In [None]:
#20 errors können gelöscht werden, da unter 1% der Daten

### Fehlende Daten ergänzen

Gegebenenfalls können anstelle von Entfernen auch Werte durch synthetische Daten ersetzt werden

In [None]:
#creating requirements.txt
#!pip freeze > requirements.txt