In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [47]:
# Functions to load all data: signals, logs and failures

def load_signals() -> pd.DataFrame:
    """Load the signals from the wind farm dataset."""
    df = pd.read_csv('../data/raw/wind-farm-1-signals-2016.csv', sep=';')
    aux = pd.read_csv('../data/raw/wind-farm-1-signals-2017.csv', sep=';')

    df = pd.concat([df, aux], axis=0).reset_index(drop=True)
    df['Timestamp'] = pd.to_datetime(df['Timestamp'])
    df = df.set_index('Timestamp').sort_index()
    return df

def load_logs() -> pd.DataFrame:
    """Load the logs from the wind farm dataset."""
    df = pd.read_csv('../data/raw/wind-farm-1-logs-2016.csv', sep=';')
    aux = pd.read_csv('../data/raw/wind-farm-1-logs-2017.csv', sep=';')
    aux.columns = df.columns

    df = pd.concat([df, aux], axis=0).reset_index(drop=True)
    df['TimeDetected'] = pd.to_datetime(df['TimeDetected'])
    return df

def load_failures() -> pd.DataFrame:
    """Load the failures from the wind farm dataset."""
    df = pd.read_csv('../data/raw/htw-failures-2016.csv', sep=';')
    aux = pd.read_csv('../data/raw/htw-failures-2017.csv', sep=';')

    df = pd.concat([df, aux], axis=0).reset_index(drop=True)
    df['Timestamp'] = pd.to_datetime(df['Timestamp'])
    df = df.set_index('Timestamp').sort_index()
    return df


In [63]:
signals = load_signals()
logs = load_logs()
failures = load_failures()

In [66]:
# Check for missing values in signals
signals_na = signals.isna().sum()
signals_na = signals_na[signals_na > 0]
print('Columns with missing values:', list(signals_na.index))

signals[(signals.index >= '2017-08-17 11:20:00+00:00')][['Turbine_ID'] + list(signals_na.index)].head(10)

Columns with missing values: ['Gen_Bear_Temp_Avg', 'Grd_Prod_CosPhi_Avg']


Unnamed: 0_level_0,Turbine_ID,Gen_Bear_Temp_Avg,Grd_Prod_CosPhi_Avg
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-08-17 11:20:00+00:00,T07,48.0,0.6
2017-08-17 11:20:00+00:00,T01,35.0,1.0
2017-08-17 11:20:00+00:00,T06,50.0,0.5
2017-08-17 11:20:00+00:00,T11,44.0,0.6
2017-08-17 11:30:00+00:00,T11,44.0,0.6
2017-08-17 11:30:00+00:00,T01,35.0,1.0
2017-08-17 11:30:00+00:00,T06,,
2017-08-17 11:30:00+00:00,T07,,
2017-08-17 11:40:00+00:00,T11,43.0,0.6
2017-08-17 11:40:00+00:00,T01,35.0,1.0


In [62]:
# Only 4 values are missing in the dataset. But we can't just drop them, because we will lose the information about the other features.
# So we will fill the missing values with the last value for that turbine. That is the 4th value before the missing value.
# This is a very simple approach, but it works for this dataset as the SCADA from the turbines only record the changes in the components.
# So if a value is missing, it means that the component didn't change.

# We will use the interpolate method from pandas to fill the missing values.
# The method='pad' will fill the missing values with the last value for that turbine.
# The limit=4 will limit the number of consecutive NaN values to 4.

signals = signals.interpolate(method='pad', limit=4)

In [58]:
signals[signals.index >= '2017-08-17 11:20:00+00:00'][['Turbine_ID'] + list(signals_na.index)].head(10)

Unnamed: 0_level_0,Turbine_ID,Gen_Bear_Temp_Avg,Grd_Prod_CosPhi_Avg
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-08-17 11:20:00+00:00,T07,48.0,0.6
2017-08-17 11:20:00+00:00,T01,35.0,1.0
2017-08-17 11:20:00+00:00,T06,50.0,0.5
2017-08-17 11:20:00+00:00,T11,44.0,0.6
2017-08-17 11:30:00+00:00,T11,44.0,0.6
2017-08-17 11:30:00+00:00,T01,35.0,1.0
2017-08-17 11:30:00+00:00,T06,35.0,1.0
2017-08-17 11:30:00+00:00,T07,35.0,1.0
2017-08-17 11:40:00+00:00,T11,43.0,0.6
2017-08-17 11:40:00+00:00,T01,35.0,1.0
