In [1]:
import sys
import os

import pandas as pd
import numpy as np

In [2]:
load_ext autoreload

In [3]:
autoreload 2

This library is being developed for the purpose of this work

In [4]:
folderpath = '../data/interim/'
data_op_0 = pd.read_csv(folderpath + 'data_op_00.csv')
data_op_0.head()

FileNotFoundError: File b'../data/interim/data_op_00.csv' does not exist

The dataset is comes with the package. To load it, just call the following.

In [None]:
data = load_data.load(type='train')
data.head()

### What represents each one of columns in this dataset?

- Column 0: engine unit or even the aircraft number
- Column 1: time step
- Column 2, 3 and 4: operational condition. See readme of dataset for futher details.
- Colum 5 to 26: sensor readings

## Let's take a look how the data looks like

In [None]:
from phm08ds.data.preprocessing import Data_per_unit
from phm08ds.data.preprocessing import Data_per_sensor

tf_unit_1 = Data_per_unit(unit=1)
unit_1 = tf_unit_1.fit_transform(data)

unit_1.head()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

sns.set_style("whitegrid")
sns.set_context("notebook")

### The data is a multivariate timeseries

In [None]:
def plot_series(unit, sensor):
    tf_data_unit = Data_per_unit(unit=unit)
    tf_data_sensor = Data_per_sensor(sensor=sensor)
    
    data_unit = tf_data_unit.fit_transform(data)
    data_unit_sensor = tf_data_sensor.fit_transform(data_unit)  
    plt.figure(figsize=(30*0.39, 10*0.39))
    plt.plot(data_unit_sensor.iloc[:,-1])
    
interact(plot_series, unit=(1,100,1), sensor=(1,20,1))

### Defining Health States for the data, i.e. classes

According to Tamilselvan (2013): "The sensory signals of each engine unit is first arranged in descending order based on the operation cycle index and the first 50 data points are termed as HS 4 (failure HS); the region between 75 and 125 data points is termed as HS 3; the region between 150 and 200 data points is termed as HS 2; and the region greater than 220 data points is termed as HS 1 (healthy HS).”

Before, lable date, I am curious if all time series, form units or sensors, have the same number os points. Let's take a look into it.

In [None]:
ax = sns.boxplot(y=data.groupby(by='unit')['time_step'].count(),orient="v")

It varies according to the Figure above. 25% of all engines have been degrated before 180 time steps. The experimental scenarios is reported " Each engine starts with different degrees of initial wear and manufacturing variation which is unknown to the user. This wear and variation is considered normal, i.e., it is not considered a fault condition.". So, the variation is due to its initial start condition.

In [None]:
from phm08ds.data.preprocessing import HealthState

tf_set_health_state = HealthState()

data_with_hs = tf_set_health_state.fit_transform(data)

In [None]:
data_with_hs.head()

In [None]:
def plot_series(unit, sensor):
    data_unit_sensor_buffer = data_with_hs.loc[data['unit'] == unit,:]
    
    plt.figure(figsize=(30*0.39, 10*0.39))    
    sns.lineplot(x='time_step', y='Sensor_' + str(sensor), data=data_unit_sensor_buffer, hue='Health_state', palette='Wistia')
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    
interact(plot_series, unit=(1,100,1), sensor=(1,20,1))

# Grouping data by operational conditions

According to Wang, 2008 the three operational conditions could be clustered into 6 groups like that

In [None]:
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(figsize=(8,7))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(data['operational_setting_1'], data['operational_setting_2'], data['operational_setting_3'], s=100)
plt.title('Clusters of operational conditions')
plt.show()

To find the cluster centers automatically we are going to use kmeans. Just for convinience.

That is the operational condition for each sample given by kmeans.

In [None]:
from phm08ds.data.preprocessing import OperationalCondition

tf_op_cond = OperationalCondition().fit(data_with_hs)
op_cond = tf_op_cond.transform(data_with_hs)
op_cond

In [None]:
data_with_op_cond = data_with_hs.copy()
data_with_op_cond['Operational_condition'] = op_cond
data_with_op_cond.head()

In [None]:
from phm08ds.data.preprocessing import Data_per_op_cond

tf_data_op_1 = Data_per_op_cond(operational_condition=1)
data_op_1 = tf_data_op_1.fit_transform(data_with_op_cond)
data_op_1.head()

Create the transformers

In [None]:
tf_data_op_0 = Data_per_op_cond(operational_condition=0)
tf_data_op_2 = Data_per_op_cond(operational_condition=2)
tf_data_op_3 = Data_per_op_cond(operational_condition=3)
tf_data_op_4 = Data_per_op_cond(operational_condition=4)
tf_data_op_5 = Data_per_op_cond(operational_condition=5)

Get data from operational conditions in different dataframes

In [None]:
data_op_0 = tf_data_op_0.fit_transform(data_with_op_cond)
data_op_2 = tf_data_op_2.fit_transform(data_with_op_cond)
data_op_3 = tf_data_op_3.fit_transform(data_with_op_cond)
data_op_4 = tf_data_op_4.fit_transform(data_with_op_cond)
data_op_5 = tf_data_op_5.fit_transform(data_with_op_cond)

### That raises a questions: How the data from one unit and one sensor looks like clustered by one operational condition?

In [None]:
data_buffer = Data_per_sensor(sensor=19).fit_transform((Data_per_unit(unit=100).fit_transform(data)))
data_buffer.head()

In [None]:
op_cond = tf_op_cond.transform(data_buffer)
data_buffer.insert(data_buffer.shape[-1], 'Operational_condition', op_cond)
data_buffer.head()

In [None]:
sns.palplot(sns.color_palette("Blues_d"))

In [None]:
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 2.5})

fig = plt.figure(figsize=(15,6))
ax = sns.scatterplot('time_step', 24, data=data_buffer, hue='Operational_condition', palette='Set2')
# ax = sns.lineplot(x='time_step', y=24, data=data_buffer)

# Save dataset:

In [None]:
data_op_0.head()

In [None]:
folderpath = '../data/interim/'
data_op_0.to_csv(folderpath + 'data_op_00.csv', index_label=False)
data_op_1.to_csv(folderpath + 'data_op_01.csv', index_label=False)
data_op_2.to_csv(folderpath + 'data_op_02.csv', index_label=False)
data_op_3.to_csv(folderpath + 'data_op_03.csv', index_label=False)
data_op_4.to_csv(folderpath + 'data_op_04.csv', index_label=False)
data_op_5.to_csv(folderpath + 'data_op_05.csv', index_label=False)