Slice Kaggle data found in sensor.csv into partitions that start 36 hours before each failure.  All columns are saved.
These partitions are used to simulate real time data that is generated for model prediction.
Code provided by Eli Guidera and modified for our needs in a notebook.

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Read data, set 'timestamp' values as index
# my path is a bit annoying, the data in another repo for me
# https://www.kaggle.com/datasets/nphantawee/pump-sensor-data
sensor_path = '~/sensor-failure-predication/synthetic-data-generator/sensor.csv'

df = pd.read_csv(sensor_path, index_col='timestamp', parse_dates=True)


In [3]:
# Column 'machine_status' has word values.
# Convert string values in target to numerics.
# Notice that 'BROKEN' is mapped to 1
status_values = [(df['machine_status'] == 'NORMAL'),
                 (df['machine_status'] == 'BROKEN'),
                 (df['machine_status'] == 'RECOVERING')]

numeric_status_values = [0, 1, 0.5]
df['machine_status'] = np.select(status_values,
                                 numeric_status_values,
                                 default=0)

In [4]:
# Get failure times
failure_times = df[df['machine_status'] == 1].index

In [5]:
# Write the data slices to csv files
for i, failure_time in enumerate(failure_times):
    df.loc[(failure_time - pd.Timedelta(seconds=60*60*12*3)) : failure_time, :].\
        to_csv('../data/kagglePump/full_slice'+str(i)+'.csv')