In [None]:
!pip install sdv

In [29]:
from sdv.timeseries import PAR
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from pickle import dump, load

In [30]:
# https://www.kaggle.com/code/xiaxiaxu/predictmachinefailureinadvance/data
sensor = pd.read_csv('sensor.csv') 

In [31]:
# only keeping cols w high var in pca analysis + machine status
data = sensor[['timestamp', 'sensor_25', 'sensor_11', 'sensor_36', 'sensor_34', 'machine_status']]

In [32]:
# okay, what can our machine status be?

data = data.convert_dtypes()
print(data.dtypes, '\n')
data['timestamp']= pd.to_datetime(data['timestamp'])

data.machine_status.unique()

timestamp          string
sensor_25         Float64
sensor_11         Float64
sensor_36         Float64
sensor_34         Float64
machine_status     string
dtype: object 



<StringArray>
['NORMAL', 'BROKEN', 'RECOVERING']
Length: 3, dtype: string

In [33]:
data.loc[data['machine_status'] == 'BROKEN']

Unnamed: 0,timestamp,sensor_25,sensor_11,sensor_36,sensor_34,machine_status
17155,2018-04-12 21:55:00,653.9383,3.625588,301.1411,261.7709,BROKEN
24510,2018-04-18 00:30:00,650.46,30.43471,434.2556,177.4708,BROKEN
69318,2018-05-19 03:18:00,719.8438,43.62322,285.3491,171.3203,BROKEN
77790,2018-05-25 00:30:00,746.036,44.79362,250.9113,171.749,BROKEN
128040,2018-06-28 22:00:00,143.1029,1.960537,367.8615,343.9342,BROKEN
141131,2018-07-08 00:11:00,759.8998,23.97027,616.8983,287.9145,BROKEN
166440,2018-07-25 14:00:00,740.6738,50.04619,812.8298,363.032,BROKEN


Alright, we've got 7 broken instances. The first two are within ~7,000 rows, lets select the 10,000 rows around them.

Then we'll scale the data with minmaxscaler (should be able to apply that directly to our dataframe) then pivot our dataset.

In [34]:
data_around_failures = data.iloc[16000:26000]
len(data_around_failures)

10000

In [35]:
data_around_failures.head()

Unnamed: 0,timestamp,sensor_25,sensor_11,sensor_36,sensor_34,machine_status
16000,2018-04-12 02:40:00,649.0801,49.93274,299.4172,160.6844,NORMAL
16001,2018-04-12 02:41:00,651.8054,51.08754,296.376,162.5435,NORMAL
16002,2018-04-12 02:42:00,650.1391,51.24662,305.7853,164.4021,NORMAL
16003,2018-04-12 02:43:00,649.2632,51.93151,294.0909,166.5538,NORMAL
16004,2018-04-12 02:44:00,643.1567,51.71158,298.7494,168.0882,NORMAL


In [None]:
print(data_around_failures[data_around_failures.isna().any(axis=1)])

# we only have one row with NA, let's simply drop that.

data_around_failures.dropna(axis=0, inplace=True)

print(data_around_failures[data_around_failures.isna().any(axis=1)])

In [37]:
# encoding the machine status variable.
# doing it manually, its pretty simple 

cleanup_nums = {"machine_status": {"NORMAL": 0, "BROKEN": 1, "RECOVERING":2}}
data_around_failures = data_around_failures.replace(cleanup_nums)
data_around_failures.head()


Unnamed: 0,timestamp,sensor_25,sensor_11,sensor_36,sensor_34,machine_status
16000,2018-04-12 02:40:00,649.0801,49.93274,299.4172,160.6844,0
16001,2018-04-12 02:41:00,651.8054,51.08754,296.376,162.5435,0
16002,2018-04-12 02:42:00,650.1391,51.24662,305.7853,164.4021,0
16003,2018-04-12 02:43:00,649.2632,51.93151,294.0909,166.5538,0
16004,2018-04-12 02:44:00,643.1567,51.71158,298.7494,168.0882,0


In [38]:
scaler = MinMaxScaler()
sensor_cols = ['sensor_25', 'sensor_11', 'sensor_36', 'sensor_34']

# scaling our data, then saving our scaler object for future use.
data_around_failures[sensor_cols] = scaler.fit_transform(data_around_failures[sensor_cols])

dump(scaler, open('scaler.pkl', 'wb'))

In [39]:
print(data_around_failures.shape)

data_around_failures.head()

# now our data is scaled to be within 0-1.

(9999, 6)


Unnamed: 0,timestamp,sensor_25,sensor_11,sensor_36,sensor_34,machine_status
16000,2018-04-12 02:40:00,0.317054,0.876782,0.142022,0.288687,0
16001,2018-04-12 02:41:00,0.330166,0.897059,0.138068,0.298528,0
16002,2018-04-12 02:42:00,0.322149,0.899853,0.150302,0.308366,0
16003,2018-04-12 02:43:00,0.317935,0.911879,0.135097,0.319756,0
16004,2018-04-12 02:44:00,0.288555,0.908017,0.141154,0.327878,0


In [40]:
# now let's try 'melting' our data

melted = data_around_failures.melt('timestamp')
melted.sort_values(by='timestamp',inplace=True)
melted.reset_index(inplace=True)
melted = melted.drop('index', 1)

In [41]:
print(melted.shape)
melted.head(12)

(49995, 3)


Unnamed: 0,timestamp,variable,value
0,2018-04-12 02:40:00,sensor_25,0.317054
1,2018-04-12 02:40:00,sensor_34,0.288687
2,2018-04-12 02:40:00,machine_status,0.0
3,2018-04-12 02:40:00,sensor_11,0.876782
4,2018-04-12 02:40:00,sensor_36,0.142022
5,2018-04-12 02:41:00,machine_status,0.0
6,2018-04-12 02:41:00,sensor_36,0.138068
7,2018-04-12 02:41:00,sensor_34,0.298528
8,2018-04-12 02:41:00,sensor_25,0.330166
9,2018-04-12 02:41:00,sensor_11,0.897059


In [None]:
# sanity check, pivoting our melted data back to see how that works.

pivoted = melted.pivot(index='timestamp', columns='variable', values='value')

print(pivoted.shape)
pivoted.head(10)

# ok looks fine to me, we're missing an index column 

In [43]:
# encoding the 'variable' variable (not my best name)
# doing it manually, its pretty simple 

# encoding them as strings not as ints - think that ints have messed things up 

cleanup_nums_var = {"variable": {"machine_status": '0',
                                 "sensor_25": '1',
                                 "sensor_34": '2',
                                 "sensor_11": '3',
                                 "sensor_36": '4'}}
melted = melted.replace(cleanup_nums_var)
melted.head()

Unnamed: 0,timestamp,variable,value
0,2018-04-12 02:40:00,1,0.317054
1,2018-04-12 02:40:00,2,0.288687
2,2018-04-12 02:40:00,0,0.0
3,2018-04-12 02:40:00,3,0.876782
4,2018-04-12 02:40:00,4,0.142022


In [44]:
melted['variable'] = melted.variable.astype('str')

melted['variable']

0        1
1        2
2        0
3        3
4        4
        ..
49990    4
49991    3
49992    1
49993    2
49994    0
Name: variable, Length: 49995, dtype: object

In [45]:
# okay, we have selected data around 2 anomalies, we have scaled our numerical values
# and we have melted our data to be 'longer' rather than 'wider'
# also am leaving in the machine_status variable, perhaps will help with training : ) 

sequence_index = 'timestamp'
entity_columns = ['variable']

model = PAR(sequence_index = sequence_index, entity_columns=entity_columns, verbose=True, epochs=64)


print(melted.dtypes, '\n')

timestamp    datetime64[ns]
variable             object
value               float64
dtype: object 



In [46]:
model.fit(melted)

PARModel(epochs=64, sample_size=1, cuda='cpu', verbose=True) instance created


Epoch 64 | Loss 78.10054016113281: 100%|██████████| 64/64 [21:58<00:00, 20.60s/it] 


ok this took ~24 mins on a large notebook image.

trained for 64 epochs as 128 seemed to greatly overfit, however im not sure what their loss metric is.

In [47]:
model.save('melted_model.pkl')

In [48]:
melted.variable.unique()

array(['1', '2', '0', '3', '4'], dtype=object)