In [None]:
!pip install sdv==0.18.0

In [None]:
from sdv.timeseries import PAR
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from pickle import dump, load

In [None]:
# https://www.kaggle.com/code/xiaxiaxu/predictmachinefailureinadvance/data
sensor = pd.read_csv("sensor.csv")

In [None]:
# only keeping cols w high var in pca analysis + machine status
data = sensor[
    ["timestamp", "sensor_25", "sensor_11", "sensor_36", "sensor_34", "machine_status"]
]

In [None]:
# okay, what can our machine status be?

data = data.convert_dtypes()
print(data.dtypes, "\n")
data["timestamp"] = pd.to_datetime(data["timestamp"])

data.machine_status.unique()

In [None]:
data.loc[data["machine_status"] == "BROKEN"]

Alright, we've got 7 broken instances. The first two are within ~7,000 rows, lets select the 10,000 rows around them.

Then we'll scale the data with minmaxscaler (should be able to apply that directly to our dataframe) then pivot our dataset.

In [None]:
data_around_failures = data.iloc[16000:26000]
len(data_around_failures)

In [None]:
data_around_failures.head()

In [None]:
print(data_around_failures[data_around_failures.isna().any(axis=1)])

# we only have one row with NA, let's simply drop that.

data_around_failures.dropna(axis=0, inplace=True)

print(data_around_failures[data_around_failures.isna().any(axis=1)])

In [None]:
# encoding the machine status variable.
# doing it manually, its pretty simple

cleanup_nums = {"machine_status": {"NORMAL": 0, "BROKEN": 1, "RECOVERING": 2}}
data_around_failures = data_around_failures.replace(cleanup_nums)
data_around_failures.head()

In [None]:
scaler = MinMaxScaler()
sensor_cols = ["sensor_25", "sensor_11", "sensor_36", "sensor_34"]

# scaling our data, then saving our scaler object for future use.
data_around_failures[sensor_cols] = scaler.fit_transform(
    data_around_failures[sensor_cols]
)

dump(scaler, open("scaler.pkl", "wb"))

In [None]:
print(data_around_failures.shape)

data_around_failures.head()

# now our data is scaled to be within 0-1.

In [None]:
# now let's try 'melting' our data

melted = data_around_failures.melt("timestamp")
melted.sort_values(by="timestamp", inplace=True)
melted.reset_index(inplace=True)
melted = melted.drop("index", 1)

In [None]:
print(melted.shape)
melted.head(12)

In [None]:
# sanity check, pivoting our melted data back to see how that works.

pivoted = melted.pivot(index="timestamp", columns="variable", values="value")

print(pivoted.shape)
pivoted.head(10)

# ok looks fine to me, we're missing an index column

In [None]:
# encoding the 'variable' variable (not my best name)
# doing it manually, its pretty simple

# encoding them as strings not as ints - think that ints have messed things up

cleanup_nums_var = {
    "variable": {
        "machine_status": "0",
        "sensor_25": "1",
        "sensor_34": "2",
        "sensor_11": "3",
        "sensor_36": "4",
    }
}
melted = melted.replace(cleanup_nums_var)
melted.head()

In [None]:
melted["variable"] = melted.variable.astype("str")

melted["variable"]

In [None]:
# okay, we have selected data around 2 anomalies, we have scaled our numerical values
# and we have melted our data to be 'longer' rather than 'wider'
# also am leaving in the machine_status variable, perhaps will help with training : )

sequence_index = "timestamp"
entity_columns = ["variable"]

model = PAR(
    sequence_index=sequence_index,
    entity_columns=entity_columns,
    verbose=True,
    epochs=64,
)


print(melted.dtypes, "\n")

In [None]:
model.fit(melted)

ok this took ~24 mins on a large notebook image.

trained for 64 epochs as 128 seemed to greatly overfit, however im not sure what their loss metric is.

In [None]:
model.save("melted_model.pkl")

In [None]:
melted.variable.unique()