https://github.com/gretelai/gretel-synthetics/blob/master/examples/timeseries_dgan.ipynb

In [None]:
# version recommended by source
!pip install torch==1.11.0

In [None]:
!pip install git+https://github.com/gretelai/gretel-synthetics.git

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.dates as md
from sklearn.preprocessing import MinMaxScaler
from pickle import dump, load

import torch

from gretel_synthetics.timeseries_dgan.dgan import DGAN
from gretel_synthetics.timeseries_dgan.config import DGANConfig, OutputType

In [None]:
# https://www.kaggle.com/code/xiaxiaxu/predictmachinefailureinadvance/data
sensor = pd.read_csv("sensor.csv")

# data columns
COLS = ["sensor_25", "sensor_11", "sensor_36", "sensor_34", "machine_status"]

# only keeping cols w high var in pca analysis + machine status
data = sensor[["timestamp"] + COLS]
data

In [None]:
# lets convert our datatypes to the correct ones

print(data.dtypes, "\n")
# i have no idea why i have to run this line twice for it to work.
data["timestamp"] = pd.to_datetime(data["timestamp"])

In [None]:
# okay, what can our machine status be?

# data = data.convert_dtypes()
print(data.dtypes, "\n")
# i have no idea why i have to run this line twice for it to work.
data["timestamp"] = pd.to_datetime(data["timestamp"])

data.machine_status.unique()

In [None]:
# Plot the 4 sensors

for c in COLS:
    if c == "machine_status":
        continue
    plt.plot(data["timestamp"], data[c], label=c)

plt.xticks(rotation=90)
plt.legend()
plt.ylabel("Sensor Value")
plt.xlabel("Date")
plt.show()

In [None]:
# how many NaN rows do we have?

print(len(data[data.isna().any(axis=1)]))

In [None]:
# lets drop them : )

data.dropna(axis=0, inplace=True)

print(len(data[data.isna().any(axis=1)]))

In [None]:
# lets try only using data centered around 2 failures.

data_around_failures = data.iloc[16000:26080]
print(len(data_around_failures))
data_around_failures

In [None]:
# Plot the 4 sensors

for c in COLS:
    if c == "machine_status":
        continue
    plt.plot(data_around_failures["timestamp"], data_around_failures[c], label=c)

plt.xticks(rotation=90)
plt.legend()
plt.ylabel("Sensor Value")
plt.xlabel("Date")
plt.show()

In [None]:
# let's now scale our data between 0-1

scaler = MinMaxScaler((0, 1))
sensor_cols = ["sensor_25", "sensor_11", "sensor_36", "sensor_34"]

# scaling our data, then saving our scaler object for future use.
data_around_failures[sensor_cols] = scaler.fit_transform(
    data_around_failures[sensor_cols]
)

dump(scaler, open("dGAN_scaler.pkl", "wb"))

In [None]:
# Plot the 4 scaled sensors

for c in COLS:
    if c == "machine_status":
        continue
    plt.plot(data_around_failures["timestamp"], data_around_failures[c], label=c)

plt.xticks(rotation=90)
plt.legend()
plt.ylabel("Sensor Value")
plt.xlabel("Date")
plt.show()

In [None]:
# sanity check - let's unscale the data and replot

sc = load(open("dGAN_scaler.pkl", "rb"))

unscaled_data = sc.inverse_transform(data_around_failures[sensor_cols])

unscaled_data_df = pd.DataFrame(unscaled_data, columns=sensor_cols)
unscaled_data_df

In [None]:
for c in COLS:
    if c == "machine_status":
        continue
    plt.plot(data_around_failures["timestamp"], unscaled_data_df[c], label=c)

plt.xticks(rotation=90)
plt.legend()
plt.ylabel("Sensor Value")
plt.xlabel("Date")
plt.show()

okay, scaling sanity check passed. lets start splitting the data to prepare it for training in dGAN.

NOTE: data generated will be in the shape of data passed in for training. so if we pass in samples of 10 rows per, it will only generate a sample of 10 rows.

we have 10,080 rows of data now, which corresponds to 10,080 minutes of data = 168 hours of data = 7 days of data.

I'm going to try splitting the data into two hour segments - giving us 84 120row/minute readings.

In [None]:
# dropping timestamp col
features = data_around_failures.drop(columns=["timestamp", "machine_status"]).to_numpy()
print(features.shape)

# number of samples to split into
n = features.shape[0] // 120
print(n)

# reshape the data accordingly
features = features[: (n * 120), :].reshape(-1, 120, features.shape[1])
# Shape is now (# examples, # time points, # features)
print(features.shape)

In [None]:
# Show a few of the 2-hour training samples
# note x-axis isnt accurate in these plots.
xaxis_2hr = data_around_failures["timestamp"][0:120]


def plot_hours(f):
    for i, c in enumerate(sensor_cols):
        plt.plot(xaxis_2hr, f[:, i], label=c)
    ax = plt.gca()
    ax.xaxis.set_major_locator(md.HourLocator(byhour=range(2, 24, 3)))
    ax.xaxis.set_major_formatter(md.DateFormatter("%H:%M"))
    plt.legend()
    plt.xlabel("Time")
    plt.ylabel("Sensor Readings")
    plt.show()


plot_hours(features[80, :, :])
plot_hours(features[3, :, :])
plot_hours(features[21, :, :])

In [None]:
# Recommended to train with a GPU - am not for now.
torch.cuda.is_available()

In [None]:
# set up DGAN config.

config = DGANConfig(
    max_sequence_len=features.shape[1],
    sample_len=12,
    batch_size=min(1000, features.shape[0]),
    apply_feature_scaling=False,  # already scaled
    apply_example_scaling=False,
    use_attribute_discriminator=False,
    generator_learning_rate=1e-4,
    discriminator_learning_rate=1e-4,
    epochs=10000,
)

model = DGAN(config)

In [None]:
model.train_numpy(
    features,
    feature_types=[OutputType.CONTINUOUS] * features.shape[2],
)

finished training at around 6-7 mins on large notebook image - not bad! 

In [None]:
# Generate synthetic data - this ran near instantly
_, synthetic_features = model.generate_numpy(100)

In [None]:
# Show some synthetic 2-hour samples
plot_hours(synthetic_features[10, :, :])
plot_hours(synthetic_features[42, :, :])
plot_hours(synthetic_features[6, :, :])

In [None]:
# Compare (non-temporal) correlations between the 4 sensors
synthetic_df = pd.DataFrame(
    synthetic_features.reshape(-1, synthetic_features.shape[2]), columns=sensor_cols
)

print("Correlation in real data:")
print(data_around_failures.drop(columns=["timestamp", "machine_status"]).corr())
print()
print("Correlation in synthetic data:")
print(synthetic_df.corr())

# Correlations between sensor variables are similar

In [None]:
# Compare distribution of sensor_34 values
plt.hist(
    [features[:, :, 3].flatten(), synthetic_features[:, :, 3].flatten()],
    label=["real", "synthetic"],
    bins=25,
    density=True,
)
plt.legend()
plt.xlabel("Sensor 34 Values")
plt.ylabel("Density")
plt.show()

In [None]:
# Compare distribution of sensor_25 values
plt.hist(
    [features[:, :, 0].flatten(), synthetic_features[:, :, 0].flatten()],
    label=["real", "synthetic"],
    bins=25,
    density=True,
)
plt.legend()
plt.xlabel("Sensor 25 Values")
plt.ylabel("Density")
plt.show()

In [None]:
# Compare distribution of sensor_11 values
plt.hist(
    [features[:, :, 1].flatten(), synthetic_features[:, :, 1].flatten()],
    label=["real", "synthetic"],
    bins=25,
    density=True,
)
plt.legend()
plt.xlabel("Sensor 11 Values")
plt.ylabel("Density")
plt.show()

In [None]:
# Compare distribution of sensor_36 values
plt.hist(
    [features[:, :, 2].flatten(), synthetic_features[:, :, 2].flatten()],
    label=["real", "synthetic"],
    bins=25,
    density=True,
)
plt.legend()
plt.xlabel("Sensor 36 Values")
plt.ylabel("Density")
plt.show()

In [None]:
# saving model for future use
model.save("dgan_model_0.pt")

# X = model.load("dgan_model_0.pt")

# X

### NOTES ON RESULTS

- this is really promising!! train time was fast, data generation is near instantaneous and overall the synthetic data looks solid to the eye test and simple metrics and plots.

- in future attempts, I'll be passing in more data to the model, likely will be using longer samples and more of them, either by selecting a larger initial slice of the data, or by using overlapping windows, or both! We'll see how that affects train time and performance. 

- will need to discuss with team how long the samples generated should be ideally. if using longer samples (like days of data) is needed, we could try thinning the data out again and using every 2 or 3 rows if needed.

- also, I scaled the data manually before but it looks like we could pass in the data without scaling and then it would scale the data for training and then unscale the data when generating synthetic data - could be convenient if it works well. 

- could also try passing in 'machine_status' and seeing how that changes results as well.