https://github.com/gretelai/gretel-synthetics/blob/master/examples/timeseries_dgan.ipynb

In [None]:
# version recommended by source
!pip install torch==1.11.0

In [None]:
!pip install git+https://github.com/gretelai/gretel-synthetics.git

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.dates as md
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
from pickle import dump, load

import torch

from gretel_synthetics.timeseries_dgan.dgan import DGAN
from gretel_synthetics.timeseries_dgan.config import DGANConfig, OutputType

In [None]:
# https://www.kaggle.com/code/xiaxiaxu/predictmachinefailureinadvance/data
sensor = pd.read_csv("sensor.csv")
sensor.isnull().sum()
COLS_TO_DROP = ["Unnamed: 0", "sensor_00", "sensor_15", "sensor_50", "sensor_51"]
sensor.drop(COLS_TO_DROP, axis=1, inplace=True)

print(sensor.shape)

In [None]:
# lets convert our datatypes to the correct ones
print(sensor.dtypes, "\n")
# i have no idea why i have to run this line twice for it to work.
sensor["timestamp"] = pd.to_datetime(sensor["timestamp"])

In [None]:
# how many NaN rows do we have?
print(len(sensor[sensor.isna().any(axis=1)]))

In [None]:
# lets drop them
sensor.dropna(axis=0, inplace=True)
print(len(sensor[sensor.isna().any(axis=1)]))

In [None]:
# lets try only using data centered around 2 failures.
data_around_failures = sensor.iloc[16000:26080]
print(len(data_around_failures))
data_around_failures

In [None]:
# subplot matrix
fig, axes = plt.subplots(10, 5, figsize=(20, 20))
fig.tight_layout()

for col, ax in zip(sensor.columns[1:-1], axes.flat):
    sensor[col].plot.line(ax=ax)
    ax.set_title(col)
# disable leftover axes
for ax in axes.flat[sensor.columns[1:-1].size :]:
    ax.set_axis_off()

okay, scaling sanity check passed. lets start splitting the data to prepare it for training in dGAN.

NOTE: data generated will be in the shape of data passed in for training. so if we pass in samples of 10 rows per, it will only generate a sample of 10 rows.

we have 10,080 rows of data now, which corresponds to 10,080 minutes of data = 168 hours of data = 7 days of data.

I'm going to try splitting the data into two hour segments - giving us 84 120row/minute readings.

In [None]:
# dropping timestamp and machine_status columns
data_around_failures.drop(["timestamp", "machine_status"], axis=1, inplace=True)
features = data_around_failures.to_numpy()
print(features.shape)

# number of samples to split into
n = features.shape[0] // 120
print(n)

# reshape the data accordingly
features = features[: (n * 120), :].reshape(-1, 120, features.shape[1])
# Shape is now (# examples, # time points, # features)
print(features.shape)

In [None]:
# Recommended to train with a GPU - am not for now.
torch.cuda.is_available()

In [None]:
# set up DGAN config.
config = DGANConfig(
    max_sequence_len=features.shape[1],
    sample_len=12,
    batch_size=min(1000, features.shape[0]),
    apply_feature_scaling=True,  # already scaled
    apply_example_scaling=False,
    use_attribute_discriminator=False,
    generator_learning_rate=1e-4,
    discriminator_learning_rate=1e-4,
    epochs=10000,
)

model = DGAN(config)

In [None]:
model.train_numpy(
    features,
    feature_types=[OutputType.CONTINUOUS] * features.shape[2],
)

finished training at around 6-7 mins on large notebook image - not bad! 

In [None]:
# Generate synthetic data - this ran near instantly
_, synthetic_features = model.generate_numpy(100)

In [None]:
# Weak correlations between synthetic and real data
sensor_cols = data_around_failures.columns
synthetic_df = pd.DataFrame(
    synthetic_features.reshape(-1, synthetic_features.shape[2]), columns=sensor_cols
)
data_around_failures.reset_index(inplace=True)
data_around_failures.corrwith(synthetic_df)

In [None]:
# Mutlicorrelation seems to be a problem in both the real and synthetic data;
# but at least this means that the synthetic data is mimicing the real data well.

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(13, 5))
sns.heatmap(data_around_failures.corr(), cmap="Greens", ax=ax1)
sns.heatmap(synthetic_df.corr(), cmap="Blues", ax=ax2)
plt.show()

In [None]:
# Scatterplot matrix for synthetic sensor data

fig, axes = plt.subplots(10, 5, figsize=(20, 20))
fig.tight_layout()

for col, ax in zip(synthetic_df.columns, axes.flat):
    synthetic_df[col].plot.line(ax=ax)
    ax.set_title(col)
# disable leftover axes
for ax in axes.flat[synthetic_df.columns.size :]:
    ax.set_axis_off()

In [None]:
# Compare distribution of sensor_34 values

plt.hist(
    [data_around_failures["sensor_34"], synthetic_df["sensor_34"]],
    label=["real", "synthetic"],
    bins=25,
    density=True,
)
plt.legend()
plt.xlabel("Sensor 34 Values")
plt.ylabel("Density")
plt.show()

In [None]:
# Compare distribution of sensor_25 values

plt.hist(
    [data_around_failures["sensor_25"], synthetic_df["sensor_25"]],
    label=["real", "synthetic"],
    bins=25,
    density=True,
)
plt.legend()
plt.xlabel("Sensor 25 Values")
plt.ylabel("Density")
plt.show()

In [None]:
# Compare distribution of sensor_11 values

plt.hist(
    [data_around_failures["sensor_11"], synthetic_df["sensor_11"]],
    label=["real", "synthetic"],
    bins=25,
    density=True,
)
plt.legend()
plt.xlabel("Sensor 11 Values")
plt.ylabel("Density")
plt.show()

In [None]:
# Compare distribution of sensor_36 values

plt.hist(
    [data_around_failures["sensor_36"], synthetic_df["sensor_36"]],
    label=["real", "synthetic"],
    bins=25,
    density=True,
)
plt.legend()
plt.xlabel("Sensor 36 Values")
plt.ylabel("Density")
plt.show()

In [None]:
# saving model for future use
model.save("dgan_model_3_48_sensors.pt")

# X = model.load("dgan_model_3_48_sensors.pt")

# X