In [None]:
!pip install torch==1.11.0  # version recommended by source
!pip install git+https://github.com/gretelai/gretel-synthetics.git

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.dates as md
from sklearn.preprocessing import MinMaxScaler
from pickle import dump, load
from skimage.util.shape import view_as_windows
import time

import torch

from gretel_synthetics.timeseries_dgan.dgan import DGAN
from gretel_synthetics.timeseries_dgan.config import DGANConfig, OutputType

In [None]:
slice0 = pd.read_csv("../data/kagglePump/slice0.csv")
slice1 = pd.read_csv("../data/kagglePump/slice1.csv")
slice2 = pd.read_csv("../data/kagglePump/slice2.csv")
slice3 = pd.read_csv("../data/kagglePump/slice3.csv")
slice4 = pd.read_csv("../data/kagglePump/slice4.csv")
slice5 = pd.read_csv("../data/kagglePump/slice5.csv")
slice6 = pd.read_csv("../data/kagglePump/slice6.csv")


print(
    slice0.shape,
    slice1.shape,
    slice2.shape,
    slice3.shape,
    slice4.shape,
    slice5.shape,
    slice6.shape,
)

In [None]:
# prep for plotting
slices_list = [slice0, slice1, slice2, slice3, slice4, slice5, slice6]
sensor_cols = ["sensor_25", "sensor_11", "sensor_36", "sensor_34"]

# cast timestamps to be type datetime
for sl in slices_list:
    sl["timestamp"] = pd.to_datetime(sl["timestamp"])

# remove 1st row to be same as data used in prev. nb
for i in range(len(slices_list)):
    slices_list[i] = slices_list[i].iloc[1:, :]
    print(len(slices_list[i]))

# create df of our dfs
all_slices = pd.concat(slices_list)

# create np array of that df.
all_slices_np = all_slices.to_numpy()

In [None]:
all_slices

In [None]:
# set up DGAN config.

config = DGANConfig(
    max_sequence_len=720,  # hard coded from prev shape
    sample_len=20,  # trying a larger sample_len
    batch_size=min(1000, 300),
    apply_feature_scaling=True,
    apply_example_scaling=False,
    use_attribute_discriminator=False,
    generator_learning_rate=1e-4,
    discriminator_learning_rate=1e-4,
    epochs=10000,
)

model = DGAN(config)

model = model.load("../models/dgan_model_2.pt", map_location=torch.device("cpu"))

In [None]:
# Generate synthetic data - this ran near instantly
_, synthetic_features = model.generate_numpy(1000)

In [None]:
def plot_12hr_slice(slice, ind):
    for i, c in enumerate(sensor_cols):
        plt.plot(slice[:, i], label=c)
    ax = plt.gca()
    ax.title.set_text("Slice" + str(ind))
    plt.legend(prop={"size": 7})

In [None]:
# plot random synthetic windows

figure = plt.figure(figsize=(10, 10))
figure.suptitle("Synthetic 12hr Window plots", fontsize=25, fontweight="roman")

for i in range(9):
    figure.add_subplot(3, 3, i + 1)
    index = np.random.choice(1000)
    sl = synthetic_features[index]
    plot_12hr_slice(sl, index)

plt.show()

In [None]:
# Compare (non-temporal) correlations between the 4 sensors
synthetic_df = pd.DataFrame(
    synthetic_features.reshape(-1, synthetic_features.shape[2]), columns=sensor_cols
)

print("Correlation in real data:")
print(all_slices.drop(columns=["timestamp", "machine_status"]).corr())
print()
print("Correlation in synthetic data:")
print(synthetic_df.corr())

# Correlations between sensor variables are worse now
# Makes sense as we are using 6 different slices rather than 1.
# will have to discuss if this is acceptable.

In [None]:
# Compare distribution of sensor_34 values
plt.hist(
    [all_slices_np[:, 4].flatten(), synthetic_features[:, :, 3].flatten()],
    label=["real", "synthetic"],
    bins=25,
    density=True,
)
plt.legend()
plt.xlabel("Sensor 34 Values")
plt.ylabel("Density")
plt.show()

In [None]:
# Compare distribution of sensor_36 values
plt.hist(
    [all_slices_np[:, 3].flatten(), synthetic_features[:, :, 2].flatten()],
    label=["real", "synthetic"],
    bins=25,
    density=True,
)
plt.legend()
plt.xlabel("Sensor 36 Values")
plt.ylabel("Density")
plt.show()

In [None]:
# Compare distribution of sensor_11 values
plt.hist(
    [all_slices_np[:, 2].flatten(), synthetic_features[:, :, 1].flatten()],
    label=["real", "synthetic"],
    bins=25,
    density=True,
)
plt.legend()
plt.xlabel("Sensor 11 Values")
plt.ylabel("Density")
plt.show()

In [None]:
# Compare distribution of sensor_25 values
plt.hist(
    [all_slices_np[:, 1].flatten(), synthetic_features[:, :, 0].flatten()],
    label=["real", "synthetic"],
    bins=25,
    density=True,
)
plt.legend()
plt.xlabel("Sensor 11 Values")
plt.ylabel("Density")
plt.show()