In [None]:
!pip install torch==1.11.0  # version recommended by source
!pip install git+https://github.com/gretelai/gretel-synthetics.git

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.dates as md
from sklearn.preprocessing import MinMaxScaler
from pickle import dump, load
from skimage.util.shape import view_as_windows
import time
import seaborn as sns


import torch

from gretel_synthetics.timeseries_dgan.dgan import DGAN
from gretel_synthetics.timeseries_dgan.config import DGANConfig, OutputType

In [None]:
slice0 = pd.read_csv("../data/kagglePump/full_slice0.csv")
slice1 = pd.read_csv("../data/kagglePump/full_slice1.csv")
slice2 = pd.read_csv("../data/kagglePump/full_slice2.csv")
slice3 = pd.read_csv("../data/kagglePump/full_slice3.csv")
slice4 = pd.read_csv("../data/kagglePump/full_slice4.csv")
slice5 = pd.read_csv("../data/kagglePump/full_slice5.csv")
slice6 = pd.read_csv("../data/kagglePump/full_slice6.csv")


print(
    slice0.shape,
    slice1.shape,
    slice2.shape,
    slice3.shape,
    slice4.shape,
    slice5.shape,
    slice6.shape,
)

In [None]:
slices_list = [slice0, slice1, slice2, slice3, slice4, slice5, slice6]

In [None]:
# removing the slice4 as its too different from the others.
# as found in plots in notebook 10.

slices_list.pop(4)

print(len(slices_list))

In [None]:
COLS_TO_DROP = ["Unnamed: 0", "sensor_00", "sensor_15", "sensor_50", "sensor_51"]
cleanup_nums = {"machine_status": {"NORMAL": 0, "BROKEN": 1, "RECOVERING": 2}}

for sl in slices_list:
    sl["timestamp"] = pd.to_datetime(sl["timestamp"])  # cast to datatype
    sl = sl.drop(COLS_TO_DROP, axis=1, inplace=True)  # drop cols

for sl in slices_list:
    sl = sl.replace(cleanup_nums)

for sl in slices_list:
    sl = sl.dropna(axis=0, inplace=True)  # drop nans

In [None]:
print(
    slice0.shape,
    slice1.shape,
    slice2.shape,
    slice3.shape,
    slice4.shape,
    slice5.shape,
    slice6.shape,
)

In [None]:
# these plots are absurd in size but lets do it anyways
def plot_slice(one_slice):
    # subplot matrix
    fig, axes = plt.subplots(10, 5, figsize=(20, 20))
    fig.tight_layout()

    for col, ax in zip(one_slice.columns[1:-1], axes.flat):
        one_slice[col].plot.line(ax=ax)
        ax.set_title(col)
    # disable leftover axes
    for ax in axes.flat[one_slice.columns[1:-1].size :]:
        ax.set_axis_off()

In [None]:
# plot_slice(slice0)

In [None]:
# plot_slice(slice1)

In [None]:
for i in range(len(slices_list)):
    slices_list[i] = slices_list[i].drop(columns=["timestamp"]).to_numpy()
    slices_list[i] = np.array(slices_list[i][1:])
    print(slices_list[i].shape)

In [None]:
# from list of nps to np of nps
slices_list = np.asarray(slices_list)
slices_list.shape

In [None]:
# we have shape 6,2160, 49 with machine status.
# we want something closer to 300,720,4.
# 300 being num training samples, 720 being datapoints per sample (12 hrs), 4 sensor cols.
window_shape = (720, 49)
windowed_data = view_as_windows(slices_list[0], window_shape, step=36)
windowed_data = np.squeeze(windowed_data)
windowed_data.shape

In [None]:
for i in range(1, 6):
    temp_window = view_as_windows(slices_list[i], window_shape, step=36)
    temp_window = np.squeeze(temp_window)
    windowed_data = np.append(windowed_data, temp_window, axis=0)

In [None]:
# set up DGAN config.

config = DGANConfig(
    max_sequence_len=windowed_data.shape[1],
    sample_len=20,  # trying a larger sample_len
    batch_size=min(1000, windowed_data.shape[0]),
    apply_feature_scaling=True,
    apply_example_scaling=False,
    use_attribute_discriminator=False,
    generator_learning_rate=1e-4,
    discriminator_learning_rate=1e-4,
    epochs=10000,
)

model = DGAN(config)

In [None]:
# Recommended to train with a GPU
torch.cuda.is_available()

In [None]:
start = time.time()
model.train_numpy(
    windowed_data,
    feature_types=[OutputType.CONTINUOUS] * (windowed_data.shape[2] - 1)
    + [OutputType.DISCRETE],
)
stop = time.time()

In [None]:
print(f"Training time: {stop - start}s")

In [None]:
# saving model for future use
model.save("../models/dgan_model_5_48_sensors.pt")

In [None]:
# Generate synthetic data - this ran near instantly
_, synthetic_features = model.generate_numpy(1)

In [None]:
synthetic_features[0].shape

In [None]:
sensor_cols = slice0.columns
sensor_cols = sensor_cols[1:]

In [None]:
synthetic_df = pd.DataFrame(
    synthetic_features.reshape(-1, synthetic_features.shape[2]), columns=sensor_cols
)
synthetic_df.shape

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(13, 5))
sns.heatmap(slice2.corr(), cmap="Greens", ax=ax1)
sns.heatmap(synthetic_df.corr(), cmap="Blues", ax=ax2)
plt.show()

will do more testing in a dedicated nb for testing model efficacy.