In [None]:
!pip install torch==1.11.0
!pip install git+https://github.com/gretelai/gretel-synthetics.git

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.dates as md
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from pickle import dump, load
from skimage.util.shape import view_as_windows

import torch

from gretel_synthetics.timeseries_dgan.dgan import DGAN
from gretel_synthetics.timeseries_dgan.config import DGANConfig, OutputType

In [None]:
slice0 = pd.read_csv("../data/kagglePump/full_slice0.csv")
slice1 = pd.read_csv("../data/kagglePump/full_slice1.csv")
slice2 = pd.read_csv("../data/kagglePump/full_slice2.csv")
slice3 = pd.read_csv("../data/kagglePump/full_slice3.csv")
slice4 = pd.read_csv("../data/kagglePump/full_slice4.csv")
slice5 = pd.read_csv("../data/kagglePump/full_slice5.csv")
slice6 = pd.read_csv("../data/kagglePump/full_slice6.csv")

print(
    slice0.shape,
    slice1.shape,
    slice2.shape,
    slice3.shape,
    slice4.shape,
    slice5.shape,
    slice6.shape,
)

In [None]:
slices_list = [slice0, slice1, slice2, slice3, slice4, slice5, slice6]

slices_list.pop(4)

print(len(slices_list))

In [None]:
COLS_TO_DROP = ["Unnamed: 0", "sensor_00", "sensor_15", "sensor_50", "sensor_51"]
cleanup_nums = {"machine_status": {"NORMAL": 0, "BROKEN": 1, "RECOVERING": 2}}

for sl in slices_list:
    sl["timestamp"] = pd.to_datetime(sl["timestamp"])  # cast to datatype
    sl = sl.drop(COLS_TO_DROP, axis=1, inplace=True)  # drop cols

for sl in slices_list:
    sl = sl.replace(cleanup_nums)

for sl in slices_list:
    sl = sl.dropna(axis=0, inplace=True)  # drop nans

In [None]:
# set up DGAN config.
config = DGANConfig(
    max_sequence_len=720,
    sample_len=20,  # trying a larger sample_len
    batch_size=300,
    apply_feature_scaling=True,
    apply_example_scaling=False,
    use_attribute_discriminator=False,
    generator_learning_rate=1e-4,
    discriminator_learning_rate=1e-4,
    epochs=10000,
)

model = DGAN(config)

In [None]:
# loading model 4 and 5
model4 = model.load(
    "../models/dgan_model_4_48_sensors.pt", map_location=torch.device("cpu")
)
model5 = model.load(
    "../models/dgan_model_5_48_sensors.pt", map_location=torch.device("cpu")
)

In [None]:
# Generate synthetic data - this ran near instantly
_, synthetic_features4 = model4.generate_numpy(1000)
_, synthetic_features5 = model5.generate_numpy(1000)

In [None]:
# 1000 samples of 720 cols x 49 rows
synthetic_features4.shape
synthetic_features5.shape

In [None]:
sensor_cols = slice0.columns
sensor_cols = sensor_cols[1:]

In [None]:
synthetic_df_4 = pd.DataFrame(
    synthetic_features4.reshape(-1, synthetic_features4.shape[2]), columns=sensor_cols
)
synthetic_df_5 = pd.DataFrame(
    synthetic_features5.reshape(-1, synthetic_features5.shape[2]), columns=sensor_cols
)

In [None]:
# Concatenating list of dfs into one large one
slices_list
real_slices = pd.concat(slices_list)
real_slices.drop(["timestamp"], axis=1, inplace=True)
real_slices.head()

Comparing the accuracy synthetic data 4 and synthetic data 5 to real data:
I. Correlations
II. Distributions
III. PCA

I. Correlations 
a) Correlation matrices for real, synthetic 4, and synthetic 5 data
b) Correlation matrices for the differences between real and synthetic data
c) Correlations between real and synthetic

In [None]:
# Correlation matrices: real,synthetic 4,synthetic 5
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 5))
sns.heatmap(real_slices.corr(), cmap="Greens", ax=ax1)
sns.heatmap(synthetic_df_4.corr(), cmap="Blues", ax=ax2)
sns.heatmap(synthetic_df_5.corr(), cmap="BuPu", ax=ax3)

ax1.set_title("Real Data")
ax2.set_title("Synthetic Data 4")
ax3.set_title("Synthetic Data 5")
plt.show()

In [None]:
# Let check for differences in correlation
real_synthetic4_corr = (real_slices.corr() - synthetic_df_4.corr()).abs()
real_synthetic5_corr = (real_slices.corr() - synthetic_df_4.corr()).abs()

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
sns.heatmap(real_synthetic4_corr, cmap="viridis", ax=ax1)
sns.heatmap(real_synthetic5_corr, cmap="magma", ax=ax2)

In [None]:
# Autocorrelations
real_slices.corrwith(synthetic_df_4)

In [None]:
real_slices.corrwith(synthetic_df_5)

II. Distributions between of real and synthetic data
a) Sensor 34
b) Sensor 25 
c) Sensor 11
d) Sensor 36

In [None]:
# Compare distributions of sensor_34 values
plt.subplot(1, 2, 1)
plt.hist(
    [real_slices["sensor_34"], synthetic_df_4["sensor_34"]],
    label=["real", "synthetic"],
    bins=25,
    density=True,
)
plt.xlabel("Sensor 34 Values")
plt.ylabel("Density")
plt.title(" Real vs. Synthetic 4")

plt.subplot(1, 2, 2)
plt.hist(
    [real_slices["sensor_34"], synthetic_df_5["sensor_34"]],
    label=["real", "synthetic"],
    bins=25,
    density=True,
)
plt.xlabel("Sensor 34 Values")
plt.ylabel("Density")
plt.title(" Real vs. Synthetic 5")
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
# Compare distributions of sensor_25 values
plt.subplot(1, 2, 1)
plt.hist(
    [real_slices["sensor_25"], synthetic_df_4["sensor_25"]],
    label=["real", "synthetic"],
    bins=25,
    density=True,
)
plt.xlabel("Sensor 25 Values")
plt.ylabel("Density")
plt.title(" Real vs. Synthetic 4")

plt.subplot(1, 2, 2)
plt.hist(
    [real_slices["sensor_25"], synthetic_df_5["sensor_25"]],
    label=["real", "synthetic"],
    bins=25,
    density=True,
)
plt.xlabel("Sensor 25 Values")
plt.ylabel("Density")
plt.title(" Real vs. Synthetic 5")
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
# Compare distributions of sensor_11 values
plt.subplot(1, 2, 1)
plt.hist(
    [real_slices["sensor_11"], synthetic_df_4["sensor_11"]],
    label=["real", "synthetic"],
    bins=25,
    density=True,
)
plt.xlabel("Sensor 11 Values")
plt.ylabel("Density")
plt.title(" Real vs. Synthetic 4")

plt.subplot(1, 2, 2)
plt.hist(
    [real_slices["sensor_11"], synthetic_df_5["sensor_11"]],
    label=["real", "synthetic"],
    bins=25,
    density=True,
)
plt.xlabel("Sensor 11 Values")
plt.ylabel("Density")
plt.title(" Real vs. Synthetic 5")
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
# Compare distributions of sensor_36 values
plt.subplot(1, 2, 1)
plt.hist(
    [real_slices["sensor_36"], synthetic_df_4["sensor_36"]],
    label=["real", "synthetic"],
    bins=25,
    density=True,
)
plt.xlabel("Sensor 36 Values")
plt.ylabel("Density")
plt.title(" Real vs. Synthetic 4")

plt.subplot(1, 2, 2)
plt.hist(
    [real_slices["sensor_36"], synthetic_df_5["sensor_36"]],
    label=["real", "synthetic"],
    bins=25,
    density=True,
)
plt.xlabel("Sensor 36 Values")
plt.ylabel("Density")
plt.title(" Real vs. Synthetic 5")
plt.legend()

plt.tight_layout()
plt.show()

III. PCA, based on this [article](https://medium.com/@clever.tech.memes/comparing-similarity-of-two-datasets-using-pca-a-technical-review-of-principal-component-analysis-94e528e4b191)

In [None]:
# PCA
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
pca = PCA(n_components=2)

real_slices_scaled = scaler.fit(real_slices[:-1]).transform(real_slices[:-1])
synthetic_df_4_scaled = scaler.fit(synthetic_df_4[:-1]).transform(synthetic_df_4[:-1])
synthetic_df_5_scaled = scaler.fit(synthetic_df_5[:-1]).transform(synthetic_df_5[:-1])

In [None]:
real_slices_covar = np.cov(np.transpose(np.array(real_slices_scaled)))
synthetic_df_4_covar = np.cov(np.transpose(np.array(synthetic_df_4_scaled)))
synthetic_df_5_covar = np.cov(np.transpose(np.array(synthetic_df_5_scaled)))

pca_real_slices = pca.fit(real_slices_covar).transform(real_slices_covar)
pca_synthetic_df_4 = pca.fit(synthetic_df_4_covar).transform(synthetic_df_4_covar)
pca_synthetic_df_5 = pca.fit(synthetic_df_5_covar).transform(synthetic_df_5_covar)

In [None]:
def components_to_corr(pca_array1, pca_array2):
    df1, df2 = pd.DataFrame(pca_array1, columns=["comp1", "comp2"]), pd.DataFrame(
        pca_array2, columns=["comp1", "comp2"]
    )
    df_vertically_appended1 = df1["comp1"].append(df1["comp2"]).reset_index(drop=True)
    df_vertically_appended2 = df2["comp1"].append(df2["comp2"]).reset_index(drop=True)
    return np.corrcoef(df_vertically_appended1, df_vertically_appended2)[0, 1]

In [None]:
print(
    "The correlation between real data and synthetic data 4 is {}".format(
        components_to_corr(pca_real_slices, pca_synthetic_df_4)
    )
)
print()
print(
    "The correlation between real data and synthetic data 5 is {}".format(
        components_to_corr(pca_real_slices, pca_synthetic_df_5)
    )
)

Synthetic data 5 seems to be more strongly correlated with the real data compared to synthetic data 4.