In [8]:
import pandas as pd
from ctgan import CTGAN
from sklearn.preprocessing import StandardScaler
from scipy.stats import ks_2samp
import numpy as np

# Load the dataset
data = pd.read_csv('diabetes.csv')

# Separate features and target
features = data.drop(columns=['Outcome'])
target = data['Outcome']

# Normalize the features
scaler = StandardScaler()
normalized_features = scaler.fit_transform(features)

# Combine normalized features with target for CTGAN
normalized_data = pd.DataFrame(normalized_features, columns=features.columns)
normalized_data['Outcome'] = target.values

# Initialize and train CTGAN
ctgan = CTGAN()
ctgan.fit(normalized_data, epochs=1000)

# Sample synthetic data
num_samples = len(data)
synthetic_data = ctgan.sample(10000)

# Inverse transform the normalized features
synthetic_features = synthetic_data.drop(columns=['Outcome'])
synthetic_features = scaler.inverse_transform(synthetic_features)
synthetic_data[features.columns] = synthetic_features

# Convert synthetic data to the same format as original data
synthetic_data = synthetic_data[normalized_data.columns]

# Statistical comparison
def compare_statistics(original, synthetic):
    stats = {
        "mean": {},
        "std": {},
        "min": {},
        "max": {},
    }

    for column in original.columns:
        orig_stats = original[column].describe()
        synth_stats = synthetic[column].describe()

        stats["mean"][column] = (orig_stats["mean"], synth_stats["mean"])
        stats["std"][column] = (orig_stats["std"], synth_stats["std"])
        stats["min"][column] = (orig_stats["min"], synth_stats["min"])
        stats["max"][column] = (orig_stats["max"], synth_stats["max"])

    return stats

def percent_difference(orig_val, synth_val):
    return abs(orig_val - synth_val) / orig_val * 100

# Compute statistics
stats = compare_statistics(data, synthetic_data)

# Print results and percent differences
for stat_type, columns in stats.items():
    print(f"\n{stat_type.upper()}:")
    for column, values in columns.items():
        orig_val, synth_val = values
        diff = percent_difference(orig_val, synth_val)
        print(f"{column} - Original: {orig_val:.4f}, Synthetic: {synth_val:.4f}, Percent Difference: {diff:.2f}%")

print("\nKOLMOGOROV-SMIRNOV TEST:")
for column in features.columns:
    stat, p_value = ks_2samp(data[column], synthetic_data[column])
    print(f"{column} - K-S Statistic: {stat:.4f}, P-value: {p_value:.4f}")



MEAN:
Pregnancies - Original: 3.8451, Synthetic: 2.9332, Percent Difference: 23.72%
Glucose - Original: 120.8945, Synthetic: 107.6811, Percent Difference: 10.93%
BloodPressure - Original: 69.1055, Synthetic: 65.4201, Percent Difference: 5.33%
SkinThickness - Original: 20.5365, Synthetic: 15.0024, Percent Difference: 26.95%
Insulin - Original: 79.7995, Synthetic: 70.3147, Percent Difference: 11.89%
BMI - Original: 31.9926, Synthetic: 30.4713, Percent Difference: 4.76%
DiabetesPedigreeFunction - Original: 0.4719, Synthetic: 0.2709, Percent Difference: 42.60%
Age - Original: 33.2409, Synthetic: 39.2300, Percent Difference: 18.02%
Outcome - Original: 0.3490, Synthetic: 0.1824, Percent Difference: 47.73%

STD:
Pregnancies - Original: 3.3696, Synthetic: 3.5410, Percent Difference: 5.09%
Glucose - Original: 31.9726, Synthetic: 31.1526, Percent Difference: 2.56%
BloodPressure - Original: 19.3558, Synthetic: 20.8618, Percent Difference: 7.78%
SkinThickness - Original: 15.9522, Synthetic: 17.05

  return abs(orig_val - synth_val) / orig_val * 100
  return abs(orig_val - synth_val) / orig_val * 100


In [7]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
from scipy.stats import ks_2samp

# Load the dataset
data = pd.read_csv('diabetes.csv')

# Separate features and target
features = data.drop(columns=['Outcome'])
target = data['Outcome']

# Normalize the features
scaler = StandardScaler()
normalized_features = scaler.fit_transform(features)

# Convert to torch tensors
X = torch.tensor(normalized_features, dtype=torch.float32)
y = torch.tensor(target.values, dtype=torch.float32).unsqueeze(1)

# Define the neural network
class Generator(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Generator, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, output_dim)
        )

    def forward(self, x):
        return self.network(x)

input_dim = X.shape[1]
output_dim = X.shape[1]

generator = Generator(input_dim, output_dim)

# Define the optimizer and loss function
optimizer = optim.Adam(generator.parameters(), lr=0.001)
criterion = nn.MSELoss()

# Train the neural network
num_epochs = 1000
batch_size = 64

for epoch in range(num_epochs):
    permutation = torch.randperm(X.size()[0])
    for i in range(0, X.size()[0], batch_size):
        optimizer.zero_grad()

        indices = permutation[i:i + batch_size]
        batch_x, batch_y = X[indices], y[indices]

        outputs = generator(batch_x)
        loss = criterion(outputs, batch_x)
        loss.backward()
        optimizer.step()

# Sample synthetic data
with torch.no_grad():
    synthetic_data = generator(X).numpy()

# Inverse transform the normalized features
synthetic_features = scaler.inverse_transform(synthetic_data)
synthetic_df = pd.DataFrame(synthetic_features, columns=features.columns)
synthetic_df['Outcome'] = target.values

# Statistical comparison
def compare_statistics(original, synthetic):
    stats = {
        "mean": {},
        "std": {},
        "min": {},
        "max": {},
    }

    for column in original.columns:
        orig_stats = original[column].describe()
        synth_stats = synthetic[column].describe()

        stats["mean"][column] = (orig_stats["mean"], synth_stats["mean"])
        stats["std"][column] = (orig_stats["std"], synth_stats["std"])
        stats["min"][column] = (orig_stats["min"], synth_stats["min"])
        stats["max"][column] = (orig_stats["max"], synth_stats["max"])

    return stats

def percent_difference(orig_val, synth_val):
    return abs(orig_val - synth_val) / orig_val * 100

# Compute statistics
stats = compare_statistics(data, synthetic_df)

# Print results and percent differences
for stat_type, columns in stats.items():
    print(f"\n{stat_type.upper()}:")
    for column, values in columns.items():
        orig_val, synth_val = values
        diff = percent_difference(orig_val, synth_val)
        print(f"{column} - Original: {orig_val:.4f}, Synthetic: {synth_val:.4f}, Percent Difference: {diff:.2f}%")

# Kolmogorov-Smirnov Test
print("\nKOLMOGOROV-SMIRNOV TEST:")
for column in features.columns:
    stat, p_value = ks_2samp(data[column], synthetic_df[column])
    print(f"{column} - K-S Statistic: {stat:.4f}, P-value: {p_value:.4f}")



MEAN:
Pregnancies - Original: 3.8451, Synthetic: 3.8466, Percent Difference: 0.04%
Glucose - Original: 120.8945, Synthetic: 121.3848, Percent Difference: 0.41%
BloodPressure - Original: 69.1055, Synthetic: 69.0705, Percent Difference: 0.05%
SkinThickness - Original: 20.5365, Synthetic: 20.6119, Percent Difference: 0.37%
Insulin - Original: 79.7995, Synthetic: 80.6122, Percent Difference: 1.02%
BMI - Original: 31.9926, Synthetic: 31.9545, Percent Difference: 0.12%
DiabetesPedigreeFunction - Original: 0.4719, Synthetic: 0.4703, Percent Difference: 0.34%
Age - Original: 33.2409, Synthetic: 33.2985, Percent Difference: 0.17%
Outcome - Original: 0.3490, Synthetic: 0.3490, Percent Difference: 0.00%

STD:
Pregnancies - Original: 3.3696, Synthetic: 3.3737, Percent Difference: 0.12%
Glucose - Original: 31.9726, Synthetic: 31.9484, Percent Difference: 0.08%
BloodPressure - Original: 19.3558, Synthetic: 19.3886, Percent Difference: 0.17%
SkinThickness - Original: 15.9522, Synthetic: 15.9914, Per

  return abs(orig_val - synth_val) / orig_val * 100
  return abs(orig_val - synth_val) / orig_val * 100


In [10]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
from scipy.stats import ks_2samp

# Load the dataset
data = pd.read_csv('diabetes.csv')

# Separate features and target
features = data.drop(columns=['Outcome'])
target = data['Outcome']

# Normalize the features
scaler = StandardScaler()
normalized_features = scaler.fit_transform(features)

# Convert to torch tensors
X = torch.tensor(normalized_features, dtype=torch.float32)
y = torch.tensor(target.values, dtype=torch.float32).unsqueeze(1)

# Define the Generator
class Generator(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Generator, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, output_dim)
        )

    def forward(self, x):
        return self.network(x)

# Define the Discriminator
class Discriminator(nn.Module):
    def __init__(self, input_dim):
        super(Discriminator, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.network(x)

input_dim = X.shape[1]
latent_dim = 20  # Size of the noise vector

generator = Generator(latent_dim, input_dim)
discriminator = Discriminator(input_dim)

# Optimizers
optimizer_G = optim.Adam(generator.parameters(), lr=0.0002)
optimizer_D = optim.Adam(discriminator.parameters(), lr=0.0002)

# Loss function
adversarial_loss = nn.BCELoss()

# Training the GAN
num_epochs = 1000
batch_size = 64

for epoch in range(num_epochs):
    permutation = torch.randperm(X.size()[0])
    for i in range(0, X.size()[0], batch_size):
        # Train Discriminator
        optimizer_D.zero_grad()
        
        real_data = X[permutation[i:i + batch_size]]
        real_labels = torch.ones((real_data.size(0), 1))
        fake_labels = torch.zeros((real_data.size(0), 1))
        
        noise = torch.randn((real_data.size(0), latent_dim))
        fake_data = generator(noise)
        
        real_loss = adversarial_loss(discriminator(real_data), real_labels)
        fake_loss = adversarial_loss(discriminator(fake_data.detach()), fake_labels)
        d_loss = real_loss + fake_loss
        d_loss.backward()
        optimizer_D.step()

        # Train Generator
        optimizer_G.zero_grad()
        
        noise = torch.randn((real_data.size(0), latent_dim))
        fake_data = generator(noise)
        g_loss = adversarial_loss(discriminator(fake_data), real_labels)
        g_loss.backward()
        optimizer_G.step()

    if epoch % 100 == 0:
        print(f"Epoch [{epoch}/{num_epochs}] Discriminator Loss: {d_loss.item():.4f}, Generator Loss: {g_loss.item():.4f}")

# Sample synthetic data
num_samples = len(data)
noise = torch.randn((num_samples, latent_dim))
with torch.no_grad():
    synthetic_data = generator(noise).numpy()

# Inverse transform the normalized features
synthetic_features = scaler.inverse_transform(synthetic_data)
synthetic_df = pd.DataFrame(synthetic_features, columns=features.columns)
synthetic_df['Outcome'] = target.values

# Statistical comparison
def compare_statistics(original, synthetic):
    stats = {
        "mean": {},
        "std": {},
        "min": {},
        "max": {},
    }

    for column in original.columns:
        orig_stats = original[column].describe()
        synth_stats = synthetic[column].describe()

        stats["mean"][column] = (orig_stats["mean"], synth_stats["mean"])
        stats["std"][column] = (orig_stats["std"], synth_stats["std"])
        stats["min"][column] = (orig_stats["min"], synth_stats["min"])
        stats["max"][column] = (orig_stats["max"], synth_stats["max"])

    return stats

def percent_difference(orig_val, synth_val):
    return abs(orig_val - synth_val) / orig_val * 100

# Compute statistics
stats = compare_statistics(data, synthetic_df)

# Print results and percent differences
for stat_type, columns in stats.items():
    print(f"\n{stat_type.upper()}:")
    for column, values in columns.items():
        orig_val, synth_val = values
        diff = percent_difference(orig_val, synth_val)
        print(f"{column} - Original: {orig_val:.4f}, Synthetic: {synth_val:.4f}, Percent Difference: {diff:.2f}%")

# Kolmogorov-Smirnov Test
print("\nKOLMOGOROV-SMIRNOV TEST:")
for column in features.columns:
    stat, p_value = ks_2samp(data[column], synthetic_df[column])
    print(f"{column} - K-S Statistic: {stat:.4f}, P-value: {p_value:.4f}")


Epoch [0/1000] Discriminator Loss: 1.3351, Generator Loss: 0.6770
Epoch [100/1000] Discriminator Loss: 0.8685, Generator Loss: 0.9934
Epoch [200/1000] Discriminator Loss: 0.7726, Generator Loss: 1.6361
Epoch [300/1000] Discriminator Loss: 0.6548, Generator Loss: 1.4359
Epoch [400/1000] Discriminator Loss: 0.6083, Generator Loss: 1.8280
Epoch [500/1000] Discriminator Loss: 0.7406, Generator Loss: 1.4959
Epoch [600/1000] Discriminator Loss: 1.0855, Generator Loss: 1.5295
Epoch [700/1000] Discriminator Loss: 1.0166, Generator Loss: 1.3697
Epoch [800/1000] Discriminator Loss: 1.2088, Generator Loss: 0.9091
Epoch [900/1000] Discriminator Loss: 1.3762, Generator Loss: 0.9508

MEAN:
Pregnancies - Original: 3.8451, Synthetic: 3.7287, Percent Difference: 3.03%
Glucose - Original: 120.8945, Synthetic: 106.3909, Percent Difference: 12.00%
BloodPressure - Original: 69.1055, Synthetic: 67.7193, Percent Difference: 2.01%
SkinThickness - Original: 20.5365, Synthetic: 21.0729, Percent Difference: 2.61

  return abs(orig_val - synth_val) / orig_val * 100
  return abs(orig_val - synth_val) / orig_val * 100


### Arbitrary dataset script below

In [7]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
from scipy.stats import ks_2samp

class GAN:
    def __init__(self, input_dim, latent_dim=20, lr=0.0002):
        self.latent_dim = latent_dim
        self.input_dim = input_dim
        
        self.generator = self.Generator(input_dim, latent_dim)
        self.discriminator = self.Discriminator(input_dim)
        
        self.optimizer_G = optim.Adam(self.generator.parameters(), lr=lr)
        self.optimizer_D = optim.Adam(self.discriminator.parameters(), lr=lr)
        self.adversarial_loss = nn.BCELoss()

    class Generator(nn.Module):
        def __init__(self, input_dim, latent_dim):
            super(GAN.Generator, self).__init__()
            self.network = nn.Sequential(
                nn.Linear(latent_dim, 128),
                nn.ReLU(),
                nn.Linear(128, 128),
                nn.ReLU(),
                nn.Linear(128, input_dim)
            )

        def forward(self, x):
            return self.network(x)

    class Discriminator(nn.Module):
        def __init__(self, input_dim):
            super(GAN.Discriminator, self).__init__()
            self.network = nn.Sequential(
                nn.Linear(input_dim, 128),
                nn.ReLU(),
                nn.Linear(128, 128),
                nn.ReLU(),
                nn.Linear(128, 1),
                nn.Sigmoid()
            )

        def forward(self, x):
            return self.network(x)

    def train(self, X, num_epochs=1000, batch_size=64):
        for epoch in range(num_epochs):
            permutation = torch.randperm(X.size()[0])
            for i in range(0, X.size()[0], batch_size):
                # Train Discriminator
                self.optimizer_D.zero_grad()
                
                real_data = X[permutation[i:i + batch_size]]
                real_labels = torch.ones((real_data.size(0), 1))
                fake_labels = torch.zeros((real_data.size(0), 1))
                
                noise = torch.randn((real_data.size(0), self.latent_dim))
                fake_data = self.generator(noise)
                
                real_loss = self.adversarial_loss(self.discriminator(real_data), real_labels)
                fake_loss = self.adversarial_loss(self.discriminator(fake_data.detach()), fake_labels)
                d_loss = real_loss + fake_loss
                d_loss.backward()
                self.optimizer_D.step()

                # Train Generator
                self.optimizer_G.zero_grad()
                
                noise = torch.randn((real_data.size(0), self.latent_dim))
                fake_data = self.generator(noise)
                g_loss = self.adversarial_loss(self.discriminator(fake_data), real_labels)
                g_loss.backward()
                self.optimizer_G.step()

            if epoch % 100 == 0:
                print(f"Epoch [{epoch}/{num_epochs}] Discriminator Loss: {d_loss.item():.4f}, Generator Loss: {g_loss.item():.4f}")

    def sample(self, num_samples):
        noise = torch.randn((num_samples, self.latent_dim))
        with torch.no_grad():
            return self.generator(noise).numpy()

def compare_statistics(original, synthetic):
    stats = {
        "mean": {},
        "std": {},
        "min": {},
        "max": {},
    }

    for column in original.columns:
        orig_stats = original[column].describe()
        synth_stats = synthetic[column].describe()

        stats["mean"][column] = (orig_stats["mean"], synth_stats["mean"])
        stats["std"][column] = (orig_stats["std"], synth_stats["std"])
        stats["min"][column] = (orig_stats["min"], synth_stats["min"])
        stats["max"][column] = (orig_stats["max"], synth_stats["max"])

    return stats

def percent_difference(orig_val, synth_val):
    return abs(orig_val - synth_val) / orig_val * 100

def perform_ks_test(original, synthetic):
    ks_results = {}
    for column in original.columns:
        stat, p_value = ks_2samp(original[column], synthetic[column])
        ks_results[column] = (stat, p_value)
    return ks_results

def produce_synthetic_data(data, num_epochs=1000, batch_size=64, latent_dim=20):
    # Separate features and target if 'Outcome' column exists
    if 'Outcome' in data.columns:
        features = data.drop(columns=['Outcome'])
        target = data['Outcome']
    else:
        features = data
        target = None

    # Normalize the features
    scaler = StandardScaler()
    normalized_features = scaler.fit_transform(features)

    # Convert to torch tensors
    X = torch.tensor(normalized_features, dtype=torch.float32)

    # Initialize and train GAN
    gan = GAN(input_dim=X.shape[1], latent_dim=latent_dim)
    gan.train(X, num_epochs=num_epochs, batch_size=batch_size)

    # Sample synthetic data
    synthetic_data = gan.sample(len(data))

    # Inverse transform the normalized features
    synthetic_features = scaler.inverse_transform(synthetic_data)
    synthetic_df = pd.DataFrame(synthetic_features, columns=features.columns)
    if target is not None:
        synthetic_df['Outcome'] = target.values

    # Statistical comparison
    stats = compare_statistics(data, synthetic_df)

    # Print results and percent differences
    for stat_type, columns in stats.items():
        print(f"\n{stat_type.upper()}:")
        for column, values in columns.items():
            orig_val, synth_val = values
            diff = percent_difference(orig_val, synth_val)
            print(f"{column} - Original: {orig_val:.4f}, Synthetic: {synth_val:.4f}, Percent Difference: {diff:.2f}%")

    # Kolmogorov-Smirnov Test
    print("\nKOLMOGOROV-SMIRNOV TEST:")
    ks_results = perform_ks_test(data, synthetic_df)
    for column, (stat, p_value) in ks_results.items():
        print(f"{column} - K-S Statistic: {stat:.4f}, P-value: {p_value:.4f}")

    return synthetic_df



### Runing on an arbitrary dataset below

In [8]:
data = pd.read_csv('diabetes.csv')
synthetic_data = produce_synthetic_data(data, num_epochs=1000, batch_size=64, latent_dim=20)



Epoch [0/1000] Discriminator Loss: 1.3730, Generator Loss: 0.6557
Epoch [100/1000] Discriminator Loss: 1.1334, Generator Loss: 0.6959
Epoch [200/1000] Discriminator Loss: 1.0905, Generator Loss: 0.9412
Epoch [300/1000] Discriminator Loss: 1.1402, Generator Loss: 1.0711
Epoch [400/1000] Discriminator Loss: 1.1135, Generator Loss: 1.0673
Epoch [500/1000] Discriminator Loss: 1.2576, Generator Loss: 0.9438
Epoch [600/1000] Discriminator Loss: 1.3259, Generator Loss: 0.6600
Epoch [700/1000] Discriminator Loss: 0.9721, Generator Loss: 1.0521
Epoch [800/1000] Discriminator Loss: 1.2036, Generator Loss: 0.8459
Epoch [900/1000] Discriminator Loss: 1.3629, Generator Loss: 0.9694

MEAN:
Pregnancies - Original: 3.8451, Synthetic: -2.0452, Percent Difference: 153.19%
Glucose - Original: 120.8945, Synthetic: 125.3675, Percent Difference: 3.70%
BloodPressure - Original: 69.1055, Synthetic: 94.9516, Percent Difference: 37.40%
SkinThickness - Original: 20.5365, Synthetic: 18.6549, Percent Difference: 9

  return abs(orig_val - synth_val) / orig_val * 100
  return abs(orig_val - synth_val) / orig_val * 100


# This is Preprocessing the data

In [1]:
from preprocess import drop_invalid_zeros
import pandas as pd

# Placeholder path for the CSV file
csv_path = 'diabetes.csv'

# Load the original dataset for detection of indices
original = pd.read_csv(csv_path)

original_set = drop_invalid_zeros(original)

print(len(original_set))


336


# THIS IS TESTING THE DATA FROM THE SCRIPT

In [2]:


from GAN_Architecture import train_GAN, Sample_Synthetic_Data, CalculateKS





# Train the GAN using the CSV file
generator = train_GAN(original_set)
# Sample synthetic data from the trained generator



Epoch [0/10000] | D Loss: 12.9826 | G Loss: 0.7516
Epoch [100/10000] | D Loss: 0.5902 | G Loss: 0.8320
Epoch [200/10000] | D Loss: 0.4013 | G Loss: 1.1778
Epoch [300/10000] | D Loss: 0.3037 | G Loss: 1.4885
Epoch [400/10000] | D Loss: 0.2969 | G Loss: 1.8221
Epoch [500/10000] | D Loss: 0.1334 | G Loss: 2.3834
Epoch [600/10000] | D Loss: 0.1278 | G Loss: 2.5706
Epoch [700/10000] | D Loss: 0.2035 | G Loss: 2.4562
Epoch [800/10000] | D Loss: 0.1488 | G Loss: 2.6397
Epoch [900/10000] | D Loss: 0.2139 | G Loss: 2.6655
Epoch [1000/10000] | D Loss: 0.3020 | G Loss: 2.9195
Epoch [1100/10000] | D Loss: 0.1039 | G Loss: 3.4978
Epoch [1200/10000] | D Loss: 0.4738 | G Loss: 2.7603
Epoch [1300/10000] | D Loss: 0.2658 | G Loss: 3.3404
Epoch [1400/10000] | D Loss: 1.0131 | G Loss: 2.4546
Epoch [1500/10000] | D Loss: 0.7587 | G Loss: 1.8092
Epoch [1600/10000] | D Loss: 0.5085 | G Loss: 2.4381
Epoch [1700/10000] | D Loss: 0.4758 | G Loss: 2.7024
Epoch [1800/10000] | D Loss: 0.3856 | G Loss: 3.0196
Epoc

In [6]:
num_samples = 336  # Specify the number of samples you want to generate

latent_dim = generator.model[0].in_features  # Extract latent dimension from generator
synthetic_data = Sample_Synthetic_Data(generator, num_samples, latent_dim)

In [7]:
synthetic_set = pd.DataFrame(synthetic_data, columns=original_set.columns)

# Calculate KS test p-values
p_values = CalculateKS(original_set, synthetic_set)

# Print the p-values for verification
print("Kolmogorov-Smirnov Test p-values:")
print(p_values)

Kolmogorov-Smirnov Test p-values:
{'Pregnancies': 0.00023951249463972682, 'Glucose': 9.60069543747106e-06, 'BloodPressure': 0.00012232462921451157, 'SkinThickness': 0.027126269279608848, 'Insulin': 0.2671047678553526, 'BMI': 1.295924993150936e-06, 'DiabetesPedigreeFunction': 1.8168377029864056e-18, 'Age': 3.8081723854136984e-08, 'Outcome': 1.5767867419856404e-23}


In [8]:
import pandas as pd
from IPython.display import HTML



# Function to render a scrollable DataFrame
def display_scrollable_dataframe(df, max_height=400):
    """
    Display a scrollable pandas DataFrame in a Jupyter Notebook.
    
    Parameters:
    df (pd.DataFrame): The DataFrame to display.
    max_height (int): The maximum height of the scrollable area in pixels.
    """
    style = f"""
    <style>
    .scrollable-dataframe {{
        max-height: {max_height}px;
        overflow-y: scroll;
        display: inline-block;
    }}
    </style>
    """
    
    html = df.to_html(classes='scrollable-dataframe')
    display(HTML(style + html))

# Display the DataFrame
display_scrollable_dataframe(synthetic_set, max_height=400)


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,1.250329,68.82074,61.731644,11.789831,41.212143,19.330612,0.53929,17.053885,0.058614
1,0.477909,61.336441,49.586208,19.857014,83.561493,23.396233,-0.363853,15.579238,0.026052
2,2.19509,72.292831,65.834343,14.794847,98.234734,27.121115,-1.212707,21.766598,-0.420209
3,5.66298,135.091766,75.773361,20.288935,115.174294,30.890062,1.291963,40.254356,1.101456
4,3.870366,170.670837,84.659081,21.989534,80.119011,35.224125,-1.665321,23.778173,-0.171568
5,2.241613,119.559319,64.428001,17.083977,52.647579,23.734818,-0.140029,21.755125,0.226905
6,5.402974,163.710907,95.123215,43.26302,238.229187,48.214119,0.64773,42.374786,0.908738
7,-2.036818,52.538963,26.917259,16.670734,95.075562,16.223255,-0.572568,16.587057,-0.821022
8,4.623677,105.939713,87.503403,23.492962,73.607071,30.3876,0.714781,27.996876,-1.448582
9,4.946786,160.340652,86.848213,49.231419,158.550644,49.478062,1.117576,32.578537,0.532403


## Sampling for values only in acceptable range

In [9]:
from GAN_Architecture import selective_sample

num_samples = 336  # Specify the number of samples you want to generate

# Assuming the generator is already trained and available
latent_dim = generator.model[0].in_features  # Extract latent dimension from generator
selected_synthetic_data = selective_sample(generator, num_samples, latent_dim, original_set)

In [10]:
# Generate the synthetic data
selected_synthetic_data = selective_sample(generator, num_samples, latent_dim, original_set)

selected_synthetic_data = pd.DataFrame(selected_synthetic_data, columns=original_set.columns)
# Calculate KS test p-values
p_values = CalculateKS(original_set, selected_synthetic_data)

# Convert the p-values dictionary to a DataFrame for columnar representation
p_values_df = pd.DataFrame(list(p_values.items()), columns=['Column', 'KS p-value'])

# Print the p-values for verification
print("Kolmogorov-Smirnov Test p-values:")
print(p_values_df)


Kolmogorov-Smirnov Test p-values:
                     Column    KS p-value
0               Pregnancies  3.320900e-04
1                   Glucose  9.548827e-08
2             BloodPressure  8.522462e-07
3             SkinThickness  1.702725e-02
4                   Insulin  1.043338e-02
5                       BMI  3.617746e-07
6  DiabetesPedigreeFunction  4.433584e-20
7                       Age  1.958394e-06
8                   Outcome  9.984241e-01


In [11]:
display_scrollable_dataframe(selected_synthetic_data, max_height=400)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,8.384535,197.0,110.0,28.38007,113.00824,44.4165,0.085,40.032551,1
1,4.752225,125.025986,110.0,23.977978,127.222603,34.824406,0.086246,33.010025,0
2,1.0,91.18853,38.674927,20.497398,123.59314,24.914305,0.929296,21.0,0
3,3.911759,104.496872,54.800423,16.660975,117.647751,24.577709,0.085,22.924479,1
4,2.683282,126.040909,73.057129,33.627453,235.458374,31.720219,1.39814,36.066174,0
5,7.363426,173.690109,108.924553,46.67461,297.131958,50.654675,0.085,66.695442,1
6,1.0,92.077301,88.000031,19.607395,46.182255,27.782207,1.499533,28.603529,1
7,3.413773,75.370209,65.927986,26.945938,107.735985,30.818577,0.085,36.689468,0
8,5.987989,197.0,87.966812,30.750233,369.308472,40.164925,0.46822,29.952574,1
9,1.0,69.479416,56.066181,7.0,62.453495,18.2,1.04859,28.549755,0


### Utility function

In [1]:
import pandas as pd
from IPython.display import HTML



# Function to render a scrollable DataFrame
def display_scrollable_dataframe(df, max_height=400):
    """
    Display a scrollable pandas DataFrame in a Jupyter Notebook.
    
    Parameters:
    df (pd.DataFrame): The DataFrame to display.
    max_height (int): The maximum height of the scrollable area in pixels.
    """
    style = f"""
    <style>
    .scrollable-dataframe {{
        max-height: {max_height}px;
        overflow-y: scroll;
        display: inline-block;
    }}
    </style>
    """
    
    html = df.to_html(classes='scrollable-dataframe')
    display(HTML(style + html))

### Testing preprocessing for determining batch_size, epochs, learning rate

In [2]:
from preprocess import clean_set, analyze_dataset
from GAN_Architecture import train_GAN, CalculateKS, selective_sample
import pandas as pd

# Placeholder path for the CSV file
csv_path = 'diabetes.csv'

# Load the original dataset for detection of indices
original = pd.read_csv(csv_path)

original_set = clean_set(original)

print(len(original_set))

epochs, lr, batch_size, beta1 = analyze_dataset(original_set)

#
#
#
# Train the GAN using the CSV file
generator = train_GAN(original_set, epochs, batch_size, lr, beta1)





336
Epoch [0/1000] | D Loss: 1.4161 | G Loss: 0.6359
Epoch [100/1000] | D Loss: 0.2035 | G Loss: 2.7809
Epoch [200/1000] | D Loss: 0.4453 | G Loss: 2.3810
Epoch [300/1000] | D Loss: 1.5776 | G Loss: 1.8450
Epoch [400/1000] | D Loss: 1.4648 | G Loss: 1.9553
Epoch [500/1000] | D Loss: 1.2432 | G Loss: 0.9560
Epoch [600/1000] | D Loss: 1.2075 | G Loss: 1.1195
Epoch [700/1000] | D Loss: 1.3278 | G Loss: 0.5104
Epoch [800/1000] | D Loss: 1.3888 | G Loss: 0.5090
Epoch [900/1000] | D Loss: 1.3937 | G Loss: 0.8825


In [4]:
num_samples = 336  # Specify the number of samples you want to generate

# Assuming the generator is already trained and available
latent_dim = generator.model[0].in_features  # Extract latent dimension from generator
selected_synthetic_data = selective_sample(generator, num_samples, latent_dim, original_set)

selected_synthetic_data = pd.DataFrame(selected_synthetic_data, columns=original_set.columns)
# Calculate KS test p-values
p_values = CalculateKS(original_set, selected_synthetic_data)

# Convert the p-values dictionary to a DataFrame for columnar representation
p_values_df = pd.DataFrame(list(p_values.items()), columns=['Column', 'KS p-value'])

# Print the p-values for verification
print("Kolmogorov-Smirnov Test p-values:")
print(p_values_df)

display_scrollable_dataframe(selected_synthetic_data, max_height=400)

Kolmogorov-Smirnov Test p-values:
                     Column    KS p-value
0               Pregnancies  5.569963e-07
1                   Glucose  2.671048e-01
2             BloodPressure  2.633089e-10
3             SkinThickness  3.643383e-03
4                   Insulin  6.240471e-03
5                       BMI  1.295925e-06
6  DiabetesPedigreeFunction  6.287346e-17
7                       Age  6.240471e-03
8                   Outcome  9.938941e-01


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,1.0,97.097076,56.971806,28.329041,119.387413,31.683418,0.422339,21.0,0
1,1.798794,115.731453,55.903606,26.483402,93.777718,32.985443,0.118465,29.053768,1
2,3.347377,160.886505,53.492184,31.933296,171.083664,37.988464,0.085,40.426968,1
3,1.0,86.800713,40.859573,19.12438,130.463043,25.144203,1.187088,21.0,0
4,3.813842,155.39592,79.130852,11.99234,223.713135,37.14304,1.598097,39.879234,0
5,3.707112,133.224503,72.365829,41.667755,38.005917,42.119061,0.085,32.934891,0
6,1.0,102.187111,47.905964,14.664574,244.078506,27.600821,2.093288,21.0,0
7,6.84903,197.0,110.0,52.0,193.991852,57.3,0.085,68.041008,1
8,1.0,89.436447,60.090195,14.985869,95.484627,26.38471,1.26794,21.0,0
9,1.0,110.44957,42.030918,21.44841,101.273346,27.985964,0.085,21.439375,0


## Testing model class

In [1]:
# Import necessary modules
import pandas as pd
from model import Model

# Define the path to the CSV file and the number of samples to generate
csv_path = 'diabetes.csv'
num_samples = 336

# Instantiate the Model class
model = Model(csv_path, num_samples)

# Display the first few rows of the original dataset
print("\nOriginal Data")
display_scrollable_dataframe(model.original_set)

# Display the first few rows of the generated synthetic data
print("\nSelected Synthetic Data:")
display_scrollable_dataframe(model.selected_synthetic_data)

# Display the KS test p-values
print("\nKolmogorov-Smirnov Test p-values:")
print(model.p_values_df)


Epoch [0/5000] | D Loss: 0.9331 | G Loss: -0.3300


KeyboardInterrupt: 

# Testing 7/15

# Working Model

#### 5000 EPOCHS, Lambda_GP = 15, Batch size = 64, 7*D Neurons per hidden layer with decaying by half for each successive layer, math.ceil(Log(D)) number of layers, learning rate = 0.000025, and lr_gen = lr_discrim

In [1]:
import pandas as pd
from IPython.display import HTML



# Function to render a scrollable DataFrame
def display_scrollable_dataframe(df, max_height=400):
    """
    Display a scrollable pandas DataFrame in a Jupyter Notebook.
    
    Parameters:
    df (pd.DataFrame): The DataFrame to display.
    max_height (int): The maximum height of the scrollable area in pixels.
    """
    style = f"""
    <style>
    .scrollable-dataframe {{
        max-height: {max_height}px;
        overflow-y: scroll;
        display: inline-block;
    }}
    </style>
    """
    
    html = df.to_html(classes='scrollable-dataframe')
    display(HTML(style + html))



# Import necessary modules
import pandas as pd
from model import Model

# Define the path to the CSV file and the number of samples to generate
csv_path = 'diabetes.csv'
num_samples = 336

# Instantiate the Model class
model = Model(csv_path, num_samples)

# Display the first few rows of the original dataset
# print("\nOriginal Data")
# display_scrollable_dataframe(model.original_set)

# Display the first few rows of the generated synthetic data
print("\nSelected Synthetic Data:")
display_scrollable_dataframe(model.selected_synthetic_data)

#print("\nNondiscriminatory Data")
#display_scrollable_dataframe(model.nondiscriminatory_data)

# Display the KS test p-values
print("\nKolmogorov-Smirnov Test p-values:")
print(model.p_values_df)

Epoch [0/5000] | D Loss: 13.8993 | G Loss: -0.0029
Epoch [100/5000] | D Loss: 9.5345 | G Loss: 0.0652
Epoch [200/5000] | D Loss: 3.5494 | G Loss: 0.1266
Epoch [300/5000] | D Loss: 1.7108 | G Loss: 0.0591
Epoch [400/5000] | D Loss: 0.8890 | G Loss: -0.0875
Epoch [500/5000] | D Loss: 0.2438 | G Loss: -0.1008
Epoch [600/5000] | D Loss: 0.1858 | G Loss: -0.1764
Epoch [700/5000] | D Loss: -0.2721 | G Loss: -0.0219
Epoch [800/5000] | D Loss: -0.3758 | G Loss: 0.0985
Epoch [900/5000] | D Loss: -0.2663 | G Loss: 0.3389
Epoch [1000/5000] | D Loss: -0.5845 | G Loss: 0.4227
Epoch [1100/5000] | D Loss: -0.5759 | G Loss: 0.4160
Epoch [1200/5000] | D Loss: -0.2859 | G Loss: 0.2670
Epoch [1300/5000] | D Loss: -0.2590 | G Loss: 0.0965
Epoch [1400/5000] | D Loss: -0.2381 | G Loss: -0.0912
Epoch [1500/5000] | D Loss: -0.0532 | G Loss: -0.2517
Epoch [1600/5000] | D Loss: -0.1087 | G Loss: -0.2917
Epoch [1700/5000] | D Loss: -0.0524 | G Loss: -0.1771
Epoch [1800/5000] | D Loss: -0.0509 | G Loss: 0.1572
Ep

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,5.38399,113.224014,76.428177,33.856052,148.620102,37.946587,0.181987,27.451103,0.340843
1,4.265149,117.952072,81.904144,36.326332,157.204758,40.59341,0.55734,28.0634,0.329855
2,3.41339,144.008087,73.449753,22.901825,157.508575,36.400711,0.673555,23.752756,0.231127
3,3.386013,70.643456,74.859398,31.940657,79.400345,29.471792,0.276708,37.872185,0.139543
4,6.705456,141.749725,83.501694,27.708437,332.347961,40.899006,0.360172,32.105625,0.541917
5,6.536658,85.257233,72.672394,33.695011,239.285583,34.979641,0.372437,35.786671,0.489751
6,3.225298,130.621902,69.450768,24.078619,120.598282,42.922253,0.379892,30.078817,0.131557
7,3.157681,124.556908,79.080872,24.498192,162.339233,35.96468,0.643227,27.380407,0.167883
8,6.317359,118.662491,72.282204,32.242031,274.008881,36.577255,0.597698,34.268974,0.545457
9,4.658329,72.027969,79.681946,31.434746,81.753281,33.925114,0.176896,30.038975,0.220343



Kolmogorov-Smirnov Test p-values:
                     Column KS p-value
0               Pregnancies   0.000000
1                   Glucose   0.000853
2             BloodPressure   0.000000
3             SkinThickness   0.002759
4                   Insulin   0.000000
5                       BMI   0.000000
6  DiabetesPedigreeFunction   0.006240
7                       Age   0.000000
8                   Outcome   0.000000


# Testing 7/18 And NEWER

In [1]:
import pandas as pd
from IPython.display import HTML



# Function to render a scrollable DataFrame
def display_scrollable_dataframe(df, max_height=400):
    """
    Display a scrollable pandas DataFrame in a Jupyter Notebook.
    
    Parameters:
    df (pd.DataFrame): The DataFrame to display.
    max_height (int): The maximum height of the scrollable area in pixels.
    """
    style = f"""
    <style>
    .scrollable-dataframe {{
        max-height: {max_height}px;
        overflow-y: scroll;
        display: inline-block;
    }}
    </style>
    """
    
    html = df.to_html(classes='scrollable-dataframe')
    display(HTML(style + html))



# Import necessary modules
import pandas as pd
from model import Model

# Define the path to the CSV file and the number of samples to generate
csv_path = 'diabetes.csv'
num_samples = 336

# Instantiate the Model class
model = Model(csv_path, num_samples)

# Display the first few rows of the original dataset
# print("\nOriginal Data")
# display_scrollable_dataframe(model.original_set)

# Display the first few rows of the generated synthetic data
print("\nSelected Synthetic Data:")
display_scrollable_dataframe(model.selected_synthetic_data)

#print("\nNondiscriminatory Data")
#display_scrollable_dataframe(model.nondiscriminatory_data)

# Display the KS test p-values
print("\nKolmogorov-Smirnov Test p-values:")
print(model.p_values_df)

Epoch [0/1000] | D Loss: 0.0885 | G Loss: -0.1719 | MSE Loss: 0.2353 | Feature Loss: 0.0001
Epoch [100/1000] | D Loss: -0.6973 | G Loss: -0.3751 | MSE Loss: 0.1626 | Feature Loss: 0.0615
Epoch [200/1000] | D Loss: -0.4195 | G Loss: -0.8679 | MSE Loss: 0.1055 | Feature Loss: 0.0270
Epoch [300/1000] | D Loss: -0.3112 | G Loss: -0.9153 | MSE Loss: 0.0850 | Feature Loss: 0.0182
Epoch [400/1000] | D Loss: -0.3421 | G Loss: -0.8837 | MSE Loss: 0.0892 | Feature Loss: 0.0240
Epoch [500/1000] | D Loss: -0.2152 | G Loss: -0.8512 | MSE Loss: 0.0701 | Feature Loss: 0.0151
Epoch [600/1000] | D Loss: -0.2939 | G Loss: -0.8585 | MSE Loss: 0.0775 | Feature Loss: 0.0199
Epoch [700/1000] | D Loss: -0.2033 | G Loss: -0.8099 | MSE Loss: 0.0766 | Feature Loss: 0.0182
Epoch [800/1000] | D Loss: -0.2000 | G Loss: -0.8587 | MSE Loss: 0.0769 | Feature Loss: 0.0177
Epoch [900/1000] | D Loss: -0.2767 | G Loss: -0.8240 | MSE Loss: 0.0592 | Feature Loss: 0.0105

Selected Synthetic Data:


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,4.035756,123.375465,56.770824,24.914425,205.554733,29.987101,0.232789,31.347157,0.147974
1,3.153913,113.78978,59.597752,22.502003,140.856461,29.969971,0.83301,27.122236,0.385938
2,3.496922,129.357208,74.094894,30.422983,157.00386,32.817875,0.598945,29.451843,0.370638
3,3.097616,118.766914,75.978165,36.44302,120.571526,31.666647,0.590473,32.512779,0.392437
4,3.982127,160.114212,57.107246,31.237049,164.790466,35.560947,0.559877,27.063725,0.110357
5,3.676184,159.271332,86.360992,38.315941,235.510803,33.123463,0.439149,29.779068,0.315727
6,3.297296,124.421188,74.827385,32.611618,173.226334,31.55299,0.287052,33.476509,0.286576
7,2.97872,97.779175,46.329613,25.949476,86.423363,29.88357,0.43741,28.148348,0.083589
8,4.189851,151.560791,70.841606,25.571768,163.850662,33.059879,0.709554,27.79356,0.196043
9,3.57862,134.887024,82.884987,33.60239,265.859863,35.262115,1.021558,26.528013,0.567977



Kolmogorov-Smirnov Test p-values:
                     Column KS p-value
0               Pregnancies   0.000000
1                   Glucose   0.052137
2             BloodPressure   0.003643
3             SkinThickness   0.000007
4                   Insulin   0.000000
5                       BMI   0.000020
6  DiabetesPedigreeFunction   0.358503
7                       Age   0.000000
8                   Outcome   0.000000


# Testing 7/23 onwards

In [1]:
import pandas as pd
from IPython.display import HTML



# Function to render a scrollable DataFrame
def display_scrollable_dataframe(df, max_height=400):
    """
    Display a scrollable pandas DataFrame in a Jupyter Notebook.
    
    Parameters:
    df (pd.DataFrame): The DataFrame to display.
    max_height (int): The maximum height of the scrollable area in pixels.
    """
    style = f"""
    <style>
    .scrollable-dataframe {{
        max-height: {max_height}px;
        overflow-y: scroll;
        display: inline-block;
    }}
    </style>
    """
    
    html = df.to_html(classes='scrollable-dataframe')
    display(HTML(style + html))



# Import necessary modules
import pandas as pd
from model import Model

# Define the path to the CSV file and the number of samples to generate
csv_path = 'diabetes.csv'
num_samples = 336

# Instantiate the Model class
model = Model(csv_path, num_samples)

# Display the first few rows of the original dataset
# print("\nOriginal Data")
# display_scrollable_dataframe(model.original_set)

# Display the first few rows of the generated synthetic data
print("\nSelected Synthetic Data:")
display_scrollable_dataframe(model.selected_synthetic_data)

#print("\nNondiscriminatory Data")
#display_scrollable_dataframe(model.nondiscriminatory_data)

# Display the KS test p-values
print("\nKolmogorov-Smirnov Test p-values:")
print(model.p_values_df)

Epoch [0/1500] | D Loss: 1.4923 | G Loss: 0.6251 | MSE Loss: 0.2268 | Feature Loss: 0.0012
Epoch [100/1500] | D Loss: 1.3831 | G Loss: 0.7281 | MSE Loss: 0.1157 | Feature Loss: 0.0120
Epoch [200/1500] | D Loss: 1.4004 | G Loss: 0.7139 | MSE Loss: 0.1093 | Feature Loss: 0.0246
Epoch [300/1500] | D Loss: 1.3708 | G Loss: 0.7179 | MSE Loss: 0.0793 | Feature Loss: 0.0212
Epoch [400/1500] | D Loss: 1.3871 | G Loss: 0.7220 | MSE Loss: 0.0759 | Feature Loss: 0.0195
Epoch [500/1500] | D Loss: 1.3715 | G Loss: 0.7295 | MSE Loss: 0.0703 | Feature Loss: 0.0154
Epoch [600/1500] | D Loss: 1.3713 | G Loss: 0.7239 | MSE Loss: 0.0763 | Feature Loss: 0.0196
Epoch [700/1500] | D Loss: 1.3882 | G Loss: 0.7213 | MSE Loss: 0.0898 | Feature Loss: 0.0181
Epoch [800/1500] | D Loss: 1.4041 | G Loss: 0.7261 | MSE Loss: 0.0590 | Feature Loss: 0.0101
Epoch [900/1500] | D Loss: 1.3820 | G Loss: 0.7220 | MSE Loss: 0.0789 | Feature Loss: 0.0192
Epoch [1000/1500] | D Loss: 1.3956 | G Loss: 0.7251 | MSE Loss: 0.0639 |

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,2.546964,140.994507,72.539658,36.558681,137.020096,38.389923,0.360606,40.074245,0.272728
1,3.846745,124.790154,63.012619,32.140671,131.646317,37.13464,0.46757,30.280731,0.170809
2,4.482731,151.203552,59.375942,36.610813,131.646591,36.628563,0.290154,31.362892,0.194892
3,2.324383,141.128937,71.966354,35.590141,169.300797,37.808971,0.405876,40.417358,0.364904
4,4.421372,129.167801,43.815521,35.242897,123.143013,33.469303,0.253956,29.851267,0.076725
5,3.080114,190.563675,101.698601,40.698269,83.922264,43.085072,0.247314,32.529934,0.665363
6,2.629597,117.797768,51.836666,33.012672,155.803329,37.97657,0.264896,31.788036,0.325713
7,2.842696,95.382401,62.197754,33.641071,150.015076,34.207611,0.401197,36.604103,0.285741
8,3.941001,121.536171,71.871033,34.891541,89.927803,35.943295,0.418688,31.648235,0.096868
9,4.321942,115.195473,59.834927,27.819958,144.459549,35.756344,0.540314,33.573547,0.206945



Kolmogorov-Smirnov Test p-values:
                     Column KS p-value
0               Pregnancies   0.000000
1                   Glucose   0.000001
2             BloodPressure   0.008093
3             SkinThickness   0.000000
4                   Insulin   0.000000
5                       BMI   0.000000
6  DiabetesPedigreeFunction   0.000000
7                       Age   0.000000
8                   Outcome   0.000000
