In [8]:
import pandas as pd
from ctgan import CTGAN
from sklearn.preprocessing import StandardScaler
from scipy.stats import ks_2samp
import numpy as np

# Load the dataset
data = pd.read_csv('diabetes.csv')

# Separate features and target
features = data.drop(columns=['Outcome'])
target = data['Outcome']

# Normalize the features
scaler = StandardScaler()
normalized_features = scaler.fit_transform(features)

# Combine normalized features with target for CTGAN
normalized_data = pd.DataFrame(normalized_features, columns=features.columns)
normalized_data['Outcome'] = target.values

# Initialize and train CTGAN
ctgan = CTGAN()
ctgan.fit(normalized_data, epochs=1000)

# Sample synthetic data
num_samples = len(data)
synthetic_data = ctgan.sample(10000)

# Inverse transform the normalized features
synthetic_features = synthetic_data.drop(columns=['Outcome'])
synthetic_features = scaler.inverse_transform(synthetic_features)
synthetic_data[features.columns] = synthetic_features

# Convert synthetic data to the same format as original data
synthetic_data = synthetic_data[normalized_data.columns]

# Statistical comparison
def compare_statistics(original, synthetic):
    stats = {
        "mean": {},
        "std": {},
        "min": {},
        "max": {},
    }

    for column in original.columns:
        orig_stats = original[column].describe()
        synth_stats = synthetic[column].describe()

        stats["mean"][column] = (orig_stats["mean"], synth_stats["mean"])
        stats["std"][column] = (orig_stats["std"], synth_stats["std"])
        stats["min"][column] = (orig_stats["min"], synth_stats["min"])
        stats["max"][column] = (orig_stats["max"], synth_stats["max"])

    return stats

def percent_difference(orig_val, synth_val):
    return abs(orig_val - synth_val) / orig_val * 100

# Compute statistics
stats = compare_statistics(data, synthetic_data)

# Print results and percent differences
for stat_type, columns in stats.items():
    print(f"\n{stat_type.upper()}:")
    for column, values in columns.items():
        orig_val, synth_val = values
        diff = percent_difference(orig_val, synth_val)
        print(f"{column} - Original: {orig_val:.4f}, Synthetic: {synth_val:.4f}, Percent Difference: {diff:.2f}%")

print("\nKOLMOGOROV-SMIRNOV TEST:")
for column in features.columns:
    stat, p_value = ks_2samp(data[column], synthetic_data[column])
    print(f"{column} - K-S Statistic: {stat:.4f}, P-value: {p_value:.4f}")



MEAN:
Pregnancies - Original: 3.8451, Synthetic: 2.9332, Percent Difference: 23.72%
Glucose - Original: 120.8945, Synthetic: 107.6811, Percent Difference: 10.93%
BloodPressure - Original: 69.1055, Synthetic: 65.4201, Percent Difference: 5.33%
SkinThickness - Original: 20.5365, Synthetic: 15.0024, Percent Difference: 26.95%
Insulin - Original: 79.7995, Synthetic: 70.3147, Percent Difference: 11.89%
BMI - Original: 31.9926, Synthetic: 30.4713, Percent Difference: 4.76%
DiabetesPedigreeFunction - Original: 0.4719, Synthetic: 0.2709, Percent Difference: 42.60%
Age - Original: 33.2409, Synthetic: 39.2300, Percent Difference: 18.02%
Outcome - Original: 0.3490, Synthetic: 0.1824, Percent Difference: 47.73%

STD:
Pregnancies - Original: 3.3696, Synthetic: 3.5410, Percent Difference: 5.09%
Glucose - Original: 31.9726, Synthetic: 31.1526, Percent Difference: 2.56%
BloodPressure - Original: 19.3558, Synthetic: 20.8618, Percent Difference: 7.78%
SkinThickness - Original: 15.9522, Synthetic: 17.05

  return abs(orig_val - synth_val) / orig_val * 100
  return abs(orig_val - synth_val) / orig_val * 100


In [7]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
from scipy.stats import ks_2samp

# Load the dataset
data = pd.read_csv('diabetes.csv')

# Separate features and target
features = data.drop(columns=['Outcome'])
target = data['Outcome']

# Normalize the features
scaler = StandardScaler()
normalized_features = scaler.fit_transform(features)

# Convert to torch tensors
X = torch.tensor(normalized_features, dtype=torch.float32)
y = torch.tensor(target.values, dtype=torch.float32).unsqueeze(1)

# Define the neural network
class Generator(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Generator, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, output_dim)
        )

    def forward(self, x):
        return self.network(x)

input_dim = X.shape[1]
output_dim = X.shape[1]

generator = Generator(input_dim, output_dim)

# Define the optimizer and loss function
optimizer = optim.Adam(generator.parameters(), lr=0.001)
criterion = nn.MSELoss()

# Train the neural network
num_epochs = 1000
batch_size = 64

for epoch in range(num_epochs):
    permutation = torch.randperm(X.size()[0])
    for i in range(0, X.size()[0], batch_size):
        optimizer.zero_grad()

        indices = permutation[i:i + batch_size]
        batch_x, batch_y = X[indices], y[indices]

        outputs = generator(batch_x)
        loss = criterion(outputs, batch_x)
        loss.backward()
        optimizer.step()

# Sample synthetic data
with torch.no_grad():
    synthetic_data = generator(X).numpy()

# Inverse transform the normalized features
synthetic_features = scaler.inverse_transform(synthetic_data)
synthetic_df = pd.DataFrame(synthetic_features, columns=features.columns)
synthetic_df['Outcome'] = target.values

# Statistical comparison
def compare_statistics(original, synthetic):
    stats = {
        "mean": {},
        "std": {},
        "min": {},
        "max": {},
    }

    for column in original.columns:
        orig_stats = original[column].describe()
        synth_stats = synthetic[column].describe()

        stats["mean"][column] = (orig_stats["mean"], synth_stats["mean"])
        stats["std"][column] = (orig_stats["std"], synth_stats["std"])
        stats["min"][column] = (orig_stats["min"], synth_stats["min"])
        stats["max"][column] = (orig_stats["max"], synth_stats["max"])

    return stats

def percent_difference(orig_val, synth_val):
    return abs(orig_val - synth_val) / orig_val * 100

# Compute statistics
stats = compare_statistics(data, synthetic_df)

# Print results and percent differences
for stat_type, columns in stats.items():
    print(f"\n{stat_type.upper()}:")
    for column, values in columns.items():
        orig_val, synth_val = values
        diff = percent_difference(orig_val, synth_val)
        print(f"{column} - Original: {orig_val:.4f}, Synthetic: {synth_val:.4f}, Percent Difference: {diff:.2f}%")

# Kolmogorov-Smirnov Test
print("\nKOLMOGOROV-SMIRNOV TEST:")
for column in features.columns:
    stat, p_value = ks_2samp(data[column], synthetic_df[column])
    print(f"{column} - K-S Statistic: {stat:.4f}, P-value: {p_value:.4f}")



MEAN:
Pregnancies - Original: 3.8451, Synthetic: 3.8466, Percent Difference: 0.04%
Glucose - Original: 120.8945, Synthetic: 121.3848, Percent Difference: 0.41%
BloodPressure - Original: 69.1055, Synthetic: 69.0705, Percent Difference: 0.05%
SkinThickness - Original: 20.5365, Synthetic: 20.6119, Percent Difference: 0.37%
Insulin - Original: 79.7995, Synthetic: 80.6122, Percent Difference: 1.02%
BMI - Original: 31.9926, Synthetic: 31.9545, Percent Difference: 0.12%
DiabetesPedigreeFunction - Original: 0.4719, Synthetic: 0.4703, Percent Difference: 0.34%
Age - Original: 33.2409, Synthetic: 33.2985, Percent Difference: 0.17%
Outcome - Original: 0.3490, Synthetic: 0.3490, Percent Difference: 0.00%

STD:
Pregnancies - Original: 3.3696, Synthetic: 3.3737, Percent Difference: 0.12%
Glucose - Original: 31.9726, Synthetic: 31.9484, Percent Difference: 0.08%
BloodPressure - Original: 19.3558, Synthetic: 19.3886, Percent Difference: 0.17%
SkinThickness - Original: 15.9522, Synthetic: 15.9914, Per

  return abs(orig_val - synth_val) / orig_val * 100
  return abs(orig_val - synth_val) / orig_val * 100


In [10]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
from scipy.stats import ks_2samp

# Load the dataset
data = pd.read_csv('diabetes.csv')

# Separate features and target
features = data.drop(columns=['Outcome'])
target = data['Outcome']

# Normalize the features
scaler = StandardScaler()
normalized_features = scaler.fit_transform(features)

# Convert to torch tensors
X = torch.tensor(normalized_features, dtype=torch.float32)
y = torch.tensor(target.values, dtype=torch.float32).unsqueeze(1)

# Define the Generator
class Generator(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Generator, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, output_dim)
        )

    def forward(self, x):
        return self.network(x)

# Define the Discriminator
class Discriminator(nn.Module):
    def __init__(self, input_dim):
        super(Discriminator, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.network(x)

input_dim = X.shape[1]
latent_dim = 20  # Size of the noise vector

generator = Generator(latent_dim, input_dim)
discriminator = Discriminator(input_dim)

# Optimizers
optimizer_G = optim.Adam(generator.parameters(), lr=0.0002)
optimizer_D = optim.Adam(discriminator.parameters(), lr=0.0002)

# Loss function
adversarial_loss = nn.BCELoss()

# Training the GAN
num_epochs = 1000
batch_size = 64

for epoch in range(num_epochs):
    permutation = torch.randperm(X.size()[0])
    for i in range(0, X.size()[0], batch_size):
        # Train Discriminator
        optimizer_D.zero_grad()
        
        real_data = X[permutation[i:i + batch_size]]
        real_labels = torch.ones((real_data.size(0), 1))
        fake_labels = torch.zeros((real_data.size(0), 1))
        
        noise = torch.randn((real_data.size(0), latent_dim))
        fake_data = generator(noise)
        
        real_loss = adversarial_loss(discriminator(real_data), real_labels)
        fake_loss = adversarial_loss(discriminator(fake_data.detach()), fake_labels)
        d_loss = real_loss + fake_loss
        d_loss.backward()
        optimizer_D.step()

        # Train Generator
        optimizer_G.zero_grad()
        
        noise = torch.randn((real_data.size(0), latent_dim))
        fake_data = generator(noise)
        g_loss = adversarial_loss(discriminator(fake_data), real_labels)
        g_loss.backward()
        optimizer_G.step()

    if epoch % 100 == 0:
        print(f"Epoch [{epoch}/{num_epochs}] Discriminator Loss: {d_loss.item():.4f}, Generator Loss: {g_loss.item():.4f}")

# Sample synthetic data
num_samples = len(data)
noise = torch.randn((num_samples, latent_dim))
with torch.no_grad():
    synthetic_data = generator(noise).numpy()

# Inverse transform the normalized features
synthetic_features = scaler.inverse_transform(synthetic_data)
synthetic_df = pd.DataFrame(synthetic_features, columns=features.columns)
synthetic_df['Outcome'] = target.values

# Statistical comparison
def compare_statistics(original, synthetic):
    stats = {
        "mean": {},
        "std": {},
        "min": {},
        "max": {},
    }

    for column in original.columns:
        orig_stats = original[column].describe()
        synth_stats = synthetic[column].describe()

        stats["mean"][column] = (orig_stats["mean"], synth_stats["mean"])
        stats["std"][column] = (orig_stats["std"], synth_stats["std"])
        stats["min"][column] = (orig_stats["min"], synth_stats["min"])
        stats["max"][column] = (orig_stats["max"], synth_stats["max"])

    return stats

def percent_difference(orig_val, synth_val):
    return abs(orig_val - synth_val) / orig_val * 100

# Compute statistics
stats = compare_statistics(data, synthetic_df)

# Print results and percent differences
for stat_type, columns in stats.items():
    print(f"\n{stat_type.upper()}:")
    for column, values in columns.items():
        orig_val, synth_val = values
        diff = percent_difference(orig_val, synth_val)
        print(f"{column} - Original: {orig_val:.4f}, Synthetic: {synth_val:.4f}, Percent Difference: {diff:.2f}%")

# Kolmogorov-Smirnov Test
print("\nKOLMOGOROV-SMIRNOV TEST:")
for column in features.columns:
    stat, p_value = ks_2samp(data[column], synthetic_df[column])
    print(f"{column} - K-S Statistic: {stat:.4f}, P-value: {p_value:.4f}")


Epoch [0/1000] Discriminator Loss: 1.3351, Generator Loss: 0.6770
Epoch [100/1000] Discriminator Loss: 0.8685, Generator Loss: 0.9934
Epoch [200/1000] Discriminator Loss: 0.7726, Generator Loss: 1.6361
Epoch [300/1000] Discriminator Loss: 0.6548, Generator Loss: 1.4359
Epoch [400/1000] Discriminator Loss: 0.6083, Generator Loss: 1.8280
Epoch [500/1000] Discriminator Loss: 0.7406, Generator Loss: 1.4959
Epoch [600/1000] Discriminator Loss: 1.0855, Generator Loss: 1.5295
Epoch [700/1000] Discriminator Loss: 1.0166, Generator Loss: 1.3697
Epoch [800/1000] Discriminator Loss: 1.2088, Generator Loss: 0.9091
Epoch [900/1000] Discriminator Loss: 1.3762, Generator Loss: 0.9508

MEAN:
Pregnancies - Original: 3.8451, Synthetic: 3.7287, Percent Difference: 3.03%
Glucose - Original: 120.8945, Synthetic: 106.3909, Percent Difference: 12.00%
BloodPressure - Original: 69.1055, Synthetic: 67.7193, Percent Difference: 2.01%
SkinThickness - Original: 20.5365, Synthetic: 21.0729, Percent Difference: 2.61

  return abs(orig_val - synth_val) / orig_val * 100
  return abs(orig_val - synth_val) / orig_val * 100


### Arbitrary dataset script below

In [7]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
from scipy.stats import ks_2samp

class GAN:
    def __init__(self, input_dim, latent_dim=20, lr=0.0002):
        self.latent_dim = latent_dim
        self.input_dim = input_dim
        
        self.generator = self.Generator(input_dim, latent_dim)
        self.discriminator = self.Discriminator(input_dim)
        
        self.optimizer_G = optim.Adam(self.generator.parameters(), lr=lr)
        self.optimizer_D = optim.Adam(self.discriminator.parameters(), lr=lr)
        self.adversarial_loss = nn.BCELoss()

    class Generator(nn.Module):
        def __init__(self, input_dim, latent_dim):
            super(GAN.Generator, self).__init__()
            self.network = nn.Sequential(
                nn.Linear(latent_dim, 128),
                nn.ReLU(),
                nn.Linear(128, 128),
                nn.ReLU(),
                nn.Linear(128, input_dim)
            )

        def forward(self, x):
            return self.network(x)

    class Discriminator(nn.Module):
        def __init__(self, input_dim):
            super(GAN.Discriminator, self).__init__()
            self.network = nn.Sequential(
                nn.Linear(input_dim, 128),
                nn.ReLU(),
                nn.Linear(128, 128),
                nn.ReLU(),
                nn.Linear(128, 1),
                nn.Sigmoid()
            )

        def forward(self, x):
            return self.network(x)

    def train(self, X, num_epochs=1000, batch_size=64):
        for epoch in range(num_epochs):
            permutation = torch.randperm(X.size()[0])
            for i in range(0, X.size()[0], batch_size):
                # Train Discriminator
                self.optimizer_D.zero_grad()
                
                real_data = X[permutation[i:i + batch_size]]
                real_labels = torch.ones((real_data.size(0), 1))
                fake_labels = torch.zeros((real_data.size(0), 1))
                
                noise = torch.randn((real_data.size(0), self.latent_dim))
                fake_data = self.generator(noise)
                
                real_loss = self.adversarial_loss(self.discriminator(real_data), real_labels)
                fake_loss = self.adversarial_loss(self.discriminator(fake_data.detach()), fake_labels)
                d_loss = real_loss + fake_loss
                d_loss.backward()
                self.optimizer_D.step()

                # Train Generator
                self.optimizer_G.zero_grad()
                
                noise = torch.randn((real_data.size(0), self.latent_dim))
                fake_data = self.generator(noise)
                g_loss = self.adversarial_loss(self.discriminator(fake_data), real_labels)
                g_loss.backward()
                self.optimizer_G.step()

            if epoch % 100 == 0:
                print(f"Epoch [{epoch}/{num_epochs}] Discriminator Loss: {d_loss.item():.4f}, Generator Loss: {g_loss.item():.4f}")

    def sample(self, num_samples):
        noise = torch.randn((num_samples, self.latent_dim))
        with torch.no_grad():
            return self.generator(noise).numpy()

def compare_statistics(original, synthetic):
    stats = {
        "mean": {},
        "std": {},
        "min": {},
        "max": {},
    }

    for column in original.columns:
        orig_stats = original[column].describe()
        synth_stats = synthetic[column].describe()

        stats["mean"][column] = (orig_stats["mean"], synth_stats["mean"])
        stats["std"][column] = (orig_stats["std"], synth_stats["std"])
        stats["min"][column] = (orig_stats["min"], synth_stats["min"])
        stats["max"][column] = (orig_stats["max"], synth_stats["max"])

    return stats

def percent_difference(orig_val, synth_val):
    return abs(orig_val - synth_val) / orig_val * 100

def perform_ks_test(original, synthetic):
    ks_results = {}
    for column in original.columns:
        stat, p_value = ks_2samp(original[column], synthetic[column])
        ks_results[column] = (stat, p_value)
    return ks_results

def produce_synthetic_data(data, num_epochs=1000, batch_size=64, latent_dim=20):
    # Separate features and target if 'Outcome' column exists
    if 'Outcome' in data.columns:
        features = data.drop(columns=['Outcome'])
        target = data['Outcome']
    else:
        features = data
        target = None

    # Normalize the features
    scaler = StandardScaler()
    normalized_features = scaler.fit_transform(features)

    # Convert to torch tensors
    X = torch.tensor(normalized_features, dtype=torch.float32)

    # Initialize and train GAN
    gan = GAN(input_dim=X.shape[1], latent_dim=latent_dim)
    gan.train(X, num_epochs=num_epochs, batch_size=batch_size)

    # Sample synthetic data
    synthetic_data = gan.sample(len(data))

    # Inverse transform the normalized features
    synthetic_features = scaler.inverse_transform(synthetic_data)
    synthetic_df = pd.DataFrame(synthetic_features, columns=features.columns)
    if target is not None:
        synthetic_df['Outcome'] = target.values

    # Statistical comparison
    stats = compare_statistics(data, synthetic_df)

    # Print results and percent differences
    for stat_type, columns in stats.items():
        print(f"\n{stat_type.upper()}:")
        for column, values in columns.items():
            orig_val, synth_val = values
            diff = percent_difference(orig_val, synth_val)
            print(f"{column} - Original: {orig_val:.4f}, Synthetic: {synth_val:.4f}, Percent Difference: {diff:.2f}%")

    # Kolmogorov-Smirnov Test
    print("\nKOLMOGOROV-SMIRNOV TEST:")
    ks_results = perform_ks_test(data, synthetic_df)
    for column, (stat, p_value) in ks_results.items():
        print(f"{column} - K-S Statistic: {stat:.4f}, P-value: {p_value:.4f}")

    return synthetic_df



### Runing on an arbitrary dataset below

In [8]:
data = pd.read_csv('diabetes.csv')
synthetic_data = produce_synthetic_data(data, num_epochs=1000, batch_size=64, latent_dim=20)



Epoch [0/1000] Discriminator Loss: 1.3730, Generator Loss: 0.6557
Epoch [100/1000] Discriminator Loss: 1.1334, Generator Loss: 0.6959
Epoch [200/1000] Discriminator Loss: 1.0905, Generator Loss: 0.9412
Epoch [300/1000] Discriminator Loss: 1.1402, Generator Loss: 1.0711
Epoch [400/1000] Discriminator Loss: 1.1135, Generator Loss: 1.0673
Epoch [500/1000] Discriminator Loss: 1.2576, Generator Loss: 0.9438
Epoch [600/1000] Discriminator Loss: 1.3259, Generator Loss: 0.6600
Epoch [700/1000] Discriminator Loss: 0.9721, Generator Loss: 1.0521
Epoch [800/1000] Discriminator Loss: 1.2036, Generator Loss: 0.8459
Epoch [900/1000] Discriminator Loss: 1.3629, Generator Loss: 0.9694

MEAN:
Pregnancies - Original: 3.8451, Synthetic: -2.0452, Percent Difference: 153.19%
Glucose - Original: 120.8945, Synthetic: 125.3675, Percent Difference: 3.70%
BloodPressure - Original: 69.1055, Synthetic: 94.9516, Percent Difference: 37.40%
SkinThickness - Original: 20.5365, Synthetic: 18.6549, Percent Difference: 9

  return abs(orig_val - synth_val) / orig_val * 100
  return abs(orig_val - synth_val) / orig_val * 100


# THIS IS TESTING THE DATA FROM THE SCRIPT

In [4]:

import pandas as pd
from GAN_Architecture import train_GAN, Sample_Synthetic_Data

# Placeholder path for the CSV file
csv_path = 'diabetes.csv'

# Train the GAN using the CSV file
generator = train_GAN(csv_path)

# Sample synthetic data from the trained generator
num_samples = 770  # Specify the number of samples you want to generate
latent_dim = generator.model[0].in_features  # Extract latent dimension from generator
synthetic_data = Sample_Synthetic_Data(generator, num_samples, latent_dim)

# Print the synthetic data for verification
print("Synthetic Data Samples:")
print(synthetic_data)


Epoch [0/10000] | D Loss: 2.9159 | G Loss: 0.7344
Epoch [100/10000] | D Loss: 0.4892 | G Loss: 0.9901
Epoch [200/10000] | D Loss: 0.2099 | G Loss: 1.8156
Epoch [300/10000] | D Loss: 0.1980 | G Loss: 2.2111
Epoch [400/10000] | D Loss: 0.2097 | G Loss: 2.3641
Epoch [500/10000] | D Loss: 0.1574 | G Loss: 3.3679
Epoch [600/10000] | D Loss: 0.5958 | G Loss: 2.3976
Epoch [700/10000] | D Loss: 0.4589 | G Loss: 2.5062
Epoch [800/10000] | D Loss: 0.1851 | G Loss: 2.9810
Epoch [900/10000] | D Loss: 0.5667 | G Loss: 2.5878
Epoch [1000/10000] | D Loss: 0.2720 | G Loss: 2.7033
Epoch [1100/10000] | D Loss: 0.4058 | G Loss: 1.9419
Epoch [1200/10000] | D Loss: 0.5325 | G Loss: 2.1189
Epoch [1300/10000] | D Loss: 0.6326 | G Loss: 1.5234
Epoch [1400/10000] | D Loss: 0.6277 | G Loss: 1.7014
Epoch [1500/10000] | D Loss: 0.6030 | G Loss: 1.7390
Epoch [1600/10000] | D Loss: 0.6916 | G Loss: 1.4433
Epoch [1700/10000] | D Loss: 0.8433 | G Loss: 1.2444
Epoch [1800/10000] | D Loss: 0.9056 | G Loss: 1.2478
Epoch

In [5]:
from GAN_Architecture import CalculateKS
import pandas as pd

csv_path = 'diabetes.csv'
# Load the original dataset for comparison
original_set = pd.read_csv(csv_path)

# Convert the synthetic data to a DataFrame with the same column names as the original dataset
synthetic_set = pd.DataFrame(synthetic_data, columns=original_set.columns)

# Calculate KS test p-values
p_values = CalculateKS(original_set, synthetic_set)

# Print the p-values for verification
print("Kolmogorov-Smirnov Test p-values:")
print(p_values)


Kolmogorov-Smirnov Test p-values:
{'Pregnancies': 2.400710997605451e-05, 'Glucose': 2.6827081809641225e-19, 'BloodPressure': 2.3956763779666205e-22, 'SkinThickness': 1.0294699434185567e-28, 'Insulin': 1.750334756421661e-110, 'BMI': 4.0969225213211506e-27, 'DiabetesPedigreeFunction': 1.5771831112367153e-37, 'Age': 8.396320472837098e-23, 'Outcome': 1.7087651952193262e-123}
