In [49]:
from IPython import get_ipython
from IPython.display import display
# %%
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from google.colab import files
# %%

In [50]:
# 1. Load and Preprocess Data
data = pd.read_csv('/content/Dr Ogunjubohun ML data.csv')

numerical_features = data.select_dtypes(include=['number']).columns
categorical_features = data.select_dtypes(exclude=['number']).columns

numerical_transformer = Pipeline(steps=[
    ('scaler', MinMaxScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)])

processed_data = preprocessor.fit_transform(data)
X_train, X_test = train_test_split(processed_data, test_size=0.2, random_state=42)

# %%


In [53]:
#2. Define GAN Architecture
noise_dim = 100

def make_generator_model():
    model = tf.keras.Sequential()
    model.add(layers.Dense(128, activation='relu', input_shape=(noise_dim,)))
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dense(processed_data.shape[1], activation='relu'))  # Using relu for output
    return model

def make_discriminator_model():
    model = tf.keras.Sequential()
    model.add(layers.Dense(64, activation='relu', input_shape=(processed_data.shape[1],)))
    model.add(layers.Dense(32, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))
    return model

generator = make_generator_model()
discriminator = make_discriminator_model()
# %%

In [54]:
# 3. Compile and Train GAN
# Define optimizers
generator_optimizer = tf.keras.optimizers.Adam(1e-4)
discriminator_optimizer = tf.keras.optimizers.Adam(1e-4)

# Define loss function
cross_entropy = tf.keras.losses.BinaryCrossentropy(from_logits=True)
def discriminator_loss(real_output, fake_output):
    real_loss = cross_entropy(tf.ones_like(real_output), real_output)
    fake_loss = cross_entropy(tf.zeros_like(fake_output), fake_output)
    total_loss = real_loss + fake_loss
    return total_loss

def generator_loss(fake_output):
    return cross_entropy(tf.ones_like(fake_output), fake_output)  # Added closing parenthesis


# Define training step
BATCH_SIZE = 32 # Define your batch size
EPOCHS = 50 # Define the number of epochs

@tf.function
def train_step(images):  # Assuming 'images' is your training data (X_train)
    with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
        noise = tf.random.normal([BATCH_SIZE, noise_dim])

        generated_images = generator(noise, training=True)

        real_output = discriminator(images, training=True)
        fake_output = discriminator(generated_images, training=True)

        gen_loss = generator_loss(fake_output)
        disc_loss = discriminator_loss(real_output, fake_output)

    gradients_of_generator = gen_tape.gradient(gen_loss, generator.trainable_variables)
    gradients_of_discriminator = disc_tape.gradient(disc_loss, discriminator.trainable_variables)

    generator_optimizer.apply_gradients(zip(gradients_of_generator, generator.trainable_variables))
    discriminator_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.trainable_variables))

In [56]:
# 4. Generate Synthetic Data
num_samples = 3000  # Number of synthetic samples to generate
noise = np.random.normal(0, 1, (num_samples, noise_dim))
synthetic_data = generator.predict(noise)

[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step


In [57]:
# 5. Convert back to original form and units
num_features = preprocessor.transformers_[0][2].tolist()
cat_features = preprocessor.transformers_[1][2].tolist()

# Create pipelines for numerical and categorical features
num_pipeline = Pipeline(steps=[
    ('scaler', MinMaxScaler())
])
cat_pipeline = Pipeline(steps=[
    ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])

# Fit the pipelines to the original data
num_pipeline.fit(data[num_features])
cat_pipeline.fit(data[cat_features])

# Get the range of the original numerical features
num_range = num_pipeline.named_steps['scaler'].data_range_

# Apply inverse transform to the numerical features, considering the range
synthetic_data_num = synthetic_data[:, :len(num_features)] * num_range

# Apply inverse transform to the categorical features
synthetic_data_cat = cat_pipeline.inverse_transform(synthetic_data[:, len(num_features):])

# Convert the numerical data to a DataFrame
synthetic_df_num = pd.DataFrame(synthetic_data_num, columns=num_features)

# Convert the categorical data to a DataFrame
synthetic_df_cat = pd.DataFrame(synthetic_data_cat, columns=cat_features)

# Concatenate the numerical and categorical DataFrames
synthetic_df_original = pd.concat([synthetic_df_num, synthetic_df_cat], axis=1)

# Ensure the columns are in the same order as the original data
synthetic_df_original = synthetic_df_original[data.columns]

In [59]:
# 6. Download the Synthetic Data
synthetic_df_original.to_csv('synthetic_data_original.csv', index=False)
files.download('synthetic_data_original.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>