<a href="https://colab.research.google.com/github/Chandanlokesh/python-lab/blob/master/mini_proj_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense, Input, Lambda
from tensorflow.keras.optimizers import Adam
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.losses import MeanSquaredError

# Preprocessing function with get_dummies
scaler = None  # Global scaler to maintain consistency

def preprocess_csv_with_dummies(file_path):
    global scaler
    df = pd.read_csv(file_path)

    # Detect numerical and categorical columns
    numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()

    # Normalize numerical columns
    if numerical_columns:
        scaler = MinMaxScaler()
        df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

    # One-hot encode categorical columns using pd.get_dummies
    if categorical_columns:
        df = pd.get_dummies(df, columns=categorical_columns)

    return df.astype(np.float32), numerical_columns, categorical_columns

# Load and preprocess data
file_path = 'simulated_traffic.csv'
processed_data, numerical_columns, categorical_columns = preprocess_csv_with_dummies(file_path)
data = processed_data.values

# Split dataset
X_train, X_temp = train_test_split(data, test_size=0.4, random_state=42)
X_valid, X_test = train_test_split(X_temp, test_size=0.5, random_state=42)

# Sampling layer for VAE
class Sampling(tf.keras.layers.Layer):
    def call(self, inputs):
        z_mean, z_log_var = inputs
        batch = tf.shape(z_mean)[0]
        dim = tf.shape(z_mean)[1]
        epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon

# Variational Autoencoder Model
class VAE(Model):
    def __init__(self, original_dim, latent_dim):
        super(VAE, self).__init__()
        self.encoder = self.build_encoder(original_dim, latent_dim)
        self.decoder = self.build_decoder(original_dim, latent_dim)

    def build_encoder(self, original_dim, latent_dim):
        inputs = Input(shape=(original_dim,))
        x = Dense(128, activation="relu")(inputs)
        x = Dense(64, activation="relu")(x)
        z_mean = Dense(latent_dim)(x)
        z_log_var = Dense(latent_dim)(x)
        z = Sampling()([z_mean, z_log_var])
        return Model(inputs, [z_mean, z_log_var, z], name="encoder")

    def build_decoder(self, original_dim, latent_dim):
        latent_inputs = Input(shape=(latent_dim,))
        x = Dense(64, activation="relu")(latent_inputs)
        x = Dense(128, activation="relu")(x)
        outputs = Dense(original_dim, activation="sigmoid")(x)
        return Model(latent_inputs, outputs, name="decoder")

    def call(self, inputs):
        z_mean, z_log_var, z = self.encoder(inputs)
        reconstructed = self.decoder(z)
        reconstruction_loss = tf.keras.losses.mse(inputs, reconstructed)
        reconstruction_loss *= tf.cast(tf.shape(inputs)[1], tf.float32)
        kl_loss = -0.5 * tf.reduce_sum(1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var), axis=-1)
        self.add_loss(tf.reduce_mean(reconstruction_loss + kl_loss))
        return reconstructed

# WGAN Generator
class Generator(Model):
    def __init__(self, data_dim):
        super(Generator, self).__init__()
        self.dense1 = Dense(128, activation='relu')
        self.dense2 = Dense(256, activation='relu')
        self.dense3 = Dense(data_dim, activation='tanh')

    def call(self, inputs):
        x = self.dense1(inputs)
        x = self.dense2(x)
        return self.dense3(x)

# WGAN Discriminator
class Discriminator(Model):
    def __init__(self):
        super(Discriminator, self).__init__()
        self.dense1 = Dense(256, activation='relu')
        self.dense2 = Dense(128, activation='relu')
        self.dense3 = Dense(1)

    def call(self, inputs):
        x = self.dense1(inputs)
        x = self.dense2(x)
        return self.dense3(x)

# WGAN Training Loop
def train_wgan(generator, discriminator, X_train, X_valid, latent_dim, batch_size=64, epochs=10, learning_rate=0.0001): #hfhslhklfhldsklafalsjflas
    gen_optimizer = Adam(learning_rate)
    disc_optimizer = Adam(learning_rate)

    for epoch in range(epochs):
        for i in range(0, X_train.shape[0], batch_size):
            real_data = X_train[i:i + batch_size]
            batch_size_real = real_data.shape[0]

            with tf.GradientTape() as disc_tape:
                z = tf.random.normal((batch_size_real, latent_dim))
                fake_data = generator(z, training=True)
                real_output = discriminator(real_data, training=True)
                fake_output = discriminator(fake_data, training=True)
                disc_loss = tf.reduce_mean(fake_output) - tf.reduce_mean(real_output)

            grads_disc = disc_tape.gradient(disc_loss, discriminator.trainable_variables)
            disc_optimizer.apply_gradients(zip(grads_disc, discriminator.trainable_variables))

            with tf.GradientTape() as gen_tape:
                z = tf.random.normal((batch_size_real, latent_dim))
                fake_data = generator(z, training=True)
                fake_output = discriminator(fake_data, training=True)
                gen_loss = -tf.reduce_mean(fake_output)

            grads_gen = gen_tape.gradient(gen_loss, generator.trainable_variables)
            gen_optimizer.apply_gradients(zip(grads_gen, generator.trainable_variables))

        print(f"Epoch {epoch + 1}/{epochs}, Generator Loss: {gen_loss.numpy()}, Discriminator Loss: {disc_loss.numpy()}")

# Initialize and train VAE
original_dim = data.shape[1]
vae_latent_dim = 5
vae = VAE(original_dim, vae_latent_dim)
vae.compile(optimizer=Adam(learning_rate=0.0001), metrics=[MeanSquaredError()])
vae.fit(X_train, X_train, validation_data=(X_valid, X_valid), epochs=10, batch_size=64) #798798798790898njsaklhjfklah

# Train WGAN
latent_dim = 10
generator = Generator(original_dim)
discriminator = Discriminator()
train_wgan(generator, discriminator, X_train, X_valid, latent_dim)

# Generate synthetic data
def generate_synthetic_data(generator, num_samples, latent_dim):
    z = tf.random.normal((num_samples, latent_dim))
    synthetic_data = generator(z, training=False).numpy()
    return synthetic_data

synthetic_samples = generate_synthetic_data(generator, num_samples=1000, latent_dim=latent_dim)

def postprocess_synthetic_data(synthetic_data, original_df, numerical_columns, categorical_columns):
    global scaler
    df = pd.DataFrame(synthetic_data, columns=original_df.columns)

    # columns_to_convert = ["Packet Count", "Byte Count", "Flow Duration (ms)", "Idle Time (ms)", "Active Time (ms)"]

    # for column in columns_to_convert:
    #     if column in numerical_columns:
    #         df[column] = df[column].round().astype(int)

    # Denormalize numerical columns
    if numerical_columns and scaler:
        df[numerical_columns] = scaler.inverse_transform(df[numerical_columns])

        # Clip and round ports to valid range
        if 'src_port' in numerical_columns:
            df['src_port'] = df['src_port'].clip(0, 65535).round().astype(int)
        if 'dst_port' in numerical_columns:
            df['dst_port'] = df['dst_port'].clip(0, 65535).round().astype(int)


        # if "Packet Count" in numerical_columns:
        #     df['Packet Count'] = df['Packet Count'].round().astype(int)
        # if "Byte Count" in numerical_columns:
        #     df['Byte Count'] = df['Byte Count'].round().astype(int)
        # if "Flow Duration (ms)" in numerical_columns:
        #     df['Flow Duration (ms)'] = df['Flow Duration (ms)'].round().astype(int)
        # if "Idle Time (ms)" in numerical_columns:
        #     df['Idle Time (ms)'] = df['Idle Time (ms)'].round().astype(int)

        # if "Active Time (ms)" in numerical_columns:
        #     df['Active Time (ms)'] = df['Active Time (ms)'].round().astype(int)


    # Convert one-hot encoded columns back to original categories
    for cat_col in categorical_columns:
        cat_prefix = [col for col in df.columns if col.startswith(cat_col + '_')]
        if cat_prefix:
            df[cat_col] = df[cat_prefix].idxmax(axis=1).apply(lambda x: x.split('_', 1)[-1])
            df = df.drop(columns=cat_prefix)


    if 'Attack Type' in df.columns:
        df['Label'] = df['Attack Type'].apply(lambda x: 'Benign' if x.lower() in ['normal', 'benign'] else 'Malicious')


    df['src_port'] = df['src_port'].fillna(0).astype(int)
    df['dst_port'] = df['dst_port'].fillna(0).astype(int)

    return df

synthetic_data_df = postprocess_synthetic_data(synthetic_samples, processed_data, numerical_columns, categorical_columns)

# Save final CSV
synthetic_data_df.to_csv('synthetic_traffic_vae_wgan.csv', index=False)
print("✅ Postprocessed synthetic data saved to 'synthetic_traffic_vae_wgan.csv'.")


Epoch 1/10
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 83ms/step - loss: 3690.6145 - mean_squared_error: 0.2457 - val_loss: 3433.0825 - val_mean_squared_error: 0.2286
Epoch 2/10
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 91ms/step - loss: 3285.4609 - mean_squared_error: 0.2188 - val_loss: 2699.9519 - val_mean_squared_error: 0.1796
Epoch 3/10
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 87ms/step - loss: 2373.3816 - mean_squared_error: 0.1580 - val_loss: 1281.1080 - val_mean_squared_error: 0.0851
Epoch 4/10
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 75ms/step - loss: 945.5228 - mean_squared_error: 0.0628 - val_loss: 297.3910 - val_mean_squared_error: 0.0194
Epoch 5/10
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 85ms/step - loss: 219.1094 - mean_squared_error: 0.0140 - val_loss: 121.7327 - val_mean_squared_error: 0.0073
Epoch 6/10
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m