# GAN


We imported packages and pre-processing our data.

In [10]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import torch
from torch.utils.data import DataLoader, TensorDataset


df = pd.read_csv('/content/real_train2.csv')


float_columns = df.select_dtypes(include=[np.float64]).columns
str_column = df.select_dtypes(include=[object]).columns[0]


encoder = OneHotEncoder(sparse=False)
encoded_str_column = encoder.fit_transform(df[[str_column]])


data_preprocessed = np.hstack((df[float_columns].values, encoded_str_column))


scaler = StandardScaler()
data_preprocessed[:, :len(float_columns)] = scaler.fit_transform(data_preprocessed[:, :len(float_columns)])


data_tensor = torch.tensor(data_preprocessed, dtype=torch.float32)


batch_size = 64
dataset = TensorDataset(data_tensor)
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)


print(f'Data tensor shape: {data_tensor.shape}')


NameError: name '_C' is not defined

Set up the generator

In [None]:
import torch
import torch.nn as nn

class Generator(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Generator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Linear(256, output_dim)
        )

    def forward(self, x):
        return self.model(x)


NameError: name '_C' is not defined

Set up the Discriminator

In [None]:
class Discriminator(nn.Module):
    def __init__(self, input_dim):
        super(Discriminator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.LeakyReLU(0.2),
            nn.Linear(256, 128),
            nn.LeakyReLU(0.2),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)


Train the GAN

In [None]:
import torch.optim as optim


latent_dim = 100
data_dim = data_tensor.shape[1]


generator = Generator(input_dim=latent_dim, output_dim=data_dim)
discriminator = Discriminator(input_dim=data_dim)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
generator.to(device)
discriminator.to(device)


criterion = nn.BCELoss()
optimizer_g = optim.Adam(generator.parameters(), lr=0.0002, betas=(0.5, 0.999))
optimizer_d = optim.Adam(discriminator.parameters(), lr=0.0002, betas=(0.5, 0.999))


num_epochs = 500

for epoch in range(num_epochs):
    for real_data, in data_loader:
        real_data = real_data.to(device)


        optimizer_d.zero_grad()

        real_labels = torch.ones(real_data.size(0), 1).to(device)
        real_output = discriminator(real_data)
        d_loss_real = criterion(real_output, real_labels)

        noise = torch.randn(real_data.size(0), latent_dim).to(device)
        fake_data = generator(noise)
        fake_labels = torch.zeros(real_data.size(0), 1).to(device)
        fake_output = discriminator(fake_data.detach())
        d_loss_fake = criterion(fake_output, fake_labels)

        d_loss = d_loss_real + d_loss_fake
        d_loss.backward()
        optimizer_d.step()

        optimizer_g.zero_grad()

        fake_output = discriminator(fake_data)
        g_loss = criterion(fake_output, real_labels)

        g_loss.backward()
        optimizer_g.step()

    if epoch % 100 == 0:
        print(f'Epoch [{epoch}/{num_epochs}], d_loss: {d_loss.item()}, g_loss: {g_loss.item()}')


Epoch [0/3000], d_loss: 1.3429065942764282, g_loss: 0.6377258896827698
Epoch [1000/3000], d_loss: 1.0203325748443604, g_loss: 1.1935430765151978
Epoch [2000/3000], d_loss: 1.0568904876708984, g_loss: 1.0611470937728882


Generate data

In [None]:
with torch.no_grad():
    noise = torch.randn(10000, latent_dim).to(device)
    generated_data = generator(noise)
    generated_data = generated_data.cpu().numpy()


    print(generated_data)


[[ 5.0770098e-01 -4.6693957e-01  7.9082124e-02  5.7008654e-01
   7.1634054e-03  4.1246203e-01]
 [ 8.2942450e-01  3.9270771e-01  7.9414085e-02  9.9133980e-01
   1.9644350e-03  8.6638331e-03]
 [-4.3289700e-01  2.1215244e-01  7.6903053e-02  5.1950291e-04
   6.8396330e-06  1.0003190e+00]
 ...
 [ 3.8019094e-01  2.6412874e-01  8.5333198e-02  9.9283844e-01
  -3.2498837e-03  9.2441440e-03]
 [ 6.3754147e-01  6.3058197e-01  7.6629348e-02  9.8323923e-01
  -2.1055341e-03  1.6853839e-02]
 [ 1.1958599e+00 -5.4331112e-01  9.3212388e-02  3.8133573e-02
   9.2195010e-01  2.4225190e-02]]


Clean and turn the data into a pandas dataframe.

In [None]:
import numpy as np


num_float_features = len(float_columns)
num_cat_features = encoded_str_column.shape[1]

generated_floats = generated_data[:, :num_float_features]
generated_cats = generated_data[:, num_float_features:]


denormalized_floats = scaler.inverse_transform(generated_floats)


decoded_cats = encoder.inverse_transform(generated_cats)


final_data = np.hstack((denormalized_floats, decoded_cats))


print(final_data)


[[2.0058276653289795 0.633722186088562 4.4294843673706055 'elliptical']
 [2.2573702335357666 0.7857985496520996 4.694533348083496 'elliptical']
 [1.2704122066497803 0.7538573145866394 2.6896352767944336 'spiral']
 ...
 [1.906132698059082 0.763052225112915 9.420561790466309 'elliptical']
 [2.107344627380371 0.827879786491394 2.4710988998413086 'elliptical']
 [2.5438711643218994 0.620211660861969 15.711592674255371 'merger']]


In [None]:
df = pd.DataFrame(final_data)
#df.groupby('3').count()
df.to_csv('/content/final_data4.csv')

KeyError: '3'

In [None]:
df

Unnamed: 0,0,1,2,3
0,2.005828,0.633722,4.429484,elliptical
1,2.25737,0.785799,4.694533,elliptical
2,1.270412,0.753857,2.689635,spiral
3,1.578997,0.76376,6.275672,merger
4,1.544504,0.704402,5.251128,spiral
...,...,...,...,...
9995,1.149211,0.785728,0.503364,spiral
9996,1.391582,0.743018,8.68048,spiral
9997,1.906133,0.763052,9.420562,elliptical
9998,2.107345,0.82788,2.471099,elliptical


10000