# GAN 

In [5]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MinMaxScaler

class CovidDataset(Dataset):
    def __init__(self, data):
        self.data = torch.FloatTensor(data.values)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

class Generator(nn.Module):
    def __init__(self, latent_dim, output_dim):
        super(Generator, self).__init__()

        self.model = nn.Sequential(
            nn.Linear(latent_dim, 256),
            nn.LeakyReLU(0.2),
            nn.BatchNorm1d(256),
            nn.Linear(256, 512),
            nn.LeakyReLU(0.2),
            nn.BatchNorm1d(512),
            nn.Linear(512, 1024),
            nn.LeakyReLU(0.2),
            nn.BatchNorm1d(1024),
            nn.Linear(1024, output_dim),
            nn.Sigmoid()  # Use Sigmoid to output values between 0 and 1
        )

    def forward(self, z):
        return self.model(z)

class Discriminator(nn.Module):
    def __init__(self, input_dim):
        super(Discriminator, self).__init__()

        self.model = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.LeakyReLU(0.2),
            nn.Linear(512, 256),
            nn.LeakyReLU(0.2),
            nn.Linear(256, 128),
            nn.LeakyReLU(0.2),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)

def train_gan(data_path, num_epochs=500, batch_size=32, latent_dim=200):
    # Load and prepare data
    df = pd.read_csv(data_path)

    # Convert all columns to numeric, forcing errors to NaN
    df = df.apply(pd.to_numeric, errors='coerce')

    # Drop rows with NaN values
    df = df.dropna()

    # Normalize the data
    scaler = MinMaxScaler()
    df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

    dataset = CovidDataset(df_scaled)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    generator = Generator(latent_dim, len(df.columns))
    discriminator = Discriminator(len(df.columns))

    criterion = nn.BCELoss()
    g_optimizer = optim.Adam(generator.parameters(), lr=0.0001, betas=(0.5, 0.999))
    d_optimizer = optim.Adam(discriminator.parameters(), lr=0.0001, betas=(0.5, 0.999))

    for epoch in range(num_epochs):
        epoch_g_loss = 0
        epoch_d_loss = 0
        batches = 0

        for i, real_data in enumerate(dataloader):
            batch_size = real_data.size(0)
            batches += 1

            # Train Discriminator
            d_optimizer.zero_grad()
            label_real = torch.ones(batch_size, 1)
            label_fake = torch.zeros(batch_size, 1)

            z = torch.randn(batch_size, latent_dim)
            fake_data = generator(z)

            d_loss_real = criterion(discriminator(real_data), label_real)
            d_loss_fake = criterion(discriminator(fake_data.detach()), label_fake)
            d_loss = d_loss_real + d_loss_fake

            d_loss.backward()
            d_optimizer.step()

            # Train Generator
            g_optimizer.zero_grad()
            z = torch.randn(batch_size, latent_dim)
            fake_data = generator(z)
            g_loss = criterion(discriminator(fake_data), label_real)
            g_loss.backward()
            g_optimizer.step()

            epoch_g_loss += g_loss.item()
            epoch_d_loss += d_loss.item()

        avg_g_loss = epoch_g_loss / batches
        avg_d_loss = epoch_d_loss / batches

        if (epoch + 1) % 10 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], d_loss: {avg_d_loss:.4f}, g_loss: {avg_g_loss:.4f}')

    return generator, scaler

def generate_samples(generator, scaler, num_samples=100, latent_dim=200):
    with torch.no_grad():
        z = torch.randn(num_samples, latent_dim)
        generated_data = generator(z).numpy()

    # Inverse transform the generated data
    df_generated = pd.DataFrame(scaler.inverse_transform(generated_data), columns=scaler.feature_names_in_)

    # Round the generated data to get binary values (0 or 1)
    df_generated = df_generated.round().clip(0, 1)  # Ensure values are either 0 or 1
    return df_generated

def main():
    data_path = '/Users/ishaanpothapragada/Desktop/Desktop_Documents/Maize/encoded_data.csv'  # Update this path

    generator, scaler = train_gan(data_path, num_epochs=100, latent_dim=200)

    original_data = pd.read_csv(data_path)

    # Generate synthetic samples
    synthetic_data = generate_samples(generator, scaler, num_samples=20000)  # Increase the number of samples

    # Ensure the synthetic data has the same columns as the original data
    synthetic_data.columns = original_data.columns

    # Add source column
    original_data['data_source'] = 'original'
    synthetic_data['data_source'] = 'synthetic'

    # Merge datasets
    merged_data = pd.concat([original_data, synthetic_data], axis=0, ignore_index=True)

    # Save datasets
    synthetic_data.to_csv('synthetic_covid_data.csv', index=False)
    merged_data.to_csv('merged_covid_data.csv', index=False)

    print("Synthetic and merged datasets have been saved.")

if __name__ == "__main__": 
    main()

Epoch [10/100], d_loss: 0.3770, g_loss: 3.0416
Epoch [20/100], d_loss: 0.2656, g_loss: 4.1275
Epoch [30/100], d_loss: 0.2642, g_loss: 4.7423
Epoch [40/100], d_loss: 0.1993, g_loss: 5.3193
Epoch [50/100], d_loss: 0.2050, g_loss: 5.7866
Epoch [60/100], d_loss: 0.1557, g_loss: 6.3521
Epoch [70/100], d_loss: 0.1675, g_loss: 6.4343
Epoch [80/100], d_loss: 0.1745, g_loss: 6.7328
Epoch [90/100], d_loss: 0.1348, g_loss: 7.1126
Epoch [100/100], d_loss: 0.1672, g_loss: 7.2631
Synthetic and merged datasets have been saved.


In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MinMaxScaler

class CovidDataset(Dataset):
    def __init__(self, data):
        self.data = torch.FloatTensor(data.values)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

class Generator(nn.Module):
    def __init__(self, latent_dim, output_dim):
        super(Generator, self).__init__()

        self.model = nn.Sequential(
            nn.Linear(latent_dim, 256),
            nn.LeakyReLU(0.2),
            nn.BatchNorm1d(256),
            nn.Linear(256, 512),
            nn.LeakyReLU(0.2),
            nn.BatchNorm1d(512),
            nn.Linear(512, 1024),
            nn.LeakyReLU(0.2),
            nn.BatchNorm1d(1024),
            nn.Linear(1024, output_dim),
            nn.Sigmoid()  # Use Sigmoid to output values between 0 and 1
        )

    def forward(self, z):
        return self.model(z)

class Discriminator(nn.Module):
    def __init__(self, input_dim):
        super(Discriminator, self).__init__()

        self.model = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.LeakyReLU(0.2),
            nn.Linear(512, 256),
            nn.LeakyReLU(0.2),
            nn.Linear(256, 128),
            nn.LeakyReLU(0.2),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)

def train_gan(data_path, num_epochs=500, batch_size=32, latent_dim=200):
    # Load and prepare data
    df = pd.read_csv(data_path)

    # Convert all columns to numeric, forcing errors to NaN
    df = df.apply(pd.to_numeric, errors='coerce')

    # Drop rows with NaN values
    df = df.dropna()

    # Normalize the data
    scaler = MinMaxScaler()
    df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

    dataset = CovidDataset(df_scaled)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    generator = Generator(latent_dim, len(df.columns))
    discriminator = Discriminator(len(df.columns))

    criterion = nn.BCELoss()
    g_optimizer = optim.Adam(generator.parameters(), lr=0.0001, betas=(0.5, 0.999))
    d_optimizer = optim.Adam(discriminator.parameters(), lr=0.0001, betas=(0.5, 0.999))

    for epoch in range(num_epochs):
        epoch_g_loss = 0
        epoch_d_loss = 0
        batches = 0

        for i, real_data in enumerate(dataloader):
            batch_size = real_data.size(0)
            batches += 1

            # Train Discriminator
            d_optimizer.zero_grad()
            label_real = torch.ones(batch_size, 1)
            label_fake = torch.zeros(batch_size, 1)

            z = torch.randn(batch_size, latent_dim)
            fake_data = generator(z)

            d_loss_real = criterion(discriminator(real_data), label_real)
            d_loss_fake = criterion(discriminator(fake_data.detach()), label_fake)
            d_loss = d_loss_real + d_loss_fake

            d_loss.backward()
            d_optimizer.step()

            # Train Generator
            g_optimizer.zero_grad()
            z = torch.randn(batch_size, latent_dim)
            fake_data = generator(z)
            g_loss = criterion(discriminator(fake_data), label_real)
            g_loss.backward()
            g_optimizer.step()

            epoch_g_loss += g_loss.item()
            epoch_d_loss += d_loss.item()

        avg_g_loss = epoch_g_loss / batches
        avg_d_loss = epoch_d_loss / batches

        if (epoch + 1) % 10 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], d_loss: {avg_d_loss:.4f}, g_loss: {avg_g_loss:.4f}')

    return generator, scaler

def generate_samples(generator, scaler, num_samples=100, latent_dim=200):
    with torch.no_grad():
        z = torch.randn(num_samples, latent_dim)
        generated_data = generator(z).numpy()

    # Inverse transform the generated data
    df_generated = pd.DataFrame(scaler.inverse_transform(generated_data), columns=scaler.feature_names_in_)

    # Round the generated data to get binary values (0 or 1)
    df_generated = df_generated.round().clip(0, 1)  # Ensure values are either 0 or 1
    return df_generated

def main():
    data_path = 'encoded_data.csv' # Update this path

    generator, scaler = train_gan(data_path, num_epochs=500, latent_dim=200)

    original_data = pd.read_csv(data_path)

    # Generate synthetic samples
    synthetic_data = generate_samples(generator, scaler, num_samples=20000)  # Increase the number of samples

    # Ensure the synthetic data has the same columns as the original data
    synthetic_data.columns = original_data.columns

    # Add source column
    original_data['data_source'] = 'original'
    synthetic_data['data_source'] = 'synthetic'

    # Merge datasets
    merged_data = pd.concat([original_data, synthetic_data], axis=0, ignore_index=True)

    # Save datasets
    synthetic_data.to_csv('synthetic500_covid_data.csv', index=False)
    merged_data.to_csv('merged500_covid_data.csv', index=False)

    print("Synthetic and merged datasets have been saved.")

if __name__ == "__main__": 
    main()

Epoch [10/500], d_loss: 0.3709, g_loss: 3.1858
Epoch [20/500], d_loss: 0.2807, g_loss: 3.8733
Epoch [30/500], d_loss: 0.2380, g_loss: 4.5520
Epoch [40/500], d_loss: 0.1949, g_loss: 5.2693
Epoch [50/500], d_loss: 0.1686, g_loss: 5.8676
Epoch [60/500], d_loss: 0.2153, g_loss: 6.1037
Epoch [70/500], d_loss: 0.1723, g_loss: 6.2785
Epoch [80/500], d_loss: 0.1533, g_loss: 6.2547
Epoch [90/500], d_loss: 0.1852, g_loss: 6.2511
Epoch [100/500], d_loss: 0.1696, g_loss: 6.2088
Epoch [110/500], d_loss: 0.1840, g_loss: 6.2395
Epoch [120/500], d_loss: 0.1730, g_loss: 6.2680
Epoch [130/500], d_loss: 0.1543, g_loss: 6.1716
Epoch [140/500], d_loss: 0.1573, g_loss: 6.2521
Epoch [150/500], d_loss: 0.1468, g_loss: 6.0427
Epoch [160/500], d_loss: 0.1505, g_loss: 5.7467
Epoch [170/500], d_loss: 0.1621, g_loss: 5.7485


KeyboardInterrupt: 

In [8]:
!pip3 install --upgrade pandas numpy torch scikit-learn sympy

INFO: pip is looking at multiple versions of torch to determine which version is compatible with other requirements. This could take a while.

The conflict is caused by:
    The user requested sympy
    torch 2.5.1+cu118 depends on sympy==1.13.1; python_version >= "3.9"

To fix this you could try to:
1. loosen the range of package versions you've specified
2. remove package versions to allow pip to attempt to solve the dependency conflict



ERROR: Cannot install sympy and torch==2.5.1+cu118 because these package versions have conflicting dependencies.

[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip
ERROR: ResolutionImpossible: for help visit https://pip.pypa.io/en/latest/topics/dependency-resolution/#dealing-with-dependency-conflicts


In [5]:
!pip3 install pandas numpy torch scikit-learn




[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
!pip3 uninstall sympy --y
!pip3 install sympy

Found existing installation: sympy 1.13.1
Uninstalling sympy-1.13.1:
  Successfully uninstalled sympy-1.13.1
Collecting sympy
  Downloading sympy-1.13.3-py3-none-any.whl.metadata (12 kB)
Downloading sympy-1.13.3-py3-none-any.whl (6.2 MB)
   ---------------------------------------- 0.0/6.2 MB ? eta -:--:--
   ---------------------------------------- 0.0/6.2 MB ? eta -:--:--
   ---------------------------------------- 0.0/6.2 MB ? eta -:--:--
   ---------------------------------------- 0.0/6.2 MB ? eta -:--:--
   ---------------------------------------- 0.0/6.2 MB ? eta -:--:--
   ---------------------------------------- 0.0/6.2 MB ? eta -:--:--
   ---------------------------------------- 0.0/6.2 MB ? eta -:--:--
   ---------------------------------------- 0.0/6.2 MB ? eta -:--:--
   ---------------------------------------- 0.0/6.2 MB ? eta -:--:--
   ---------------------------------------- 0.0/6.2 MB ? eta -:--:--
   ---------------------------------------- 0.0/6.2 MB ? eta -:--:--
   

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torch 2.5.1+cu118 requires sympy==1.13.1; python_version >= "3.9", but you have sympy 1.13.3 which is incompatible.

[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip
