In [None]:
#Imports
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

#### Creating synthetic dataset using WGAN-GP

In [19]:
#getting the data
df = pd.read_csv('../Data/GOOG.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2497 entries, 0 to 2496
Data columns (total 22 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Date        2497 non-null   object 
 1   Open        2497 non-null   float64
 2   High        2497 non-null   float64
 3   Low         2497 non-null   float64
 4   Adj Close   2497 non-null   float64
 5   MSFT Close  2497 non-null   float64
 6   AMZN Close  2497 non-null   float64
 7   META Close  2497 non-null   float64
 8   AAPL Close  2497 non-null   float64
 9   7ma         2497 non-null   float64
 10  14ma        2497 non-null   float64
 11  21ma        2497 non-null   float64
 12  7atr        2497 non-null   float64
 13  14atr       2497 non-null   float64
 14  21atr       2497 non-null   float64
 15  7upper      2497 non-null   float64
 16  7lower      2497 non-null   float64
 17  14upper     2497 non-null   float64
 18  14lower     2497 non-null   float64
 19  21upper     2497 non-null  

In [20]:
use_cuda = 1
device = torch.device("cuda" if (torch.cuda.is_available() & use_cuda) else "cpu")

### Data Prep

In [21]:
#convert Date col to integer IDs for Embedding layer
df['Date'] = df['Date'].astype('category')
df['DateID'] = df['Date'].cat.codes
max_date_id = df['DateID'].max()

# Prepare data
x = df.drop(['Close', 'Date'], axis=1).values
dates = df['DateID'].values.reshape(-1, 1)
x = np.hstack([dates, x])
y = df['Close'].values

#split the data into train and test
split = int(df.shape[0]* 0.8)
x_train, x_test = x[: split, :], x[split:, :]
y_train, y_test = y[: split], y[split:]

#scaling the data (excluding 'DateID')
scaler = MinMaxScaler(feature_range=(-1, 1))
x_train[:, 1:] = scaler.fit_transform(x_train[:, 1:])
x_test[:, 1:] = scaler.transform(x_test[:, 1:])
y_train = scaler.fit_transform(y_train.reshape(-1, 1))
y_test = scaler.transform(y_test.reshape(-1, 1))

In [22]:
#creating the TensorDatasets and DataLoaders
train_dataset = TensorDataset(torch.LongTensor(x_train[:, 0]), torch.FloatTensor(x_train[:, 1:]), torch.FloatTensor(y_train))
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)

### Defining the Generator and Discriminator

In [23]:
class Generator(nn.Module):
    def __init__(self, input_size, max_date_id):
        super().__init__()
        self.embedding = nn.Embedding(max_date_id + 1, 10)  # Embedding layer for DateID
        self.gru_1 = nn.GRU(input_size + 10, 1024, batch_first=True)  # +10 for embedding dimension
        self.gru_2 = nn.GRU(1024, 512, batch_first = True)
        self.gru_3 = nn.GRU(512, 256, batch_first = True)
        self.linear_1 = nn.Linear(256, 128)
        self.linear_2 = nn.Linear(128, 64)
        self.linear_3 = nn.Linear(64, x_train.shape[1] - 1)  # Output all columns except the DateID
        self.dropout = nn.Dropout(0.2)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
    def forward(self, date_ids, noise):
        date_embedding = self.embedding(date_ids).unsqueeze(1)  # Add seq_len dimension
        x = torch.cat([date_embedding, noise], dim=-1)  # Now, x should have a last dimension of input_size
        
        h0 = torch.zeros(1, x.size(0), 1024).to(self.device)
        out_1, _ = self.gru_1(x, h0)
        out_1 = self.dropout(out_1)
        h1 = torch.zeros(1, x.size(0), 512).to(device)
        out_2, _ = self.gru_2(out_1, h1)
        out_2 = self.dropout(out_2)
        h2 = torch.zeros(1, x.size(0), 256).to(device)
        out_3, _ = self.gru_3(out_2, h2)
        out_3 = self.dropout(out_3)
        out_4 = self.linear_1(out_3[:, -1, :])
        out_5 = self.linear_2(out_4)
        out_6 = self.linear_3(out_5)
        return out_6

class Discriminator(nn.Module):
    def __init__(self, max_date_id):
        super().__init__()
        self.embedding = nn.Embedding(max_date_id + 1, 10)  # Embedding layer for DateID
        self.conv1 = nn.Conv1d(x_train.shape[1] - 1 + 10, 32, kernel_size=5, stride=1, padding=2) # Adjust input channels
        self.conv2 = nn.Conv1d(32, 64, kernel_size = 5, stride = 1, padding = 'same')
        self.conv3 = nn.Conv1d(64, 128, kernel_size = 5, stride = 1, padding = 'same')
        self.linear1 = nn.Linear(128, 220)
        self.linear2 = nn.Linear(220, 220)
        self.linear3 = nn.Linear(220, 1)
        self.leaky = nn.LeakyReLU(0.01)
        self.relu = nn.ReLU()

    def forward(self, date_ids, x):
        date_embedding = self.embedding(date_ids).unsqueeze(1)
        x = torch.cat([date_embedding, x], dim=-1)
        x = x.transpose(1, 2)  # Change shape to [batch_size, channels, length]
    
        conv1 = self.conv1(x)
        conv1 = self.leaky(conv1)
        conv2 = self.conv2(conv1)
        conv2 = self.leaky(conv2)
        conv3 = self.conv3(conv2)
        conv3 = self.leaky(conv3)
        flatten_x =  conv3.reshape(conv3.shape[0], -1) # Flattens all dimensions after batch size
        out_1 = self.linear1(flatten_x)
        out_1 = self.leaky(out_1)
        out_2 = self.linear2(out_1)
        out_2 = self.relu(out_2)
        out = self.linear3(out_2)
        return out

In [24]:
#defining gradient penalty calculation
def compute_gradient_penalty(D, date_ids, real_samples, fake_samples):
    alpha = torch.FloatTensor(np.random.random((real_samples.size(0), 1, 1))).to(device)
    interpolates = (alpha * real_samples + ((1 - alpha) * fake_samples)).requires_grad_(True)
    d_interpolates = D(date_ids, interpolates)  # We added the date_ids argument here
    fake = torch.FloatTensor(real_samples.shape[0], 1).fill_(1.0).to(device)
    gradients = torch.autograd.grad(
        outputs=d_interpolates,
        inputs=interpolates,
        grad_outputs=fake,
        create_graph=True,
        retain_graph=True,
        only_inputs=True,
    )[0]
    gradients = gradients.view(gradients.size(0), -1)
    gradient_penalty = ((gradients.norm(2, dim=1) - 1) ** 2).mean()
    return gradient_penalty

In [25]:
#setting hyperparameters
lr = 0.0002
n_epochs = 2000
lambda_gp = 10

#setting embedding layer params
embedding_dim = 10  
max_date_id = len(pd.unique(df['Date'])) 
date_embedding_layer = nn.Embedding(max_date_id, embedding_dim).to(device)

In [26]:
#setting input_size for Generator
input_size = x_train.shape[1] - 1  # subtracting 1 because DateID is not a feature but an index for the embedding

#initializing the Generator and Discriminator
generator = Generator(input_size, max_date_id).to(device)
discriminator = Discriminator(max_date_id).to(device)

optimizer_G = torch.optim.Adam(generator.parameters(), lr=lr, betas=(0.5, 0.9))
optimizer_D = torch.optim.Adam(discriminator.parameters(), lr=lr, betas=(0.5, 0.9))

### Training loop

In [16]:

for epoch in range(n_epochs):
    for batch_idx, (date_ids, x_real, y_real) in enumerate(train_loader):
        date_ids = date_ids.to(device)
        x_real = x_real.to(device)
        y_real = y_real.to(device)

        #train Discriminator
        optimizer_D.zero_grad()

        z = torch.randn((x_real.shape[0], 1, input_size)).to(device)
        gen_data = generator(date_ids, z)
        #adjusting real_data and gen_data for channel dimension
        real_data = x_real.unsqueeze(1)
        gen_data_with_features = gen_data.unsqueeze(1)

        #loss for real and fake data
        d_real = discriminator(date_ids, real_data)
        d_fake = discriminator(date_ids, gen_data_with_features.detach())

        #gradient penalty
        gradient_penalty = compute_gradient_penalty(discriminator, date_ids, real_data, gen_data_with_features.detach())
        d_loss = -torch.mean(d_real) + torch.mean(d_fake) + lambda_gp * gradient_penalty

        d_loss.backward()
        optimizer_D.step()

        optimizer_G.zero_grad()

        if batch_idx % 5 == 0:
            #train Generator
            gen_data_with_features = gen_data.unsqueeze(1)
            d_fake = discriminator(date_ids, gen_data_with_features)
            g_loss = -torch.mean(d_fake)

            g_loss.backward()
            optimizer_G.step()

    print(f"[Epoch {epoch}/{n_epochs}] [D loss: {d_loss.item()}] [G loss: {g_loss.item()}]")


[Epoch 0/2000] [D loss: -0.4212631285190582] [G loss: -29.218046188354492]
[Epoch 1/2000] [D loss: -0.49134179949760437] [G loss: -31.93711280822754]
[Epoch 2/2000] [D loss: -0.3286817669868469] [G loss: -28.7783145904541]
[Epoch 3/2000] [D loss: -0.4855578541755676] [G loss: -29.305410385131836]
[Epoch 4/2000] [D loss: -0.5687747597694397] [G loss: -29.171741485595703]
[Epoch 5/2000] [D loss: -0.38936614990234375] [G loss: -28.124967575073242]
[Epoch 6/2000] [D loss: -0.44582033157348633] [G loss: -29.790756225585938]
[Epoch 7/2000] [D loss: -0.5597432851791382] [G loss: -28.997989654541016]
[Epoch 8/2000] [D loss: -0.5521508455276489] [G loss: -30.713176727294922]
[Epoch 9/2000] [D loss: -0.3950687348842621] [G loss: -30.45928955078125]
[Epoch 10/2000] [D loss: -0.44600942730903625] [G loss: -28.244060516357422]
[Epoch 11/2000] [D loss: -0.5003863573074341] [G loss: -30.45521354675293]
[Epoch 12/2000] [D loss: -0.7103862166404724] [G loss: -29.88578987121582]
[Epoch 13/2000] [D loss:

### Generating the data

In [27]:
#setting no. of samples to same as original
n_samples = len(df)

#generating and saving synthetic data
with torch.no_grad():
    z = torch.randn((n_samples, 1, input_size)).to(device)
    date_ids_all = torch.LongTensor(df['DateID'].values).to(device)  # Take date IDs from the entire dataset
    generated_data = generator(date_ids_all, z).cpu().numpy()
    synthetic_dates = df['Date'].cat.categories[date_ids_all.cpu().numpy()].values
    synthetic_data = np.column_stack([synthetic_dates, scaler.inverse_transform(generated_data)])

    pd.DataFrame(synthetic_data).to_csv('WGANGP_synth_data.csv', index=False)

In [31]:
#adding column names from original dataset
df = pd.read_csv('WGANGP_synth_data.csv')
df2 = pd.read_csv('GOOG.csv')

df.columns = df2.columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2497 entries, 0 to 2496
Data columns (total 22 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Date        2497 non-null   object 
 1   Open        2497 non-null   float64
 2   High        2497 non-null   float64
 3   Low         2497 non-null   float64
 4   Adj Close   2497 non-null   float64
 5   MSFT Close  2497 non-null   float64
 6   AMZN Close  2497 non-null   float64
 7   META Close  2497 non-null   float64
 8   AAPL Close  2497 non-null   float64
 9   7ma         2497 non-null   float64
 10  14ma        2497 non-null   float64
 11  21ma        2497 non-null   float64
 12  7atr        2497 non-null   float64
 13  14atr       2497 non-null   float64
 14  21atr       2497 non-null   float64
 15  7upper      2497 non-null   float64
 16  7lower      2497 non-null   float64
 17  14upper     2497 non-null   float64
 18  14lower     2497 non-null   float64
 19  21upper     2497 non-null  

In [32]:
#saving it
df.to_csv('../Data/WGANGP_synth_data.csv')