In [None]:
#Imports
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler

from ctgan import CTGAN

In [2]:
#loading the data
real_data = pd.read_csv('../Data/GOOG.csv')

In [3]:
#function to make sequences of the time series data
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = pd.DataFrame(data)
    cols, names = list(), list()
    
    #input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
        
    #forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    
    #concat
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg

In [4]:
#seperating the Date column
date_column = real_data['Date']
real_data_without_date = real_data.drop('Date', axis=1)

In [10]:
#normalizing the continuous data columns
continuous_columns = ['Open', 'High', 'Low', 'Adj Close', 'MSFT Close', 'AMZN Close',
       'META Close', 'AAPL Close', '7ma', '14ma', '21ma', '7atr', '14atr',
       '21atr', '7upper', '7lower', '14upper', '14lower', '21upper', '21lower',
       'Close']

scaler = MinMaxScaler()
real_data_without_date[continuous_columns] = scaler.fit_transform(real_data_without_date[continuous_columns])

In [11]:
#vonvert the data into 30 day sequences
sequence_length = 30
data_sequences = series_to_supervised(real_data_without_date.values, n_in=sequence_length)

In [13]:
#training CTGAN on the sequence data
ctgan = CTGAN(epochs=500)
ctgan.fit(data_sequences)

In [None]:
#generating synthetic sequences
synthetic_sequences = ctgan.sample(len(data_sequences))

#converting sequences back to time series format
current_timestep_cols = [col for col in synthetic_sequences.columns if '(t)' in col]
synthetic_data = synthetic_sequences[current_timestep_cols]
synthetic_data.columns = real_data_without_date.columns  # renaming columns back to original

#sssigning starting dates
synthetic_data['Date'] = date_column.iloc[sequence_length-1:].reset_index(drop=True)

#reversing the normalization
synthetic_data[continuous_columns] = scaler.inverse_transform(synthetic_data[continuous_columns])

#reordering the columns to original order
synthetic_data = synthetic_data[real_data.columns]

In [15]:
#saving
synthetic_data.to_csv("../Data/CTGAN_synth_data.csv")

#seeing what it looks like
synthetic_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2467 entries, 0 to 2466
Data columns (total 22 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Date        2467 non-null   object 
 1   Open        2467 non-null   float64
 2   High        2467 non-null   float64
 3   Low         2467 non-null   float64
 4   Adj Close   2467 non-null   float64
 5   MSFT Close  2467 non-null   float64
 6   AMZN Close  2467 non-null   float64
 7   META Close  2467 non-null   float64
 8   AAPL Close  2467 non-null   float64
 9   7ma         2467 non-null   float64
 10  14ma        2467 non-null   float64
 11  21ma        2467 non-null   float64
 12  7atr        2467 non-null   float64
 13  14atr       2467 non-null   float64
 14  21atr       2467 non-null   float64
 15  7upper      2467 non-null   float64
 16  7lower      2467 non-null   float64
 17  14upper     2467 non-null   float64
 18  14lower     2467 non-null   float64
 19  21upper     2467 non-null  