In [1]:
import numpy as np
import json
import pandas as pd
from sdgym.synthesizers.tgan import *
from sdgym.synthesizers.tablegan import *
from sdgym.synthesizers.medgan import *
from sdgym.synthesizers.utils import *

In [14]:
class GeneralTransformer(Transformer):
    """Continuous and ordinal columns are normalized to [0, 1].
    Discrete columns are converted to a one-hot vector.
    """
    def __init__(self, meta, act='sigmoid'):
        self.act = act
        self.meta = meta
        self.output_dim = None

    def fit(self, data):
        if self.meta is None:
            self.meta = self.get_metadata(data)
#         self.columns = data.columns
        self.columns = [col['name'] for col in self.meta]
        self.output_dim = 0
        for info in self.meta:
            if info['type'] in [CONTINUOUS, ORDINAL]:
                self.output_dim += 1
            else:
                self.output_dim += info['size']

    def transform(self, data):
        data_t = []
        self.output_info = []
        for id_, info in enumerate(self.meta):
            col = data[:, id_]
            if info['type'] == CONTINUOUS:
                col = (col - (info['min'])) / (info['max'] - info['min'])
                if self.act == 'tanh':
                    col = col * 2 - 1
                data_t.append(col.reshape([-1, 1]))
                self.output_info.append((1, self.act))

            elif info['type'] == ORDINAL:
                col = col / info['size']
                if self.act == 'tanh':
                    col = col * 2 - 1
                data_t.append(col.reshape([-1, 1]))
                self.output_info.append((1, self.act))

            else:
                col_t = np.zeros([len(data), info['size']])
                col_t[np.arange(len(data)), col.astype('int32')] = 1
                data_t.append(col_t)
                self.output_info.append((info['size'], 'softmax'))

        return np.concatenate(data_t, axis=1)

    def inverse_transform(self, data):
        data_t = np.zeros([len(data), len(self.meta)])

        data = data.copy()
        for id_, info in enumerate(self.meta):
            if info['type'] == CONTINUOUS:
                current = data[:, 0]
                data = data[:, 1:]

                if self.act == 'tanh':
                    current = (current + 1) / 2

                current = np.clip(current, 0, 1)
                data_t[:, id_] = current * (info['max'] - info['min']) + info['min']

            elif info['type'] == ORDINAL:
                current = data[:, 0]
                data = data[:, 1:]

                if self.act == 'tanh':
                    current = (current + 1) / 2

                current = current * info['size']
                current = np.round(current).clip(0, info['size'] - 1)
                data_t[:, id_] = current
            else:
                current = data[:, :info['size']]
                data = data[:, info['size']:]
                data_t[:, id_] = np.argmax(current, axis=1)

        return data_t

In [12]:
class TableganTransformer(Transformer):

    def __init__(self, meta, side):
        self.meta = meta
        self.minn = np.zeros(len(meta))
        self.maxx = np.zeros(len(meta))
        for i in range(len(meta)):
            if meta[i]['type'] == CONTINUOUS:
                self.minn[i] = meta[i]['min'] - 1e-3
                self.maxx[i] = meta[i]['max'] + 1e-3
            else:
                self.minn[i] = -1e-3
                self.maxx[i] = meta[i]['size'] - 1 + 1e-3

        self.height = side

    def fit(self, data):
        pass

    def transform(self, data):
        data = data.copy().astype('float32')
        data = (data - self.minn) / (self.maxx - self.minn) * 2 - 1
        if self.height * self.height > len(data[0]):
            padding = np.zeros((len(data), self.height * self.height - len(data[0])))
            data = np.concatenate([data, padding], axis=1)
        return data.reshape(-1, 1, self.height, self.height)

    def inverse_transform(self, data):
        data = data.reshape(-1, self.height * self.height)

        data_t = np.zeros([len(data), len(self.meta)])

        for id_, info in enumerate(self.meta):
            numerator = (data[:, id_].reshape([-1]) + 1)
            # denominator = 2
            # addition = (self.maxx[id_] - self.minn[id_]) + self.minn[id_]

            data_t[:, id_] = (numerator / 2) * (self.maxx[id_] - self.minn[id_]) + self.minn[id_]
            if info['type'] in [CATEGORICAL, ORDINAL]:
                data_t[:, id_] = np.round(data_t[:, id_])

        return data_t

In [None]:
a = (x - min) / (max - min)
b = a / (max - min) + min

In [2]:
def init(meta):
#     self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    sides = [4, 8, 16, 24, 32]
    side = -1
    for i in sides:
        if i * i >= len(meta):
            side = i
            break
    return side

In [19]:
dataset = 'berka'
data = pd.read_csv(f'../data/{dataset}/{dataset}_sdgym.csv')
meta = json.load(open(f'data/real/{dataset}.json', 'r'))
fake_data = pd.read_csv(f'/mnt/samples/sample_berka_<sdgym.synthesizers.tablegan.TableganSynthesizer object at 0x7fec32eccda0>_91.csv')
side = init(meta)

In [None]:
                data_t[:, id_] = [info['i2s'][x] for x in [data_t[:, id_]]]


In [20]:
meta

[{'max': 3682987.0, 'min': 1.0, 'name': 'trans_id', 'type': 'continuous'},
 {'max': 11382.0, 'min': 1.0, 'name': 'account_id', 'type': 'continuous'},
 {'max': 87400.0, 'min': 0.0, 'name': 'trans_amount', 'type': 'continuous'},
 {'max': 209637.0,
  'min': -41125.7,
  'name': 'balance_after_trans',
  'type': 'continuous'},
 {'i2s': ['WITHDRAWAL', 'CREDIT', 'UNKNOWN'],
  'name': 'trans_type',
  'size': 3,
  'type': 'categorical'},
 {'i2s': ['WITHDRAWAL_IN_CASH',
   'REMITTANCE_TO_OTHER_BANK',
   'UNKNOWN',
   'CREDIT_IN_CASH',
   'COLLECTION_FROM_OTHER_BANK',
   'CC_WITHDRAWAL'],
  'name': 'trans_operation',
  'size': 6,
  'type': 'categorical'},
 {'i2s': ['UNKNOWN',
   'INTEREST_CREDITED',
   'PAYMENT_FOR_STATEMENT',
   'HOUSEHOLD',
   'OLD_AGE_PENSION',
   'INSURANCE_PAYMENT',
   'LOAN_PAYMENT',
   'SANCTION_INTEREST'],
  'name': 'trans_k_symbol',
  'size': 8,
  'type': 'categorical'},
 {'max': 2190.0, 'min': 0.0, 'name': 'trans_date', 'type': 'continuous'}]

In [24]:
[meta[5]['i2s'][x] for x in [2, 2, 3, 4, 2]]

['UNKNOWN',
 'UNKNOWN',
 'CREDIT_IN_CASH',
 'COLLECTION_FROM_OTHER_BANK',
 'UNKNOWN']

In [16]:
transformer = GeneralTransformer(meta)
train_data = transformer.transform(data.values)
# print(train_data[:5])
real_data = transformer.inverse_transform(train_data)
print(data.head().to_string())
pd.DataFrame(real_data).head()

           0       1        2        3    4    5    6       7
0   806940.0  2749.0   2200.0  31144.2  0.0  0.0  0.0   807.0
1   181540.0   616.0     14.6  19985.6  0.0  0.0  2.0  1915.0
2  2292564.0  7562.0   6450.2  73820.0  0.0  1.0  6.0  1503.0
3   501012.0  1707.0  11969.0  26113.6  1.0  4.0  0.0  2013.0
4   515967.0  1763.0  17200.0  40477.2  0.0  0.0  0.0  2060.0


Unnamed: 0,0,1,2,3,4,5,6,7
0,806940.0,2749.0,2200.0,31144.2,0.0,0.0,0.0,807.0
1,181540.0,616.0,14.6,19985.6,0.0,0.0,2.0,1915.0
2,2292564.0,7562.0,6450.2,73820.0,0.0,1.0,6.0,1503.0
3,501012.0,1707.0,11969.0,26113.6,1.0,4.0,0.0,2013.0
4,515967.0,1763.0,17200.0,40477.2,0.0,0.0,0.0,2060.0


In [17]:
transformer = TableganTransformer(meta, side)
train_data = transformer.transform(data.values)
train_data
real_data = transformer.inverse_transform(train_data)
print(data.head().to_string())
pd.DataFrame(real_data).head()

           0       1        2        3    4    5    6       7
0   806940.0  2749.0   2200.0  31144.2  0.0  0.0  0.0   807.0
1   181540.0   616.0     14.6  19985.6  0.0  0.0  2.0  1915.0
2  2292564.0  7562.0   6450.2  73820.0  0.0  1.0  6.0  1503.0
3   501012.0  1707.0  11969.0  26113.6  1.0  4.0  0.0  2013.0
4   515967.0  1763.0  17200.0  40477.2  0.0  0.0  0.0  2060.0


Unnamed: 0,0,1,2,3,4,5,6,7
0,806940.0,2749.0,2200.0,31144.199219,-0.0,0.0,-0.0,807.0
1,181540.0,616.0,14.6,19985.599609,-0.0,0.0,2.0,1915.0
2,2292564.0,7562.0,6450.200195,73820.0,-0.0,1.0,6.0,1503.0
3,501012.0,1707.0,11969.0,26113.599609,1.0,4.0,-0.0,2013.0
4,515967.0,1763.0,17200.0,40477.199219,-0.0,0.0,-0.0,2060.0
