In [9]:
import pandas as pd
import numpy as np
import torch
from tqdm import tqdm
import torch.nn.functional as F

In [3]:
data_df = pd.read_csv('C:/Users/chanyoung/Desktop/RBFfitting/data/nonlinear15/non_linear15.csv')

In [4]:
data_df

Unnamed: 0,V0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14
0,1.361462,0.868128,1.119836,0.770762,1.543228,-0.484979,1.202227,1.812575,-1.328527,-0.452863,1.329738,1.401647,0.991044,-0.811218,1.467842
1,1.575887,0.502778,1.091785,0.330353,1.899189,-0.116958,0.987593,1.239788,-1.252347,-1.719436,0.306399,0.155447,1.327731,-1.011604,0.255159
2,1.333128,0.269990,1.162955,0.159699,1.646715,1.511054,1.234128,1.932573,-1.371554,-0.929414,0.733253,-1.374973,1.109616,-0.876114,0.516849
3,1.546209,0.297732,1.116530,0.482296,1.882979,-0.754731,0.762650,0.752762,-1.433568,-1.254173,1.054452,0.357797,1.062463,-0.555984,0.641373
4,1.661727,0.587748,1.013359,0.635559,1.464764,-0.546585,1.445773,1.857547,-1.568815,-0.723346,1.086332,1.656669,1.188205,-1.103284,1.055944
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-2.679979,-1.521583,0.961226,1.383197,-1.392430,-0.701504,-1.148679,0.558870,-0.177923,0.470893,0.907261,0.241810,-0.787315,-1.079895,1.162237
96,-1.977749,-1.478321,1.206590,0.352901,-0.829146,2.016453,-0.940043,-1.517751,-0.011627,-0.044789,0.697491,1.705942,-1.024586,-0.860960,0.453126
97,-2.235777,-1.149230,1.102117,0.655368,-1.112439,1.523969,-0.893636,-1.060217,0.021164,0.303327,1.165866,0.894137,-0.852924,-0.960449,0.966107
98,-2.246968,-0.699402,1.122137,0.350573,-1.382963,-0.901993,-0.138510,0.038130,-0.155820,0.145202,0.447543,0.029613,-0.210601,-1.063089,-0.596316


In [6]:
empty_row = pd.DataFrame(np.zeros((1, 15)), columns=data_df.columns)
empty_row

Unnamed: 0,V0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
for i in range(len(data_df)):
    if i == 0:
        con_df = pd.concat([data_df[i:i+1], empty_row])
    else:
        con_df2 = pd.concat([data_df[i:i+1], empty_row])
        con_df = pd.concat([con_df, con_df2])

In [8]:
con_df.reset_index(drop= True,inplace = True)
con_df.to_csv('./data/nonlinear_GAIN_200.csv', index = False)

In [10]:
use_gpu = False  # set it to True to use GPU and False to use CPU
if use_gpu:
    torch.cuda.set_device(0)

In [11]:
#%% System Parameters
# 1. Mini batch size
mb_size = 128
# 2. Missing rate
p_miss = 0.5
# 3. Hint rate
p_hint = 0.9
# 4. Loss Hyperparameters
alpha = 10
# 5. Train Rate
train_rate = 0.8

#%% Data

# Data generation
Data = con_df.values

# Parameters
No = len(Data)
Dim = len(Data[0,:])

# Hidden state dimensions
H_Dim1 = Dim
H_Dim2 = Dim

# Normalization (0 to 1)
Min_Val = np.zeros(Dim)
Max_Val = np.zeros(Dim)

for i in range(Dim):
    Min_Val[i] = np.min(Data[:,i])
    Data[:,i] = Data[:,i] - np.min(Data[:,i])
    Max_Val[i] = np.max(Data[:,i])
    Data[:,i] = Data[:,i] / (np.max(Data[:,i]) + 1e-6)    

#%% Missing introducing
p_miss_vec = p_miss * np.ones((Dim,1)) 
   
Missing = np.zeros((No,Dim))

for i in range(Dim):
    A = np.random.uniform(0., 1., size = [len(Data),])
    B = A > p_miss_vec[i]
    Missing[:,i] = 1.*B

    
#%% Train Test Division    
   
idx = np.random.permutation(No)

Train_No = int(No * train_rate)
Test_No = No - Train_No
    
# Train / Test Features
trainX = Data[idx[:Train_No],:]
testX = Data[idx[Train_No:],:]

# Train / Test Missing Indicators
trainM = Missing[idx[:Train_No],:]
testM = Missing[idx[Train_No:],:]

#%% Necessary Functions

# 1. Xavier Initialization Definition
# def xavier_init(size):
#     in_dim = size[0]
#     xavier_stddev = 1. / tf.sqrt(in_dim / 2.)
#     return tf.random_normal(shape = size, stddev = xavier_stddev)
def xavier_init(size):
    in_dim = size[0]
    xavier_stddev = 1. / np.sqrt(in_dim / 2.)
    return np.random.normal(size = size, scale = xavier_stddev)
    
# Hint Vector Generation
def sample_M(m, n, p):
    A = np.random.uniform(0., 1., size = [m, n])
    B = A > p
    C = 1.*B
    return C

In [12]:
if use_gpu is True:
    D_W1 = torch.tensor(xavier_init([Dim*2, H_Dim1]),requires_grad=True, device="cuda")     # Data + Hint as inputs
    D_b1 = torch.tensor(np.zeros(shape = [H_Dim1]),requires_grad=True, device="cuda")

    D_W2 = torch.tensor(xavier_init([H_Dim1, H_Dim2]),requires_grad=True, device="cuda")
    D_b2 = torch.tensor(np.zeros(shape = [H_Dim2]),requires_grad=True, device="cuda")

    D_W3 = torch.tensor(xavier_init([H_Dim2, Dim]),requires_grad=True, device="cuda")
    D_b3 = torch.tensor(np.zeros(shape = [Dim]),requires_grad=True, device="cuda")       # Output is multi-variate
else:
    D_W1 = torch.tensor(xavier_init([Dim*2, H_Dim1]),requires_grad=True)     # Data + Hint as inputs
    D_b1 = torch.tensor(np.zeros(shape = [H_Dim1]),requires_grad=True)

    D_W2 = torch.tensor(xavier_init([H_Dim1, H_Dim2]),requires_grad=True)
    D_b2 = torch.tensor(np.zeros(shape = [H_Dim2]),requires_grad=True)

    D_W3 = torch.tensor(xavier_init([H_Dim2, Dim]),requires_grad=True)
    D_b3 = torch.tensor(np.zeros(shape = [Dim]),requires_grad=True)       # Output is multi-variate

theta_D = [D_W1, D_W2, D_W3, D_b1, D_b2, D_b3]

#%% 2. Generator
if use_gpu is True:
    G_W1 = torch.tensor(xavier_init([Dim*2, H_Dim1]),requires_grad=True, device="cuda")     # Data + Mask as inputs (Random Noises are in Missing Components)
    G_b1 = torch.tensor(np.zeros(shape = [H_Dim1]),requires_grad=True, device="cuda")

    G_W2 = torch.tensor(xavier_init([H_Dim1, H_Dim2]),requires_grad=True, device="cuda")
    G_b2 = torch.tensor(np.zeros(shape = [H_Dim2]),requires_grad=True, device="cuda")

    G_W3 = torch.tensor(xavier_init([H_Dim2, Dim]),requires_grad=True, device="cuda")
    G_b3 = torch.tensor(np.zeros(shape = [Dim]),requires_grad=True, device="cuda")
else:
    G_W1 = torch.tensor(xavier_init([Dim*2, H_Dim1]),requires_grad=True)     # Data + Mask as inputs (Random Noises are in Missing Components)
    G_b1 = torch.tensor(np.zeros(shape = [H_Dim1]),requires_grad=True)

    G_W2 = torch.tensor(xavier_init([H_Dim1, H_Dim2]),requires_grad=True)
    G_b2 = torch.tensor(np.zeros(shape = [H_Dim2]),requires_grad=True)

    G_W3 = torch.tensor(xavier_init([H_Dim2, Dim]),requires_grad=True)
    G_b3 = torch.tensor(np.zeros(shape = [Dim]),requires_grad=True)
theta_G = [G_W1, G_W2, G_W3, G_b1, G_b2, G_b3]

#%% 1. Generator
def generator(new_x,m):
    inputs = torch.cat(dim = 1, tensors = [new_x,m])  # Mask + Data Concatenate
    G_h1 = F.relu(torch.matmul(inputs, G_W1) + G_b1)
    G_h2 = F.relu(torch.matmul(G_h1, G_W2) + G_b2)   
    G_prob = torch.sigmoid(torch.matmul(G_h2, G_W3) + G_b3) # [0,1] normalized Output
    
    return G_prob

#%% 2. Discriminator
def discriminator(new_x, h):
    inputs = torch.cat(dim = 1, tensors = [new_x,h])  # Hint + Data Concatenate
    D_h1 = F.relu(torch.matmul(inputs, D_W1) + D_b1)  
    D_h2 = F.relu(torch.matmul(D_h1, D_W2) + D_b2)
    D_logit = torch.matmul(D_h2, D_W3) + D_b3
    D_prob = torch.sigmoid(D_logit)  # [0,1] Probability Output
    
    return D_prob

#%% 3. Other functions
# Random sample generator for Z
def sample_Z(m, n):
    return np.random.uniform(0., 0.01, size = [m, n])        

# Mini-batch generation
def sample_idx(m, n):
    A = np.random.permutation(m)
    idx = A[:n]
    return idx

def discriminator_loss(M, New_X, H):
    # Generator
    G_sample = generator(New_X,M)
    # Combine with original data
    Hat_New_X = New_X * M + G_sample * (1-M)

    # Discriminator
    D_prob = discriminator(Hat_New_X, H)

    #%% Loss
    D_loss = -torch.mean(M * torch.log(D_prob + 1e-8) + (1-M) * torch.log(1. - D_prob + 1e-8))
    return D_loss

def generator_loss(X, M, New_X, H):
    #%% Structure
    # Generator
    G_sample = generator(New_X,M)

    # Combine with original data
    Hat_New_X = New_X * M + G_sample * (1-M)

    # Discriminator
    D_prob = discriminator(Hat_New_X, H)

    #%% Loss
    G_loss1 = -torch.mean((1-M) * torch.log(D_prob + 1e-8))
    MSE_train_loss = torch.mean((M * New_X - M * G_sample)**2) / torch.mean(M)

    G_loss = G_loss1 + alpha * MSE_train_loss 

    #%% MSE Performance metric
    MSE_test_loss = torch.mean(((1-M) * X - (1-M)*G_sample)**2) / torch.mean(1-M)
    return G_loss, MSE_train_loss, MSE_test_loss
    
def test_loss(X, M, New_X):
    #%% Structure
    # Generator
    G_sample = generator(New_X,M)

    #%% MSE Performance metric
    MSE_test_loss = torch.mean(((1-M) * X - (1-M)*G_sample)**2) / torch.mean(1-M)
    return MSE_test_loss, G_sample

In [13]:
optimizer_D = torch.optim.Adam(params=theta_D)
optimizer_G = torch.optim.Adam(params=theta_G)

#%% Start Iterations
for it in tqdm(range(500)):    
    
    #%% Inputs
    mb_idx = sample_idx(Train_No, mb_size)
    X_mb = trainX[mb_idx,:]  
    
    Z_mb = sample_Z(mb_size, Dim) 
    M_mb = trainM[mb_idx,:]  
    H_mb1 = sample_M(mb_size, Dim, 1-p_hint)
    H_mb = M_mb * H_mb1
    
    New_X_mb = M_mb * X_mb + (1-M_mb) * Z_mb  # Missing Data Introduce
    
    if use_gpu is True:
        X_mb = torch.tensor(X_mb, device="cuda")
        M_mb = torch.tensor(M_mb, device="cuda")
        H_mb = torch.tensor(H_mb, device="cuda")
        New_X_mb = torch.tensor(New_X_mb, device="cuda")
    else:
        X_mb = torch.tensor(X_mb)
        M_mb = torch.tensor(M_mb)
        H_mb = torch.tensor(H_mb)
        New_X_mb = torch.tensor(New_X_mb)
    
    optimizer_D.zero_grad()
    D_loss_curr = discriminator_loss(M=M_mb, New_X=New_X_mb, H=H_mb)
    D_loss_curr.backward()
    optimizer_D.step()
    
    optimizer_G.zero_grad()
    G_loss_curr, MSE_train_loss_curr, MSE_test_loss_curr = generator_loss(X=X_mb, M=M_mb, New_X=New_X_mb, H=H_mb)
    G_loss_curr.backward()
    optimizer_G.step()    
        
    #%% Intermediate Losses
    if it % 100 == 0:
        print('Iter: {}'.format(it))
        print('Train_loss: {:.4}'.format(np.sqrt(MSE_train_loss_curr.item())))
        print('Test_loss: {:.4}'.format(np.sqrt(MSE_test_loss_curr.item())))
        print()

  4%|███▌                                                                            | 22/500 [00:00<00:03, 121.23it/s]

Iter: 0
Train_loss: 0.2317
Test_loss: 0.2248



 28%|██████████████████████                                                         | 140/500 [00:00<00:01, 217.20it/s]

Iter: 100
Train_loss: 0.1844
Test_loss: 0.2072



 47%|█████████████████████████████████████▎                                         | 236/500 [00:01<00:01, 213.42it/s]

Iter: 200
Train_loss: 0.1668
Test_loss: 0.2081



 67%|█████████████████████████████████████████████████████                          | 336/500 [00:01<00:00, 221.34it/s]

Iter: 300
Train_loss: 0.1413
Test_loss: 0.1986



 86%|███████████████████████████████████████████████████████████████████▊           | 429/500 [00:02<00:00, 191.62it/s]

Iter: 400
Train_loss: 0.1205
Test_loss: 0.192



100%|███████████████████████████████████████████████████████████████████████████████| 500/500 [00:02<00:00, 208.92it/s]


In [14]:
augmentationX = Data
augmentationM = np.zeros_like(Data)
for i in range(Dim):
    A = np.random.uniform(0., 1., size = [len(Data),])
    B = A > p_miss_vec[i]
    augmentationM[:,i] = 1.*B
Z_mb = sample_Z(200, 15) 

# Z_mb = sample_Z(Test_No, Dim) 
M_mb = augmentationM
X_mb = augmentationX
        
New_X_mb = M_mb * X_mb + (1-M_mb) * Z_mb  # Missing Data Introduce

if use_gpu is True:
    X_mb = torch.tensor(X_mb, device='cuda')
    M_mb = torch.tensor(M_mb, device='cuda')
    New_X_mb = torch.tensor(New_X_mb, device='cuda')
else:
    X_mb = torch.tensor(X_mb)
    M_mb = torch.tensor(M_mb)
    New_X_mb = torch.tensor(New_X_mb)
    
MSE_final, Sample = test_loss(X=X_mb, M=M_mb, New_X=New_X_mb)
# np.set_printoptions(formatter={'float': lambda x: "{0:0.8f}".format(x)})

imputed_data = M_mb * X_mb + (1-M_mb) * Sample

In [15]:
imputed_df = pd.DataFrame(imputed_data.detach().numpy(), columns = con_df.columns)
imputed_df.to_csv('./data/augmentation_nonlinear_200.csv', index = False)

In [16]:
imputed_df

Unnamed: 0,V0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14
0,0.902328,0.744631,0.961920,0.764719,0.802341,0.410103,0.492393,0.871903,0.237426,0.217737,0.929120,0.401731,0.873216,0.067608,0.356340
1,0.661297,0.507563,0.555601,0.518825,0.459027,0.515734,0.673324,0.405531,0.504884,0.404061,0.267801,0.569488,0.500028,0.103868,0.483817
2,0.950203,0.932666,0.951742,0.586985,0.623344,0.363981,0.730417,0.724526,0.111115,0.092070,0.193591,0.474337,1.000000,0.086022,0.548998
3,0.598356,0.485068,0.376139,0.482834,0.333248,0.296855,0.467389,0.359262,0.262642,0.494785,0.205853,0.464228,0.500028,0.365989,0.483817
4,0.691889,0.529037,0.977566,0.879460,0.929429,0.734687,0.407692,0.902778,0.192902,0.288328,0.759700,0.293532,0.099020,0.043843,0.615848
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0.418495,0.565607,0.336184,0.688447,0.469148,0.515734,0.467389,0.405531,0.385440,0.456917,0.551435,0.569488,0.500028,0.160769,0.350501
196,0.096678,0.669449,0.962755,0.492757,0.670638,0.575864,0.458032,0.558005,0.372999,0.528793,0.678550,0.592507,0.420724,0.146065,0.474865
197,0.492259,0.485068,0.555601,0.518825,0.380697,0.328721,0.450895,0.405531,0.457953,0.494785,0.551435,0.456688,0.500028,0.398280,0.483817
198,0.152876,0.175880,0.324928,0.693180,0.093709,0.947943,0.295060,0.202725,0.375984,0.563760,0.550035,0.916185,0.290232,0.375856,0.267174
