In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

# create binary classification data
x_train, y_train = make_classification(n_samples=2000, n_features=20, n_informative=10)
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train)

# split data by class
x_train_0, y_train_0 = x_train[y_train == 0], y_train[y_train == 0]
x_train_1, y_train_1 = x_train[y_train == 1], y_train[y_train == 1]

# model params
g_input_size = 20
g_hidden_size = 150
g_output_size = 20
d_input_size = 20
d_hidden_size = 50
d_output_size = 1

# training params
batch_size = 20
d_learning_rate = 2e-4
g_learning_rate = 1e-4
optim_betas = (0.9, 0.999)
num_epochs = 2000
print_interval = 100
g_steps = 2

# batch generators
def batch_gen(batch_size, x, y=None):
    
    # create list of batches
    size = x.shape[0]
    idx_array = np.arange(size)
    n_batch = int(np.ceil(size / float(batch_size)))
    batches = [(int(i * batch_size), int(min(size, (i + 1) * batch_size))) for i in range(n_batch)]
    
    # use list of batches to yield from input data
    for batch_index, (start, end) in enumerate(batches):
        batch_ids = idx_array[start:end]
        if y is not None:
            yield Variable(torch.from_numpy(x[batch_ids])), Variable(torch.from_numpy(y[batch_ids]))
        else:
            yield Variable(torch.from_numpy(x[batch_ids]))

# generator
class Generator(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(Generator, self).__init__()
        self.map1 = nn.Linear(input_size, hidden_size)
        self.map2 = nn.Linear(hidden_size, hidden_size)
        self.map3 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = F.elu(self.map1(x))
        x = F.sigmoid(self.map2(x))
        return self.map3(x)
    
# discriminator
class Discriminator(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(Discriminator, self).__init__()
        self.map1 = nn.Linear(input_size, hidden_size)
        self.map2 = nn.Linear(hidden_size, hidden_size)
        self.map3 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = F.elu(self.map1(x))
        x = F.elu(self.map2(x))
        return F.sigmoid(self.map3(x))

# loss fn
loss = nn.BCELoss()

# models to predict class 0
G_0 = Generator(input_size=g_input_size, hidden_size=g_hidden_size, output_size=g_output_size)
D_0 = Discriminator(input_size=d_input_size, hidden_size=d_hidden_size, output_size=d_output_size)
d_opt_0 = optim.Adam(D_0.parameters(), lr=d_learning_rate, betas=optim_betas)
g_opt_0 = optim.Adam(G_0.parameters(), lr=g_learning_rate, betas=optim_betas)

# models to predict class 1
G_1 = Generator(input_size=g_input_size, hidden_size=g_hidden_size, output_size=g_output_size)
D_1 = Discriminator(input_size=d_input_size, hidden_size=d_hidden_size, output_size=d_output_size)
d_opt_1 = optim.Adam(D_1.parameters(), lr=d_learning_rate, betas=optim_betas)
g_opt_1 = optim.Adam(G_1.parameters(), lr=g_learning_rate, betas=optim_betas)

In [2]:
# train class 0 generator
for epoch in range(num_epochs):
    
    # instantiate batch loaders
    batch_loader = batch_gen(batch_size, x_train_0, y_train_0)
    for x, y in batch_loader:
        D_0.zero_grad()

        #  train D on real class 0 data
        d_real_decision = D_0(x.float())
        d_real_error = loss(d_real_decision, Variable(torch.ones((len(x), 1)))) # ones = true
        d_real_error.backward() # compute and store gradients, don't change params

        #  train D on fake class 0 data
        d_gen_input = Variable(torch.randn(len(x), g_input_size))
        d_fake_data = G_0(d_gen_input).detach()  # detach to avoid training G on these labels
        d_fake_decision = D_0(d_fake_data)
        d_fake_error = loss(d_fake_decision, Variable(torch.zeros((len(x), 1)))) # zeros = fake
        d_fake_error.backward()

        # optimize D params, based on stored gradients from backward() with real + fake
        d_opt_0.step()

        for _ in range(g_steps):
            G_0.zero_grad()

            gen_input = Variable(torch.randn(len(x), g_input_size))
            g_fake_data = G_0(gen_input)
            dg_fake_decision = D_0(g_fake_data)
            g_error = loss(dg_fake_decision, Variable(torch.ones((len(x), 1))))
            g_error.backward()
            g_opt_0.step()
    
    if epoch % print_interval == 0:
        print('Epoch {} \td_real_loss: {}\td_fake_loss: {}\tg_loss: {}'.format(
            epoch, 
            round(d_real_error.data[0], 4),
            round(d_fake_error.data[0], 4),
            round(g_error.data[0], 4)))

Epoch 0 	d_real_loss: 0.5579	d_fake_loss: 0.729	g_loss: 0.6619
Epoch 100 	d_real_loss: 0.3329	d_fake_loss: 0.1904	g_loss: 2.5496
Epoch 200 	d_real_loss: 0.8028	d_fake_loss: 0.4723	g_loss: 1.1961
Epoch 300 	d_real_loss: 0.6949	d_fake_loss: 0.5818	g_loss: 0.6019
Epoch 400 	d_real_loss: 0.6896	d_fake_loss: 0.7037	g_loss: 0.8548
Epoch 500 	d_real_loss: 0.9074	d_fake_loss: 0.4758	g_loss: 0.968
Epoch 600 	d_real_loss: 0.6603	d_fake_loss: 0.6009	g_loss: 0.839
Epoch 700 	d_real_loss: 0.8257	d_fake_loss: 0.7165	g_loss: 0.6754
Epoch 800 	d_real_loss: 0.7572	d_fake_loss: 0.6193	g_loss: 0.8509
Epoch 900 	d_real_loss: 0.7763	d_fake_loss: 0.8032	g_loss: 0.7087
Epoch 1000 	d_real_loss: 0.4685	d_fake_loss: 0.6639	g_loss: 0.6641
Epoch 1100 	d_real_loss: 1.0368	d_fake_loss: 0.7257	g_loss: 0.668
Epoch 1200 	d_real_loss: 0.4567	d_fake_loss: 0.654	g_loss: 0.8452
Epoch 1300 	d_real_loss: 0.4706	d_fake_loss: 0.6806	g_loss: 0.6873
Epoch 1400 	d_real_loss: 0.4858	d_fake_loss: 0.6761	g_loss: 0.8807
Epoch 1500 	

In [3]:
# train class 1 generator
for epoch in range(num_epochs):
    
    # instantiate batch loaders
    batch_loader = batch_gen(batch_size, x_train_1, y_train_1)
    for x, y in batch_loader:
        
        D_1.zero_grad()

        #  train D on real class 0 data
        d_real_decision = D_1(x.float())
        d_real_error = loss(d_real_decision, Variable(torch.ones((len(x), 1)))) # ones = true
        d_real_error.backward() # compute and store gradients, don't change params

        #  train D on fake class 0 data
        d_gen_input = Variable(torch.randn(len(x), g_input_size))
        d_fake_data = G_1(d_gen_input).detach()  # detach to avoid training G on these labels
        d_fake_decision = D_1(d_fake_data)
        d_fake_error = loss(d_fake_decision, Variable(torch.zeros((len(x), 1)))) # zeros = fake
        d_fake_error.backward()

        # optimize D params, based on stored gradients from backward() with real + fake
        d_opt_1.step()

        for _ in range(g_steps):
            G_1.zero_grad()

            gen_input = Variable(torch.randn(len(x), g_input_size))
            g_fake_data = G_1(gen_input)
            dg_fake_decision = D_1(g_fake_data)
            g_error = loss(dg_fake_decision, Variable(torch.ones((len(x), 1))))
            g_error.backward()
            g_opt_1.step()
    
    if epoch % print_interval == 0:
        print('Epoch {} \td_real_loss: {}\td_fake_loss: {}\tg_loss: {}'.format(
            epoch, 
            round(d_real_error.data[0], 4),
            round(d_fake_error.data[0], 4),
            round(g_error.data[0], 4)))

Epoch 0 	d_real_loss: 0.6441	d_fake_loss: 0.7062	g_loss: 0.683
Epoch 100 	d_real_loss: 1.0428	d_fake_loss: 0.4461	g_loss: 1.5447
Epoch 200 	d_real_loss: 0.4023	d_fake_loss: 0.4523	g_loss: 0.9873
Epoch 300 	d_real_loss: 0.9556	d_fake_loss: 0.6086	g_loss: 0.819
Epoch 400 	d_real_loss: 0.5581	d_fake_loss: 0.6403	g_loss: 0.7663
Epoch 500 	d_real_loss: 0.5373	d_fake_loss: 0.8199	g_loss: 0.6224
Epoch 600 	d_real_loss: 0.7304	d_fake_loss: 0.4689	g_loss: 0.9775
Epoch 700 	d_real_loss: 0.9036	d_fake_loss: 0.7135	g_loss: 0.7506
Epoch 800 	d_real_loss: 0.6516	d_fake_loss: 0.5061	g_loss: 0.9862
Epoch 900 	d_real_loss: 0.8509	d_fake_loss: 0.4352	g_loss: 1.0231
Epoch 1000 	d_real_loss: 0.7484	d_fake_loss: 0.6967	g_loss: 0.6285
Epoch 1100 	d_real_loss: 0.6737	d_fake_loss: 0.8008	g_loss: 0.623
Epoch 1200 	d_real_loss: 0.5728	d_fake_loss: 0.5715	g_loss: 0.8846
Epoch 1300 	d_real_loss: 0.6785	d_fake_loss: 0.7411	g_loss: 0.7032
Epoch 1400 	d_real_loss: 0.5944	d_fake_loss: 0.8443	g_loss: 0.575
Epoch 1500 

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# use logistic regression on original data
model = LogisticRegression()
preds = model.fit(x_train, y_train).predict(x_test)

# print results
print(classification_report(preds, y_test))
print(accuracy_score(preds, y_test))

             precision    recall  f1-score   support

          0       0.90      0.89      0.89       260
          1       0.88      0.89      0.88       240

avg / total       0.89      0.89      0.89       500

0.888


In [5]:
def generate_data(n):
    # generate synthetic data
    xf_train_0 = G_0(Variable(torch.randn(n, 20))).data.numpy()
    xf_train_1 = G_1(Variable(torch.randn(n, 20))).data.numpy()

    # create beefed up trainset
    x_train_syn = np.vstack([x_train_0, xf_train_0, x_train_1, xf_train_1])
    y_train_syn = np.hstack([np.zeros((len(x_train_0) + n)), np.ones((len(x_train_1) + n))]).reshape(-1, 1)
    
    # shuffle data
    data = np.hstack([x_train_syn, y_train_syn])
    data = pd.DataFrame(data).iloc[np.random.permutation(len(data))]
    return data.iloc[:, :-1].values, data.iloc[:, -1].values

# generate synthetic data
x_train_syn, y_train_syn = generate_data(500)

# use logistic regression on augmented data
preds = model.fit(x_train_syn, y_train_syn).predict(x_test)

# print results
print(classification_report(preds, y_test))
print(accuracy_score(preds, y_test))

             precision    recall  f1-score   support

        0.0       0.86      0.88      0.87       252
        1.0       0.88      0.85      0.87       248

avg / total       0.87      0.87      0.87       500

0.868
