# Adversarial Network Tuning

Definitions:
* Network parameters: $\Theta$
* CLgen model: $G(\Theta)$
* Discriminator model: $D(G(\Theta))$

From candidate params $\Theta = \{\Theta_1, \Theta_2, \ldots, \Theta_n\}$,
find the params $\Theta_{i}$ which minimize accuracy of the discriminator.

Discriminator functions:
* Human-or-robot? Distinguish between programs from GitHub and synthesized codes.
* Is it *useful*? Determine closest distance to benchmark features (would have to be a different set of benchmarks).


1. $\epsilon = 0.05$
1. $\Theta = newParams()$
1. while $abs(D(G(\Theta)) - 0.5) > \epsilon$
1. `    ` $\Theta = newParams()$

## GitHub Corpus

In [1]:
from clgen.corpus import Corpus

corpus = Corpus.from_json({
    "path": "~/data/github",
    "vocabulary": "greedy"
})
corpus

corpus of 3950 files

In [5]:
import sqlite3

def corpus_iter(kernels_db):
    db = sqlite3.connect(kernels_db)
    c = db.cursor()
    c.execute("SELECT contents FROM PreprocessedFiles WHERE status=0")
    srcs = [row[0] for row in c.fetchall()]
    c.close()
    db.close()
    return srcs

def encode_srcs(srcs, atomizer):
    return 

In [None]:
from labm8 import fs

github_srcs = corpus_iter(fs.path(corpus.contentcache.path, "kernels.db"))

inpath = fs.path("data", "encoded-" + corpus.hash + ".pkl")
if fs.exists(inpath):
    with open(inpath, "rb") as infile:
        github_seqs = pickle.load(infile)
else:
    github_seqs = [corpus.atomizer.atomize(x) for x in github_srcs]
    with open(inpath, "wb") as outfile:
        pickle.dump(github_seqs, outfile)
        print("cached", outfile)

In [None]:
import numpy as np
import pandas as pd

lens = np.array([len(x) for x in github_seqs])
_data = [{"Percentile": x, "Sequence Length": int(round(np.percentile(lens, x)))} for x in range(0, 101, 10)]
data = pd.DataFrame(_data, columns=["Percentile", "Sequence Length"])
data

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

plt.semilogy(data["Percentile"], data["Sequence Length"])
plt.title("GitHub Corpus")
plt.xlabel("Percentile")
plt.ylabel("Sequence Length")

In [None]:
import scipy.stats

seq_length = 1024
p1 = scipy.stats.percentileofscore(lens, seq_length)
p2 = 100 - p1
print("""\
A sequence length of {seq_length} is the {p1:.1f}% percentile of the GitHub corpus.
{p2:.1f}% of sequences will be truncated.""".format(**vars()))

In [None]:
# inputs
vocab_size = corpus.vocab_size + 1  # pad value

# network param
seq_length = 1024
embedding_vector_length = 64

# training param
nb_epoch = 50
batch_size = 64

## Discriminator Model

In [None]:
from keras.layers import Input, Dropout, Embedding, merge, LSTM, Dense
from keras.layers.normalization import BatchNormalization
from keras.models import Model, Sequential, load_model
from keras.preprocessing.sequence import pad_sequences
from keras.utils.visualize_util import model_to_dot
from keras.wrappers.scikit_learn import KerasClassifier

def create_model():
    """ instantiate model """
    data_in = Input(shape=(2,), name="data_in")

    code_in = Input(shape=(seq_length,), dtype="int32", name="code_in")
    x = Embedding(output_dim=embedding_vector_length, input_dim=vocab_size, input_length=seq_length)(code_in)
    x = LSTM(64, consume_less="mem", return_sequences=True)(x)
    x = LSTM(64, consume_less="mem")(x)
    out = Dense(2, activation="sigmoid", name="out")(x)

    model = Model(input=code_in, output=out)
    model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=['accuracy'])
    return model

model = create_model()

# TODO:
# # train model
# model.fit(X_train, y_train,
#           nb_epoch=nb_epoch, batch_size=batch_size,
#           verbose=1, shuffle=True)

# # predict with model
# predictions = np.array(model.predict(y_train, batch_size=batch_size, verbose=0))
# predictions = [np.argmax(x) for x in predictions[0]]

# model.save(outpath)