# Sampling Efficiency and Throughput
Investigating sample success rate as a product of network size.

In [1]:
# preamble
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from time import time
from labm8 import fs
from labm8 import system
from labm8.time import nowstr

import clgen
from clgen import corpus
from clgen import dbutil
from clgen import sampler
from clgen import model
from clgen import preprocess

**Experimental setup:**

In [2]:
rnn_sizes = [64, 128, 256, 512, 1024, 2048]
corpus_opts = {
    "path": "~/data/github"
}
training_opts = {
    "model_type": "lstm",
    "num_layers": 3,
    "max_epochs": 3
}
s = sampler.from_json({
    "kernels": {
        "args": [
            "__global float*",
            "__global float*",
            "__global float*",
            "const int"
        ],
        "max_length": 5000,
        "temperature": 1
    },
    "sampler": {
        "batch_size": 5000,
        "max_batches": 1,
        "static_checker": False,
        "dynamic_checker": False
    }
})

c = corpus.Corpus.from_json(corpus_opts)
print("Corpus size:", c.size)
print("Vocab size: ", c.vocab_size)
print()
clgen.platform_info()

Corpus size: 20271852
Vocab size:  92

CLgen:      0.2.1 (with CUDA)
Platform:   Linux
Memory:     32057 MB

Device:     GPU GeForce GTX 1080
Compute #.: 20
Frequency:  1733 HZ
Memory:     8113 MB
Driver:     367.57

Device:     GPU GeForce GTX 1080
Compute #.: 20
Frequency:  1733 HZ
Memory:     8113 MB
Driver:     367.57


**Experimental methodology:**

In [None]:
def evaluate(model, sampler):
    """ evaluate sampling efficiency """
    model.cache.empty()  # clear checkpoint cache    
    tstart = time()  # start timer
    model.train(quiet=True)  # train model
    training_time = time() - tstart

    # clear the sample cache
    sampler.cache(model).empty()

    # sample kernels and time
    tstart = time()
    sampler.sample(model, quiet=True)
    tend = time()
    elapsed = tend - tstart

    # preprocess sample
    sample_db = sampler.cache(model)["kernels.db"]
    preprocess.preprocess_db(sample_db)

    num_kernels = dbutil.num_rows_in(sample_db, "ContentFiles")
    num_good_kernels = dbutil.num_good_kernels(sample_db)
    num_ugly_kernels = dbutil.num_rows_in(sample_db, "PreprocessedFiles",
                                          "WHERE status=2")
    discard_rate = 1 - (num_good_kernels / num_kernels)
    ugly_rate = 1 - (num_ugly_kernels / num_kernels)


    total_charcount = dbutil.cc(sample_db, "ContentFiles")
    good_charcount = dbutil.cc(sample_db, "PreprocessedFiles",
                               condition="WHERE status=0")

    efficiency = good_charcount / total_charcount
    throughput = good_charcount / elapsed

    return {
        "training_time": training_time,
        "sampling_time": elapsed,
        "num_kernels": num_kernels,
        "num_good_kernels": num_good_kernels,
        "discard_rate": discard_rate,
        "ugly_rate": ugly_rate,
        "total_charcount": total_charcount,
        "good_charcount": good_charcount,
        "efficiency": efficiency,  # good_chars / total_chars
        "throughput": throughput,  # good_chars / second
        "corpus_dir": model.corpus.cache.path,
        "model_dir": model.cache.path,
        "sampler_dir": sampler.cache(model).path,
    }

**Experimental results:**

In [None]:
data_file = "./sampling-{host}.json".format(host=system.HOSTNAME)
fs.rm(data_file)  # reset experimental data
data = clgen.load_json_file(data_file, must_exist=False)

for rnn_size in rnn_sizes:
    key = str(rnn_size)
    if key not in data:
        print("collecting result for", rnn_size, "...")
        training_opts["rnn_size"] = rnn_size
        m = model.from_json({
            "corpus": corpus_opts,
            "train_opts": training_opts
        })
        data[key] = evaluate(m, s)
    print("result", rnn_size, clgen.format_json(data[key]))
    clgen.write_file(data_file, clgen.format_json(data))
print("done", nowstr())

collecting result for 64 ...


**Experimental results:**

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from labm8 import viz
plt.style.use(["seaborn-white", "seaborn-paper"])

plt.subplots(2, 2)

plt.subplot(2, 2, 1)
x, y = zip(*[(int(x), data[x]["training_time"] / training_opts["max_epochs"])
             for x in sorted(data.keys(), key=lambda x: int(x))])
sns.barplot(x, y)
plt.title("Time per. epoch")
plt.xlabel("Nodes per layer")
plt.ylabel("Time (seconds)")

plt.subplot(2, 2, 2)
x, y = zip(*[(int(x), data[x]["sampling_time"] / data[x]["num_kernels"])
             for x in sorted(data.keys(), key=lambda x: int(x))])
sns.barplot(x, y)
plt.title("Time per. sample")
plt.xlabel("Nodes per layer")
plt.ylabel("Time (seconds)")
    
plt.subplot(2, 2, 3)
x, y = zip(*[(int(x), data[x]["efficiency"] * 100)
             for x in sorted(data.keys(), key=lambda x: int(x))])
sns.barplot(x, y)
plt.title("Efficiency")
plt.xlabel("Nodes per layer")
plt.ylabel("% good chars")
    
plt.subplot(2, 2, 4)
x, y = zip(*[(int(x), data[x]["throughput"])
             for x in sorted(data.keys(), key=lambda x: int(x))])
sns.barplot(x, y)
plt.title("Throughput")
plt.xlabel("Nodes per layer")
plt.ylabel("good chars / second")

viz.finalise(figsize=(8, 8))