# CLgen Sampling
-----
Dec 2016.

In [1]:
# preamble
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from labm8 import fs
from labm8 import viz

import clgen
from clgen import corpus

## Experimental methodology

Experimental platforms: cc[123]. NVIDIA GTX 1080.

* Train on GitHub corpus.
* Generate 1000 samples.
* Reject bad samples.

Commands:

```
$ export CUDA_VISIBLE_DEVICES=0
$ export MODEL=model-128x3x50
$ cd ~/phd/lab/clgen/inference
$ python ./inference.py $MODEL.json 2>&1 | tee inference-$MODEL.log
```

In [2]:
c = corpus.Corpus.from_json({"path": "~/data/github"})
print("CLgen:      ", clgen.version())
print("Corpus size:", c.size)
print("Vocab size: ", c.vocab_size)

CLgen:       0.2.1
Corpus size: 20271852
Vocab size:  92


In [3]:
p, num_prototypes = corpus.most_common_prototypes(c, 20)
for row in p:
    ratio, prototype = row
    print(ratio * 100, '\t', prototype)

print("# prototypes", num_prototypes)

13.22049405306496 	 const int a, __global int* b, __global int* c
9.423604757548034 	 
3.476669716376944 	 __global int* a
3.43092406221409 	 __global float* a, __global float* b, __global float* c
2.2872827081427265 	 __global int* a, __global int* b
1.9670631290027447 	 __global float* a, __global float* b, __global int* c
1.8298261665141813 	 __global float* a, __global float* b
1.7840805123513266 	 __global float4* a, __global float4* b, __global float4* c, __global float4* d, __global float4* e, float f
1.6925892040256174 	 __local int* a
1.2808783165599267 	 __global float* a
1.0064043915827996 	 __global uint* a, __global uint* b
0.869167429094236 	 __global int* a, __global int* b, __global int* c
0.777676120768527 	 __global float4* a, __global float4* b, __global float4* c, __global float4* d, int e, int f, float g, float h
0.777676120768527 	 __global float4* q, __global float4* r, __global float4* s, __global float4* t, int u, int e, int v, float f, float g, float h, float 

## Experimental results

In [7]:
def parse_file(path):
    """ return model size, num layers, and epochs """
    import re
    m = re.search(r"([0-9]+)x([0-9]+)x([0-9]+)", path)
    return m.group(1), m.group(2), m.group(3) 

data_files = ["benchmark/" + x for x in fs.ls("benchmark")
              if x.startswith("benchmark-") and x.endswith(".json")]
data = dict((parse_file(x), clgen.load_json_file(x)) for x in data_files)
print("read {} data files".format(len(data)))

read 8 data files


## Evaluation

In [5]:
import matplotlib.pyplot as plt
import seaborn as sns
from labm8 import viz
plt.style.use(["seaborn-white", "seaborn-paper"])

def label(path):
    return '-'.join([str(x) for x in path])

# sort key values
sorted_keys = sorted(data.keys(), key=lambda x: [int(y) for y in x])