In [14]:
import pathlib
import typing

import pandas as pd

from deeplearning.clgen import clgen
from deeplearning.clgen.corpuses import corpuses
from deeplearning.clgen.proto import corpus_pb2
from deeplearning.clgen.proto import clgen_pb2
from deeplearning.clgen.proto import model_pb2
from deeplearning.clgen.proto import sampler_pb2
from deeplearning.clgen import clgen
from lib.labm8 import bazelutil
from lib.labm8 import pbutil

In [15]:
def LoadProtos(directory, proto_class):
    return {
        name.stem: corpuses.Corpus(pbutil.FromFile(directory / name, proto_class()))
        for name in directory.iterdir()
    }

clone_lists_dir = bazelutil.DataPath('phd/experimental/polyglot/baselines/corpuses')
corpuses_dir = bazelutil.DataPath('phd/experimental/polyglot/baselines/corpuses')
models_dir = bazelutil.DataPath('phd/experimental/polyglot/baselines/models')
samplers_dir = bazelutil.DataPath('phd/experimental/polyglot/baselines/samplers')

In [16]:
working_dir = '/var/phd/clgen/baseline'

def InstanceConfigs(language: str, model: str, temperature: str) -> typing.Dict[str, str]:
    return {
        'corpus': bazelutil.DataPath(f'phd/experimental/polyglot/baselines/corpuses/{language}-char.pbtxt'),
        'model': bazelutil.DataPath(f'phd/experimental/polyglot/baselines/models/{model}.pbtxt'),
        'sampler': bazelutil.DataPath(f'phd/experimental/polyglot/baselines/samplers/{language}-{temperature}.pbtxt'),
    }

instance_configs = [
    InstanceConfigs('opencl', '32-512x2-adam', '1.0'),
    InstanceConfigs('opencl', '32-1024x2-adam', '1.0'),
    InstanceConfigs('opencl', '64-512x2-adam', '1.0'),
    InstanceConfigs('opencl', '64-1024x2-adam', '1.0'),
    InstanceConfigs('opencl', '128-512x2-adam', '1.0'),
    InstanceConfigs('opencl', '128-1024x2-adam', '1.0'),
]

def InstanceFromConfigs(config_paths):
    corpus_config = pbutil.FromFile(config_paths['corpus'], corpus_pb2.Corpus())
    model_config = pbutil.FromFile(config_paths['model'], model_pb2.Model())
    sampler_config = pbutil.FromFile(config_paths['sampler'], sampler_pb2.Sampler())
    model_config.corpus.CopyFrom(corpus_config)
    return clgen.Instance(
        clgen_pb2.Instance(working_dir=working_dir, model=model_config, sampler=sampler_config))

instances = [InstanceFromConfigs(config_paths) for config_paths in instance_configs]
print("Loaded {} instances".format(len(instances)))

Loaded 6 instances


In [17]:
def InstanceStats(instance):
    stats = {}
    stats['Language'] = {
        'opencl': 'OpenCL',
        'java': 'Java',
    }[pathlib.Path(instance.model.corpus.config.local_directory).stem]
    instance.model.corpus.Create()
    stats['Encoding'] = 'Character' if 'Ascii' in str(instance.model.corpus.atomizer) else 'Token'
    stats['Vocab size'] = instance.model.corpus.atomizer.vocab_size
    stats['Corpus size'] = '{:.1f}M'.format(instance.model.corpus.encoded.token_count / 1e6)
    stats['Model size'] = f'{instance.model.config.architecture.neurons_per_layer}x{instance.model.config.architecture.num_layers}'
    stats['Dropout'] = instance.model.config.architecture.post_layer_dropout_micros / 1e6
    if instance.model.config.training.HasField('adam_optimizer'):
        stats['Optimizer'] = 'Adam'
        stats['Learning rate'] = instance.model.config.training.adam_optimizer.initial_learning_rate_micros / 1e6
        stats['Decay'] = instance.model.config.training.adam_optimizer.learning_rate_decay_per_epoch_micros / 1e6
    else:
        stats['Optimizer'] = 'RMSProp'
        stats['Learning rate'] = instance.model.config.training.rmsprop_optimizer.initial_learning_rate_micros / 1e6
        stats['Decay'] = instance.model.config.training.rmsprop_optimizer.learning_rate_decay_per_epoch_micros / 1e6
    stats['Trained?'] = instance.model.is_trained
    # TODO: 
    # * Time / epoch
    # * Min loss
    # * Time / 1k samples
    # * Post process discard ratio
    # * Time / 1k good samples
    return stats

df = pd.DataFrame([InstanceStats(i) for i in instances],
                  columns=['Language', 'Encoding', 'Vocab size', 'Corpus size', 'Model size', 'Dropout',
                           'Optimizer', 'Learning rate', 'Decay', 'Trained?'])
df

  0% (8 of 11927) |                      | Elapsed Time: 0:00:20 ETA:   8:23:24Process ForkPoolWorker-13:
Process ForkPoolWorker-16:
Process ForkPoolWorker-12:
Process ForkPoolWorker-5:
Process ForkPoolWorker-4:
Process ForkPoolWorker-6:
Process ForkPoolWorker-14:
Process ForkPoolWorker-3:
Process ForkPoolWorker-10:
Process ForkPoolWorker-8:
Process ForkPoolWorker-7:
Process ForkPoolWorker-9:
Process ForkPoolWorker-1:
Process ForkPoolWorker-2:
Process ForkPoolWorker-11:
Process ForkPoolWorker-15:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/linuxbrew/.linuxbrew/opt/python/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Traceback (most rece

KeyboardInterrupt: 

  File "/home/linuxbrew/.linuxbrew/opt/python/lib/python3.6/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))
  File "/home/linuxbrew/.linuxbrew/opt/python/lib/python3.6/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))
  File "/home/linuxbrew/.linuxbrew/opt/python/lib/python3.6/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))
  File "/home/linuxbrew/.linuxbrew/opt/python/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/linuxbrew/.linuxbrew/opt/python/lib/python3.6/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))
  File "/home/linuxbrew/.linuxbrew/opt/python/lib/python3.6/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))
  File "/home/linuxbrew/.linuxbrew/opt/python/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
 

  File "/home/cec/phd/bazel-bin/experimental/polyglot/baselines/notebooks/run.runfiles/phd/deeplearning/clgen/corpuses/preprocessed.py", line 120, in PreprocessorWorker
    pathlib.Path(job.contentfile_root), job.relpath, job.preprocessors)
  File "/home/cec/phd/bazel-bin/experimental/polyglot/baselines/notebooks/run.runfiles/phd/deeplearning/clgen/corpuses/preprocessed.py", line 91, in FromContentFile
    text = preprocessors.Preprocess(input_text, preprocessors_)
  File "/home/cec/phd/bazel-bin/experimental/polyglot/baselines/notebooks/run.runfiles/phd/deeplearning/clgen/preprocessors/preprocessors.py", line 67, in Preprocess
    text = preprocessor(text)
  File "/home/cec/phd/bazel-bin/experimental/polyglot/baselines/notebooks/run.runfiles/phd/deeplearning/clgen/corpuses/preprocessed.py", line 91, in FromContentFile
    text = preprocessors.Preprocess(input_text, preprocessors_)
  File "/home/cec/phd/bazel-bin/experimental/polyglot/baselines/notebooks/run.runfiles/phd/deeplearning/c

  File "/home/cec/phd/bazel-bin/experimental/polyglot/baselines/notebooks/run.runfiles/phd/deeplearning/clgen/preprocessors/opencl.py", line 48, in _ClangPreprocess
    return clang.Preprocess(text, GetClangArgs(use_shim=use_shim))
  File "/home/cec/phd/bazel-bin/experimental/polyglot/baselines/notebooks/run.runfiles/phd/deeplearning/clgen/preprocessors/normalizer.py", line 62, in NormalizeIdentifiers
    stdout, stderr = process.communicate()
  File "/home/cec/phd/bazel-bin/experimental/polyglot/baselines/notebooks/run.runfiles/phd/deeplearning/clgen/preprocessors/clang.py", line 136, in CompileLlvmBytecode
    stdout, stderr = process.communicate()
  File "/home/cec/phd/bazel-bin/experimental/polyglot/baselines/notebooks/run.runfiles/phd/deeplearning/clgen/preprocessors/clang.py", line 136, in CompileLlvmBytecode
    stdout, stderr = process.communicate()
  File "/home/linuxbrew/.linuxbrew/opt/python/lib/python3.6/subprocess.py", line 1514, in _communicate
    ready = selector.select

  File "/home/linuxbrew/.linuxbrew/opt/python/lib/python3.6/subprocess.py", line 1514, in _communicate
    ready = selector.select(timeout)
KeyboardInterrupt
  File "/home/linuxbrew/.linuxbrew/opt/python/lib/python3.6/selectors.py", line 376, in select
    fd_event_list = self._poll.poll(timeout)
  File "/home/cec/phd/bazel-bin/experimental/polyglot/baselines/notebooks/run.runfiles/phd/deeplearning/clgen/preprocessors/clang.py", line 136, in CompileLlvmBytecode
    stdout, stderr = process.communicate()
  File "/home/linuxbrew/.linuxbrew/opt/python/lib/python3.6/selectors.py", line 376, in select
    fd_event_list = self._poll.poll(timeout)
  File "/home/linuxbrew/.linuxbrew/opt/python/lib/python3.6/selectors.py", line 376, in select
    fd_event_list = self._poll.poll(timeout)
  File "/home/linuxbrew/.linuxbrew/opt/python/lib/python3.6/subprocess.py", line 1514, in _communicate
    ready = selector.select(timeout)
KeyboardInterrupt
  File "/home/linuxbrew/.linuxbrew/opt/python/lib/pyt