In [6]:
import pathlib
import typing

import pandas as pd

from deeplearning.clgen import clgen
from deeplearning.clgen.corpuses import corpuses
from deeplearning.clgen.proto import corpus_pb2
from deeplearning.clgen.proto import clgen_pb2
from deeplearning.clgen.proto import model_pb2
from deeplearning.clgen.proto import sampler_pb2
from deeplearning.clgen import clgen
from lib.labm8 import bazelutil
from lib.labm8 import pbutil

In [7]:
def LoadProtos(directory, proto_class):
    return {
        name.stem: corpuses.Corpus(pbutil.FromFile(directory / name, proto_class()))
        for name in directory.iterdir()
    }

clone_lists_dir = bazelutil.DataPath('phd/experimental/polyglot/baselines/corpuses')
corpuses_dir = bazelutil.DataPath('phd/experimental/polyglot/baselines/corpuses')
models_dir = bazelutil.DataPath('phd/experimental/polyglot/baselines/models')
samplers_dir = bazelutil.DataPath('phd/experimental/polyglot/baselines/samplers')

In [12]:
working_dir = '/var/phd/clgen/baseline'

def InstanceConfigs(language: str, model: str, temperature: str) -> typing.Dict[str, str]:
    return {
        'corpus': bazelutil.DataPath(f'phd/experimental/polyglot/baselines/corpuses/{language}-char.pbtxt'),
        'model': bazelutil.DataPath(f'phd/experimental/polyglot/baselines/models/{model}.pbtxt'),
        'sampler': bazelutil.DataPath(f'phd/experimental/polyglot/baselines/samplers/{language}-{temperature}.pbtxt'),
    }

instance_configs = [
    InstanceConfigs('opencl', '32-512x2-adam', '1.0'),
    InstanceConfigs('opencl', '32-1024x2-adam', '1.0'),
    InstanceConfigs('opencl', '64-512x2-adam', '1.0'),
    InstanceConfigs('opencl', '64-1024x2-adam', '1.0'),
    InstanceConfigs('opencl', '128-512x2-adam', '1.0'),
    InstanceConfigs('opencl', '128-1024x2-adam', '1.0'),
]

def InstanceFromConfigs(config_paths):
    corpus_config = pbutil.FromFile(config_paths['corpus'], corpus_pb2.Corpus())
    model_config = pbutil.FromFile(config_paths['model'], model_pb2.Model())
    sampler_config = pbutil.FromFile(config_paths['sampler'], sampler_pb2.Sampler())
    model_config.corpus.CopyFrom(corpus_config)
    return clgen.Instance(
        clgen_pb2.Instance(working_dir=working_dir, model=model_config, sampler=sampler_config))

instances = [InstanceFromConfigs(config_paths) for config_paths in instance_configs]
print("Loaded {} instances".format(len(instances)))

Loaded 6 instances


In [13]:
def InstanceStats(instance):
    stats = {}
    stats['Language'] = {
        'opencl': 'OpenCL',
        'java': 'Java',
    }[pathlib.Path(instance.model.corpus.config.local_directory).stem]
    instance.model.corpus.Create()
    stats['Encoding'] = 'Character' if 'Ascii' in str(instance.model.corpus.atomizer) else 'Token'
    stats['Vocab size'] = instance.model.corpus.atomizer.vocab_size
    stats['Corpus size'] = '{:.1f}M'.format(instance.model.corpus.encoded.token_count / 1e6)
    stats['Model size'] = f'{instance.model.config.architecture.neurons_per_layer}x{instance.model.config.architecture.num_layers}'
    stats['Dropout'] = instance.model.config.architecture.post_layer_dropout_micros / 1e6
    if instance.model.config.training.HasField('adam_optimizer'):
        stats['Optimizer'] = 'Adam'
        stats['Learning rate'] = instance.model.config.training.adam_optimizer.initial_learning_rate_micros / 1e6
        stats['Decay'] = instance.model.config.training.adam_optimizer.learning_rate_decay_per_epoch_micros / 1e6
    else:
        stats['Optimizer'] = 'RMSProp'
        stats['Learning rate'] = instance.model.config.training.rmsprop_optimizer.initial_learning_rate_micros / 1e6
        stats['Decay'] = instance.model.config.training.rmsprop_optimizer.learning_rate_decay_per_epoch_micros / 1e6
    stats['Trained?'] = instance.model.is_trained
    # TODO: 
    # * Time / epoch
    # * Min loss
    # * Time / 1k samples
    # * Post process discard ratio
    # * Time / 1k good samples
    return stats

df = pd.DataFrame([InstanceStats(i) for i in instances],
                  columns=['Language', 'Encoding', 'Vocab size', 'Corpus size', 'Model size', 'Dropout',
                           'Optimizer', 'Learning rate', 'Decay', 'Trained?'])
df

OperationalError: (sqlite3.OperationalError) no such column: preprocessed_contentfiles.wall_time_ms [SQL: 'SELECT count(*) AS count_1 \nFROM (SELECT preprocessed_contentfiles.id AS preprocessed_contentfiles_id, preprocessed_contentfiles.input_relpath AS preprocessed_contentfiles_input_relpath, preprocessed_contentfiles.input_sha256 AS preprocessed_contentfiles_input_sha256, preprocessed_contentfiles.input_charcount AS preprocessed_contentfiles_input_charcount, preprocessed_contentfiles.input_linecount AS preprocessed_contentfiles_input_linecount, preprocessed_contentfiles.sha256 AS preprocessed_contentfiles_sha256, preprocessed_contentfiles.charcount AS preprocessed_contentfiles_charcount, preprocessed_contentfiles.linecount AS preprocessed_contentfiles_linecount, preprocessed_contentfiles.text AS preprocessed_contentfiles_text, preprocessed_contentfiles.preprocessing_succeeded AS preprocessed_contentfiles_preprocessing_succeeded, preprocessed_contentfiles.preprocess_time_ms AS preprocessed_contentfiles_preprocess_time_ms, preprocessed_contentfiles.wall_time_ms AS preprocessed_contentfiles_wall_time_ms, preprocessed_contentfiles.date_added AS preprocessed_contentfiles_date_added \nFROM preprocessed_contentfiles) AS anon_1'] (Background on this error at: http://sqlalche.me/e/e3q8)