In [1]:
import pathlib

import pandas as pd

from deeplearning.clgen import clgen
from deeplearning.clgen.corpuses import corpuses
from deeplearning.clgen.proto import corpus_pb2
from deeplearning.clgen.proto import clgen_pb2
from deeplearning.clgen.proto import model_pb2
from deeplearning.clgen.proto import sampler_pb2
from deeplearning.clgen import clgen
from lib.labm8 import bazelutil
from lib.labm8 import pbutil

In [2]:
clone_lists_dir = bazelutil.DataPath('phd/experimental/polyglot/baselines/corpuses')
corpuses_dir = bazelutil.DataPath('phd/experimental/polyglot/baselines/corpuses')
models_dir = bazelutil.DataPath('phd/experimental/polyglot/baselines/models')
samplers_dir = bazelutil.DataPath('phd/experimental/polyglot/baselines/samplers')

In [3]:
def LoadProtos(directory, proto_class):
    return {
        name.stem: corpuses.Corpus(pbutil.FromFile(directory / name, proto_class()))
        for name in directory.iterdir()
    }

corpuses = LoadProtos(corpuses_dir, corpus_pb2.Corpus)

In [4]:
working_dir = '/var/phd/clgen/baseline'
instance_configs = [
    {
        'corpus': bazelutil.DataPath('phd/experimental/polyglot/baselines/corpuses/opencl-char.pbtxt'),
        'model': bazelutil.DataPath('phd/experimental/polyglot/baselines/models/1024x2x50-adam.pbtxt'),
        'sampler': bazelutil.DataPath('phd/experimental/polyglot/baselines/samplers/opencl-1.0.pbtxt'),
    },
    {
        'corpus': bazelutil.DataPath('phd/experimental/polyglot/baselines/corpuses/opencl-char.pbtxt'),
        'model': bazelutil.DataPath('phd/experimental/polyglot/baselines/models/512x2x50-adam.pbtxt'),
        'sampler': bazelutil.DataPath('phd/experimental/polyglot/baselines/samplers/opencl-1.0.pbtxt'),
    },
    {
        'corpus': bazelutil.DataPath('phd/experimental/polyglot/baselines/corpuses/opencl-tok.pbtxt'),
        'model': bazelutil.DataPath('phd/experimental/polyglot/baselines/models/1024x2x50-adam.pbtxt'),
        'sampler': bazelutil.DataPath('phd/experimental/polyglot/baselines/samplers/opencl-1.0.pbtxt'),
    },
    {
        'corpus': bazelutil.DataPath('phd/experimental/polyglot/baselines/corpuses/opencl-tok.pbtxt'),
        'model': bazelutil.DataPath('phd/experimental/polyglot/baselines/models/1024x2x50-adam.pbtxt'),
        'sampler': bazelutil.DataPath('phd/experimental/polyglot/baselines/samplers/opencl-1.0.pbtxt'),
    },
    {
        'corpus': bazelutil.DataPath('phd/experimental/polyglot/baselines/corpuses/java-char.pbtxt'),
        'model': bazelutil.DataPath('phd/experimental/polyglot/baselines/models/1024x2x50-adam.pbtxt'),
        'sampler': bazelutil.DataPath('phd/experimental/polyglot/baselines/samplers/java-1.0.pbtxt'),
    },
    {
        'corpus': bazelutil.DataPath('phd/experimental/polyglot/baselines/corpuses/java-char.pbtxt'),
        'model': bazelutil.DataPath('phd/experimental/polyglot/baselines/models/512x2x50-adam.pbtxt'),
        'sampler': bazelutil.DataPath('phd/experimental/polyglot/baselines/samplers/java-1.0.pbtxt'),
    },
    {
        'corpus': bazelutil.DataPath('phd/experimental/polyglot/baselines/corpuses/java-tok.pbtxt'),
        'model': bazelutil.DataPath('phd/experimental/polyglot/baselines/models/1024x2x50-adam.pbtxt'),
        'sampler': bazelutil.DataPath('phd/experimental/polyglot/baselines/samplers/java-1.0.pbtxt'),
    },
    {
        'corpus': bazelutil.DataPath('phd/experimental/polyglot/baselines/corpuses/java-tok.pbtxt'),
        'model': bazelutil.DataPath('phd/experimental/polyglot/baselines/models/1024x2x50-adam.pbtxt'),
        'sampler': bazelutil.DataPath('phd/experimental/polyglot/baselines/samplers/java-1.0.pbtxt'),
    },
]

def InstanceFromConfigs(config_paths):
    corpus_config = pbutil.FromFile(config_paths['corpus'], corpus_pb2.Corpus())
    model_config = pbutil.FromFile(config_paths['model'], model_pb2.Model())
    sampler_config = pbutil.FromFile(config_paths['sampler'], sampler_pb2.Sampler())
    model_config.corpus.CopyFrom(corpus_config)
    return clgen.Instance(
        clgen_pb2.Instance(working_dir=working_dir, model=model_config, sampler=sampler_config))


instances = [InstanceFromConfigs(config_paths) for config_paths in instance_configs]
print("Loaded {} instances".format(len(instances)))

Loaded 8 instances


In [13]:
def InstanceStats(instance):
    stats = {}
    stats['Language'] = {
        'opencl': 'OpenCL',
        'java': 'Java',
    }[pathlib.Path(instance.model.corpus.config.local_directory).stem]
    instance.model.corpus.Create()
    stats['Encoding'] = 'Character' if 'Ascii' in str(instance.model.corpus.atomizer) else 'Token'
    stats['Vocab size'] = instance.model.corpus.atomizer.vocab_size
    stats['Corpus size'] = '{:.1f}M'.format(instance.model.corpus.encoded.token_count / 1e6)
    stats['Model size'] = f'{instance.model.config.architecture.neurons_per_layer}x{instance.model.config.architecture.num_layers}'
    stats['Dropout'] = instance.model.config.architecture.post_layer_dropout_micros / 1e6
    if instance.model.config.training.HasField('adam_optimizer'):
        stats['Optimizer'] = 'Adam'
        stats['Learning rate'] = instance.model.config.training.adam_optimizer.initial_learning_rate_micros / 1e6
        stats['Decay'] = instance.model.config.training.adam_optimizer.learning_rate_decay_per_epoch_micros / 1e6
    else:
        stats['Optimizer'] = 'RMSProp'
        stats['Learning rate'] = instance.model.config.training.rmsprop_optimizer.initial_learning_rate_micros / 1e6
        stats['Decay'] = instance.model.config.training.rmsprop_optimizer.learning_rate_decay_per_epoch_micros / 1e6
    stats['Trained?'] = instance.model.is_trained
    # TODO: 
    # * Time / epoch
    # * Min loss
    # * Time / 1k samples
    # * Post process discard ratio
    # * Time / 1k good samples
    return stats

df = pd.DataFrame([InstanceStats(i) for i in instances],
                  columns=['Language', 'Encoding', 'Vocab size', 'Corpus size', 'Model size', 'Dropout',
                           'Optimizer', 'Learning rate', 'Decay', 'Trained?'])
df

Unnamed: 0,Language,Encoding,Vocab size,Corpus size,Model size,Dropout,Optimizer,Learning rate,Decay,Trained?
0,OpenCL,Character,95,21.8M,1024x2,0.002,Adam,0.002,0.0,True
1,OpenCL,Character,95,21.8M,512x2,0.002,Adam,0.002,0.0,True
2,OpenCL,Token,181,18.9M,1024x2,0.002,Adam,0.002,0.0,True
3,OpenCL,Token,181,18.9M,1024x2,0.002,Adam,0.002,0.0,True
4,Java,Character,3085,286.9M,1024x2,0.002,Adam,0.002,0.0,False
5,Java,Character,3085,286.9M,512x2,0.002,Adam,0.002,0.0,False
6,Java,Token,3133,262.2M,1024x2,0.002,Adam,0.002,0.0,False
7,Java,Token,3133,262.2M,1024x2,0.002,Adam,0.002,0.0,False
