In [1]:
import datetime
import pathlib
import typing
import collections

from absl import flags
from absl import logging
from labm8 import humanize
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from datasets.github.scrape_repos import contentfiles
from deeplearning.clgen import clgen
from deeplearning.clgen import errors
from deeplearning.clgen.corpuses import corpuses
from deeplearning.clgen.proto import corpus_pb2
from deeplearning.clgen.proto import clgen_pb2
from deeplearning.clgen.proto import model_pb2
from deeplearning.clgen.proto import sampler_pb2
from labm8 import bazelutil
from labm8 import pbutil
from labm8 import viz

In [2]:
instances_config = pathlib.Path('~/data/experimental/deeplearning/polyglot/instances.pbtxt').expanduser()
instances = [
    clgen.Instance(i) for i in
    pbutil.FromFile(instances_config, clgen_pb2.Instances()).instance
]
print("Loaded {} instances".format(len(instances)))

Loaded 48 instances


In [4]:
def GetContentfileDirectories(instances: typing.List[clgen.Instance]) -> typing.List[pathlib.Path]:
    """Return the list of contentfiles directories."""
    preprocessed_dirs = {i.model.corpus.preprocessed.database_path.parent for i in instances}
    contentfiles = {(p / 'contentfiles').resolve() for p in preprocessed_dirs}
    return contentfiles

GetContentfileDirectories(instances)

{PosixPath('/mnt/cc/data/datasets/github/corpuses/java'),
 PosixPath('/mnt/cc/data/datasets/github/corpuses/opencl')}

In [6]:
def GetContentfileDatabase(local_directory: pathlib.Path) -> pathlib.Path:
    path = pathlib.Path(f'/mnt/cc/data/datasets/github/repos_by_lang/{local_directory.stem}.db')
    if path.is_file():
        return path
    else:
        raise FileNotFoundError(path)
        
contentfiles_dbs = [GetContentfileDatabase(p) for p in GetContentfileDirectories(instances)]
contentfiles_dbs

[PosixPath('/mnt/cc/data/datasets/github/repos_by_lang/java.db'),
 PosixPath('/mnt/cc/data/datasets/github/repos_by_lang/opencl.db')]

In [None]:
def GetOutputCorpus(instance: clgen.Instance) -> corpuses.Corpus:
    with instance.Session():
        out_dir = instance.model.SamplerCache(instance.sampler)
        if not out_dir.is_dir():
            return None
        output_corpus_config = corpus_pb2.Corpus()
        output_corpus_config.CopyFrom(instance.model.corpus.config)
        output_corpus_config.local_directory = str(out_dir) + '.contentfiles'
        if not pathlib.Path(output_corpus_config.local_directory).is_dir():
            return None
        return corpuses.Corpus(output_corpus_config)

output_corpuses = [GetOutputCorpus(i) for i in instances]
print("Loaded {} output corpuses".format(len([x for x in output_corpuses if x])))

In [16]:
def InstanceStats(instance: clgen.Instance, output_corpus: corpuses.Corpus) -> typing.Dict[str, typing.Any]:
    stats = collections.OrderedDict()
    stats['Language'] = {
        'opencl': 'OpenCL',
        'java': 'Java',
    }[pathlib.Path(instance.model.corpus.config.local_directory).stem]
    stats['Encoding'] = 'Character' if 'Ascii' in str(instance.model.corpus.atomizer) else 'Token'
    stats['Vocab size'] = instance.model.corpus.atomizer.vocab_size
    stats['Corpus size'] = '{:.1f}M'.format(instance.model.corpus.encoded.token_count / 1e6)
    # stats['Embedding'] = instance.model.config.architecture.embedding_size
    stats['Model size'] = f'{instance.model.config.architecture.neurons_per_layer}x{instance.model.config.architecture.num_layers}'
    # stats['Dropout'] = instance.model.config.architecture.post_layer_dropout_micros / 1e6
    if instance.model.config.training.HasField('adam_optimizer'):
        stats['Optimizer'] = 'Adam'
        stats['Learning rate'] = instance.model.config.training.adam_optimizer.initial_learning_rate_micros / 1e6
        stats['Decay'] = instance.model.config.training.adam_optimizer.learning_rate_decay_per_epoch_micros / 1e6
    else:
        stats['Optimizer'] = 'RMSProp'
        stats['Learning rate'] = instance.model.config.training.rmsprop_optimizer.initial_learning_rate_micros / 1e6
        stats['Decay'] = instance.model.config.training.rmsprop_optimizer.learning_rate_decay_per_epoch_micros / 1e6
    telemetry = instance.model.TrainingTelemetry()
    if telemetry:
        stats['Epochs'] = len(telemetry)
        stats['Final Loss'] = '{:.3f}'.format(telemetry[-1].loss)
        stats['Training time'] = humanize.Duration(sum(t.epoch_wall_time_ms for t in telemetry) / 1e3)
        stats['Time / epoch'] = humanize.Duration(np.array([t.epoch_wall_time_ms for t in telemetry]).mean() / 1e3)

    if output_corpus:
#         with instance.Session():
#             try:
#                 output_corpus.Create()
#             except errors.EmptyCorpusException:
#                 pass
        samples_dir = instance.model.SamplerCache(instance.sampler)
        sample_times = np.array([
            pbutil.FromFile(samples_dir / f, model_pb2.Sample, uninitialized_okay=True).wall_time_ms for f in samples_dir.iterdir()
        ], dtype=np.int32)
        # TODO(cec): Use the number of extracted kernels, not the number of samples themselves.
        # Sample times is in milliseconds, and we want time per thousand, so they cancel out.
        # Average sample time in seconds.
        sample_time_seconds = sample_times.mean() / 1000
        stats['Sample temperature'] = humanize.Commas(instance.sampler.temperature)
        # stats['Output samples'] = humanize.Commas(output_corpus.preprocessed.input_size)
        stats['Output vocab size'] = humanize.Commas(output_corpus.vocab_size)
        stats['Time / sample (ms)'] = int(round(sample_times.mean()))
        sample_throughput = (24 * 3600) / sample_time_seconds
        stats['Samples / day'] = '{:.1f}k'.format(sample_throughput / 1000)
        # stats['Time / 1k samples'] = humanize.Duration(samples_time_seconds * 1000)
        if output_corpus.preprocessed.size:
            efficiency = (output_corpus.preprocessed.size / 
                          (output_corpus.preprocessed.input_size or 1))
            good_sample_throughput = efficiency * sample_throughput
            stats['Efficiency'] = '{:.2%}'.format(efficiency)
            stats['Throughput / day'] = '{:.1f}k'.format(good_sample_throughput / 1000)
    print(stats)
    return stats

stats = pd.DataFrame([InstanceStats(i, o) for i, o in zip(instances, output_corpuses)]).fillna('-')
stats

OrderedDict([('Language', 'OpenCL'), ('Encoding', 'Character'), ('Vocab size', 92), ('Corpus size', '21.8M'), ('Model size', '512x2'), ('Optimizer', 'Adam'), ('Learning rate', 0.002), ('Decay', 0.05)])
OrderedDict([('Language', 'OpenCL'), ('Encoding', 'Character'), ('Vocab size', 92), ('Corpus size', '21.8M'), ('Model size', '512x2'), ('Optimizer', 'Adam'), ('Learning rate', 0.002), ('Decay', 0.05)])
OrderedDict([('Language', 'OpenCL'), ('Encoding', 'Character'), ('Vocab size', 92), ('Corpus size', '21.8M'), ('Model size', '1024x2'), ('Optimizer', 'Adam'), ('Learning rate', 0.002), ('Decay', 0.05), ('Epochs', 50), ('Final Loss', '0.187'), ('Training time', '12 hours'), ('Time / epoch', '15 minutes'), ('Sample temperature', '1.0'), ('Output samples', '10,034'), ('Output vocab size', '71'), ('Time / sample (ms)', 385), ('Samples / day', '224.3k'), ('Efficiency', '3.27%'), ('Throughput / day', '7.3k')])
OrderedDict([('Language', 'OpenCL'), ('Encoding', 'Character'), ('Vocab size', 92), ('

Unnamed: 0,Language,Encoding,Vocab size,Corpus size,Model size,Optimizer,Learning rate,Decay,Epochs,Final Loss,Training time,Time / epoch,Sample temperature,Output samples,Output vocab size,Time / sample (ms),Samples / day,Efficiency,Throughput / day
0,OpenCL,Character,92,21.8M,512x2,Adam,0.002,0.05,-,-,-,-,-,-,-,-,-,-,-
1,OpenCL,Character,92,21.8M,512x2,Adam,0.002,0.05,-,-,-,-,-,-,-,-,-,-,-
2,OpenCL,Character,92,21.8M,1024x2,Adam,0.002,0.05,50,0.187,12 hours,15 minutes,1.0,10034,71,385,224.3k,3.27%,7.3k
3,OpenCL,Character,92,21.8M,1024x2,Adam,0.002,0.05,50,0.187,12 hours,15 minutes,0.5,10027,69,473,182.8k,5.83%,10.7k
4,OpenCL,Token,166,18.9M,512x2,Adam,0.002,0.05,50,0.269,4 hours,4 minutes,1.0,10059,112,520,166.0k,2.22%,3.7k
5,OpenCL,Token,166,18.9M,512x2,Adam,0.002,0.05,50,0.269,4 hours,4 minutes,0.5,10002,132,511,169.1k,1.92%,3.2k
6,OpenCL,Token,166,18.9M,1024x2,Adam,0.002,0.05,50,0.236,11 hours,13 minutes,1.0,10040,109,524,164.7k,3.68%,6.1k
7,OpenCL,Token,166,18.9M,1024x2,Adam,0.002,0.05,50,0.236,11 hours,13 minutes,0.5,10040,100,535,161.5k,3.75%,6.0k
8,Java,Character,3085,286.9M,512x2,Adam,0.002,0.05,38,1.081,2 days,an hour,-,-,-,-,-,-,-
9,Java,Character,3085,286.9M,512x2,Adam,0.002,0.05,38,1.081,2 days,an hour,-,-,-,-,-,-,-


In [None]:
for (_, row), instance in zip(stats.iterrows(), instances):
    plt.plot([t.epoch_num for t in instance.model.TrainingTelemetry()], 
             [t.loss for t in instance.model.TrainingTelemetry()], 
             label=f"{row['Language']}-{row['Model size']}")

ax = plt.gca()
plt.title('Training Losses')

# X axis.
# plt.xlim((0, 50 - 1))
# ax.set_xticklabels([i + 1 for i in ax.get_xticks()])
plt.xlabel('Epochs')

# Y axis.
plt.ylabel('Loss')

plt.legend()
viz.finalise(size=(10, 8))