In [1]:
import datetime
import pathlib
import typing
import collections

from absl import flags
from absl import logging
import humanize
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from datasets.github.scrape_repos import contentfiles
from deeplearning.clgen import clgen
from deeplearning.clgen import errors
from deeplearning.clgen.corpuses import corpuses
from deeplearning.clgen.proto import corpus_pb2
from deeplearning.clgen.proto import clgen_pb2
from deeplearning.clgen.proto import model_pb2
from deeplearning.clgen.proto import sampler_pb2
from labm8 import bazelutil
from labm8 import pbutil
from labm8 import viz

In [2]:
instances_config = pathlib.Path('~/data/experimental/deeplearning/polyglot/instances.pbtxt').expanduser()
instances = [
    clgen.Instance(i) for i in
    pbutil.FromFile(instances_config, clgen_pb2.Instances()).instance
]
print("Loaded {} instances".format(len(instances)))

Loaded 48 instances


In [3]:
def GetContentfileDirectories(instances: typing.List[clgen.Instance]) -> typing.List[pathlib.Path]:
    """Return the list of contentfiles directories."""
    preprocessed_dirs = {i.model.corpus.preprocessed.database_path.parent for i in instances}
    contentfiles = {(p / 'contentfiles').resolve() for p in preprocessed_dirs}
    return contentfiles

GetContentfileDirectories(instances)

{PosixPath('/mnt/cc/data/datasets/github/corpuses/java'),
 PosixPath('/mnt/cc/data/datasets/github/corpuses/opencl')}

In [4]:
def GetContentfileDatabase(local_directory: pathlib.Path) -> pathlib.Path:
    path = pathlib.Path(f'/mnt/cc/data/datasets/github/repos_by_lang/{local_directory.stem}.db')
    if path.is_file():
        return path
    else:
        raise FileNotFoundError(path)
        
contentfiles_dbs = [GetContentfileDatabase(p) for p in GetContentfileDirectories(instances)]
contentfiles_dbs

[PosixPath('/mnt/cc/data/datasets/github/repos_by_lang/opencl.db'),
 PosixPath('/mnt/cc/data/datasets/github/repos_by_lang/java.db')]

In [None]:
def GetOutputCorpus(instance: clgen.Instance) -> corpuses.Corpus:
    with instance.Session():
        out_dir = instance.model.SamplerCache(instance.sampler)
        if not out_dir.is_dir():
            return None
        output_corpus_config = corpus_pb2.Corpus()
        output_corpus_config.CopyFrom(instance.model.corpus.config)
        output_corpus_config.local_directory = str(out_dir) + '.contentfiles'
        if not pathlib.Path(output_corpus_config.local_directory).is_dir():
            return None
        corpus = corpuses.Corpus(output_corpus_config)
        print(corpus, '... ', end='')
        with instance.Session():
            try:
                corpus.Create()
            except errors.EmptyCorpusException:
                pass       
        print('done')
        return corpus

output_corpuses = [GetOutputCorpus(i) for i in instances]
print("Loaded {} output corpuses".format(len([x for x in output_corpuses if x])))

<deeplearning.clgen.corpuses.corpuses.Corpus object at 0x7f450eb939e8> ... done
<deeplearning.clgen.corpuses.corpuses.Corpus object at 0x7f450ebe8fd0> ... done
<deeplearning.clgen.corpuses.corpuses.Corpus object at 0x7f450eb93828> ... done
<deeplearning.clgen.corpuses.corpuses.Corpus object at 0x7f450eb52b00> ... done
<deeplearning.clgen.corpuses.corpuses.Corpus object at 0x7f450eb93860> ... done
<deeplearning.clgen.corpuses.corpuses.Corpus object at 0x7f454e21c5c0> ... done
<deeplearning.clgen.corpuses.corpuses.Corpus object at 0x7f450ebb5b00> ... done
<deeplearning.clgen.corpuses.corpuses.Corpus object at 0x7f454e21c710> ... done
<deeplearning.clgen.corpuses.corpuses.Corpus object at 0x7f450eb2e080> ... 

100% (9779 of 9779) |####################| Elapsed Time: 0:05:47 Time:  0:05:47
100% (283 of 283) |######################| Elapsed Time: 0:00:00 Time:  0:00:00


done
<deeplearning.clgen.corpuses.corpuses.Corpus object at 0x7f450eb9c9e8> ... 

100% (10054 of 10054) |##################| Elapsed Time: 0:05:56 Time:  0:05:56
100% (276 of 276) |######################| Elapsed Time: 0:00:00 Time:  0:00:00


done
<deeplearning.clgen.corpuses.corpuses.Corpus object at 0x7f450eb83e48> ... done
<deeplearning.clgen.corpuses.corpuses.Corpus object at 0x7f454e1eedd8> ... done
<deeplearning.clgen.corpuses.corpuses.Corpus object at 0x7f454e233b70> ... done
<deeplearning.clgen.corpuses.corpuses.Corpus object at 0x7f454dd87dd8> ... 

100% (10047 of 10047) |##################| Elapsed Time: 0:00:19 Time:  0:00:19
100% (10047 of 10047) |##################| Elapsed Time: 0:00:16 Time:  0:00:16


done
<deeplearning.clgen.corpuses.corpuses.Corpus object at 0x7f450ec223c8> ... 

100% (10023 of 10023) |##################| Elapsed Time: 0:05:36 Time:  0:05:36
100% (9950 of 9950) |####################| Elapsed Time: 0:00:19 Time:  0:00:19


done
<deeplearning.clgen.corpuses.corpuses.Corpus object at 0x7f450c266438> ... 

 36% (3703 of 10011) |#######            | Elapsed Time: 0:00:46 ETA:   0:02:01

In [None]:
def InstanceStats(instance: clgen.Instance, output_corpus: corpuses.Corpus) -> typing.Dict[str, typing.Any]:
    stats = collections.OrderedDict()
    preprocessed_dir = instance.model.corpus.preprocessed.database_path.parent
    language = (preprocessed_dir / 'contentfiles').resolve().name
    stats['Language'] = {
        'opencl': 'OpenCL',
        'java': 'Java',
    }[language]
    stats['Encoding'] = 'Character' if 'Ascii' in str(instance.model.corpus.atomizer) else 'Token'
    stats['Vocab size'] = instance.model.corpus.atomizer.vocab_size
    stats['Corpus size'] = '{:.1f}M'.format(instance.model.corpus.encoded.token_count / 1e6)
    # stats['Embedding'] = instance.model.config.architecture.embedding_size
    stats['Model size'] = f'{instance.model.config.architecture.neurons_per_layer}x{instance.model.config.architecture.num_layers}'
    # stats['Dropout'] = instance.model.config.architecture.post_layer_dropout_micros / 1e6
    if instance.model.config.training.HasField('adam_optimizer'):
        stats['Optimizer'] = 'Adam'
        stats['Learning rate'] = instance.model.config.training.adam_optimizer.initial_learning_rate_micros / 1e6
        stats['Decay'] = instance.model.config.training.adam_optimizer.learning_rate_decay_per_epoch_micros / 1e6
    else:
        stats['Optimizer'] = 'RMSProp'
        stats['Learning rate'] = instance.model.config.training.rmsprop_optimizer.initial_learning_rate_micros / 1e6
        stats['Decay'] = instance.model.config.training.rmsprop_optimizer.learning_rate_decay_per_epoch_micros / 1e6
    telemetry = instance.model.TrainingTelemetry()
    if telemetry:
        stats['Epochs'] = len(telemetry)
        stats['Final Loss'] = '{:.3f}'.format(telemetry[-1].loss)
        stats['Training time'] = humanize.naturaldelta(
            datetime.timedelta(seconds=sum(t.epoch_wall_time_ms for t in telemetry) / 1e3))
        stats['Time / epoch'] = humanize.naturaldelta(
            datetime.timedelta(seconds=np.array([t.epoch_wall_time_ms for t in telemetry]).mean() / 1e3))

    if output_corpus:
        samples_dir = instance.model.SamplerCache(instance.sampler)
        sample_times = np.array([
            pbutil.FromFile(samples_dir / f, model_pb2.Sample, uninitialized_okay=True).wall_time_ms for f in samples_dir.iterdir()
        ], dtype=np.int32)
        # TODO(cec): Use the number of extracted kernels, not the number of samples themselves.
        # Sample times is in milliseconds, and we want time per thousand, so they cancel out.
        # Average sample time in seconds.
        sample_time_seconds = sample_times.mean() / 1000
        stats['Sample temperature'] = humanize.intcomma(instance.sampler.temperature)
        # stats['Output samples'] = humanize.intcomma(output_corpus.preprocessed.input_size)
        stats['Output vocab size'] = humanize.intcomma(output_corpus.vocab_size)
        stats['Time / sample (ms)'] = int(round(sample_times.mean()))
        sample_throughput = (24 * 3600) / sample_time_seconds
        stats['Samples / day'] = '{:.1f}k'.format(sample_throughput / 1000)
        # stats['Time / 1k samples'] = humanize.naturaldelta(
        #     datetime.timedelta(seconds=samples_time_seconds * 1000))
        if output_corpus.preprocessed.size:
            efficiency = (output_corpus.preprocessed.size / 
                          (output_corpus.preprocessed.input_size or 1))
            good_sample_throughput = efficiency * sample_throughput
            stats['Efficiency'] = '{:.2%}'.format(efficiency)
            stats['Throughput / day'] = '{:.1f}k'.format(good_sample_throughput / 1000)
    print(stats)
    return stats

stats = pd.DataFrame([InstanceStats(i, o) for i, o in zip(instances, output_corpuses)]).fillna('-')
stats

In [None]:
for (_, row), instance in zip(stats.iterrows(), instances):
    plt.plot([t.epoch_num for t in instance.model.TrainingTelemetry()], 
             [t.loss for t in instance.model.TrainingTelemetry()], 
             label=f"{row['Language']}-{row['Model size']}")

ax = plt.gca()
plt.title('Training Losses')

# X axis.
# plt.xlim((0, 50 - 1))
# ax.set_xticklabels([i + 1 for i in ax.get_xticks()])
plt.xlabel('Epochs')

# Y axis.
plt.ylabel('Loss')

plt.legend()
viz.finalise(size=(10, 8))