In [38]:
# Imports.

import sqlalchemy as sql

from labm8 import app
from labm8 import sqlutil
from labm8 import humanize
from deeplearning.clgen import clgen
from deeplearning.clgen import samples_database
from deeplearning.clgen.corpuses import encoded
from deeplearning.clgen.corpuses import corpuses
from deeplearning.clgen.proto import clgen_pb2
from deeplearning.clgen.proto import corpus_pb2
from deeplearning.clgen.proto import model_pb2
from deeplearning.clgen.proto import sampler_pb2
from experimental.deeplearning.deepsmith.java_fuzz import sample_java_model
from experimental.deeplearning.deepsmith.java_fuzz import sample_opencl_model
from research.cummins_2017_cgo import generative_model as opencl

app.FLAGS(['argv0', '--clgen_corpus_dir=/var/phd/datasets/github/corpuses/opencl', '--clgen_multichar_tokenizer'])

FLAGS = app.FLAGS

## Create pre-encoded OpenCL corpus database

In [20]:
# The OpenCL instance to export pre-encoded corpus from.
config = sample_java_model.MakeClgenInstanceConfig(
      working_dir='/var/phd/experimental/deeplearning/deepsmith/java_fuzz/opencl_clgen_cache',
      encoded_db=encoded.EncodedContentFiles('file:///var/phd/db/cc1.mysql?github_java_methods_enc_2019.07.16?charset=utf8'), num_training_epochs=50, seed_text='kernel void A (',
      neurons_per_layer=512)

# Replace the Java corpus with an OpenCL one.
config.model.corpus.CopyFrom(opencl.CreateCorpusProtoFromFlags())

output_db = encoded.EncodedContentFiles('file:///var/phd/db/cc1.mysql?opencl_enc_2019.07.29?charset=utf8')

In [32]:
def ExportPreEncodedCorpus(instance: clgen.Instance, dst: encoded.EncodedContentFiles):
    with instance.Session() as instance_session:
        src_corpus = instance_session.model.corpus
        src_corpus.Create()
        with dst.Session(commit=True) as dst_session:
            print("Storing vocab with", len(src_corpus.atomizer.vocab), "entries")
            corpuses.StoreVocabInMetaTable(dst_session, src_corpus.atomizer.vocab)
            print("Copying encoded content files ...")
            dst_session.query(encoded.EncodedContentFile).delete()
            with src_corpus.encoded.Session() as src_session:
                query = src_session.query(encoded.EncodedContentFile)
                for i, row in enumerate(query):
                    dst_session.merge(row)
                print("Imported", i, "contentfiles")

ExportPreEncodedCorpus(clgen.Instance(config), output_db)

with output_db.Session() as session:
    query = session.query(sql.func.sum(encoded.EncodedContentFile.tokencount))
    print("Exported", query.one()[0], "token corpus")

Storing vocab with 166 entries
Copying encoded content files ...
Imported 4015 contentfiles
Exported 19171581 token corpus


## Export subset of Java corpus

In [43]:
target_token_count = 19171581

input_db = encoded.EncodedContentFiles('file:///var/phd/db/cc1.mysql?github_java_methods_enc_2019.07.16?charset=utf8')
output_db = encoded.EncodedContentFiles('file:///var/phd/db/cc1.mysql?github_java_methods_enc_2019.07.16_mini?charset=utf8')

In [44]:
def ExportMetaTable(input_session, output_session):
    output_session.query(encoded.Meta).delete()
    for row in input_session.query(encoded.Meta):
        output_session.merge(row)

def ExportTokenCount(input_db, output_db, target_token_count):
    """Iteratively build-up corpus to export."""
    row_batch_size = 100

    with input_db.Session() as input_session, output_db.Session(commit=True) as output_session:
        ExportMetaTable(input_session, output_session)
        
        output_session.query(encoded.EncodedContentFile).delete()
        query = input_session.query(encoded.EncodedContentFile)
        batches = sqlutil.OffsetLimitBatchedQuery(query=query, batch_size=row_batch_size, compute_max_rows=True)
        actual_token_count, method_count = 0, 0
        for batch in batches:
            for row in batch.rows:
                method_count += 1
                actual_token_count += row.tokencount
                output_session.merge(row)
                # Periodically print a progress update.
                if method_count % 1000 == 0:
                    print('processed', humanize.Commas(method_count), 
                          'methods, token count', humanize.Commas(actual_token_count), 
                          'of', humanize.Commas(target_token_count),'...')
                # We're done.
                if actual_token_count >= target_token_count:
                    print("Exported", humanize.Commas(actual_token_count), "token corpus of", 
                          humanize.Commas(method_count), 
                          "methods from a possible", humanize.Commas(batch.max_rows), "methods")
                    return

ExportTokenCount(input_db, output_db, target_token_count)

processed 1,000 methods, token count 247,292 of 19,171,581 ...
processed 2,000 methods, token count 517,808 of 19,171,581 ...
processed 3,000 methods, token count 756,441 of 19,171,581 ...
processed 4,000 methods, token count 1,001,015 of 19,171,581 ...
processed 5,000 methods, token count 1,232,256 of 19,171,581 ...
processed 6,000 methods, token count 1,504,923 of 19,171,581 ...
processed 7,000 methods, token count 1,727,197 of 19,171,581 ...
processed 8,000 methods, token count 2,026,271 of 19,171,581 ...
processed 9,000 methods, token count 2,286,418 of 19,171,581 ...
processed 10,000 methods, token count 2,571,847 of 19,171,581 ...
processed 11,000 methods, token count 2,812,830 of 19,171,581 ...
processed 12,000 methods, token count 3,081,203 of 19,171,581 ...
processed 13,000 methods, token count 3,366,257 of 19,171,581 ...
processed 14,000 methods, token count 3,609,967 of 19,171,581 ...
processed 15,000 methods, token count 3,893,045 of 19,171,581 ...
processed 16,000 methods,