In [52]:
# Imports.

import sqlalchemy as sql

from labm8 import app
from labm8 import sqlutil
from labm8 import humanize
from deeplearning.clgen import clgen
from deeplearning.clgen import samples_database
from deeplearning.clgen.corpuses import encoded
from deeplearning.clgen.corpuses import corpuses
from deeplearning.clgen.proto import clgen_pb2
from deeplearning.clgen.proto import corpus_pb2
from deeplearning.clgen.proto import model_pb2
from deeplearning.clgen.proto import sampler_pb2
from experimental.deeplearning.deepsmith.java_fuzz import sample_java_model
from experimental.deeplearning.deepsmith.java_fuzz import sample_opencl_model
from research.cummins_2017_cgo import generative_model as opencl

app.FLAGS(['argv0', '--clgen_corpus_dir=/var/phd/datasets/github/corpuses/opencl', '--clgen_multichar_tokenizer'])

FLAGS = app.FLAGS

## Create pre-encoded OpenCL corpus database

In [53]:
# The OpenCL instance to export pre-encoded corpus from.
config = sample_java_model.MakeClgenInstanceConfig(
      working_dir='/var/phd/experimental/deeplearning/deepsmith/java_fuzz/opencl_clgen_cache',
      encoded_db=encoded.EncodedContentFiles('file:///var/phd/db/cc1.mysql?github_java_methods_enc_2019.07.16?charset=utf8'), num_training_epochs=50, seed_text='kernel void A (',
      neurons_per_layer=512)

# Replace the Java corpus with an OpenCL one.
config.model.corpus.CopyFrom(opencl.CreateCorpusProtoFromFlags())

output_db = encoded.EncodedContentFiles('file:///var/phd/db/cc1.mysql?opencl_enc_2019.07.29?charset=utf8')

In [32]:
def ExportPreEncodedCorpus(instance: clgen.Instance, dst: encoded.EncodedContentFiles):
    with instance.Session() as instance_session:
        src_corpus = instance_session.model.corpus
        src_corpus.Create()
        with dst.Session(commit=True) as dst_session:
            print("Storing vocab with", len(src_corpus.atomizer.vocab), "entries")
            corpuses.StoreVocabInMetaTable(dst_session, src_corpus.atomizer.vocab)
            print("Copying encoded content files ...")
            dst_session.query(encoded.EncodedContentFile).delete()
            with src_corpus.encoded.Session() as src_session:
                query = src_session.query(encoded.EncodedContentFile)
                for i, row in enumerate(query):
                    dst_session.merge(row)
                print("Imported", i, "contentfiles")

ExportPreEncodedCorpus(clgen.Instance(config), output_db)

with output_db.Session() as session:
    query = session.query(sql.func.sum(encoded.EncodedContentFile.tokencount))
    print("Exported", query.one()[0], "token corpus")

Storing vocab with 166 entries
Copying encoded content files ...
Imported 4015 contentfiles
Exported 19171581 token corpus


## Export subset of Java corpus

In [59]:
target_token_count = int(50e6)

input_db = encoded.EncodedContentFiles('file:///var/phd/db/cc1.mysql?github_java_methods_enc_2019.07.16?charset=utf8')
output_db = encoded.EncodedContentFiles('file:///var/phd/db/cc1.mysql?github_java_methods_enc_2019.07.16_T50M?charset=utf8')

In [60]:
def ExportMetaTable(input_session, output_session):
    output_session.query(encoded.Meta).delete()
    for row in input_session.query(encoded.Meta):
        output_session.merge(row)

def ExportTokenCount(input_db, output_db, target_token_count):
    """Iteratively build-up corpus to export."""
    row_batch_size = 500

    with input_db.Session() as input_session, output_db.Session(commit=True) as output_session:
        ExportMetaTable(input_session, output_session)
        
        output_session.query(encoded.EncodedContentFile).delete()
        query = input_session.query(encoded.EncodedContentFile)
        batches = sqlutil.OffsetLimitBatchedQuery(query=query, batch_size=row_batch_size, compute_max_rows=True)
        actual_token_count, method_count = 0, 0
        for batch in batches:
            for row in batch.rows:
                method_count += 1
                actual_token_count += row.tokencount
                output_session.merge(row)
                # Periodically print a progress update.
                if method_count % 5000 == 0:
                    print('processed', humanize.Commas(method_count), 
                          'methods, token count', humanize.Commas(actual_token_count), 
                          'of', humanize.Commas(target_token_count),'...')
                # We're done.
                if actual_token_count >= target_token_count:
                    print("Exported", humanize.Commas(actual_token_count), "token corpus of", 
                          humanize.Commas(method_count), 
                          "methods from a possible", humanize.Commas(batch.max_rows), "methods")
                    return

ExportTokenCount(input_db, output_db, target_token_count)

processed 5,000 methods, token count 1,232,256 of 50,000,000 ...
processed 10,000 methods, token count 2,571,847 of 50,000,000 ...
processed 15,000 methods, token count 3,893,045 of 50,000,000 ...
processed 20,000 methods, token count 5,089,660 of 50,000,000 ...
processed 25,000 methods, token count 6,466,949 of 50,000,000 ...
processed 30,000 methods, token count 7,665,186 of 50,000,000 ...
processed 35,000 methods, token count 8,949,218 of 50,000,000 ...
processed 40,000 methods, token count 10,233,516 of 50,000,000 ...
processed 45,000 methods, token count 11,673,493 of 50,000,000 ...
processed 50,000 methods, token count 12,992,139 of 50,000,000 ...
processed 55,000 methods, token count 14,326,308 of 50,000,000 ...
processed 60,000 methods, token count 15,826,717 of 50,000,000 ...
processed 65,000 methods, token count 17,118,088 of 50,000,000 ...
processed 70,000 methods, token count 18,322,733 of 50,000,000 ...
processed 75,000 methods, token count 19,327,871 of 50,000,000 ...
pro

## Evaluating OpenCL models

In [45]:
# Model with corpus dir.
config = sample_java_model.MakeClgenInstanceConfig(
    working_dir='/var/phd/experimental/deeplearning/deepsmith/java_fuzz/opencl_clgen_cache',
    encoded_db=encoded.EncodedContentFiles('file:///var/phd/db/cc1.mysql?github_java_methods_enc_2019.07.16?charset=utf8'), 
    num_training_epochs=50, 
    seed_text='kernel void A (',
    neurons_per_layer=512)
config.model.corpus.CopyFrom(opencl.CreateCorpusProtoFromFlags())
opencl_model_with_corpus_dir = clgen.Instance(config)

# Model with corpus database.
config = sample_java_model.MakeClgenInstanceConfig(
    working_dir='/var/phd/experimental/deeplearning/deepsmith/java_fuzz/opencl_clgen_cache',
    encoded_db=encoded.EncodedContentFiles('file:///var/phd/db/cc1.mysql?opencl_enc_2019.07.29?charset=utf8'), 
    num_training_epochs=50, 
    seed_text='kernel void A (',
    neurons_per_layer=512)
opencl_model_with_corpus_db = clgen.Instance(config)

In [51]:
with opencl_model_with_corpus_dir.Session() as session:
    print('Corpus dir trained?', session.model.is_trained)
    
with opencl_model_with_corpus_db.Session() as session:
    print('Corpus db trained?', session.model.is_trained)

Corpus dir trained? False
Corpus db trained? False
