In [47]:
import pathlib

import pandas as pd
from sqlalchemy.sql import func

from datasets.github.scrape_repos import contentfiles
from deeplearning.clgen.corpuses import corpuses
from deeplearning.clgen.proto import corpus_pb2
from labm8 import bazelutil
from labm8 import pbutil

In [26]:
def LoadCorpuses(directory, proto_class):
    return {
        name.stem: corpuses.Corpus(pbutil.FromFile(directory / name, proto_class()))
        for name in directory.iterdir()
        if not str(name.stem).startswith('c-')
    }

corpuses_dir = bazelutil.DataPath('phd/experimental/polyglot/baselines/corpuses')
corps = LoadProtos(corpuses_dir, corpus_pb2.Corpus)
print('Loaded', len(corps), 'corpuses')

Loaded 4 corpuses


In [34]:
def InputContentfiles(local_directory: pathlib.Path) -> pathlib.Path:
    path = pathlib.Path(f'/var/phd/datasets/github/repos_by_lang/{local_directory.stem}.db')
    if path.is_file():
        return path
    else:
        raise FileNotFoundError(path)

In [51]:
def CorpusStats(corpus: corpuses.Corpus):
    corpus.Create()
    stats = {}
    stats['Language'] = {
        'opencl': 'OpenCL',
        'c': 'C',
        'java': 'Java'
    }[pathlib.Path(corpus.config.local_directory).stem]

    input_db_path = InputContentfiles(pathlib.Path(corpus.config.local_directory))
    input_db = contentfiles.ContentFiles(input_db_path)
    with input_db.Session() as session:
        stats['Repositories'] = '{:.1f}k'.format(session.query(contentfiles.ContentFile.clone_from_url).distinct().count() / 1e3)
        stats['Files'] = '{:.1f}k'.format(session.query(contentfiles.ContentFile).count() / 1e3)
        stats['Lines'] = '{:.1f}M'.format(session.query(func.sum(contentfiles.ContentFile.linecount)).scalar() / 1e6)
        
    stats['Encoding'] = 'Character' if 'Ascii' in str(corpus.atomizer) else 'Token'
    stats['Vocab size'] = corpus.vocab_size
    stats['Corpus size'] = '{:.1f}M'.format((corpus.encoded.token_count) / 1e6)
    return stats

df = pd.DataFrame(
    [CorpusStats(c) for c in corps.values()],
    columns=['Language', 'Repositories', 'Files', 'Lines', 'Encoding', 
             'Vocab size', 'Corpus size'])
df

ValueError: not enough values to unpack (expected 2, got 0)