# Java Corpus

In [1]:
## Imports

import collections
import pathlib
import shutil
import typing
import math

import matplotlib
import numpy as np
import pandas as pd
import scipy
import seaborn as sns
import sqlalchemy as sql

from matplotlib import pyplot as plt

from labm8.py import app
from labm8.py import prof
from labm8.py import sqlutil
from labm8.py import pdutil
from labm8.py import viz
from labm8.py import labtypes
from labm8.py import humanize

from datasets.github.scrape_repos import contentfiles
from deeplearning.clgen.corpuses import preprocessed
from deeplearning.clgen.corpuses import encoded

FLAGS = app.FLAGS(['argv0'])

In [2]:
scraper_db = 'file:///var/phd/db/cc1.mysql?github_java?charset=utf8'
methods_db = 'file:///var/phd/db/cc1.mysql?github_java_methods_2019.06.28?charset=utf8'
pp_db = 'file:///var/phd/db/cc1.mysql?github_java_methods_pp_2019.07.02?charset=utf8'
enc_db = 'file:///var/phd/db/cc1.mysql?github_java_methods_enc_2019.07.02?charset=utf8'

# Connect to databases.
scraper_db = contentfiles.ContentFiles(scraper_db, must_exist=True)
methods_db = contentfiles.ContentFiles(methods_db, must_exist=True)
pp_db = preprocessed.PreprocessedContentFiles(pp_db, must_exist=True)
enc_db = encoded.EncodedContentFiles(enc_db, must_exist=True)

In [3]:
# Print a table of file and line counts.

def CfOverview(db):
  with db.Session() as s:
    repo_count = s.query(sql.func.count(contentfiles.GitHubRepository.clone_from_url)).one()[0]
    contentfile_count = s.query(sql.func.count(contentfiles.ContentFile.id)).one()[0]
    avg_charcount = int(round(s.query(sql.func.avg(contentfiles.ContentFile.charcount)).one()[0]))
    avg_linecount = int(round(s.query(sql.func.avg(contentfiles.ContentFile.linecount)).one()[0]))
    return {
        'Repo_count': humanize.Commas(repo_count),
        'Contentfile count': humanize.Commas(contentfile_count),
        'Average charcount': humanize.Commas(avg_charcount),
        'Average linecount': humanize.Commas(avg_linecount),
    }

def PpOverview(db):
  with db.Session() as s:
    cf_count = s.query(sql.func.count(preprocessed.PreprocessedContentFile.id))\
        .filter(preprocessed.PreprocessedContentFile.preprocessing_succeeded == True).one()[0]
    avg_charcount = int(round(s.query(sql.func.avg(preprocessed.PreprocessedContentFile.charcount))\
                              .filter(preprocessed.PreprocessedContentFile.preprocessing_succeeded == True).one()[0]))
    avg_linecount = int(round(s.query(sql.func.avg(preprocessed.PreprocessedContentFile.linecount))\
                              .filter(preprocessed.PreprocessedContentFile.preprocessing_succeeded == True).one()[0]))
    return {
        'Repo_count': '-',
        'Contentfile count': humanize.Commas(cf_count),
        'Average charcount': humanize.Commas(avg_charcount),
        'Average linecount': humanize.Commas(avg_linecount),
    }

def EnvOverview(db):
  with db.Session() as s:
    cf_count = s.query(sql.func.count(encoded.EncodedContentFile.id))[0]
    avg_charcount = int(round(s.query(sql.func.avg(encoded.EncodedContentFile.tokencount)).one()[0]))
    return {
        'Repo_count': '-',
        'Contentfile count': humanize.Commas(cf_count),
        'Average charcount': humanize.Commas(avg_charcount),
        'Average linecount': '-',
    }

with prof.ProfileToStdout('queries'):
  df = pd.DataFrame([
      # CfOverview(scraper_db),
      CfOverview(methods_db),
      PpOverview(pp_db),
      EncOverview(enc_db),
  ], index=[
      # 'scraper_db', 
      'static_methods',
      'preprocessed_methods',
      'encoded_methods',
  ])
df

queries in 3m 29s 667ms


Unnamed: 0,Average charcount,Average linecount,Contentfile count,Repo_count
static_methods,428,12,533083,13983
preprocessed_methods,313,14,460072,-


In [None]:
def GetReposFeatures(db):
  """Get a table of repo features."""
  with db.Session() as s:
    q = s.query(sql.func.count(contentfiles.ContentFile.id).label('file_count'),
                contentfiles.GitHubRepository.owner,
                contentfiles.GitHubRepository.name,
                contentfiles.GitHubRepository.clone_from_url,
                contentfiles.GitHubRepository.num_stars,
                contentfiles.GitHubRepository.num_forks,
                contentfiles.GitHubRepository.num_watchers)\
        .join(contentfiles.GitHubRepository)\
        .group_by(contentfiles.ContentFile.clone_from_url)
    df = pdutil.QueryToDataFrame(s, q)
    df.set_index('clone_from_url', inplace=True)
  return df

def GetPpRepoFeatures(db, df):
  """Get the subset of repos used in a preprocessed db."""
  with db.Session() as s:
    q = s.query(preprocessed.PreprocessedContentFile.input_relpath)
    clone_from_urls = {":".join(x[0].split(":")[:2]) for x in q}
  return df.loc[list(clone_from_urls)]

scraper_db_repo_features = GetReposFeatures(scraper_db)
methods_db_repo_features = GetReposFeatures(methods_db)
pp_db_repo_features = GetPpRepoFeatures(pp_db, methods_db_repo_features)

In [None]:
# Plot Github Stargazers

import math
import matplotlib.ticker as ticker

@ticker.FuncFormatter
def y_formatter(x, pos):
    return humanize.DecimalPrefix(x, '', precision=2)

@ticker.FuncFormatter
def x_formatter(x, pos):
    return humanize.DecimalPrefix(np.expm1(x), '', precision=2)
  
def PlotStarCount(df, ax):
  viz.Distplot(x='num_stars', data=df, log1p_x=True, nbins=12)
  plt.xlabel('Log Github Stargazers')
  plt.ylabel('Frequencey')

  ax.xaxis.set_major_formatter(x_formatter)
  ax.yaxis.set_major_formatter(y_formatter)
  
ax = plt.subplot(1, 3, 1)
PlotStarCount(scraper_db_repo_features, ax=ax)
ax.set_title(f'{humanize.Commas(len(scraper_db_repo_features))} scraped repos')
  
ax = plt.subplot(1, 3, 2)
PlotStarCount(methods_db_repo_features, ax=ax)
ax.set_title(f'{humanize.Commas(len(methods_db_repo_features))} "usable" repos')

ax = plt.subplot(1, 3, 3)
PlotStarCount(pp_db_repo_features, ax=ax)
ax.set_title(f'{humanize.Commas(len(pp_db_repo_features))} preprocessed repos')

viz.Finalize(figsize=(12, 4))

In [None]:
# Load data

def GetSecretsFeatures():
  with pp_db.Session() as s:
    q = s.query(preprocessed.PreprocessedContentFile.input_relpath, 
                preprocessed.PreprocessedContentFile.text) \
        .filter(preprocessed.PreprocessedContentFile.preprocessing_succeeded == False)\
        .filter(preprocessed.PreprocessedContentFile.text.like('Text contains secrets: %'))\
        .all()

    clone_from_urls = [':'.join(r[0].split(':')[:2]) for r in q]
    relpaths = [r[0].split(':')[2] for r in q]
    artifact_indices = [int(r[0].split(':')[-1]) for r in q]
    secret_types = [r[1][len('Text contains secrets: '):] for r in q]

  tuples = list(zip(clone_from_urls, relpaths, artifact_indices, secret_types))
  
  with methods_db.Session() as s:
    q = s.query(contentfiles.ContentFile).filter(sql.tuple_(
        contentfiles.ContentFile.clone_from_url, 
        contentfiles.ContentFile.relpath,
        contentfiles.ContentFile.artifact_index
    ).in_(tuples)).all()

    texts = [r.text for r in q]
    assert len(q) == len(tuples)
    
  return pd.DataFrame(zip(clone_from_urls, relpaths, artifact_indices, secret_types, texts), columns=[
    'clone_from_url',
    'relpath',
    'artifact_index',
    'secret_type',
    'text',
])

secrets_df = GetSecretsFeatures()

In [None]:
#@title Plot data  { form-width: "30%" }

secrets_df.groupby('secret_type').count()[['text']].plot.bar()
plt.gca().get_legend().remove()
plt.xlabel('')
plt.ylabel('#. methods filtered')
viz.Finalize(figsize=(5, 5))