In [1]:
# %%
# Full-pass line counting for DISL: decomposed
# - Streams HF dataset and writes a parquet/jsonl with: sha256, loc_total, loc_nonempty (+ optional fields)

import os, json, hashlib
from typing import Optional, List, Dict

from datasets import load_dataset
from tqdm import tqdm

# Optional parquet
HAVE_PARQUET = False
try:
    import pyarrow as pa, pyarrow.parquet as pq
    HAVE_PARQUET = True
except Exception:
    HAVE_PARQUET = False

DATASET_NAME   = "ASSERT-KTH/DISL"
CONFIG         = "decomposed"                 # deduplicated split
SPLIT          = "train"
OUTPUT_DIR     = "disl_token_stats"           # same dir you used for tokens
OUT_BASE       = f"loc_{CONFIG}"
OUT_PATH       = os.path.join(OUTPUT_DIR, f"{OUT_BASE}.parquet" if HAVE_PARQUET else f"{OUT_BASE}.jsonl")
PERSIST_FIELDS = ["file_path", "contract_address", "compiler_version"]  # include if present
HF_TOKEN       = os.environ.get("HUGGING_FACE_HUB_TOKEN") or os.environ.get("HF_TOKEN")

os.makedirs(OUTPUT_DIR, exist_ok=True)

def sha256_text(s: str) -> str:
    return hashlib.sha256(s.encode("utf-8", "ignore")).hexdigest()

def count_lines(s: str):
    if not s:
        return 0, 0
    lines = s.splitlines()
    loc_total = len(lines)
    loc_nonempty = sum(1 for ln in lines if ln.strip())
    return loc_total, loc_nonempty

class RowWriter:
    def __init__(self, path: str, use_parquet: bool = True, batch_size: int = 5000):
        self.path = path
        self.use_parquet = use_parquet and HAVE_PARQUET
        self.batch_size = batch_size
        self.rows = []
        self.writer = None
        if not self.use_parquet:
            open(self.path, "w").close()  # truncate JSONL

    def append(self, row: Dict):
        self.rows.append(row)
        if len(self.rows) >= self.batch_size:
            self.flush()

    def flush(self):
        if not self.rows:
            return
        if self.use_parquet:
            table = pa.Table.from_pylist(self.rows)
            if self.writer is None:
                self.writer = pq.ParquetWriter(self.path, table.schema, compression="zstd")
            self.writer.write_table(table)
        else:
            with open(self.path, "a", encoding="utf-8") as f:
                for r in self.rows:
                    f.write(json.dumps(r, ensure_ascii=False) + "\n")
        self.rows = []

    def close(self):
        self.flush()
        if self.writer is not None:
            self.writer.close()

print(f"[info] Loading {DATASET_NAME}:{CONFIG}/{SPLIT} (streaming)")
ds = load_dataset(DATASET_NAME, CONFIG, split=SPLIT, streaming=True)

writer = RowWriter(OUT_PATH, use_parquet=True, batch_size=5000)

written = 0
pbar = tqdm(ds, desc="Counting lines")
for ex in pbar:
    src = ex.get("source_code")
    if not src:
        continue
    loc_total, loc_nonempty = count_lines(src)
    row = {
        "sha256": sha256_text(src),
        "loc_total": int(loc_total),
        "loc_nonempty": int(loc_nonempty),
    }
    for fld in PERSIST_FIELDS:
        if fld in ex and ex[fld] is not None:
            row[fld] = ex[fld]
    writer.append(row)
    written += 1
    if written % 20000 == 0:
        pbar.set_postfix_str(f"written={written}")

writer.close()
print(f"[done] Wrote {written:,} rows to {OUT_PATH}")


  from .autonotebook import tqdm as notebook_tqdm


[info] Loading ASSERT-KTH/DISL:decomposed/train (streaming)


Counting lines: 514506it [02:48, 3048.64it/s, written=500000]

[done] Wrote 514,506 rows to disl_token_stats/loc_decomposed.parquet





In [None]:
# %%
import os
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

# ---- Configure paths ----
OUTPUT_DIR          = "disl_token_stats"
TOKENS_PARQUET_PATH = os.path.join(OUTPUT_DIR, "lengths_decomposed.parquet")  # from your previous run (--persist)
LINES_PARQUET_PATH  = os.path.join(OUTPUT_DIR, "loc_decomposed.parquet")      # from cell above
CONTEXT_TOKENS      = 4096
BIN_WIDTH_LOC       = 50       # lines per bin for the share plot
SAMPLE_HEXBIN       = 200_000  # subsample for hexbin speed (set None to use all)

# ---- Load data ----
def load_table(path: str) -> pd.DataFrame:
    if path.endswith(".parquet"):
        return pd.read_parquet(path)
    elif path.endswith(".jsonl"):
        return pd.read_json(path, lines=True)
    else:
        raise ValueError(f"Unsupported file type: {path}")

df_tokens = load_table(TOKENS_PARQUET_PATH)[["sha256", "tokens"]]
df_lines  = load_table(LINES_PARQUET_PATH)[["sha256", "loc_total", "loc_nonempty"]]
df = df_tokens.merge(df_lines, on="sha256", how="inner").dropna()
print(f"[merge] {len(df):,} rows")

# ---- Style for IEEE single column ----
FIGSIZE = (3.4, 2.4)  # inches (w,h)
mpl.rcParams.update({
    "font.family": "sans-serif",
    "font.size": 8,
    "axes.titlesize": 8,
    "axes.labelsize": 8,
    "xtick.labelsize": 7,
    "ytick.labelsize": 7,
    "legend.fontsize": 7,
    "axes.linewidth": 0.8,
    "pdf.fonttype": 42,
    "ps.fonttype": 42,
})

# ---- (A) Hexbin: tokens vs non-empty LOC ----
plot_df = df
if SAMPLE_HEXBIN and len(df) > SAMPLE_HEXBIN:
    plot_df = df.sample(SAMPLE_HEXBIN, random_state=42).copy()

x = plot_df["tokens"].to_numpy()
y = plot_df["loc_nonempty"].to_numpy()

x_cap = np.percentile(x, 99.0)
y_cap = np.percentile(y, 99.0)

fig = plt.figure(figsize=FIGSIZE)
ax = fig.add_subplot(111)
hb = ax.hexbin(x, y, gridsize=50, extent=(0, x_cap, 0, y_cap), mincnt=5, bins='log')
ax.axvline(CONTEXT_TOKENS, linestyle="--", linewidth=1.0)

ax.set_xlim(0, x_cap)
ax.set_ylim(0, y_cap)
ax.set_xlabel("Tokens")
ax.set_ylabel("Non-empty LoC")
ax.set_title("Token vs. LoC")

# compact colorbar without frame
cbar = fig.colorbar(hb, ax=ax, fraction=0.08, pad=0.02)
cbar.ax.tick_params(labelsize=7)

fig.tight_layout()
hexbin_pdf = os.path.join(OUTPUT_DIR, "hexbin_tokens_vs_loc_ieee.pdf")
fig.savefig(hexbin_pdf, format="pdf", bbox_inches="tight")
plt.close(fig)
print(f"[saved] {hexbin_pdf}")

# ---- (B) Share exceeding 4096 tokens by LoC bins ----
df["loc_bin"] = (df["loc_nonempty"] // BIN_WIDTH_LOC) * BIN_WIDTH_LOC
g = df.groupby("loc_bin", as_index=False).agg(
    n=("tokens", "size"),
    share_exceed=("tokens", lambda s: float((s > CONTEXT_TOKENS).mean())),
)

# filter low-support bins (optional)
g = g[g["n"] >= 50]  # keep bins with at least 50 samples for stability

fig = plt.figure(figsize=FIGSIZE)
ax = fig.add_subplot(111)
ax.plot(g["loc_bin"], g["share_exceed"] * 100.0, linewidth=1.2)

# annotate approx LoC where ≥50% exceed context
half = g[g["share_exceed"] >= 0.5]
if not half.empty:
    loc50 = int(half["loc_bin"].iloc[0])
    ax.axvline(loc50, linestyle="--", linewidth=1.0)
    ax.text(loc50 * 1.02, 52, f"≥50% exceed at ~{loc50} LoC", va="bottom", ha="left")

ax.set_xlabel("Non-empty LoC (binned)")
ax.set_ylabel("Share > 4,096 tokens (%)")
ax.set_title("Where contexts exceed token budget")

# no grid, tight
fig.tight_layout()
share_pdf = os.path.join(OUTPUT_DIR, "share_exceed_by_loc_ieee.pdf")
fig.savefig(share_pdf, format="pdf", bbox_inches="tight")
plt.close(fig)
print(f"[saved] {share_pdf}")

# ---- Print quick headline numbers ----
overall_exceed = (df["tokens"] > CONTEXT_TOKENS).mean() * 100
print(f"[headline] Overall > {CONTEXT_TOKENS} tokens: {overall_exceed:.1f}%")
if not half.empty:
    print(f"[headline] LoC where ≥50% exceed: ~{loc50}")


[merge] 514,506 rows
[saved] disl_token_stats/hexbin_tokens_vs_loc_ieee.pdf
[saved] disl_token_stats/share_exceed_by_loc_ieee.pdf
[headline] Overall > 4096 tokens: 31.3%
[headline] LoC where ≥50% exceed: ~300


In [None]:
# %%
# Hexbin with tokens/line guide-lines + IEEE-ready PDF
# Requires: your persisted parquet from the earlier runs:
#   - disl_token_stats/lengths_decomposed.parquet  (sha256, tokens, ...)
#   - disl_token_stats/loc_decomposed.parquet      (sha256, loc_total, loc_nonempty, ...)

import os
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

OUTPUT_DIR          = "disl_token_stats"
TOKENS_PARQUET_PATH = os.path.join(OUTPUT_DIR, "lengths_decomposed.parquet")
LINES_PARQUET_PATH  = os.path.join(OUTPUT_DIR,  "loc_decomposed.parquet")
OUT_PDF             = os.path.join(OUTPUT_DIR,  "hexbin_tokens_vs_loc_guides_ieee.pdf")

CONTEXT_TOKENS      = 4096
SAMPLE_HEXBIN       = 200_000   # for speed; set None to plot all rows

# --- Load & join ---
def load_table(path: str) -> pd.DataFrame:
    if path.endswith(".parquet"):
        return pd.read_parquet(path)
    elif path.endswith(".jsonl"):
        return pd.read_json(path, lines=True)
    else:
        raise ValueError(f"Unsupported file: {path}")

df_tokens = load_table(TOKENS_PARQUET_PATH)[["sha256", "tokens"]]
df_lines  = load_table(LINES_PARQUET_PATH)[["sha256", "loc_nonempty"]]
df = df_tokens.merge(df_lines, on="sha256", how="inner").dropna()
df = df[df["loc_nonempty"] > 0].copy()  # avoid divide-by-zero
print(f"[merge] {len(df):,} rows")

# --- tokens per line stats ---
df["tpl"] = df["tokens"] / df["loc_nonempty"]
k50  = df["tpl"].median()
k25  = df["tpl"].quantile(0.25)
k75  = df["tpl"].quantile(0.75)

def loc_at_context(k): return CONTEXT_TOKENS / k
print(f"[tokens/line] median={k50:.2f}, IQR=({k25:.2f}, {k75:.2f})")
print(f"[implied LoC at 4096] median ~ {loc_at_context(k50):.0f}, "
      f"IQR ~ ({loc_at_context(k75):.0f}, {loc_at_context(k25):.0f})")

# --- Style for IEEE single column ---
FIGSIZE = (3.4, 2.4)  # inches
mpl.rcParams.update({
    "font.family": "sans-serif",
    "font.size": 8,
    "axes.titlesize": 8,
    "axes.labelsize": 8,
    "xtick.labelsize": 7,
    "ytick.labelsize": 7,
    "legend.fontsize": 7,
    "axes.linewidth": 0.8,
    "pdf.fonttype": 42,
    "ps.fonttype": 42,
})

# --- Prepare data for hexbin (crop at p99 to keep plot readable) ---
plot_df = df
if SAMPLE_HEXBIN and len(plot_df) > SAMPLE_HEXBIN:
    plot_df = plot_df.sample(SAMPLE_HEXBIN, random_state=42)

x = plot_df["tokens"].to_numpy()
y = plot_df["loc_nonempty"].to_numpy()
x_cap = np.percentile(df["tokens"], 99.0)      # crop using full df stats
y_cap = np.percentile(df["loc_nonempty"], 99.0)

# --- Plot ---
fig = plt.figure(figsize=FIGSIZE)
ax = fig.add_subplot(111)

hb = ax.hexbin(
    x, y, gridsize=55,
    extent=(0, x_cap, 0, y_cap),
    mincnt=5, bins='log'
)

# vertical context line
ax.axvline(CONTEXT_TOKENS, linestyle="--", linewidth=1.0)

# overlay tokens/line guide-lines: y = tokens / k  =>  tokens = k * y
yy = np.linspace(0, y_cap, 100)
ax.plot(k25 * yy, yy, linewidth=0.9, alpha=0.9, label=f"{k25:.1f} tokens/LOC")
ax.plot(k50 * yy, yy, linewidth=1.2, alpha=1.0, label=f"{k50:.1f} tokens/LOC (median)")
ax.plot(k75 * yy, yy, linewidth=0.9, alpha=0.9, label=f"{k75:.1f} tokens/LOC")

# annotate share exceeding 4096 (simple and direct)
share_exceed = (df["tokens"] > CONTEXT_TOKENS).mean() * 100
y_at_context = (df["tokens"] <= CONTEXT_TOKENS).mean()   # CDF at 4096 for label placement
ax.text(CONTEXT_TOKENS * 1.02, min(y_at_context + 0.06, 0.97) * y_cap,  # rough placement
        f">{CONTEXT_TOKENS}: {share_exceed:.1f}%",
        va="bottom", ha="left")

# labels/limits
ax.set_xlim(0, x_cap)
ax.set_ylim(0, y_cap)
ax.set_xlabel("Tokens")
ax.set_ylabel("Non-empty LOC")
ax.set_title("Tokens vs. LOC with tokens/line guides (p99 crop)")

# compact colorbar
cbar = fig.colorbar(hb, ax=ax, fraction=0.08, pad=0.02)
cbar.ax.tick_params(labelsize=7)

# lean legend
ax.legend(loc="lower right", frameon=False, handlelength=1.8)

fig.tight_layout()
os.makedirs(OUTPUT_DIR, exist_ok=True)
fig.savefig(OUT_PDF, format="pdf", bbox_inches="tight")
plt.close(fig)
print(f"[saved] {OUT_PDF}")


[merge] 514,506 rows
[tokens/line] median=13.06, IQR=(11.79, 14.60)
[implied LOC at 4096] median ~ 314, IQR ~ (281, 348)
[saved] disl_token_stats/hexbin_tokens_vs_loc_guides_ieee.pdf


In [14]:
# %%
# Tokens vs. LOC hexbin with tokens/line guide-lines
# Legend placed BELOW the axes (outside), IEEE-friendly PDF

import os
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

# ---- Config (edit paths if needed) ----
OUTPUT_DIR          = "disl_token_stats"
TOKENS_PARQUET_PATH = os.path.join(OUTPUT_DIR, "lengths_decomposed.parquet")
LINES_PARQUET_PATH  = os.path.join(OUTPUT_DIR,  "loc_decomposed.parquet")
OUT_PDF             = os.path.join(OUTPUT_DIR,  "hexbin_tokens_vs_loc_guides_ieee.pdf")

CONTEXT_TOKENS      = 4096
SAMPLE_HEXBIN       = 200_000  # set None to use all rows
GRIDSIZE            = 55       # hexbin grid
MINCNT              = 5        # hide very sparse bins

# ---- Load & join ----
def load_table(path: str) -> pd.DataFrame:
    if path.endswith(".parquet"):
        return pd.read_parquet(path)
    elif path.endswith(".jsonl"):
        return pd.read_json(path, lines=True)
    else:
        raise ValueError(f"Unsupported file: {path}")

df_tokens = load_table(TOKENS_PARQUET_PATH)[["sha256", "tokens"]]
df_lines  = load_table(LINES_PARQUET_PATH)[["sha256", "loc_nonempty"]]
df = df_tokens.merge(df_lines, on="sha256", how="inner").dropna()
df = df[df["loc_nonempty"] > 0].copy()
print(f"[merge] {len(df):,} rows")

# ---- tokens/line stats (for guide-lines) ----
df["tpl"] = df["tokens"] / df["loc_nonempty"]
k25 = df["tpl"].quantile(0.25)
k50 = df["tpl"].median()
k75 = df["tpl"].quantile(0.75)
print(f"[tokens/LOC] 25%={k25:.2f}, median={k50:.2f}, 75%={k75:.2f}")

# ---- IEEE single-column style ----
FIGSIZE = (3.4, 2.8)  # a bit taller to make room for legend below
mpl.rcParams.update({
    "font.family": "sans-serif",
    "font.size": 8,
    "axes.titlesize": 8,
    "axes.labelsize": 8,
    "xtick.labelsize": 7,
    "ytick.labelsize": 7,
    "legend.fontsize": 7,
    "axes.linewidth": 0.8,
    "pdf.fonttype": 42,
    "ps.fonttype": 42,
})

# ---- Prepare data (crop at p99 for readability) ----
plot_df = df
if SAMPLE_HEXBIN and len(plot_df) > SAMPLE_HEXBIN:
    plot_df = plot_df.sample(SAMPLE_HEXBIN, random_state=42)

x = plot_df["tokens"].to_numpy()
y = plot_df["loc_nonempty"].to_numpy()
x_cap = np.percentile(df["tokens"], 99.0)       # crop using full df stats
y_cap = np.percentile(df["loc_nonempty"], 99.0)

# ---- Plot ----
fig = plt.figure(figsize=FIGSIZE)
ax  = fig.add_subplot(111)

hb = ax.hexbin(
    x, y, gridsize=GRIDSIZE, mincnt=MINCNT, bins="log",
    extent=(0, x_cap, 0, y_cap)
)

# Context window
ax.axvline(CONTEXT_TOKENS, linestyle="--", linewidth=1.0)

# Tokens/line guide-lines: tokens = k * LOC  => x = k * y
yy = np.linspace(0, y_cap, 100)
ln25, = ax.plot(k25 * yy, yy, linewidth=0.9, label=f"{k25:.1f} tokens/LoC")
ln50, = ax.plot(k50 * yy, yy, linewidth=1.2, label=f"{k50:.1f} tokens/LoC (median)")
ln75, = ax.plot(k75 * yy, yy, linewidth=0.9, label=f"{k75:.1f} tokens/LoC")

# Simple overflow annotation
share_exceed = (df["tokens"] > CONTEXT_TOKENS).mean() * 100
ax.text(CONTEXT_TOKENS * 1.02, 0.85 * y_cap, f">{CONTEXT_TOKENS}: {share_exceed:.1f}%",
        va="bottom", ha="left")

# Axes labels/limits
ax.set_xlim(0, x_cap)
ax.set_ylim(0, y_cap)
ax.set_xlabel("Tokens")
ax.set_ylabel("Non-empty LoC")
#ax.set_title("Tokens vs. LOC with tokens/line guides (p99 crop)")

# Compact colorbar
cbar = fig.colorbar(hb, ax=ax, fraction=0.08, pad=0.02)
cbar.ax.tick_params(labelsize=7)

# ---- Legend BELOW the axes (outside), no overlap ----
handles = [ln25, ln50, ln75]
leg = ax.legend(
    handles=handles,
    loc="upper center",
    bbox_to_anchor=(0.5, -0.22),   # below axes
    frameon=False,
    ncol=3,
    columnspacing=0.9,
    handlelength=1.6,
)
fig.subplots_adjust(bottom=0.30)   # give the legend breathing room

# Save IEEE-ready vector PDF
os.makedirs(OUTPUT_DIR, exist_ok=True)
fig.savefig(OUT_PDF, format="pdf", bbox_inches="tight")
plt.close(fig)
print(f"[saved] {OUT_PDF}")


[merge] 514,506 rows
[tokens/LOC] 25%=11.79, median=13.06, 75%=14.60
[saved] disl_token_stats/hexbin_tokens_vs_loc_guides_ieee.pdf
