# HQ-50K EDA Notebook (Image Restoration)

This notebook performs an exploratory data analysis (EDA) of the **HQ-50K** dataset for image restoration tasks.


> **References**
> - GitHub repo: https://github.com/littleYaang/HQ-50K  
> - Dataset: https://huggingface.co/datasets/YangQiee/HQ-50K  



## 0. Environment Setup

In [3]:

# %pip install pillow numpy pandas matplotlib tqdm opencv-python imagehash scipy --quiet

import os, sys, math, json, random, glob, hashlib, itertools, shutil, io
from collections import Counter, defaultdict

import numpy as np
import pandas as pd
from tqdm import tqdm
from PIL import Image, ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

import matplotlib.pyplot as plt
plt.rcParams["figure.dpi"] = 120

try:
    import cv2
except Exception as e:
    cv2 = None

try:
    import imagehash
except Exception as e:
    imagehash = None

try:
    from scipy.fft import fft2, fftshift
except Exception as e:
    fft2 = None
    fftshift = None

print("Python:", sys.version)
print("NumPy:", np.__version__)
print("Pandas:", pd.__version__)
print("PIL:", Image.__version__)
print("OpenCV present:", cv2 is not None)
print("imagehash present:", imagehash is not None)
print("SciPy FFT present:", fft2 is not None)


Python: 3.13.5 (main, Jun 25 2025, 18:55:22) [GCC 14.2.0]
NumPy: 2.2.6
Pandas: 2.3.3
PIL: 12.0.0
OpenCV present: True
imagehash present: True
SciPy FFT present: True
ERROR! Session/line number was not unique in database. History logging moved to new session 5


%pip install -q datasets img2dataset pillow numpy pandas matplotlib tqdm opencv-python imagehash scipy


In [4]:

import os

# path written in Linux format
DATASET_ROOT = os.environ.get("HQ50K_ROOT", r"data/HQ-50K")  
OUT_DIR = os.environ.get("HQ50K_EDA_OUT", "./eda_output")

os.makedirs(DATASET_ROOT, exist_ok=True)
os.makedirs(OUT_DIR, exist_ok=True)

print("DATASET_ROOT:", DATASET_ROOT)
print("OUT_DIR:", OUT_DIR)

SUBFOLDERS = ["train", "val"]



DATASET_ROOT: data/HQ-50K
OUT_DIR: ./eda_output


In [6]:

%pip install -q datasets img2dataset pillow numpy pandas matplotlib tqdm opencv-python imagehash scipy

from datasets import load_dataset
import pandas as pd
from img2dataset import download
import os


os.makedirs(DATASET_ROOT, exist_ok=True)

print("[1/3] Loading image URLs from Hugging Face …")
ds = load_dataset("YangQiee/HQ-50K", split="train")
urls = [u for u in ds["text"] if isinstance(u, str)]
print(f"[1/3] Got {len(urls)} URLs")

print("[2/3] Writing URL list to parquet …")
parquet_path = os.path.join(DATASET_ROOT, "hq50k_urls.parquet")
pd.DataFrame({"text": urls}).to_parquet(parquet_path, index=False)

print("[3/3] Downloading images with img2dataset …")
download(
    processes_count=8,
    thread_count=32,
    url_list=parquet_path,
    input_format="parquet",
    url_col="text",
    output_folder=DATASET_ROOT,
    output_format="files",
    number_sample_per_shard=10000,
    save_additional_columns=None,
    distributor="multiprocessing",
    retries=2,
    timeout=20,
    min_image_size=128,
    skip_reencode=True,
)
print("Download complete — images saved under", DATASET_ROOT)


Note: you may need to restart the kernel to use updated packages.
[1/3] Loading image URLs from Hugging Face …
[1/3] Got 51250 URLs
[2/3] Writing URL list to parquet …
[3/3] Downloading images with img2dataset …
Starting the downloading of this file
Sharding file number 1 of 1 called /home/anthony/Documents/498/COEN691-project/notebooks/data/HQ-50K/hq50k_urls.parquet


0it [00:00, ?it/s]

File sharded in 0 shards
Downloading starting now, check your bandwidth speed (with bwm-ng)your cpu (with htop), and your disk usage (with iotop)!



  check_for_updates()


Download complete — images saved under data/HQ-50K


## 2. Scan Images

In [9]:

def list_images(root, subfolders=("train", "val")):
    exts = (".jpg", ".jpeg", ".png", ".bmp", ".webp", ".tif", ".tiff")
    records = []
    for sub in subfolders:
        subpath = os.path.join(root, sub)
        if not os.path.isdir(subpath):
            continue
        for path in glob.iglob(os.path.join(subpath, "**", "*"), recursive=True):
            if path.lower().endswith(exts):
                try:
                    size = os.path.getsize(path)
                except Exception:
                    size = None
                records.append({"split": sub, "path": path, "size_bytes": size})
    return pd.DataFrame.from_records(records)

df_files = list_images(DATASET_ROOT, SUBFOLDERS)
print("Found", len(df_files), "image files")
df_files.sample(min(3, len(df_files)))


Found 0 image files


## 3. Basic File Stats

In [None]:

def inspect_image(path):
    info = {"path": path}
    try:
        with Image.open(path) as im:
            info["format"] = im.format
            info["mode"] = im.mode
            w, h = im.size
            info["width"] = int(w)
            info["height"] = int(h)
            info["aspect"] = w / h if h else np.nan
            info["megapixels"] = (w * h) / 1e6
            if im.format == "JPEG":
                size_b = os.path.getsize(path)
                info["bpp_proxy"] = (size_b * 8) / max(1, (w * h))
            else:
                info["bpp_proxy"] = np.nan
    except Exception as e:
        info["error"] = str(e)
    return info

if len(df_files) > 0:
    stats = []
    for row in tqdm(df_files.itertuples(), total=len(df_files)):
        stats.append(inspect_image(row.path))
    df_stats = pd.DataFrame(stats).merge(df_files[["split", "path", "size_bytes"]], on="path", how="left")
else:
    df_stats = pd.DataFrame()

print("Images with errors:", int(df_stats["error"].notna().sum()) if not df_stats.empty else 0)
df_stats.head()


In [None]:

# Save basic stats
if not df_stats.empty:
    df_stats.to_csv(os.path.join(OUT_DIR, "basic_stats.csv"), index=False)
    df_summary = {
        "n_images": int(len(df_stats)),
        "by_split": df_stats.groupby("split")["path"].count().to_dict(),
        "avg_megapixels": float(df_stats["megapixels"].mean(skipna=True)) if "megapixels" in df_stats else None,
        "median_megapixels": float(df_stats["megapixels"].median(skipna=True)) if "megapixels" in df_stats else None,
        "format_counts": df_stats["format"].value_counts(dropna=False).to_dict() if "format" in df_stats else {},
        "error_count": int(df_stats["error"].notna().sum()) if "error" in df_stats else 0,
    }
    with open(os.path.join(OUT_DIR, "summary_basic.json"), "w") as f:
        json.dump(df_summary, f, indent=2)
    df_summary


## 4. Distributions & Visualizations

In [None]:

if not df_stats.empty:
    plt.figure()
    df_stats["megapixels"].dropna().clip(upper=20).hist(bins=50)
    plt.title("Megapixel Distribution (clipped at 20MP)")
    plt.xlabel("MP"); plt.ylabel("Count")
    plt.savefig(os.path.join(OUT_DIR, "hist_megapixels.png"), bbox_inches="tight")
    plt.show()

    plt.figure()
    df_stats["aspect"].dropna().clip(lower=0.2, upper=5).hist(bins=60)
    plt.title("Aspect Ratio Distribution (clipped)")
    plt.xlabel("W/H"); plt.ylabel("Count")
    plt.savefig(os.path.join(OUT_DIR, "hist_aspect.png"), bbox_inches="tight")
    plt.show()

    if "bpp_proxy" in df_stats.columns:
        plt.figure()
        df_stats.loc[df_stats["bpp_proxy"].notna(), "bpp_proxy"].clip(upper=4).hist(bins=60)
        plt.title("JPEG Bits-per-Pixel (proxy)")
        plt.xlabel("bpp (approx)"); plt.ylabel("Count")
        plt.savefig(os.path.join(OUT_DIR, "hist_bpp_proxy.png"), bbox_inches="tight")
        plt.show()

    plt.figure()
    df_stats["format"].fillna("Unknown").value_counts().plot(kind="bar")
    plt.title("Format Counts"); plt.ylabel("Images")
    plt.savefig(os.path.join(OUT_DIR, "bar_formats.png"), bbox_inches="tight")
    plt.show()


## 5. Quality Proxies: Sharpness, Brightness, Colorfulness

In [None]:

def var_of_laplacian(img_gray):
    if cv2 is None:
        return np.nan
    return cv2.Laplacian(img_gray, cv2.CV_64F).var()

def brightness_and_contrast(img_gray):
    g = np.asarray(img_gray, dtype=np.float32) / 255.0
    return float(g.mean()), float(g.std())

def colorfulness_metric(im):
    im = np.asarray(im.convert("RGB"), dtype=np.float32)
    rg = np.abs(im[...,0]-im[...,1])
    yb = np.abs(0.5*(im[...,0]+im[...,1]) - im[...,2])
    std_rg, mean_rg = np.std(rg), np.mean(rg)
    std_yb, mean_yb = np.std(yb), np.mean(yb)
    return float(np.sqrt(std_rg**2 + std_yb**2) + 0.3*np.sqrt(mean_rg**2 + mean_yb**2))

def compute_quality_metrics(row):
    out = {"path": row.path}
    try:
        with Image.open(row.path) as im:
            rgb = im.convert("RGB")
            gray = rgb.convert("L")
            g = np.asarray(gray)
            out["sharpness_varlap"] = var_of_laplacian(g)
            b, c = brightness_and_contrast(gray)
            out["brightness"] = b
            out["contrast"] = c
            out["colorfulness"] = colorfulness_metric(rgb)
    except Exception as e:
        out["qm_error"] = str(e)
    return out

if not df_stats.empty:
    qms = []
    for row in tqdm(df_stats.itertuples(), total=len(df_stats)):
        qms.append(compute_quality_metrics(row))
    df_qm = pd.DataFrame(qms)
    df_stats = df_stats.merge(df_qm, on="path", how="left")
    df_stats.to_csv(os.path.join(OUT_DIR, "stats_with_quality.csv"), index=False)

    for col, title in [
        ("sharpness_varlap", "Variance of Laplacian (Sharpness)"),
        ("brightness", "Brightness (0-1)"),
        ("contrast", "Local Contrast (std of grayscale)"),
        ("colorfulness", "Colorfulness (Hasler-Süsstrunk)"),
    ]:
        plt.figure()
        df_stats[col].dropna().clip(upper=np.percentile(df_stats[col].dropna(), 99)).hist(bins=60)
        plt.title(title); plt.xlabel(col); plt.ylabel("Count")
        plt.savefig(os.path.join(OUT_DIR, f"hist_{col}.png"), bbox_inches="tight")
        plt.show()


## 6. Frequency Richness (High-Frequency Ratio)

In [None]:

def high_frequency_ratio(im, cutoff=0.25):
    # Compute ratio of high-frequency energy via 2D FFT.
    # cutoff in (0,1): proportion of central (low-freq) square to ignore.
    if fft2 is None or fftshift is None:
        return np.nan
    arr = np.asarray(im.convert("L"), dtype=np.float32) / 255.0
    F = fftshift(np.abs(fft2(arr)))
    h, w = F.shape
    cy, cx = h//2, w//2
    r = int(min(h, w) * cutoff / 2.0)
    mask = np.ones_like(F, dtype=bool)
    mask[cy-r:cy+r, cx-r:cx+r] = False
    hf = F[mask].sum()
    total = F.sum() + 1e-8
    return float(hf / total)

def compute_hf_ratio(row):
    out = {"path": row.path}
    try:
        with Image.open(row.path) as im:
            out["hf_ratio"] = high_frequency_ratio(im, cutoff=0.25)
    except Exception as e:
        out["hf_error"] = str(e)
    return out

if not df_stats.empty:
    hfs = []
    for row in tqdm(df_stats.itertuples(), total=len(df_stats)):
        hfs.append(compute_hf_ratio(row))
    df_hf = pd.DataFrame(hfs)
    df_stats = df_stats.merge(df_hf, on="path", how="left")
    df_stats.to_csv(os.path.join(OUT_DIR, "stats_with_quality_and_hf.csv"), index=False)

    plt.figure()
    df_stats["hf_ratio"].dropna().hist(bins=60)
    plt.title("High-Frequency Energy Ratio")
    plt.xlabel("hf_ratio"); plt.ylabel("Count")
    plt.savefig(os.path.join(OUT_DIR, "hist_hf_ratio.png"), bbox_inches="tight")
    plt.show()


## 7. Near-Duplicate Detection (Perceptual Hash)

In [None]:

def phash_of_image(path):
    if imagehash is None:
        return None
    try:
        with Image.open(path) as im:
            return str(imagehash.phash(im.convert("RGB")))
    except Exception:
        return None

if not df_stats.empty and imagehash is not None:
    phashes = []
    for row in tqdm(df_stats.itertuples(), total=len(df_stats)):
        phashes.append({"path": row.path, "phash": phash_of_image(row.path)})
    df_ph = pd.DataFrame(phashes)
    df_stats = df_stats.merge(df_ph, on="path", how="left")

    dup_groups = df_stats[df_stats["phash"].notna()].groupby("phash")["path"].apply(list)
    dup_groups = {h: ps for h, ps in dup_groups.items() if len(ps) > 1}
    with open(os.path.join(OUT_DIR, "near_duplicates.json"), "w") as f:
        json.dump(dup_groups, f, indent=2)
    print("Near-duplicate clusters (exact phash match):", len(dup_groups))


## 8. Visual Sampling Grids

In [None]:

def save_grid(paths, out_path, cols=5, size=256):
    imgs = []
    for p in paths[:cols*cols]:
        try:
            with Image.open(p) as im:
                im = im.convert("RGB")
                im.thumbnail((size, size))
                imgs.append(im)
        except Exception:
            pass
    if not imgs:
        return
    rows = math.ceil(len(imgs)/cols)
    grid = Image.new("RGB", (cols*size, rows*size), (255,255,255))
    for i, im in enumerate(imgs):
        r, c = divmod(i, cols)
        grid.paste(im, (c*size, r*size))
    grid.save(out_path)

if not df_stats.empty:
    df_sorted = df_stats.sort_values("hf_ratio")
    low_hf_paths = df_sorted["path"].head(25).tolist()
    high_hf_paths = df_sorted["path"].tail(25).tolist()
    save_grid(low_hf_paths, os.path.join(OUT_DIR, "grid_low_hf.png"))
    save_grid(high_hf_paths, os.path.join(OUT_DIR, "grid_high_hf.png"))


## 9. Suggested Train/Val/Test Splits

In [None]:

import numpy as np, os
def create_split_files(df, out_dir, train_ratio=0.98, val_ratio=0.02, seed=1337):
    rng = np.random.default_rng(seed)
    df = df[df["error"].isna()] if "error" in df.columns else df.copy()
    paths = df["path"].tolist()
    rng.shuffle(paths)
    n = len(paths)
    n_train = int(n * train_ratio)
    n_val = int(n * val_ratio)
    split = {
        "train": paths[:n_train],
        "val": paths[n_train:n_train+n_val],
        "test": paths[n_train+n_val:],
    }
    os.makedirs(out_dir, exist_ok=True)
    for k, v in split.items():
        with open(os.path.join(out_dir, f"{k}.txt"), "w") as f:
            f.write("\n".join(v))
    return split

if not df_stats.empty:
    split = create_split_files(df_stats, os.path.join(OUT_DIR, "splits"))
    {k: len(v) for k, v in split.items()}


## 10. Pair Generation for Image Restoration (Synthetic Degradations)

We provide simple, configurable degradations to generate paired data for:
- **Super-Resolution (SR)**: down-scale by s∈{2,3,4}, optional blur, noise, JPEG.
- **Denoising**: add Gaussian noise (σ in [0,50]) or Poisson.
- **deJPEG**: re-encode JPEG at low qualities (q in [10,30]).

> **Note:** Deraining/dehazing typically require specialized synthesis; you can extend the pipeline accordingly.

In [None]:

import io, hashlib
def ensure_dir(p):
    os.makedirs(p, exist_ok=True)

def degrade_sr(img, scale=4, blur_sigma=0.8, noise_sigma=2.0, jpeg_q=95):
    w, h = img.size
    lr = img.resize((max(1,w//scale), max(1,h//scale)), Image.BICUBIC)
    if blur_sigma and (cv2 is not None):
        k = int(blur_sigma*4+1)//2*2+1
        lr_np = cv2.GaussianBlur(np.array(lr), (k,k), blur_sigma)
        lr = Image.fromarray(lr_np)
    if noise_sigma and noise_sigma>0:
        lr_np = np.array(lr).astype(np.float32)
        lr_np += np.random.normal(0, noise_sigma, lr_np.shape)
        lr_np = np.clip(lr_np, 0, 255).astype(np.uint8)
        lr = Image.fromarray(lr_np)
    if jpeg_q and 1 <= jpeg_q <= 100:
        buf = io.BytesIO()
        lr.save(buf, format="JPEG", quality=int(jpeg_q))
        buf.seek(0)
        lr = Image.open(buf).convert("RGB")
    return lr

def degrade_denoise(img, sigma=25.0, poisson=False):
    arr = np.array(img).astype(np.float32)
    if poisson:
        arr01 = arr/255.0
        noisy = np.random.poisson(arr01*255)/255.0
        arr = np.clip(noisy*255, 0, 255).astype(np.uint8)
    else:
        arr += np.random.normal(0, sigma, arr.shape)
        arr = np.clip(arr, 0, 255).astype(np.uint8)
    return Image.fromarray(arr)

def degrade_dejpeg(img, quality=20):
    buf = io.BytesIO()
    img.save(buf, format="JPEG", quality=int(quality))
    buf.seek(0)
    return Image.open(buf).convert("RGB")

def write_pair(hr_img, lr_img, base_name, out_root, task):
    hr_dir = os.path.join(out_root, task, "HR")
    lr_dir = os.path.join(out_root, task, "LR")
    ensure_dir(hr_dir); ensure_dir(lr_dir)
    hr_path = os.path.join(hr_dir, base_name + ".png")
    lr_path = os.path.join(lr_dir, base_name + ".png")
    hr_img.save(hr_path)
    lr_img.save(lr_path)
    return hr_path, lr_path

def generate_pairs(paths, out_root="./pairs", limit=None, seed=123):
    rng = np.random.default_rng(seed)
    count = 0
    for p in tqdm(paths):
        if limit and count >= limit:
            break
        try:
            with Image.open(p) as im:
                im = im.convert("RGB")
                base = hashlib.md5(p.encode()).hexdigest()[:12]

                s = int(rng.choice([2,3,4]))
                lr = degrade_sr(im, scale=s, blur_sigma=float(rng.uniform(0.0, 1.2)),
                                noise_sigma=float(rng.uniform(0.0, 3.0)), jpeg_q=int(rng.integers(70, 100)))
                write_pair(im, lr, f"sr_s{s}_{base}", out_root, "SR")

                if rng.random() < 0.5:
                    lr = degrade_denoise(im, sigma=float(rng.uniform(5, 50)), poisson=False)
                else:
                    lr = degrade_denoise(im, sigma=0.0, poisson=True)
                write_pair(im, lr, f"denoise_{base}", out_root, "Denoise")

                q = int(rng.integers(10, 35))
                lr = degrade_dejpeg(im, quality=q)
                write_pair(im, lr, f"dejpeg_q{q}_{base}", out_root, "DeJPEG")
                count += 1
        except Exception:
            pass
    print("Generated pairs:", count)

if not df_stats.empty:
    sample_paths = df_stats["path"].dropna().sample(min(200, len(df_stats)), random_state=0).tolist()
    generate_pairs(sample_paths, out_root=os.path.join(OUT_DIR, "pairs"), limit=None)


## 11. Summary Tables

In [None]:

if not df_stats.empty:
    agg = df_stats.groupby("split").agg(
        n=("path","count"),
        mp_mean=("megapixels","mean"),
        mp_median=("megapixels","median"),
        bpp_mean=("bpp_proxy","mean"),
        hf_mean=("hf_ratio","mean"),
        sharp_mean=("sharpness_varlap","mean"),
        bright_mean=("brightness","mean"),
        contrast_mean=("contrast","mean"),
        color_mean=("colorfulness","mean"),
        n_errors=("error", lambda s: int(s.notna().sum())),
    )
    agg.to_csv(os.path.join(OUT_DIR, "summary_by_split.csv"))
    display(agg.head())

    report = {
        "N": int(len(df_stats)),
        "train_N": int((df_stats["split"]=="train").sum()),
        "val_N": int((df_stats["split"]=="val").sum()),
        "avg_MP": float(df_stats["megapixels"].mean(skipna=True)),
        "median_MP": float(df_stats["megapixels"].median(skipna=True)),
        "avg_bpp": float(df_stats["bpp_proxy"].mean(skipna=True)),
        "avg_hf_ratio": float(df_stats["hf_ratio"].mean(skipna=True)),
        "avg_sharp_varlap": float(df_stats["sharpness_varlap"].mean(skipna=True)),
        "avg_brightness": float(df_stats["brightness"].mean(skipna=True)),
        "avg_contrast": float(df_stats["contrast"].mean(skipna=True)),
        "avg_colorfulness": float(df_stats["colorfulness"].mean(skipna=True)),
        "errors": int(df_stats["error"].notna().sum()) if "error" in df_stats else 0,
    }
    with open(os.path.join(OUT_DIR, "eda_report.json"), "w") as f:
        json.dump(report, f, indent=2)
    report
