Generate Codebooks for All CSVs in the Repository

In [1]:
import os
import glob
import pandas as pd
from pathlib import Path
from ydata_profiling import ProfileReport

Setup script dir and paths

In [2]:
script_dir = Path.cwd()
project_root = script_dir.parent.parent
output_dir = project_root / "codebooks"
output_dir.mkdir(exist_ok=True)

Search recursively across repo folders for csvs

In [3]:
def find_all_csv_files(root_dir):
    csv_files = []
    for csv_file in glob.glob(root_dir + "/**/*.csv", recursive=True):
        path = Path(csv_file)
        if any(part in str(path) for part in ['venv', '__pycache__', '.git']):
            continue
        csv_files.append(str(path))
    return sorted(csv_files)

csv_files = find_all_csv_files(str(project_root))

Create and save ProfileReport for each csv

In [4]:
for csv_file in csv_files:
        dataset_name = Path(csv_file).stem
        codebook_path = os.path.join(output_dir, dataset_name + "_codebook.html")
        df = pd.read_csv(csv_file)
        profile = ProfileReport(df, title="Data Profile: " + dataset_name, explorative=True)
        profile.to_file(codebook_path)

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]




  0%|          | 0/58 [00:00<?, ?it/s]

[A




  5%|▌         | 3/58 [00:00<00:01, 28.61it/s]

[A




 72%|███████▏  | 42/58 [00:00<00:00, 229.49it/s]

[A

100%|██████████| 58/58 [00:00<00:00, 202.12it/s]




Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]




  0%|          | 0/67 [00:00<?, ?it/s]

[A




 15%|█▍        | 10/67 [00:00<00:00, 60.63it/s]

[A




 34%|███▍      | 23/67 [00:00<00:00, 89.98it/s]

[A




 54%|█████▎    | 36/67 [00:00<00:00, 104.98it/s]

[A




 76%|███████▌  | 51/67 [00:00<00:00, 100.13it/s]

[A




 93%|█████████▎| 62/67 [00:00<00:00, 97.75it/s] 

[A

100%|██████████| 67/67 [00:00<00:00, 97.58it/s]




Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]




  0%|          | 0/73 [00:00<?, ?it/s]

[A




  4%|▍         | 3/73 [00:00<00:02, 26.39it/s]

[A




 42%|████▏     | 31/73 [00:00<00:00, 159.86it/s]

[A




 70%|██████▉   | 51/73 [00:00<00:00, 172.71it/s]

[A




 95%|█████████▍| 69/73 [00:00<00:00, 135.76it/s]

[A

100%|██████████| 73/73 [00:00<00:00, 143.58it/s]




Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]




  0%|          | 0/14 [00:00<?, ?it/s]

[A

100%|██████████| 14/14 [00:00<00:00, 797.86it/s]




Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]




  0%|          | 0/3 [00:00<?, ?it/s]

[A

100%|██████████| 3/3 [00:00<00:00, 124.78it/s]




Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]




  0%|          | 0/67 [00:00<?, ?it/s]

[A




 54%|█████▎    | 36/67 [00:00<00:00, 348.38it/s]

[A

100%|██████████| 67/67 [00:00<00:00, 337.03it/s]




Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]




  0%|          | 0/67 [00:00<?, ?it/s]

[A




 88%|████████▊ | 59/67 [00:00<00:00, 553.86it/s]

[A

100%|██████████| 67/67 [00:00<00:00, 623.04it/s]




Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]




  0%|          | 0/5 [00:00<?, ?it/s]

[A

100%|██████████| 5/5 [00:00<00:00, 305.20it/s]




Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]




  0%|          | 0/12 [00:00<?, ?it/s]

[A

100%|██████████| 12/12 [00:00<00:00, 249166.57it/s]




Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]




  0%|          | 0/8 [00:00<?, ?it/s]

[A

100%|██████████| 8/8 [00:00<00:00, 801.19it/s]




Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]




  0%|          | 0/8 [00:00<?, ?it/s]

[A

100%|██████████| 8/8 [00:00<00:00, 502.04it/s]




Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]




  0%|          | 0/26 [00:00<?, ?it/s]

[A




  4%|▍         | 1/26 [00:00<00:02,  9.97it/s]

[A




  8%|▊         | 2/26 [00:00<00:02,  8.75it/s]

[A




 54%|█████▍    | 14/26 [00:00<00:00, 52.59it/s]

[A




 96%|█████████▌| 25/26 [00:00<00:00, 69.54it/s]

[A

100%|██████████| 26/26 [00:00<00:00, 57.07it/s]




Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]