In [1]:
import os
import lmdb
import pandas as pd

# Define the base directory
base_path = "./"

# List all subdirectories
folders = [f for f in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, f))]

# Function to count LMDB entries
def get_lmdb_length(lmdb_path):
    if not os.path.exists(lmdb_path):
        return 0  # Return 0 if LMDB database does not exist

    try:
        env = lmdb.open(
            lmdb_path,
            subdir=os.path.isdir(lmdb_path),
            readonly=True,
            lock=False,
            readahead=False,
            meminit=False,
        )
        with env.begin() as txn:
            stats = txn.stat()
        env.close()
        return stats['entries']
    except Exception as e:
        print(f"Error reading {lmdb_path}: {e}")
        return 0  # Return 0 if an error occurs

# Collect data for DataFrame
data = []

for folder in folders:
    folder_path = os.path.join(base_path, folder)
    train_lmdb_path = os.path.join(folder_path, "train.lmdb")
    valid_lmdb_path = os.path.join(folder_path, "valid.lmdb")

    # Get the number of entries in train.lmdb and valid.lmdb
    train_count = get_lmdb_length(train_lmdb_path)
    valid_count = get_lmdb_length(valid_lmdb_path)

    # Append data to list
    data.append([folder, train_count, valid_count, train_count + valid_count])

# Create DataFrame
df = pd.DataFrame(data, columns=["Dataset", "Train Entries", "Valid Entries", "Total Entries"])

df

Unnamed: 0,Dataset,Train Entries,Valid Entries,Total Entries
0,crystal_ll,350968,87741,438709
1,crystal_hh,350968,87741,438709
2,film_hh,1503930,375982,1879912
3,pretrain,0,0,0
4,film_ll,1502845,375711,1878556
