In [8]:
import os
import pathlib
import shutil

import pandas as pd
from sklearn.model_selection import train_test_split


def _chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i : i + n]


def generate_csv_test_dataset(
    name: str, df: pd.DataFrame, test_datasets_path: pathlib.Path, random_state: int
) -> None:
    # generate dirs
    base_path = test_datasets_path / "csv" / name
    os.makedirs(base_path, exist_ok=True)
    os.makedirs(base_path / "split" / "train", exist_ok=True)
    os.makedirs(base_path / "split" / "test", exist_ok=True)
    os.makedirs(base_path / "split" / "val", exist_ok=True)
    os.makedirs(base_path / "bad_split" / "train", exist_ok=True)
    os.makedirs(base_path / "bad_split" / "test", exist_ok=True)
    os.makedirs(base_path / "bad_split" / "val", exist_ok=True)
    os.makedirs(base_path / "splits" / "train", exist_ok=True)
    os.makedirs(base_path / "splits" / "test", exist_ok=True)
    os.makedirs(base_path / "splits" / "val", exist_ok=True)

    # ---------------------------------------------------------------------------------
    # common cases
    df.to_csv(base_path / "comma.csv", sep=",", index=False)
    df.to_csv(base_path / "semicolon.csv", sep=";", index=False)
    df.to_csv(base_path / "tab.csv", sep="\t", index=False)
    df.to_csv(base_path / "vertical_bar.csv", sep="|", index=False)

    # ---------------------------------------------------------------------------------
    # no header
    df.to_csv(
        base_path / "no_header.csv",
        sep=";",
        header=False,
        index=False,
    )

    # ---------------------------------------------------------------------------------
    # bad format
    df.to_csv(
        path_or_buf=base_path / "bad_format.csv",
        sep=";",
        header=False,
        index=False,
    )
    with open(base_path / "bad_format.csv", "w") as file:
        file.write("------")

    # ---------------------------------------------------------------------------------
    # empty file
    with open(base_path / "empty_file.csv", "w") as file:
        file.write("")

    # ---------------------------------------------------------------------------------
    # empty dataframe
    df.head(0).to_csv(
        path_or_buf=base_path / "empty_dataset.csv",
        sep=";",
        index=False,
    )

    # ---------------------------------------------------------------------------------
    # with splits
    train, rest = train_test_split(
        df,
        train_size=0.334,
        stratify=df.target,
        random_state=random_state,
    )
    test, val = train_test_split(
        rest,
        train_size=0.5,
        stratify=rest.target,
        random_state=random_state,
    )

    train.to_csv(
        path_or_buf=base_path / "split" / "train" / "train.csv",
        sep=";",
        index=False,
    )
    test.to_csv(
        path_or_buf=base_path / "split" / "test" / "test.csv",
        sep=";",
        index=False,
    )
    val.to_csv(
        path_or_buf=base_path / "split" / "val" / "val.csv",
        sep=";",
        index=False,
    )
    shutil.make_archive(str(base_path / "split.zip"), "zip", base_path / "split")

    # ---------------------------------------------------------------------------------
    # with splits but bad folders
    train.to_csv(
        path_or_buf=base_path / "bad_split" / "train.csv",
        sep=";",
        index=False,
    )
    test.to_csv(
        path_or_buf=base_path / "bad_split" / "test.csv",
        sep=";",
        index=False,
    )
    val.to_csv(
        path_or_buf=base_path / "bad_split" / "val.csv",
        sep=";",
        index=False,
    )
    shutil.make_archive(
        str(base_path / "bad_split.zip"),
        "zip",
        base_path / "bad_split",
    )

    # ---------------------------------------------------------------------------------
    # with several splits
    def get_start_end_idx(df, n, i):
        start_idx = int(i * len(df) / n)
        end_idx = min(int((i + 1) * len(df) / n), len(df))
        return start_idx, end_idx

    n = 5
    for i in range(n - 1):
        train_start, train_end = get_start_end_idx(train, n, i)
        train.iloc[train_start:train_end].to_csv(
            path_or_buf=base_path / "splits" / "train" / f"train_{i}.csv",
            sep=";",
            index=False,
        )

        test_start, test_end = get_start_end_idx(test, n, i)
        test.iloc[test_start:test_end].to_csv(
            path_or_buf=base_path / "splits" / "test" / f"test_{i}.csv",
            sep=";",
            index=False,
        )

        val_start, val_end = get_start_end_idx(test, n, i)
        val.iloc[val_start:val_end].to_csv(
            path_or_buf=base_path / "splits" / "val" / f"val_{i}.csv",
            sep=";",
            index=False,
        )
    shutil.make_archive(
        str(base_path / "splits.zip"),
        "zip",
        base_path / "splits",
    )

    # TODO: splits with one error.

In [10]:
from sklearn.datasets import load_iris, load_wine

TEST_DATASETS_PATH = pathlib.Path("./test_datasets")
RANDOM_STATE = 50

os.makedirs(TEST_DATASETS_PATH, exist_ok=True)
with open(TEST_DATASETS_PATH / ".gitignore", "w") as f:
    f.write("*")

df_iris = load_iris(return_X_y=False, as_frame=True)["frame"]
generate_csv_test_dataset(
    "iris",
    df=df_iris,
    test_datasets_path=TEST_DATASETS_PATH,
    random_state=RANDOM_STATE,
)

In [None]:
df_wine = load_wine(return_X_y=False, as_frame=True)["frame"]
generate_csv_test_dataset("wine", df=df_wine)

In [None]:
[lst[i : i + n] for i in range(0, len(lst), n)]

AttributeError: module 'itertools' has no attribute 'batched'

In [None]:
for i in chunks(range(150), 3):
    print(list(i))

[0, 1, 2]
[3, 4, 5]
[6, 7, 8]
[9, 10, 11]
[12, 13, 14]
[15, 16, 17]
[18, 19, 20]
[21, 22, 23]
[24, 25, 26]
[27, 28, 29]
[30, 31, 32]
[33, 34, 35]
[36, 37, 38]
[39, 40, 41]
[42, 43, 44]
[45, 46, 47]
[48, 49, 50]
[51, 52, 53]
[54, 55, 56]
[57, 58, 59]
[60, 61, 62]
[63, 64, 65]
[66, 67, 68]
[69, 70, 71]
[72, 73, 74]
[75, 76, 77]
[78, 79, 80]
[81, 82, 83]
[84, 85, 86]
[87, 88, 89]
[90, 91, 92]
[93, 94, 95]
[96, 97, 98]
[99, 100, 101]
[102, 103, 104]
[105, 106, 107]
[108, 109, 110]
[111, 112, 113]
[114, 115, 116]
[117, 118, 119]
[120, 121, 122]
[123, 124, 125]
[126, 127, 128]
[129, 130, 131]
[132, 133, 134]
[135, 136, 137]
[138, 139, 140]
[141, 142, 143]
[144, 145, 146]
[147, 148, 149]
