# Save Training Data
* preprocesses training data and optionally stores it in the cloud

In [None]:
import NBInclude: @nbinclude
@nbinclude("../Alpha.ipynb");

In [None]:
function copy(src, dst)
    @info "copying $src to $dst"
    run(`rclone --retries=10 copy $src $dst`)
end

function get_path(repo, version, part)
    joinpath(repo, "data/alphas/all/Transformer/$version/$part")
end

function get_cloud_path(version, part)
    joinpath(get_settings()["bucket"], version, "$part")
end

function is_epoch_complete(repo, version, part, split, epoch)
    path = get_path(repo, version, part)
    isfile(joinpath(path, split, "$epoch", "complete"))
end

function save_epoch(repo, version, part, split, epoch, bucket)
    src = joinpath(get_path(repo, version, part), split, "$epoch")
    dst = joinpath(get_cloud_path(version, part), split, "$epoch")
    copy(src, dst)
end

function save_config(repo, version, part, bucket)
    src = joinpath(get_path(repo, version, part), "config.json")
    dst = get_cloud_path(version, part)
    copy(src, dst)
end

function gen_part(repo, version, part, num_parts, num_epochs)
    cd(joinpath(repo, "notebooks/TrainingAlphas/Transformer"))
    input = "PretrainDataset.ipynb"
    output = joinpath(
        repo,
        "data/papermill/training_alphas/transformer/all/PretrainDataset.$part.ipynb",
    )
    opts = [
        "-p",
        "partition",
        "$part",
        "-p",
        "num_partitions",
        "$num_parts",
        "-p",
        "num_epochs",
        "$num_epochs",
        "-p",
        "version",
        "$version",
    ]
    run(`papermill $input $output $opts`)
end

function save_part(repo, version, part, num_epochs, bucket)
    for epoch = 0:num_epochs-1
        for split in ["validation", "training"]
            while !is_epoch_complete(repo, version, part, split, epoch)
                sleep(10)
            end
            save_epoch(repo, version, part, split, epoch, bucket)
        end
    end
    save_config(repo, version, part, bucket)
end

function remove_part(repo, version, part)
    path = get_path(repo, version, part)
    @info "deleting $path"
    rm(path, recursive = true)
end

function save_training_data()
    repo = get_data_path("../")
    version = "maskv4"
    set_logging_outdir("all/Transformer/$version")
    num_parts = 4
    num_epochs = 128
    for part = 0:num_parts-1
        Threads.@spawn gen_part(repo, version, part, num_parts, num_epochs)
        if get_settings()["cloud_storage"]
            save_part(repo, version, part, num_epochs, get_settings()["bucket"])
            remove_part(repo, version, part)
        end
    end
end;

In [None]:
save_training_data()