Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: A Prototype for Enhancing the Performance of Data loader #708

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ if (FL_BACKEND STREQUAL "CPU")
set(FL_USE_CPU ON)
elseif (FL_BACKEND STREQUAL "CUDA")
set(FL_USE_CUDA ON)
add_compile_options(-D FL_USE_CUDA)
elseif (FL_BACKEND STREQUAL "OPENCL")
set(FL_USE_OPENCL ON)
else ()
Expand Down
14 changes: 9 additions & 5 deletions flashlight/app/imgclass/dataset/Imagenet.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,10 @@ std::shared_ptr<Dataset> imagenetDataset(

// Create image dataset
std::shared_ptr<Dataset> imageDataset = fl::ext::image::jpegLoader(filepaths);
imageDataset = std::make_shared<TransformDataset>(imageDataset, transformfns);
if (!transformfns.empty()) {
imageDataset =
std::make_shared<TransformDataset>(imageDataset, transformfns);
}

// Create labels from filepaths
auto getLabelIdxs = [&labelMap](const std::string& s) -> uint64_t {
Expand All @@ -73,10 +76,11 @@ std::shared_ptr<Dataset> imagenetDataset(
std::transform(
filepaths.begin(), filepaths.end(), labels.begin(), getLabelIdxs);

auto labelDataset = std::make_shared<LabelLoader>(labels, [](uint64_t x) {
std::vector<af::array> result{af::constant(x, 1, 1, 1, 1, af::dtype::u64)};
return result;
});
auto labelDataset = std::make_shared<LabelLoader>(
labels, [](uint64_t x, SamplePtr samplePtr) {
auto label = af::constant(x, 1, 1, 1, 1, af::dtype::u64);
samplePtr->setArray(label);
});
return std::make_shared<MergeDataset>(
MergeDataset({imageDataset, labelDataset}));
}
Expand Down
2 changes: 1 addition & 1 deletion flashlight/app/imgclass/dataset/Imagenet.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ std::unordered_map<std::string, uint64_t> getImagenetLabels(
std::shared_ptr<Dataset> imagenetDataset(
const std::string& fp,
const std::unordered_map<std::string, uint64_t>& labelMap,
std::vector<Dataset::TransformFunction> transformfns);
std::vector<Dataset::TransformFunction> transformfns = {});

constexpr uint64_t kImagenetInputIdx = 0;
constexpr uint64_t kImagenetTargetIdx = 1;
Expand Down
9 changes: 6 additions & 3 deletions flashlight/app/imgclass/examples/ImageNetEval.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,21 +85,24 @@ int main(int argc, char** argv) {
// Conventional image resize parameter used for evaluation
const int randomResizeMin = imageSize / .875;
ImageTransform testTransforms = compose(
{fl::ext::image::resizeTransform(randomResizeMin),
{fl::ext::image::reorder(1, 2, 0), // stb has channel along the first dim.
fl::ext::image::resizeTransform(randomResizeMin),
fl::ext::image::centerCropTransform(imageSize),
fl::ext::image::normalizeImage(
fl::app::image::kImageNetMean, fl::app::image::kImageNetStd)});

auto labelMap = getImagenetLabels(labelPath);
auto testDataset = fl::ext::image::DistributedDataset(
imagenetDataset(testList, labelMap, {testTransforms}),
imagenetDataset(testList, labelMap),
worldRank,
worldSize,
FLAGS_data_batch_size,
1, // train_n_repeatedaug
10, // prefetch threads
FLAGS_data_batch_size,
fl::BatchDatasetPolicy::INCLUDE_LAST);
{testTransforms},
fl::BatchDatasetPolicy::INCLUDE_LAST,
true /* usePreallocatedSamples */);
FL_LOG_MASTER(INFO) << "[testDataset size] " << testDataset.size();

// The main evaluation loop
Expand Down
15 changes: 11 additions & 4 deletions flashlight/app/imgclass/examples/ImageNetResnet34.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include "flashlight/ext/image/fl/dataset/DistributedDataset.h"
#include "flashlight/ext/image/fl/models/Resnet.h"
#include "flashlight/fl/dataset/datasets.h"
#include "flashlight/fl/dataset/Sample.h"
#include "flashlight/fl/meter/meters.h"
#include "flashlight/fl/optim/optim.h"
#include "flashlight/lib/common/String.h"
Expand Down Expand Up @@ -182,6 +183,7 @@ int main(int argc, char** argv) {
ImageTransform trainTransforms = compose(
{// randomly resize shortest side of image between 256 to 480 for
// scale invariance
fl::ext::image::reorder(1, 2, 0), // stb has channel along the first dim.
fl::ext::image::randomResizeTransform(randomResizeMin, randomResizeMax),
fl::ext::image::randomCropTransform(randomCropSize, randomCropSize),
fl::ext::image::normalizeImage(
Expand All @@ -190,6 +192,7 @@ int main(int argc, char** argv) {
fl::ext::image::randomHorizontalFlipTransform(horizontalFlipProb)});
ImageTransform valTransforms = compose(
{// Resize shortest side to 256, then take a center crop
fl::ext::image::reorder(1, 2, 0), // stb has channel along the first dim.
fl::ext::image::resizeTransform(randomResizeMin),
fl::ext::image::centerCropTransform(randomCropSize),
fl::ext::image::normalizeImage(
Expand All @@ -200,24 +203,28 @@ int main(int argc, char** argv) {
const int64_t prefetchSize = FLAGS_data_batch_size;
auto labelMap = getImagenetLabels(labelPath);
auto trainDataset = fl::ext::image::DistributedDataset(
imagenetDataset(trainList, labelMap, {trainTransforms}),
imagenetDataset(trainList, labelMap),
worldRank,
worldSize,
batchSizePerGpu,
1, // train_n_repeatedaug
prefetchThreads,
prefetchSize,
fl::BatchDatasetPolicy::SKIP_LAST);
{trainTransforms},
fl::BatchDatasetPolicy::SKIP_LAST,
true /* usePreallocatedSamples */);

auto valDataset = fl::ext::image::DistributedDataset(
imagenetDataset(valList, labelMap, {valTransforms}),
imagenetDataset(valList, labelMap),
worldRank,
worldSize,
batchSizePerGpu,
1, // train_n_repeatedaug
prefetchThreads,
prefetchSize,
fl::BatchDatasetPolicy::INCLUDE_LAST);
{valTransforms},
fl::BatchDatasetPolicy::INCLUDE_LAST,
true /* usePreallocatedSamples */);

//////////////////////////
// Load model and optimizer
Expand Down
13 changes: 9 additions & 4 deletions flashlight/app/imgclass/examples/ImageNetTransformer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,8 @@ int main(int argc, char** argv) {
imageSize);

ImageTransform trainTransforms = compose(
{fl::ext::image::randomResizeCropTransform(
{fl::ext::image::reorder(1, 2, 0), // stb has channel along the first dim.
fl::ext::image::randomResizeCropTransform(
imageSize,
0.08, // scaleLow
1.0, // scaleHigh
Expand All @@ -226,7 +227,8 @@ int main(int argc, char** argv) {
fl::ext::image::randomEraseTransform(FLAGS_train_aug_p_randomerase)});

ImageTransform valTransforms = compose(
{fl::ext::image::resizeTransform(randomResizeMin),
{fl::ext::image::reorder(1, 2, 0), // stb has channel along the first dim.
fl::ext::image::resizeTransform(randomResizeMin),
fl::ext::image::centerCropTransform(imageSize),
fl::ext::image::normalizeImage(
fl::app::image::kImageNetMean, fl::app::image::kImageNetStd)});
Expand All @@ -238,25 +240,28 @@ int main(int argc, char** argv) {
<< "[Warning] You are not using all ImageNet classes (1000)";
}

auto trainTransformVec = {trainTransforms};
auto trainDataset = std::make_shared<fl::ext::image::DistributedDataset>(
imagenetDataset(trainList, labelMap, {trainTransforms}),
imagenetDataset(trainList, labelMap),
worldRank,
worldSize,
FLAGS_data_batch_size,
FLAGS_train_aug_n_repeatedaug,
FLAGS_data_prefetch_thread,
prefetchSize,
trainTransformVec,
fl::BatchDatasetPolicy::SKIP_LAST);
FL_LOG_MASTER(INFO) << "[trainDataset size] " << trainDataset->size();

auto valDataset = fl::ext::image::DistributedDataset(
imagenetDataset(valList, labelMap, {valTransforms}),
imagenetDataset(valList, labelMap),
worldRank,
worldSize,
FLAGS_data_batch_size,
1, // train_n_repeatedaug
FLAGS_data_prefetch_thread,
prefetchSize,
{valTransforms},
fl::BatchDatasetPolicy::INCLUDE_LAST);
FL_LOG_MASTER(INFO) << "[valDataset size] " << valDataset.size();

Expand Down
4 changes: 3 additions & 1 deletion flashlight/app/objdet/dataset/Coco.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,9 @@ CocoDataset::CocoDataset(
// Now define how to load the data from CocoDataSampoles in arrayfire
std::shared_ptr<Dataset> ds = std::make_shared<LoaderDataset<CocoDataSample>>(
data, [](const CocoDataSample& sample) {
af::array image = loadJpeg(sample.filepath);
auto samplePtr = std::make_shared<Sample>();
loadJpeg(sample.filepath, samplePtr);
af::array image = samplePtr->array();

long long int imageSizeArray[] = {image.dims(1), image.dims(0)};

Expand Down
26 changes: 16 additions & 10 deletions flashlight/app/objdet/examples/ImageNetResnet50Backbone.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -132,32 +132,36 @@ int main(int argc, char** argv) {
// af::array. Thus, we need to `compose` all of the transforms so are each
// applied only to the image
ImageTransform trainTransforms = compose(
{// randomly resize shortest side of image between 256 to 480 for
// scale invariance
{fl::ext::image::reorder(1, 2, 0), // stb has channel along the first dim.
// randomly resize shortest side of image between 256 to
// 480 for scale invariance
fl::ext::image::randomResizeTransform(randomResizeMin, randomResizeMax),
fl::ext::image::randomCropTransform(randomCropSize, randomCropSize),
fl::ext::image::normalizeImage(mean, std),
// Randomly flip image with probability of 0.5
fl::ext::image::randomHorizontalFlipTransform(horizontalFlipProb)});
ImageTransform valTransforms =
compose({// Resize shortest side to 256, then take a center crop
fl::ext::image::resizeTransform(randomResizeMin),
fl::ext::image::centerCropTransform(randomCropSize),
fl::ext::image::normalizeImage(mean, std)});
ImageTransform valTransforms = compose(
{// Resize shortest side to 256, then take a center crop
fl::ext::image::reorder(1, 2, 0), // stb has channel along the first dim.
fl::ext::image::resizeTransform(randomResizeMin),
fl::ext::image::centerCropTransform(randomCropSize),
fl::ext::image::normalizeImage(mean, std)});

const int64_t batchSizePerGpu = FLAGS_data_batch_size;
const int64_t prefetchThreads = 10;
const int64_t prefetchSize = FLAGS_data_batch_size;
auto labelMap = getImagenetLabels(labelPath);
auto trainDataset = fl::ext::image::DistributedDataset(
imagenetDataset(trainList, labelMap, {trainTransforms}),
imagenetDataset(trainList, labelMap),
worldRank,
worldSize,
batchSizePerGpu,
1, // train_n_repeatedaug
prefetchThreads,
prefetchSize,
fl::BatchDatasetPolicy::INCLUDE_LAST);
{trainTransforms},
fl::BatchDatasetPolicy::INCLUDE_LAST,
true /* usePreallocatedSamples */);

auto valDataset = fl::ext::image::DistributedDataset(
imagenetDataset(valList, labelMap, {valTransforms}),
Expand All @@ -167,7 +171,9 @@ int main(int argc, char** argv) {
1, // train_n_repeatedaug
prefetchThreads,
prefetchSize,
fl::BatchDatasetPolicy::INCLUDE_LAST);
{trainTransforms},
fl::BatchDatasetPolicy::INCLUDE_LAST,
true /* usePreallocatedSamples */);

//////////////////////////
// Load model and optimizer
Expand Down
10 changes: 10 additions & 0 deletions flashlight/ext/amp/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,17 @@
cmake_minimum_required(VERSION 3.10)

if (FL_USE_CUDA)
target_sources(
flashlight
PRIVATE
${CMAKE_CURRENT_LIST_DIR}/DynamicScaler.cpp
${CMAKE_CURRENT_LIST_DIR}/backend/cuda/Utils.cu
)
else()
target_sources(
flashlight
PRIVATE
${CMAKE_CURRENT_LIST_DIR}/DynamicScaler.cpp
${CMAKE_CURRENT_LIST_DIR}/backend/cpu/Utils.cpp
)
endif()
51 changes: 17 additions & 34 deletions flashlight/ext/amp/DynamicScaler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
*/

#include "flashlight/ext/amp/DynamicScaler.h"

#include "flashlight/ext/amp/Utils.h"
#include "flashlight/fl/flashlight.h"

namespace fl {
Expand All @@ -16,59 +16,42 @@ DynamicScaler::DynamicScaler(
double initFactor,
double maxFactor,
unsigned int updateInterval)
: scaleFactor_(initFactor),
maxScaleFactor_(maxFactor),
updateInterval_(updateInterval) {}
: maxScaleFactor_(maxFactor), updateInterval_(updateInterval) {
scaleFactor_ = fl::Variable(af::constant(initFactor, 1, 1, 1, 1, f32), false);
flag_ = af::constant(0, 1, 1, 1, 1, s32);
}

fl::Variable DynamicScaler::scale(const fl::Variable& loss) {
// Force casting to fp32 to avoid overflow in scaling.
auto scaledLoss = loss.as(af::dtype::f32);
scaledLoss = scaledLoss * scaleFactor_;
scaledLoss = scaleLoss(scaledLoss, scaleFactor_);
return scaledLoss;
}

bool DynamicScaler::unscale(std::vector<fl::Variable>& params) {
for (auto& p : params) {
p.grad() = p.grad() / scaleFactor_;
if (fl::isInvalidArray(p.grad().array())) {
if (scaleFactor_ >= fl::kAmpMinimumScaleFactorValue) {
scaleFactor_ = scaleFactor_ / 2.0f;
FL_LOG(INFO) << "AMP: Scale factor decreased. New value:\t"
<< scaleFactor_;
} else {
FL_LOG(FATAL) << "Minimum loss scale reached: "
<< fl::kAmpMinimumScaleFactorValue
<< " with over/underflowing gradients. Lowering the "
<< "learning rate, using gradient clipping, or "
<< "increasing the batch size can help resolve "
<< "loss explosion.";
}
successCounter_ = 0;
return false;
}
validityCheck(p.grad().array(), flag_);
}
for (auto& p : params) {
scaleGrads(p.grad().array(), scaleFactor_.array(), flag_);
}

++successCounter_;
return true;
return adjustScaleFactor(scaleFactor_.array(), flag_);
}

void DynamicScaler::update() {
if (scaleFactor_ >= maxScaleFactor_) {
return;
}

if (scaleFactor_ == updateInterval_) {
scaleFactor_ *= 2;
FL_VLOG(2) << "AMP: Scale factor doubled. New value:\t" << scaleFactor_;
if (successCounter_ == updateInterval_) {
scaleFactor_.array() = scaleFactor_.array() * 2;
FL_VLOG(2) << "AMP: Scale factor doubled";
successCounter_ = 0;
} else {
scaleFactor_ += 2;
FL_VLOG(3) << "AMP: Scale factor incremented. New value\t" << scaleFactor_;
scaleFactor_.array() = scaleFactor_.array() + 2;
FL_VLOG(3) << "AMP: Scale factor incremented";
}
}

double DynamicScaler::getScaleFactor() const {
return scaleFactor_;
return scaleFactor_.scalar<float>();
}

} // namespace ext
Expand Down
12 changes: 10 additions & 2 deletions flashlight/ext/amp/DynamicScaler.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,15 +65,23 @@ class DynamicScaler {
double getScaleFactor() const;

private:
double scaleFactor_;
fl::Variable scaleFactor_;
// The maximum value of scaleFactor_.
double maxScaleFactor_;
// Number of iterations without changing scaleFactor_.
unsigned int successCounter_{0};
// Double up the scaleFactor_ when successCounter_ equals updateInterval_.
unsigned int updateInterval_;

FL_SAVE_LOAD(scaleFactor_, maxScaleFactor_, updateInterval_, successCounter_)

af::array flag_;

FL_SAVE_LOAD(
maxScaleFactor_,
updateInterval_,
successCounter_,
scaleFactor_,
flag_)
DynamicScaler() = default;
};

Expand Down