Skip to content

Commit

Permalink
Migrate training into celery and upload results to s3 (#1157)
Browse files Browse the repository at this point in the history
* upload train results to s3, add endpoint to get train results, frontend request from endpoint

:art: Auto-generated directory tree for repository in Architecture.md

:art: Format Python code with psf/black

slight reformatting

add refetch

reduce duplicated code

* resolve comments

:art: Auto-generated directory tree for repository in Architecture.md

* :art: Format Python code with psf/black

* update dockerfiles so that it automatically spins up celery and redis in dev env

* call createTrainspace, pass id to train()

* modify get_trainspace to return detailedtrainresultsdata

* resolve sonarcloud

* :art: Format Python code with psf/black

* resolve sonarcloud

* :art: Auto-generated directory tree for repository in Architecture.md

* :art: Format Python code with psf/black

* pass tests

* fix nits

---------

Co-authored-by: andrewpeng02 <andrewpeng02@users.noreply.github.com>
  • Loading branch information
andrewpeng02 and andrewpeng02 committed May 14, 2024
1 parent 9cb69e4 commit 54a800c
Show file tree
Hide file tree
Showing 47 changed files with 2,146 additions and 675 deletions.
20 changes: 14 additions & 6 deletions .github/Architecture.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,18 +30,26 @@
| | | |- 📜 __init__.py
| | | |- 📜 health_check_middleware.py
| | |- 📂 core:
| | | |- 📜 trainer.py
| | | |- 📜 criterion.py
| | | |- 📜 dl_model.py : torch model based on user specifications from drag and drop
| | | |- 📜 dataset.py : read in the dataset through URL or file upload
| | | |- 📂 celery:
| | | | |- 📜 trainer.py
| | | | |- 📜 criterion.py
| | | | |- 📜 dl_model.py : torch model based on user specifications from drag and drop
| | | | |- 📜 train_types.py
| | | | |- 📜 dataset.py : read in the dataset through URL or file upload
| | | | |- 📜 __init__.py
| | | | |- 📜 Dockerfile
| | | | |- 📜 worker.py
| | | | |- 📜 optimizer.py : what optimizer to use (ie: SGD or Adam for now)
| | | |- 📜 __init__.py
| | | |- 📜 authenticator.py
| | | |- 📜 optimizer.py : what optimizer to use (ie: SGD or Adam for now)
| | |- 📜 asgi.py
| | |- 📜 constants.py : list of helpful constants
| | |- 📜 celery_app.py
| | |- 📜 settings.py
| | |- 📜 __init__.py
| | |- 📜 wsgi.py
| | |- 📜 urls.py
| | |- 📜 celeryconfig.py
| |- 📜 README.md
| |- 📜 docker-compose.yml
| |- 📜 cli.py
Expand All @@ -52,7 +60,6 @@
| |- 📜 manage.py
| |- 📜 environment.yml
| |- 📜 docker-compose.prod.yml
| |- 📜 Dockerfile.prod
```

## Frontend Architecture
Expand Down Expand Up @@ -210,6 +217,7 @@
| | |- 📂 pages:
| | | |- 📂 train:
| | | | |- 📜 [train_space_id].tsx
| | | | |- 📜 metrics_to_charts.tsx
| | | | |- 📜 index.tsx
| | | |- 📜 _app.tsx
| | | |- 📜 forgot.tsx
Expand Down
15 changes: 15 additions & 0 deletions dlp-terraform/ecs/s3.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
resource "aws_s3_bucket" "s3bucket_executions" {
bucket = "dlp-executions"

tags = {
Name = "Execution data"
}
}
resource "aws_s3_bucket_public_access_block" "access_block_uploads" {
bucket = aws_s3_bucket.s3bucket_executions.id

block_public_acls = true
block_public_policy = true
ignore_public_acls = true
restrict_public_buckets = true
}
27 changes: 27 additions & 0 deletions dlp-terraform/ecs/sqs.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
resource "aws_sqs_queue" "training_queue" {
name = "training-queue.fifo"
fifo_queue = true
message_retention_seconds = 60*24

redrive_policy = jsonencode({
deadLetterTargetArn = aws_sqs_queue.training_queue_deadletter.arn
maxReceiveCount = 4
})
}

resource "aws_sqs_queue" "training_queue_deadletter" {
name = "training-deadletter-queue"
}

resource "aws_sqs_queue_redrive_allow_policy" "training_queue_redrive_allow_policy" {
queue_url = aws_sqs_queue.training_queue_deadletter.id

redrive_allow_policy = jsonencode({
redrivePermission = "byQueue",
sourceQueueArns = [aws_sqs_queue.training_queue.arn]
})
}

output "sqs_queue_url" {
value = aws_sqs_queue.training_queue.url
}
2 changes: 1 addition & 1 deletion frontend/next.config.js
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ const nextConfig = {
{
source: "/api/lambda/:path*",
destination:
"https://em9iri9g4j.execute-api.us-west-2.amazonaws.com/:path*",
"https://qt6nzp3sjd.execute-api.us-east-1.amazonaws.com/:path*",
},
{
source: "/api/training/:path*",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import {
import { useTrainImageMutation } from "../redux/imageApi";
import { useRouter } from "next/router";
import { removeTrainspaceData } from "@/features/Train/redux/trainspaceSlice";
import { useCreateTrainspaceMutation } from "@/features/Train/redux/trainspaceApi";

const ImageTrainspace = () => {
const trainspace = useAppSelector(
Expand Down Expand Up @@ -93,20 +94,25 @@ const TrainspaceStepInner = ({
const Component = STEP_SETTINGS[TRAINSPACE_SETTINGS.steps[step]].component;
const [isStepModified, setIsStepModified] = useState<boolean>(false);
const [train] = useTrainImageMutation();
const [createTrainspace] = useCreateTrainspaceMutation();
const[isButtonClicked, setIsButtonClicked] = useState<boolean>(false);
const dispatch = useAppDispatch();
const router = useRouter();
useEffect(() => {
if (trainspace.step < TRAINSPACE_SETTINGS.steps.length)
setStep(trainspace.step);
else {
train(trainspace)
.unwrap()
.then(({ trainspaceId }) => {
router.push({ pathname: `/train/${trainspaceId}` }).then(() => {
dispatch(removeTrainspaceData());
});
const inner = async () => {
const { trainspaceId } = await createTrainspace(trainspace).unwrap();
await train({
trainspaceData: trainspace,
trainspaceId: trainspaceId,
}).unwrap();
router.push({ pathname: `/train/${trainspaceId}` }).then(() => {
dispatch(removeTrainspaceData());
});
};
inner();
}
}, [trainspace]);
if (!Component) return <></>;
Expand Down
5 changes: 3 additions & 2 deletions frontend/src/features/Train/features/Image/redux/imageApi.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,13 @@ const imageApi = backendApi.injectEndpoints({
endpoints: (builder) => ({
trainImage: builder.mutation<
{ trainspaceId: string },
TrainspaceData<"TRAIN">
{ trainspaceData: TrainspaceData<"TRAIN">; trainspaceId: string }
>({
query: (trainspaceData) => ({
query: ({ trainspaceData, trainspaceId }) => ({
url: "/api/train/img-run",
method: "POST",
body: {
trainspace_id: trainspaceId,
name: trainspaceData.name,
data_source: trainspaceData.dataSource,
dataset_data: {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import {
import { useTrainTabularMutation } from "../redux/tabularApi";
import { useRouter } from "next/router";
import { removeTrainspaceData } from "@/features/Train/redux/trainspaceSlice";
import { useCreateTrainspaceMutation } from "@/features/Train/redux/trainspaceApi";

const TabularTrainspace = () => {
const trainspace = useAppSelector(
Expand Down Expand Up @@ -93,6 +94,7 @@ const TrainspaceStepInner = ({
const Component = STEP_SETTINGS[TRAINSPACE_SETTINGS.steps[step]].component;
const [isStepModified, setIsStepModified] = useState<boolean>(false);
const [isButtonClicked, setIsButtonClicked] = useState<boolean>(false);
const [createTrainspace] = useCreateTrainspaceMutation();
const [train] = useTrainTabularMutation();
const dispatch = useAppDispatch();
const router = useRouter();
Expand All @@ -112,16 +114,20 @@ const TrainspaceStepInner = ({
if (trainspace.step < TRAINSPACE_SETTINGS.steps.length)
setStep(trainspace.step);
else {
train(trainspace)
.unwrap()
.then(({ trainspaceId }) => {
router.push({ pathname: `/train/${trainspaceId}` }).then(() => {
dispatch(removeTrainspaceData());
});
const inner = async () => {
const { trainspaceId } = await createTrainspace(trainspace).unwrap();
await train({
trainspaceData: trainspace,
trainspaceId: trainspaceId,
}).unwrap();
router.push({ pathname: `/train/${trainspaceId}` }).then(() => {
dispatch(removeTrainspaceData());
});
};
inner();
}
}, [trainspace]);

if (!Component) return null;
return (
<Component
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,13 @@ const tabularApi = backendApi.injectEndpoints({
endpoints: (builder) => ({
trainTabular: builder.mutation<
{ trainspaceId: string },
TrainspaceData<"TRAIN">
{ trainspaceData: TrainspaceData<"TRAIN">; trainspaceId: string }
>({
query: (trainspaceData) => ({
query: ({ trainspaceData, trainspaceId }) => ({
url: "/api/training/tabular",
method: "POST",
body: {
trainspace_id: trainspaceId,
name: trainspaceData.name,
data_source: trainspaceData.dataSource,
target: trainspaceData.parameterData.targetCol,
Expand Down
36 changes: 36 additions & 0 deletions frontend/src/features/Train/redux/trainspaceApi.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,12 @@ import { backendApi } from "@/common/redux/backendApi";
import {
DATA_SOURCE,
DatasetData,
DetailedTrainResultsData,
FileUploadData,
} from "@/features/Train/types/trainTypes";
import { fetchBaseQuery } from "@reduxjs/toolkit/dist/query";
import { TrainspaceData as TabularTrainspaceData } from "../features/Tabular/types/tabularTypes";
import { TrainspaceData as ImageTrainspaceData } from "../features/Image/types/imageTypes";

const trainspaceApi = backendApi
.enhanceEndpoints({ addTagTypes: ["UserDatasetFilesData"] })
Expand Down Expand Up @@ -90,6 +93,37 @@ const trainspaceApi = backendApi
return response.data;
},
}),
createTrainspace: builder.mutation<
{ trainspaceId: string },
TabularTrainspaceData<"TRAIN"> | ImageTrainspaceData<"TRAIN">
>({
query: (trainspaceData) => ({
url: "/api/lambda/trainspace",
method: "POST",
body: {
name: trainspaceData.name,
data_source: trainspaceData.dataSource,
dataset_data: trainspaceData.datasetData,
review_data: trainspaceData.reviewData,
// TODO: add model_id
},
}),
}),
getTrainspace: builder.query<
{
config: unknown;
detailedTrainResultsData: DetailedTrainResultsData | undefined;
},
{ trainspaceId: string; withResults: boolean }
>({
query: ({ trainspaceId, withResults }) => ({
url: `/api/lambda/trainspace/${trainspaceId}`,
method: "GET",
params: {
with_results: withResults,
},
}),
}),
}),
overrideExisting: true,
});
Expand All @@ -98,4 +132,6 @@ export const {
useGetDatasetFilesDataQuery,
useUploadDatasetFileMutation,
useLazyGetColumnsFromDatasetQuery,
useCreateTrainspaceMutation,
useGetTrainspaceQuery,
} = trainspaceApi;
50 changes: 48 additions & 2 deletions frontend/src/features/Train/types/trainTypes.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import { DATA_SOURCE_ARR } from "../constants/trainConstants";

// keep in sync with schemas.py
export type DATA_SOURCE = typeof DATA_SOURCE_ARR[number];

export type TRAIN_STATUS =
Expand All @@ -16,16 +17,61 @@ export interface BaseTrainspaceData {
step: number;
}

// basic information, used on dashboard
export interface TrainResultsData {
name: string;
trainspaceId: number;
trainspaceId: string;
dataSource: DATA_SOURCE;
status: TRAIN_STATUS;
created: Date;
step: string;
uid: string;
}

export type CHART_TYPE = "LINE" | "AUC/ROC" | "CONFUSION_MATRIX"

export type Chart = TimeSeriesChart | AucRocChart | ConfusionMatrixChart

export interface TimeSeriesMetric {
x_name: string;
y_name: string;

x_values: number[];
y_values: number[];
}

export interface TimeSeriesChart {
name: string;

time_series: TimeSeriesMetric[]
chart_type: "LINE"
graph_index: number;
}

export interface AucRocChart {
name: string;

values: [number[], number[], number][];

chart_type: "AUC/ROC"
graph_index: number;
}

export interface ConfusionMatrixChart {
name: string;

values: number[][];

chart_type: "CONFUSION_MATRIX"
graph_index: number;
}

// more detailed information, used when viewing a run
export interface DetailedTrainResultsData {
basic_info: TrainResultsData

all_metrics: Chart[]
}

export interface FileUploadData {
name: string;
lastModified: string;
Expand Down

0 comments on commit 54a800c

Please sign in to comment.