diff --git a/medcat-service/README.md b/medcat-service/README.md index 95b89a9de..56bfdf74d 100644 --- a/medcat-service/README.md +++ b/medcat-service/README.md @@ -316,6 +316,9 @@ The following environment variables are available for tailoring the MedCAT Servi - `SERVER_PORT` - the port number used (default: `5000`), - `SERVER_WORKERS` - the number of workers serving the Flask app working in parallel (default: `1` ; only used in production server). - `SERVER_WORKER_TIMEOUT` - the max timeout (in sec) for receiving response from worker (default: `300` ; only used with production server). +- `SERVER_GUNICORN_MAX_REQUESTS` - maximum number of requests a worker will process before restarting (default: `1000`), +- `SERVER_GUNICORN_MAX_REQUESTS_JITTER` - adds randomness to `MAX_REQUESTS` to avoid all workers restarting simultaneously (default: `50`), +- `SERVER_GUNICORN_EXTRA_ARGS` - any additional Gunicorn CLI arguments you want to pass (default: none). (Example value: "SERVER_GUNICORN_EXTRA_ARGS=--backlog 20") The following environment variables are available for tailoring the MedCAT Service wrapper: diff --git a/medcat-service/env/app.env b/medcat-service/env/app.env index cb68c397c..e59c7ad2f 100755 --- a/medcat-service/env/app.env +++ b/medcat-service/env/app.env @@ -36,6 +36,8 @@ SERVER_PORT=5000 SERVER_WORKERS=1 SERVER_WORKER_TIMEOUT=300 SERVER_THREADS=1 +SERVER_GUNICORN_MAX_REQUESTS=1000 +SERVER_GUNICORN_MAX_REQUESTS_JITTER=50 # set the number of torch threads, this should be used ONLY if you are using CPUs and the default image # set to -1 or 0 if you are using GPU @@ -43,4 +45,4 @@ APP_TORCH_THREADS=8 # GPU SETTING # CAUTION, use only if you are using the GPU docker image. -APP_CUDA_DEVICE_COUNT=1 +APP_CUDA_DEVICE_COUNT=-1 diff --git a/medcat-service/env/app_deid.env b/medcat-service/env/app_deid.env index 56607c72b..e59c7ad2f 100755 --- a/medcat-service/env/app_deid.env +++ b/medcat-service/env/app_deid.env @@ -36,6 +36,8 @@ SERVER_PORT=5000 SERVER_WORKERS=1 SERVER_WORKER_TIMEOUT=300 SERVER_THREADS=1 +SERVER_GUNICORN_MAX_REQUESTS=1000 +SERVER_GUNICORN_MAX_REQUESTS_JITTER=50 # set the number of torch threads, this should be used ONLY if you are using CPUs and the default image # set to -1 or 0 if you are using GPU diff --git a/medcat-service/start_service_production.sh b/medcat-service/start_service_production.sh index f82cc67c9..185009aeb 100644 --- a/medcat-service/start_service_production.sh +++ b/medcat-service/start_service_production.sh @@ -33,6 +33,16 @@ if [ -z ${SERVER_WORKER_TIMEOUT+x} ]; then echo "SERVER_WORKER_TIMEOUT is unset -- setting to default (sec): $SERVER_WORKER_TIMEOUT"; fi +if [ -z ${SERVER_GUNICORN_MAX_REQUESTS+x} ]; then + SERVER_GUNICORN_MAX_REQUESTS=1000; + echo "SERVER_GUNICORN_MAX_REQUESTS is unset -- setting to default: $SERVER_GUNICORN_MAX_REQUESTS"; +fi + +if [ -z ${SERVER_GUNICORN_MAX_REQUESTS_JITTER+x} ]; then + SERVER_GUNICORN_MAX_REQUESTS_JITTER=50; + echo "SERVER_GUNICORN_MAX_REQUESTS_JITTER is unset -- setting to default: $SERVER_GUNICORN_MAX_REQUESTS_JITTER"; +fi + # Note - SERVER_ACCESS_LOG_FORMAT is unused when worker-class is set to UvicornWorker SERVER_ACCESS_LOG_FORMAT="%(t)s [ACCESS] %(h)s \"%(r)s\" %(s)s \"%(f)s\" \"%(a)s\"" @@ -40,6 +50,8 @@ SERVER_ACCESS_LOG_FORMAT="%(t)s [ACCESS] %(h)s \"%(r)s\" %(s)s \"%(f)s\" \"%(a)s # # Using Gunicorn, even though FastAPI recommends Uvicorn, to keep support for the post_fork config echo "Starting up the service using gunicorn server ..." +set -x + exec gunicorn \ --bind "$SERVER_HOST:$SERVER_PORT" \ --workers="$SERVER_WORKERS" \ @@ -50,5 +62,8 @@ exec gunicorn \ --error-logfile=- \ --log-level info \ --config /cat/config.py \ + --max-requests="$SERVER_GUNICORN_MAX_REQUESTS" \ + --max-requests-jitter="$SERVER_GUNICORN_MAX_REQUESTS_JITTER" \ + ${SERVER_GUNICORN_EXTRA_ARGS:-} \ --worker-class uvicorn.workers.UvicornWorker \ medcat_service.main:app