From 286a4ea63c5d75be1d40f0f9d7bac2fa29a67ae5 Mon Sep 17 00:00:00 2001 From: Tim Ballard <1425377+timoballard@users.noreply.github.com> Date: Mon, 6 May 2024 13:51:08 -0500 Subject: [PATCH 1/2] track scan results, timestamps in a sqlite database --- .gitignore | 4 +- app.py | 160 +++++++++++++++++++++++++++++++++++------------ config.py | 1 + requirements.txt | 8 +++ 4 files changed, 132 insertions(+), 41 deletions(-) diff --git a/.gitignore b/.gitignore index 06d9cfc..fce8c3b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ venv/ -.env \ No newline at end of file +.env + +scanner.db \ No newline at end of file diff --git a/app.py b/app.py index edafd1b..df2fdb1 100644 --- a/app.py +++ b/app.py @@ -3,7 +3,7 @@ from boto3 import client as boto3_client from botocore.client import ClientError, Config - +from datetime import datetime from enum import Enum import environs from io import BytesIO @@ -19,29 +19,50 @@ from config import S3Config, EnvS3Config, ClamAVConfig, EnvClamAVConfig +from peewee import * + env = environs.Env() -dictConfig({ - 'version': 1, - 'formatters': {'default': { - 'format': '[%(asctime)s] %(levelname)s in %(module)s: %(message)s', - }}, - 'handlers': {'default': { - 'class': 'logging.StreamHandler', - 'stream': sys.stdout, - 'formatter': 'default' - }}, - 'root': { - 'level': 'INFO', - 'handlers': ['default'] +dictConfig( + { + "version": 1, + "formatters": { + "default": { + "format": "[%(asctime)s] %(levelname)s in %(module)s: %(message)s", + } + }, + "handlers": { + "default": { + "class": "logging.StreamHandler", + "stream": sys.stdout, + "formatter": "default", + } + }, + "root": {"level": "INFO", "handlers": ["default"]}, } -}) +) + +FILE_REFRESH_INTERVAL_SECS = 600 +FILE_SCAN_INTERVAL_SECS = 1 + +db = SqliteDatabase("scanner.db") + + +class ScannedFile(Model): + filename = CharField(unique=True) + last_scan_timestamp = DateTimeField(null=True) + last_scan_result = CharField() + + class Meta: + database = db + logger = logging.getLogger(__name__) + class ScanResult(Enum): - CLEAN = 1, - INFECTED = 2, + CLEAN = (1,) + INFECTED = (2,) UNKNOWN = 3 @classmethod @@ -66,6 +87,7 @@ def construct_s3_client(config: S3Config) -> boto3_client: return s3_client + def scan_file(clamav_config: ClamAVConfig, file) -> ScanResult: response = requests.post( clamav_config.endpoint_url, @@ -78,8 +100,8 @@ def scan_file(clamav_config: ClamAVConfig, file) -> ScanResult: def prepare_env(): # read .env file if there is one - env.read_env(recurse=False) - + res = env.read_env(recurse=False, override=True) + # load VCAP_SERVICES into env if defined try: vcap_services = json.loads(env.str("VCAP_SERVICES")) @@ -102,16 +124,36 @@ def prepare_env(): logger.info("no VCAP_SERVICES defined in env") -def scan_loop(): - prepare_env() +def create_scanned_file(filename, scan_timestamp, scan_result): + ScannedFile.insert( + filename=filename, + last_scan_timestamp=scan_timestamp, + last_scan_result=scan_result, + ).on_conflict_ignore().execute() + + +def upsert_scanned_file(filename, scan_timestamp, scan_result): + ScannedFile.insert( + filename=filename, + last_scan_timestamp=scan_timestamp, + last_scan_result=scan_result, + ).on_conflict( + conflict_target=[ScannedFile.filename], + preserve=[ScannedFile.last_scan_timestamp, ScannedFile.last_scan_result], + ).execute() + + +def refresh_files(): + """ + Periodically scan the target S3 bucket and created a ScannedFile entry in the database for file + """ + s3_config = EnvS3Config(env) + s3_client = construct_s3_client(s3_config) while True: - s3_config = EnvS3Config(env) - s3_client = construct_s3_client(s3_config) - - clamav_config = EnvClamAVConfig(env) + logger.info("refreshing files...") - paginator = s3_client.get_paginator('list_objects_v2') + paginator = s3_client.get_paginator("list_objects_v2") pages = paginator.paginate(Bucket=s3_config.bucket) if pages: @@ -119,32 +161,70 @@ def scan_loop(): if "Contents" in page: for object_summary in page["Contents"]: object_name = object_summary["Key"] + scan_timestamp = None + scan_result = ScanResult.UNKNOWN - file = BytesIO() - s3_client.download_fileobj(s3_config.bucket, object_name, file) - file.seek(0) + create_scanned_file(object_name, scan_timestamp, scan_result) - scan_result = scan_file(clamav_config, file) + logger.info("done refreshing files...") - logger.info(f"{object_name}: scan result: {scan_result}") + sleep(FILE_REFRESH_INTERVAL_SECS) + + +def scan_files(): + """ + Fetch the least recently scanned file from the database, scan it, and update its database record with the new scan result & timestamp + """ + while True: + s3_config = EnvS3Config(env) + + s3_client = construct_s3_client(s3_config) + + clamav_config = EnvClamAVConfig(env) + + # scan the least recently scanned file in the database + for scanned_file in ( + ScannedFile.select().order_by(ScannedFile.last_scan_timestamp).limit(1) + ): + file = BytesIO() + s3_client.download_fileobj(s3_config.bucket, scanned_file.filename, file) + file.seek(0) + + scan_result = scan_file(clamav_config, file) + + upsert_scanned_file(scanned_file.filename, datetime.utcnow(), scan_result) + + logger.info(f"{scanned_file.filename}: scan result: {scan_result}") + + sleep(FILE_SCAN_INTERVAL_SECS) - sleep(1) app = Flask(__name__) -@app.route('/') + +@app.route("/") def health_check(): - logger.info('handling health check') - return 'healthy' + logger.info("handling health check") + return "healthy" + def create_app(): - worker = Thread(target=scan_loop, daemon=True) - worker.start() + prepare_env() + + db.connect() + db.create_tables([ScannedFile]) + + file_refresh_worker = Thread(target=refresh_files, daemon=True) + file_refresh_worker.start() + + scan_worker = Thread(target=scan_files, daemon=True) + scan_worker.start() return app -if __name__ == '__main__': + +if __name__ == "__main__": logger.info("starting up...") - port = int(os.getenv('PORT', '8080')) - create_app().run(host='0.0.0.0', port=port) + port = int(os.getenv("PORT", "8080")) + create_app().run(host="0.0.0.0", port=port) diff --git a/config.py b/config.py index 8cb18bb..211cb8b 100644 --- a/config.py +++ b/config.py @@ -1,5 +1,6 @@ import environs + class S3Config: def __init__(self): self.region_name = "" diff --git a/requirements.txt b/requirements.txt index 966ac9b..9cbf370 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ +black==24.4.2 blinker==1.7.0 boto3==1.34.92 botocore==1.34.92 @@ -14,12 +15,19 @@ Jinja2==3.1.3 jmespath==1.0.1 MarkupSafe==2.1.5 marshmallow==3.21.1 +mypy-extensions==1.0.0 +newrelic==9.9.0 packaging==24.0 +pathspec==0.12.1 +peewee==3.17.3 +platformdirs==4.2.1 python-dateutil==2.9.0.post0 python-dotenv==1.0.1 requests==2.31.0 s3transfer==0.10.1 six==1.16.0 +tomli==2.0.1 +typing_extensions==4.11.0 urllib3==1.26.18 Werkzeug==3.0.2 zipp==3.18.1 From cd86de6b868328e264384de6dc0d4e7a0f3b31eb Mon Sep 17 00:00:00 2001 From: Tim Ballard <1425377+timoballard@users.noreply.github.com> Date: Mon, 6 May 2024 13:52:23 -0500 Subject: [PATCH 2/2] rm unused result --- app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app.py b/app.py index df2fdb1..dc72655 100644 --- a/app.py +++ b/app.py @@ -100,7 +100,7 @@ def scan_file(clamav_config: ClamAVConfig, file) -> ScanResult: def prepare_env(): # read .env file if there is one - res = env.read_env(recurse=False, override=True) + env.read_env(recurse=False, override=True) # load VCAP_SERVICES into env if defined try: