Skip to content

Commit

Permalink
Merge pull request #101 from EGA-archive/pipeline
Browse files Browse the repository at this point in the history
Pipeline
  • Loading branch information
silverdaz committed May 27, 2020
2 parents e987c07 + 27b5393 commit 20195d1
Show file tree
Hide file tree
Showing 50 changed files with 1,983 additions and 1,907 deletions.
36 changes: 0 additions & 36 deletions .coveragerc

This file was deleted.

3 changes: 2 additions & 1 deletion .github/workflows/testsuite.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ jobs:
matrix:
# os: [ubuntu-latest, macOS-latest]
os: [ubuntu-latest]
bootstrap: ['', 'S3=true', 'DOCKER_SECRETS=true', 'S3=true DOCKER_SECRETS=true']
# bootstrap: ['', 'S3=true', 'DOCKER_SECRETS=true', 'S3=true DOCKER_SECRETS=true']
bootstrap: ['']

runs-on: ${{ matrix.os }}

Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ tests/**/*.d
tests/_common/users/
tests/_common/backup
tests/_common/mq/*.pem
*.old

# =====================================
# Byte-compiled / optimized / DLL files
Expand Down
14 changes: 14 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Version 1.2

* Less MQ connection sockets: one federated queue, one shovel (and added message ``type`` to distinguish the messages)
* MQ Heartbeat are reintroduced
* ingest and verify are one service: Since we were loading the data in memory, we also decrypt, checksum and move it to a staging area
* 2 instances of a backup microservice are added. Obviously, this is for illustration purpose only, each LocalEGA site might already have their own backup system. Nevertheless, a trust/confirmation is sent to CentralEGA.
* Database pipeline segregated from the main final database
* A save2db service is introduced at the end of the pipeline to save information in the long-term storage database (not the pipeline DB). It can handle also handle the dataset mappings (as a job of type `mapping`).
* Correlation IDs are used for each inbox upload/rename/deletion. However, when several message types are emitted by CentralEGA, the same correlation ID might be reused. Therefore, we introduce a `job_id`, handled by the database. The latter generates new job id if necessary (detecting if repeated messages).
* No leaked information from the LocalEGAs to CentralEGA. We only use checksums and public information
* Support for S3 has been factorized out from the code. The code is smaller and simpler. In order to support an S3-backed storage, the system administrator can for example use [S3-fuse filesystem](https://github.com/s3fs-fuse/s3fs-fuse)
* The pipeline database and mq docker images are migrated back into this repo

# Version 1.1
18 changes: 15 additions & 3 deletions deploy/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,7 @@ private/lega.yml .env:
make -C bootstrap -s

clean-volumes:
docker volume rm lega_db lega_inbox lega_s3
-docker volume rm lega_inbox-s3
docker volume rm lega_db lega_inbox lega_archive_db lega_vault lega_vault_bkp

ps:
@docker-compose ps
Expand All @@ -37,9 +36,15 @@ preflight-check:
# the rabbitmq shovel to CentralEGA (the federated queue can be late, it doesn't hurt)

logs:
@docker-compose logs -f
@docker-compose logs -f $(SELECTED)
# @docker-compose logs -f logs

# Restart all but the CentralEGA stubs
restart: SELECTED=ingest backup1 backup2 cleanup dispatcher save2db db mq archive-db inbox
restart:
@docker-compose restart $(SELECTED)


####################################################
## Docker Images
####################################################
Expand Down Expand Up @@ -88,3 +93,10 @@ erase:
purge:
@$(call remove_dangling,)


####################################################
# Checking the archive DB

dbshell:
chmod 600 $(CURDIR)/private/certs/save2db.sec.pem
psql "postgres://lega:----------------@localhost:15432/lega?sslmode=verify-ca&sslcert=$(CURDIR)/private/certs/save2db.cert.pem&sslkey=$(CURDIR)/private/certs/save2db.sec.pem&sslrootcert=$(CURDIR)/private/certs/CA.save2db.cert.pem"
70 changes: 41 additions & 29 deletions deploy/bootstrap/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
# * Docker-compose .env
# * Users credentials ( ssh-keys, passphrases, .json)
# * Stores configurations in {cega,lega}.conf
# * {cega,lega}.yml
# * {ingest,verify,finalize}.ini
# * {cega,lega, pipeline}.yml
# * *.ini
# * cega_mq_defs.json
# * cega_mq_rabbitmq.config
# * The certificates
Expand All @@ -23,29 +23,20 @@ OPENSSL=openssl
HOSTNAME_DOMAIN=
#HOSTNAME_DOMAIN=.localega

ARGS=
ifdef DOCKER_SECRETS
ARGS+=--secrets ../private/secrets
endif

ifdef S3
ARGS+=--archive_s3
endif


.PHONY: all users certs clean clean-all
.DELETE_ON_ERROR: ../private/*

SECRETS:= master.key.passphrase db.lega mq.admin
SECRETS:= master.key.passphrase db.lega mq.admin archive-db.lega

ifdef S3
SECRETS+= s3.access s3.secret
endif

ALL_FILES=$(addprefix ../private/, lega-entrypoint.sh cega-entrypoint.sh \
cega-mq-defs.json cega-mq-rabbitmq.config \
ingest.ini verify.ini finalize.ini master.key.sec \
lega.yml cega.yml)
cega-mq-defs.json cega-mq-rabbitmq.config cega-accession.ini \
master.key.sec \
dispatcher.ini ingest.ini backup1.ini backup2.ini cleanup.ini save2db.ini \
pipeline.yml lega.yml cega.yml)

all: ../.env $(ALL_FILES) certs users

Expand All @@ -67,15 +58,11 @@ dev: all

../private/lega.conf: run/lega/conf.py $(addprefix ../private/secrets/,$(SECRETS))
@echo "Creating LocalEGA configuration trace: $(@F)"
ifdef S3
@python -m run.lega.conf --secrets $(CURDIR)/../private/secrets --archive_s3 > $@
else
@python -m run.lega.conf --secrets $(CURDIR)/../private/secrets > $@
endif

../.env: | ../private
@echo "COMPOSE_PROJECT_NAME=lega" > $@
@echo "COMPOSE_FILE=private/lega.yml:private/cega.yml" >> $@
@echo "COMPOSE_FILE=private/lega.yml:private/cega.yml:private/pipeline.yml" >> $@
@echo "COMPOSE_PATH_SEPARATOR=:" >> $@

############### Generate CentralEGA services
Expand All @@ -92,16 +79,39 @@ endif
@echo "Creating CentralEGA docker-compose file: $(@F)"
@python -m run.cega.services > $@

../private/cega-accession.ini: ../private/cega.conf run/cega/accession.py | ../private
@echo "Creating CentralEGA docker-compose file: $(@F)"
@python -m run.cega.accession $< > $@

############### Generate LocalEGA settings

../private/lega.yml: run/lega/services.py | ../private
../private/lega.yml: ../private/cega.conf ../private/lega.conf
@echo "Creating LocalEGA docker-compose file: $(@F)"
@python -m run.lega.services $(ARGS) ../private/cega.conf ../private/lega.conf 2>../private/.err > $@
@python -m run.lega.services ../private/cega.conf ../private/lega.conf 2>../private/.err > $@

../private/pipeline.yml: run/lega/pipeline.py | ../private
../private/pipeline.yml: ../private/cega.conf ../private/lega.conf
@echo "Creating LocalEGA docker-compose file: $(@F)"
@python -m run.lega.pipeline ../private/cega.conf ../private/lega.conf 2>../private/.err > $@

../private/backup1.ini: ../private/lega.conf run/lega/backup.py | ../private
@echo "Creating LocalEGA service configuration: $(@F)"
@python -m run.lega.backup --queue accession \
--destination /ega/vault \
--routing_key backup1 \
$< 2>../private/.err > $@

../private/backup2.ini: ../private/lega.conf run/lega/backup.py | ../private
@echo "Creating LocalEGA service configuration: $(@F)"
@python -m run.lega.backup --queue backup1 \
--destination /ega/vault.bkp \
--routing_key backup2 \
$< 2>../private/.err > $@

../private/%.ini: ../private/lega.conf run/lega/%.py | ../private
@echo "Creating LocalEGA service configuration: $(@F)"
@python -m run.lega.$* $(ARGS) $< 2>../private/.err > $@
@python -m run.lega.$* $< 2>../private/.err > $@

../private/master.key.sec: ../private/secrets/master.key.passphrase
@echo "Creating master key: $(@F)"
Expand Down Expand Up @@ -136,21 +146,23 @@ users: $(USERS:%=../private/users/%.passphrase) \

############### Generate Certificates

COMPONENTS=ingest \
verify \
finalize \
COMPONENTS=dispatcher \
ingest \
backup1 \
backup2 \
cleanup \
save2db \
db \
mq \
keys \
inbox \
outgest \
streamer
archive-db

ifdef S3
COMPONENTS+=archive inbox-s3-backend
endif

COMPONENTS+=cega-mq cega-users testsuite
COMPONENTS+=cega-mq cega-users cega-accession testsuite

CERTS=$(COMPONENTS:%=../private/certs/%.sec.pem) \
$(COMPONENTS:%=../private/certs/%.cert.pem) \
Expand Down
64 changes: 64 additions & 0 deletions deploy/bootstrap/cega-accession.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import sqlite3

from lega.conf import CONF
from lega.utils.amqp import consume, publish


CONN = sqlite3.connect(':memory:') # don't bother, keep it in RAM. Too bad if we restart

def init_db():
c = CONN.cursor()
c.execute('''CREATE TABLE accessions (id INTEGER PRIMARY KEY AUTOINCREMENT,
md5 text UNIQUE,
username text,
filepath text)''')
CONN.commit()

def get_accession_id(md5, username, filepath):
c = CONN.cursor()
c.execute('''INSERT INTO accessions (md5,username,filepath)
VALUES(?,?,?)
ON CONFLICT(md5) DO NOTHING;''', [md5, username, filepath])
accession_id = c.lastrowid
CONN.commit()
return accession_id

def work(data):
"""Read a message, split the header and decrypt the remainder."""


decrypted_checksums = data['decrypted_checksums']

md5_checksum = None
for c in decrypted_checksums:
if c.get('type') == 'md5':
md5_checksum = c.get('value')
break

if md5_checksum is None:
data['reason'] = 'Missing md5 checksum'
publish(data, exchange='localega.v1', routing_key='files.error')


filepath = data['filepath']
username = data['user']
accession_id = get_accession_id(md5_checksum, username, filepath)
accession = f"EGAF{accession_id:0>11}" # I think EBI decided to use 11 digits
print('Using accession id:', accession) # no LOG.debug for __main__ and don't care

data['type'] = 'accession'
data['accession_id'] = accession

# Publish the answer
publish(data, exchange='localega.v1', routing_key='accession')
# All good: Ack message

def main():
init_db()
consume(work)

if __name__ == '__main__':
main()
14 changes: 7 additions & 7 deletions deploy/bootstrap/certs.mk
Original file line number Diff line number Diff line change
Expand Up @@ -34,23 +34,23 @@ DOMAIN_EMAIL := -dev.ega@crg.eu
## Certificates
###############################################

../private/certs/dispatcher.cert.pem: EXT=client_cert
../private/certs/ingest.cert.pem: EXT=client_cert
../private/certs/verify.cert.pem: EXT=client_cert
../private/certs/finalize.cert.pem: EXT=client_cert
../private/certs/backup1.cert.pem: EXT=client_cert
../private/certs/backup2.cert.pem: EXT=client_cert
../private/certs/cleanup.cert.pem: EXT=client_cert
../private/certs/save2db.cert.pem: EXT=client_cert
../private/certs/mq.cert.pem: EXT=server_client_cert
../private/certs/inbox.cert.pem: EXT=server_client_cert
../private/certs/db.cert.pem: EXT=server_cert
../private/certs/keys.cert.pem: EXT=server_cert
../private/certs/cega-mq.cert.pem: EXT=server_cert
../private/certs/cega-users.cert.pem: EXT=server_cert
../private/certs/cega-accession.cert.pem: EXT=client_cert
../private/certs/outgest.cert.pem: EXT=server_client_cert
../private/certs/streamer.cert.pem: EXT=server_client_cert
../private/certs/testsuite.cert.pem: EXT=client_cert

ifdef S3
../private/certs/archive.cert.pem: EXT=server_cert
../private/certs/inbox-s3-backend.cert.pem: EXT=server_cert
endif
../private/certs/archive-db.cert.pem: EXT=server_cert

%.cert.pem: %.csr.pem | ../private/certs/serial ../private/certs/index.txt
@echo "Creating $(@F)"
Expand Down
54 changes: 54 additions & 0 deletions deploy/bootstrap/run/cega/accession.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import sys
import configparser

from docopt import docopt

__doc__ = f'''
Utility to help bootstrap a LocalEGA instance.
Usage:
{sys.argv[0]} [options] <conf>
Options:
-h, --help Prints this help and exit
-v, --version Prints the version and exits
-V, --verbose Prints more output
'''

def main(conf, args):

config = configparser.RawConfigParser()
config['DEFAULT'] = {
'queue': 'v1.files.verified',
'exchange': conf.get('mq', 'exchange'),
'cega_exchange': conf.get('mq', 'exchange'),
'cega_error_key': 'files.error',
}

config['broker'] = {
'connection': conf.get('mq', 'connection'),
'enable_ssl': 'yes',
'verify_peer': 'yes',
'verify_hostname': 'no',
'cacertfile': '/cega/CA.crt',
'certfile': '/cega/ssl.crt',
'keyfile': '/cega/ssl.key',
}

# output
config.write(sys.stdout)


if __name__ == '__main__':
args = docopt(__doc__,
sys.argv[1:],
help=True,
version='CentralEGA accession service boostrap (version 0.2)')
conf = configparser.RawConfigParser()
conf.read(args['<conf>'])
main(conf, args)

0 comments on commit 20195d1

Please sign in to comment.