diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 3cad036..adc3ad3 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -15,11 +15,12 @@ env: HOST_GID: 1000 _AIRFLOW_WWW_USER_USERNAME: ci _AIRFLOW_WWW_USER_PASSWORD: ci - # PSQL_USER_MAIN: ${{ secrets.CI_PSQL_USER }} - # PSQL_PASSWORD_MAIN: ${{ secrets.CI_PSQL_PASSWORD }} - # PSQL_HOST_MAIN: ${{ secrets.CI_PSQL_HOST }} - # PSQL_PORT_MAIN: ${{ secrets.CI_PSQL_PORT }} - # PSQL_DB_MAIN: ${{ secrets.CI_PSQL_DB }} + AIRFLOW_PSQL_USER_MAIN: ${{ secrets.CI_PSQL_USER }} + AIRFLOW_PSQL_PASSWORD_MAIN: ${{ secrets.CI_PSQL_PASSWORD }} + AIRFLOW_PSQL_HOST_MAIN: ${{ secrets.CI_PSQL_HOST }} + AIRFLOW_PSQL_PORT_MAIN: ${{ secrets.CI_PSQL_PORT }} + AIRFLOW_PSQL_DB_MAIN: ${{ secrets.CI_PSQL_DB }} + EPISCANNER_HOST_DATA: /home/runner/work/AlertFlow/AlertFlow/alertflow/episcanner-downloader/epi_scanner/data/ jobs: main: @@ -40,17 +41,17 @@ jobs: - name: Check if repository is a fork run: | if [[ "${{ github.event.repository.fork }}" == "true" ]]; then - export PSQL_USER_MAIN="${{ secrets.CI_PSQL_USER_FORK }}" - export PSQL_PASSWORD_MAIN="${{ secrets.CI_PSQL_PASSWORD_FORK }}" - export PSQL_HOST_MAIN="${{ secrets.CI_PSQL_HOST_FORK }}" - export PSQL_PORT_MAIN="${{ secrets.CI_PSQL_PORT_FORK }}" - export PSQL_DB_MAIN="${{ secrets.CI_PSQL_DB_FORK }}" + export AIRFLOW_PSQL_USER_MAIN="${{ secrets.CI_PSQL_USER_FORK }}" + export AIRFLOW_PSQL_PASSWORD_MAIN="${{ secrets.CI_PSQL_PASSWORD_FORK }}" + export AIRFLOW_PSQL_HOST_MAIN="${{ secrets.CI_PSQL_HOST_FORK }}" + export AIRFLOW_PSQL_PORT_MAIN="${{ secrets.CI_PSQL_PORT_FORK }}" + export AIRFLOW_PSQL_DB_MAIN="${{ secrets.CI_PSQL_DB_FORK }}" else - export PSQL_USER_MAIN="${{ secrets.CI_PSQL_USER }}" - export PSQL_PASSWORD_MAIN="${{ secrets.CI_PSQL_PASSWORD }}" - export PSQL_HOST_MAIN="${{ secrets.CI_PSQL_HOST }}" - export PSQL_PORT_MAIN="${{ secrets.CI_PSQL_PORT }}" - export PSQL_DB_MAIN="${{ secrets.CI_PSQL_DB }}" + export AIRFLOW_PSQL_USER_MAIN="${{ secrets.CI_PSQL_USER }}" + export AIRFLOW_PSQL_PASSWORD_MAIN="${{ secrets.CI_PSQL_PASSWORD }}" + export AIRFLOW_PSQL_HOST_MAIN="${{ secrets.CI_PSQL_HOST }}" + export AIRFLOW_PSQL_PORT_MAIN="${{ secrets.CI_PSQL_PORT }}" + export AIRFLOW_PSQL_DB_MAIN="${{ secrets.CI_PSQL_DB }}" fi - name: Semantic Release Title Check diff --git a/alertflow/dags/episcanner/episcanner_export_data.py b/alertflow/dags/episcanner/episcanner_export_data.py new file mode 100644 index 0000000..9e81f5a --- /dev/null +++ b/alertflow/dags/episcanner/episcanner_export_data.py @@ -0,0 +1,103 @@ +import os +from datetime import datetime, timedelta + +from airflow import DAG +from airflow.models import Variable +from airflow.operators.bash import BashOperator +from airflow.operators.python import PythonOperator +from dotenv import dotenv_values, set_key + + +def set_airflow_variables(): + """ + Set Airflow variables from environment and write them to the .env file. + """ + # Set Airflow variables from environment variables + PSQL_USER = os.environ.get('AIRFLOW_PSQL_USER_MAIN') + PSQL_PASSWORD = os.environ.get('AIRFLOW_PSQL_PASSWORD_MAIN') + PSQL_HOST = os.environ.get('AIRFLOW_PSQL_HOST_MAIN') + PSQL_PORT = os.environ.get('AIRFLOW_PSQL_PORT_MAIN') + PSQL_DB = os.environ.get('AIRFLOW_PSQL_DB_MAIN') + + Variable.set('PSQL_USER', PSQL_USER) + Variable.set('PSQL_PASSWORD', PSQL_PASSWORD) + Variable.set('PSQL_HOST', PSQL_HOST) + Variable.set('PSQL_PORT', PSQL_PORT) + Variable.set('PSQL_DB', PSQL_DB) + + # Write variables to .env file + dotenv_path = '/opt/airflow/episcanner-downloader/.env' + env_vars = dotenv_values(dotenv_path) + env_vars['PSQL_USER'] = PSQL_USER + env_vars['PSQL_PASSWORD'] = PSQL_PASSWORD + env_vars['PSQL_HOST'] = PSQL_HOST + env_vars['PSQL_PORT'] = PSQL_PORT + env_vars['PSQL_DB'] = PSQL_DB + for key, value in env_vars.items(): + set_key(dotenv_path, key, value) + + +default_args = { + 'owner': 'airflow', + 'depends_on_past': False, + 'start_date': datetime(2023, 5, 21), + 'retries': 1, + 'retry_delay': timedelta(minutes=5), +} + +with DAG( + 'EPISCANNER_DOWNLOADER', + default_args=default_args, + schedule_interval='0 3 * * 0', # Every Sunday at 3 AM + catchup=False, +) as dag: + + # clone the repository from GitHub + clone_repository = BashOperator( + task_id='clone_repository', + bash_command='git clone --branch main --single-branch --depth 1 ' + 'https://github.com/AlertaDengue/episcanner-downloader.git ' + '/opt/airflow/episcanner-downloader', + dag=dag, + ) + + # Set variables for Episcanner-PostgreSQL connection + set_connection_variables = PythonOperator( + task_id='set_connection_variables', + python_callable=set_airflow_variables, + dag=dag, + ) + + # Install the Episcanner package using Poetry + install_episcanner = BashOperator( + task_id='install_episcanner', + bash_command='source /home/airflow/mambaforge/bin/activate episcanner-downloader && ' # NOQA E501 + 'cd /opt/airflow/episcanner-downloader && ' + 'poetry install', + dag=dag, + ) + + # Download all data to the specified directory + episcanner_downloader = BashOperator( + task_id='episcanner_downloader', + bash_command='source /home/airflow/mambaforge/bin/activate episcanner-downloader && ' # NOQA E501 + 'cd /opt/airflow/episcanner-downloader &&' + 'python epi_scanner/downloader/export_data.py ' + '-s all -d dengue chikungunya -o /opt/airflow/episcanner_data', + dag=dag, + ) + + # Remove the episcanner-downloader repository + remove_repository = BashOperator( + task_id='remove_repository', + bash_command='rm -rf /opt/airflow/episcanner-downloader', + dag=dag, + ) + + ( + clone_repository + >> set_connection_variables + >> install_episcanner + >> episcanner_downloader + >> remove_repository + ) diff --git a/docker/Dockerfile b/docker/Dockerfile index 19d7e64..f01c7a9 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -19,8 +19,6 @@ RUN apt-get update \ vim \ sed \ tar \ - gcc \ - make \ lzma \ libssl-dev \ libtk8.6 \ @@ -34,6 +32,8 @@ RUN apt-get update \ libsqlite3-dev \ postgresql-client \ wget \ + gettext \ + build-essential \ && rm -rf /var/lib/apt/lists/* @@ -61,6 +61,7 @@ COPY --chown=airflow alertflow/airflow.cfg ${AIRFLOW_HOME}/airflow.cfg COPY --chown=airflow docker/scripts/entrypoint.sh /entrypoint.sh RUN chmod +x /entrypoint.sh + USER airflow WORKDIR ${AIRFLOW_HOME} @@ -73,4 +74,13 @@ RUN /usr/local/bin/python -m virtualenv /opt/envs/py310 --python="/opt/py310/bin "satellite-weather-downloader >= 1.8.2" \ psycopg2 +# Install conda and create environment +RUN curl -LO https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh \ + && bash Mambaforge-Linux-x86_64.sh -b -p /home/airflow/mambaforge \ + && rm Mambaforge-Linux-x86_64.sh \ + && /home/airflow/mambaforge/bin/mamba create -y -n episcanner-downloader python=3.11 poetry psycopg2 python-dotenv \ + && chown -R ${HOST_UID}:${HOST_GID} ${AIRFLOW_HOME}/ /home/airflow/mambaforge/ + +RUN echo "alias activate_episcanner='source /home/airflow/mambaforge/bin/activate episcanner-downloader'" >> /home/airflow/.bashrc + ENTRYPOINT [ "/entrypoint.sh" ] diff --git a/docker/compose.yaml b/docker/compose.yaml index 294151c..42c60ec 100644 --- a/docker/compose.yaml +++ b/docker/compose.yaml @@ -32,12 +32,22 @@ x-airflow-common: AIRFLOW_HOME: ${AIRFLOW_HOME:-/opt/airflow} AIRFLOW_VAR_PSQL_MAIN_URI: '{"PSQL_MAIN_URI":"${PSQL_URI_MAIN}"}' AIRFLOW_VAR_CDSAPI_KEY: '{"CDSAPI_KEY":"${CDSAPI_KEY}"}' + # HOST_UID: ${HOST_UID} HOST_GID: ${HOST_GID} + # Episcsanner variables + AIRFLOW_PSQL_USER_MAIN: ${AIRFLOW_PSQL_USER_MAIN} + AIRFLOW_PSQL_PASSWORD_MAIN: ${AIRFLOW_PSQL_PASSWORD_MAIN} + AIRFLOW_PSQL_HOST_MAIN: ${AIRFLOW_PSQL_HOST_MAIN} + AIRFLOW_PSQL_PORT_MAIN: ${AIRFLOW_PSQL_PORT_MAIN} + AIRFLOW_PSQL_DB_MAIN: ${AIRFLOW_PSQL_DB_MAIN} volumes: - ${AIRFLOW_PROJ_DIR:-.}/alertflow/dags:${AIRFLOW_HOME}/dags - ${AIRFLOW_PROJ_DIR:-.}/alertflow/logs:${AIRFLOW_HOME}/logs - ${AIRFLOW_PROJ_DIR:-.}/alertflow/plugins:${AIRFLOW_HOME}/plugins + # Episcanner + - ${EPISCANNER_HOST_DATA}:${AIRFLOW_HOME}/episcanner_data + user: "${AIRFLOW_UID:-50000}:0" depends_on: &airflow-common-depends-on