[Reference](https://medium.com/@patricklowe33/etl-using-docker-python-postgres-airflow-ed3e9508bd2e)

# Step 1 — Prerequisites
- Docker
- Python
- PostgreSQL
- Apache Airflow
- Airlabs

# Step 2 — Setup
## Dockerfile
```
FROM python:3.6.1-alpine

RUN apk update \
  && apk add \
    build-base \
    postgresql \
    postgresql-dev \
    libpq

RUN mkdir /app
WORKDIR /app
COPY ./requirements.txt .
RUN pip install -r requirements.txt

ENV PYTHONUNBUFFERED 1

COPY . .
```

## requirements.txt
```
psycopg2==2.7.2
```

## Compose.yml
```
version: '3.4'

x-common:
  &common
  image: apache/airflow:2.3.0
  user: "${AIRFLOW_UID}:0"
  env_file:
    - .env
  volumes:
    - ./dags:/opt/airflow/dags
    - ./logs:/opt/airflow/logs
    - ./plugins:/opt/airflow/plugins
    - /var/run/docker.sock:/var/run/docker.sock

x-depends-on:
  &depends-on
  depends_on:
    postgres:
      condition: service_healthy
    airflow-init:
      condition: service_completed_successfully

services:
  postgres:
    image: postgres:13
    container_name: postgres
    healthcheck:
      test: ["CMD", "pg_isready", "-U", "airflow"]
      interval: 5s
      retries: 5
    env_file:
      - .env
    environment:
      POSTGRES_PORT: ${POSTGRES_PORT:-5432}
# UPDATE your host here, for local machines (not docker) "localhost" worked for me
      POSTGRES_HOST: ${POSTGRES_HOST:-postgres}
    volumes:
      - local_postgres_data:/var/lib/postgresql/data
    ports:
      - 5432:5432

  scheduler:
    <<: [*common, *depends-on]
    container_name: airflow-scheduler
    command: scheduler
    restart: on-failure
    ports:
      - "8793:8793"

  webserver:
    <<: [*common, *depends-on]
    container_name: airflow-webserver
    restart: always
    command: webserver
    ports:
      - "8080:8080"
    healthcheck:
      test: ["CMD", "curl", "--fail", "http://localhost:8080/health"]
      interval: 30s
      timeout: 30s
      retries: 5
  
  airflow-init:
    <<: *common
    container_name: airflow-init
    entrypoint: /bin/bash
    command:
      - -c
      - |
        mkdir -p /sources/logs /sources/dags /sources/plugins
        chown -R "${AIRFLOW_UID}:0" /sources/{logs,dags,plugins}
        exec /entrypoint airflow version
volumes:
    local_postgres_data:
```


## .env
```
# Database
POSTGRES_USER=postgres
POSTGRES_PASSWORD=YOUR_PASSWORD
POSTGRES_DB=ryanair_API

# Backend DB - update your username and password
AIRFLOW__DATABASE__SQL_ALCHEMY_CONN=postgresql+psycopg2://YOUR_USERNAME:YOUR_PASSWORD@postgres/YOUR_DATABASE
AIRFLOW__DATABASE__LOAD_DEFAULT_CONNECTIONS=False

# Airflow Init
_AIRFLOW_DB_UPGRADE=True
_AIRFLOW_WWW_USER_CREATE=True
_AIRFLOW_WWW_USER_USERNAME=postgres
_AIRFLOW_WWW_USER_PASSWORD=YOUR_PASSWORD
```

## Step 3 —DAG Script for Airflow

In [1]:
from airflow import DAG
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.bash_operator import BashOperator
from airflow.operators.python_operator import PythonOperator
from datetime import datetime,timedelta

default_args = {
   'owner': 'airflow',
   'depends_on_past': False,
   'retries': 0
}

dag=DAG(
    dag_id='ryanair_DAG',
    default_args=default_args,
    start_date=datetime(2024,1,9),
    catchup=False,
    schedule_interval='*/3 * * * *', #every 3 minutes
    )

t1 = BashOperator(
    task_id = 'Bash_task',
    bash_command = 'python $AIRFLOW_HOME/dags/scripts/RyanAir_ETL.py',
    dag = dag
    )

t1

## Step 4 — Python Script

In [2]:
#!/usr/bin/env python
import pandas as pd
import requests
import json
import psycopg2
import psycopg2.extras as extras


def extract(api_key):
    # ## Extract
    # Gather data from airlabs API about RyanAir flights
    # https://airlabs.co/account
    fields = "&_fields=flight_iata,dep_iata,dep_time_utc,dep_estimated_utc,dep_actual_utc,arr_iata,arr_time_utc,arr_estimated_utc,status,duration,delayed,dep_delayed,arr_delayed"
    method = 'ping'
    params = {'api_key': api_key}
    # Flights based on Airline, FR is IATA code for RyanAir
    schedules_api = 'https://airlabs.co/api/v9/schedules?airline_iata=FR'
    print("Extracting...")
    schedule_data = pd.json_normalize(requests.get(schedules_api+fields+method, params).json(), record_path=['response'])
    return schedule_data

# ## Transform
# Clean data for uploading
import os
#print("Current Directory:" + str(CUR_DIR) + "/dags/scripts/airport_codes.csv")
def convert_iata(df):
    CUR_DIR = os.path.abspath(os.path.dirname(__file__))
    airport_codes = pd.read_csv(str(CUR_DIR)+'/airport_codes.csv')
    df = df.merge(
        airport_codes[['dep_iata','airport_name']],
        on='dep_iata',
        how="left")
    df.rename(
        columns={"airport_name": "departure_airport"},
        inplace=True)
    df = df.drop(columns=['dep_iata'])
    df = df.merge(
        airport_codes[['arr_iata','airport_name']],
        on='arr_iata',
        how="left")
    df.rename(
        columns={"airport_name": "arrival_airport"},
        inplace=True)
    df = df.drop( columns=['arr_iata'])
    return df

def convert_timestamp(df):
    cols = ['dep_time_utc','dep_estimated_utc','dep_actual_utc','arr_time_utc','arr_estimated_utc']
    df[cols] = df[cols].apply(pd.to_datetime)
    # departure date/time in GMT
    df['dep_date'], df['dep_time'] = df['dep_time_utc'].dt.normalize(), df['dep_time_utc'].dt.time
    # updated departure date/time in GMT
    df['dep_date_upd'], df['dep_time_upd'] = df['dep_estimated_utc'].dt.normalize(), df['dep_estimated_utc'].dt.time
    df['dep_date_upd'].fillna(df['dep_date'], inplace=True)
    df['dep_time_upd'].fillna(df['dep_time'], inplace=True)

    # actual departure date/time in GMT
    df['dep_date_act'], df['dep_time_act'] = df['dep_actual_utc'].dt.normalize(), df['dep_actual_utc'].dt.time
    df['dep_date_act'].fillna(df['dep_date'], inplace=True)
    df['dep_time_act'].fillna(df['dep_time'], inplace=True)
    # arrival date/time in GMT
    df['arr_date'], df['arr_time'] = df['arr_time_utc'].dt.normalize(), df['arr_time_utc'].dt.time
    # updated arrival date/time in GMT
    df['arr_date_upd'], df['arr_time_upd'] = df['arr_estimated_utc'].dt.normalize(), df['arr_estimated_utc'].dt.time
    df['arr_date_upd'].fillna(df['arr_date'], inplace=True)
    df['arr_time_upd'].fillna(df['arr_time'], inplace=True)
    df.drop( columns=cols, inplace=True)
    return df

def prep_load(df):
    df[['delayed','dep_delayed']] = df[['delayed','dep_delayed']].fillna(0)
    df = df[df['status']=='landed']
    return df

# ## Load
# Load data into Postgres database

def create_table(conn):
    cur = conn.cursor()
    try:
        cur.execute("""CREATE TABLE IF NOT EXISTS public.schedule(
        flight_id BIGSERIAL PRIMARY KEY,
        flight_iata VARCHAR(8),
        status VARCHAR(10),
        departure_airport VARCHAR(255),
        arrival_airport VARCHAR(255),
        dep_date date,
        dep_time time,
        dep_time_upd time,
        dep_time_act time,
        arr_date date,
        arr_time time,
        arr_time_upd time,
        duration real,
        delayed real,
        dep_delayed real,
        arr_date_upd date,
        dep_date_upd date,
        dep_date_act date);
        """)
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        conn.rollback()
    else:
        conn.commit()

def insert_values(conn, df, table):
    tuples = [tuple(x) for x in df.to_numpy()]
    cols = ','.join(list(df.columns))
    query = """INSERT INTO %s(%s) VALUES %%s;""" % (table, cols)
    cursor = conn.cursor()
    try:
        extras.execute_values(cursor, query, tuples)
        conn.commit()
    except (Exception, psycopg2.DatabaseError) as error:
        print("Error: %s" % error)
        conn.rollback()
        cursor.close()
        return 1
    cursor.close()

def main():

    api_key = "YOUR_AIRLABS_API_KEY"

    conn = psycopg2.connect(
        host="postgres", # changed from 'localhost' so it would work with docker
        database="ryanair_API",
        user="postgres", #your postgres username
        password="POSTGRES_PASSWORD")

    data = extract(api_key)
    print("Transforming...")
    create_table(conn)
    data = prep_load(convert_timestamp(convert_iata(data)))
    print("Loading...")
    insert_values(conn, data, 'schedule')
    print("Finished.")

if __name__ == "__main__":
    main()

# Step 5 —Running Docker and Airflow
```
docker compose up -d

docker exec -tiu postgres postgres psql

\l

\c ryanair_API
SELECT COUNT(*) FROM schedule;

docker exec -it -u postgres postgres psql -d ryanair_API -c "COPY (SELECT * FROM schedule) TO STDOUT WITH CSV HEADER" > data.csv

docker compose down
```