In [1]:
#Airflow Archtechture Diagram

from diagrams import Diagram
from diagrams.custom import Custom
from diagrams.gcp.storage import GCS
from diagrams.programming.language import Python

# Define custom icons (local paths or URLs can be used)
pdf_icon = "/Users/nishitamatlani/Documents/GitHub/assignment2-nishita/images/pdf.png"
watson_icon = "/Users/nishitamatlani/Documents/GitHub/assignment2-nishita/images/ibm.png"
pypdf2_icon = "/Users/nishitamatlani/Documents/GitHub/assignment2-nishita/images/python.png"

# Create the diagram with increased padding to add space above the diagram
graph_attr = {
    "pad": "1.0",   # Adds padding around the diagram (controls space)
    "fontsize": "22"  # Increases font size for better readability
}

# Create and save the diagram as a PNG file (no rendering in the notebook)
output_filename = "airflow_etl_architecture"

# Create the diagram and save it as a file
with Diagram("Airflow ETL Architecture", filename=output_filename, show=False, direction="LR", graph_attr=graph_attr):
    # Define nodes with custom icons
    pdf = Custom("PDF", pdf_icon)
    watson_api = Custom("IBM Watson API", watson_icon)
    gcp_bucket1 = GCS("GCP Bucket")
    gcp_bucket2 = GCS("GCP Bucket")
    
    pypdf2 = Custom("PyPDF2 LIbrary", pypdf2_icon)

    # Define the workflow connections
    gcp_bucket1 >> pdf >> watson_api >> gcp_bucket2
    pdf >> pypdf2 >> gcp_bucket2

print(f"Diagram saved as {output_filename}.png")


Diagram saved as airflow_etl_architecture.png


In [2]:
from diagrams import Cluster, Diagram, Edge
from diagrams.aws.storage import S3
from diagrams.aws.database import RDS
from diagrams.onprem.client import Client
from diagrams.onprem.compute import Server
from diagrams.programming.language import Python
from diagrams.onprem.workflow import Airflow
from diagrams.custom import Custom
from diagrams.generic.compute import Rack  # Representing Docker Compose containers

# Paths to your custom icons
openai_icon_path = "/Users/nishitamatlani/Documents/GitHub/assignment2-nishita/images/openai.png"
hugging_face_icon_path = "/Users/nishitamatlani/Documents/GitHub/assignment2-nishita/images/hugging_face.png"
watson_icon_path = "/Users/nishitamatlani/Documents/GitHub/assignment2-nishita/images/ibm.png"
streamlit_icon_path = "/Users/nishitamatlani/Documents/GitHub/assignment2-nishita/images/streamlit.png"
pdf_icon_path = "/Users/nishitamatlani/Documents/GitHub/assignment2-nishita/images/pdf.png"
jwt_icon_path = "/Users/nishitamatlani/Documents/GitHub/assignment2-nishita/images/jwt.png"  # Custom JWT icon

# Create a diagram to represent the overall architecture
with Diagram("Airflow ETL and Data Flow Architecture", show=False):

    # Source: Hugging Face dataset going to S3 bucket initially
    huggingface_source = Custom("GAIA Dataset (Hugging Face)", hugging_face_icon_path)

    # S3 Bucket for initial storage
    s3_bucket = S3("Amazon S3 Bucket")

    # ETL Orchestration and Data Extraction
    with Cluster("ETL Process"):
        # Airflow for Orchestration
        airflow = Airflow("Airflow ETL")

        # PDF processing step using custom PDF icon
        pdf_file = Custom("PDF File", pdf_icon_path)

        # Custom icon for IBM Watson API
        ibm_watson = Custom("IBM Watson API", watson_icon_path)

        # PyPDF for text extraction
        pypdf2_lib = Python("PyPDF2 Library")

        # Target S3 bucket for storing extracted content
        s3_target_bucket = S3("Amazon S3 Bucket")

        # Define ETL data flow paths with different arrow styles
        huggingface_source >> Edge(label="Upload Dataset", style="solid") >> s3_bucket
        s3_bucket >> Edge(label="PDF Files", style="solid") >> airflow >> Edge(label="Trigger ETL", style="dotted") >> pdf_file
        pdf_file >> Edge(label="Text Extraction", style="solid") >> ibm_watson >> Edge(label="Data Transfer", style="solid") >> s3_target_bucket
        pdf_file >> Edge(label="Text Extraction", style="solid") >> pypdf2_lib >> Edge(label="Data Transfer", style="solid") >> s3_target_bucket

    # Database in Amazon RDS for data storage
    rds_instance = RDS("Amazon RDS")

    # Backend Service using FastAPI
    with Cluster("Backend Service"):
        fastapi_service = Server("FastAPI")

        # JWT Authentication Component between FastAPI and Streamlit
        jwt_auth = Custom("JWT Authentication", jwt_icon_path)

        # Use Custom node for OpenAI
        openai_integration = Custom("OpenAI", openai_icon_path)

    # Frontend Client with Streamlit
    with Cluster("Frontend Service"):
        streamlit_app = Custom("Streamlit", streamlit_icon_path)

    # Data flow from S3 (ETL output) to RDS
    s3_target_bucket >> Edge(label="Extracted Text", style="solid") >> rds_instance
    s3_bucket >> Edge(label="Raw Data", style="solid") >> rds_instance

    # FastAPI interacts with RDS and OpenAI using API calls (dashed lines)
    rds_instance >> Edge(label="Data Query", style="dashed") >> fastapi_service
    fastapi_service >> Edge(label="API Call", style="dashed") >> openai_integration

    # JWT Authentication flow
    fastapi_service >> Edge(label="JWT Verification", style="dashed") >> jwt_auth
    jwt_auth >> Edge(label="Access Control", style="dotted") >> streamlit_app

    # Streamlit interacts with FastAPI for data display using API calls
    fastapi_service >> Edge(label="API Call", style="dashed") >> streamlit_app

    # Add a separate section for Docker Compose
    with Cluster("Deployment Layer"):
        docker_compose = Rack("Docker Compose\n(Containerized Services)")

        # Docker containers for each service
        docker_airflow = airflow - Edge(label="Containerized Deployment", style="dashed") - docker_compose
        docker_fastapi = fastapi_service - Edge(label="Containerized Deployment", style="dashed") - docker_compose
        docker_streamlit = streamlit_app - Edge(label="Containerized Deployment", style="dashed") - docker_compose
