In [1]:
from minio import Minio
from time import sleep
from requests import get
import json
from collections import deque
from io import BytesIO
from typing import Self
import logging
import pickle

logging.basicConfig(filename='NeoPipeline.log', level=logging.INFO)
logger = logging.getLogger(__name__)

In [2]:
# Connect to Minio blob storage
def create_minio_client(endpoint='localhost:9000', 
                        access_key='minioadmin', 
                        secret_key='minioadmin', 
                        secure=False):

    try: 
        # Initialize client
        client = Minio(endpoint=endpoint, 
                       access_key=access_key, 
                       secret_key=secret_key, 
                       secure=secure)

        # verify connection
        client.list_buckets()
        logger.info(f"Successfully connected to Minio at {endpoint}")
        connection_status = True

        return client, connection_status

    except Exception as e:
        logger.error(f"{e}")
        connection_status = False
        
        return client, connection_status

    
minio_client, cur_connection_status = create_minio_client()

In [3]:
# Set parameters for Airflow
api_key_param = 'Sfn0wfG6FG6E3D5Hu8MrxSja38yMXftWqboKv6ZH'
api_uri_param = 'https://api.nasa.gov/neo/rest/v1/feed?'
start_date_param = '2025-05-02'
end_date_param = '2025-05-09'
bucket_name_param = 'neo'
mode = 'silver'

In [4]:
class NeoApiClient:
    def __init__(self, 
                 api_key,
                 api_uri,
                 start_date, 
                 end_date, 
                 storage, 
                 connection_status,
                 bucket_name,
                 mode):
        
        self.key = api_key
        self.api_uri = api_uri
        self.start_date = start_date
        self.end_date = end_date
        self.storage = storage
        self.connection_status = connection_status
        self.bucket_name = bucket_name
        self.mode = mode
        self.data = None

    
    def _update_queue(self, mode: str, object_name: str=None) -> None:
        
        # List objects in bucket
        bucket_objects = [obj.object_name for obj in self.storage.list_objects(self.bucket_name)]
        
        # Verify neo/queue exists, create it, if it does not exist
        if 'queue' not in bucket_objects:
            queue = pickle.dumps(deque([]))
            queue_file = BytesIO(queue)
            
            self.storage.put_object(self.bucket_name,
                                    'queue',
                                    queue_file,
                                    length=len(queue_file.getvalue()))
            
        # Select update mode
        match mode:
            case 'in':
                HttpResponse = self.storage.get_object(self.bucket_name, 'queue')
                queue = pickle.loads(HttpResponse.data)
                
                queue.append(object_name)
                
                queue_bytes = pickle.dumps(queue)
                queue_file = BytesIO(queue_bytes)

                self.storage.put_object(self.bucket_name,
                                        'queue',
                                        queue_file,
                                        length=len(queue_file.getvalue()))               
                
            case 'out':
                HttpResponse = self.storage.get_object(self.bucket_name, 'queue')
                queue = pickle.loads(HttpResponse.data)
                
                if queue:
                    item = queue.popleft()
                else:
                    return None
                
                queue_bytes = pickle.dumps(queue)
                queue_file = BytesIO(queue_bytes)

                self.storage.put_object(self.bucket_name,
                                        'queue',
                                        queue_file,
                                        length=len(queue_file.getvalue()))
            
                return item

                
    def print_queue(self):
        HttpResponse = self.storage.get_object(self.bucket_name, 'queue')
        queue = pickle.loads(HttpResponse.data)
        print(queue)    
    
        
    def extract(self) -> Self:
        logger.info(f'Mode: {self.mode}......extracting')
        # Generate API request uri
        full_uri = f'{self.api_uri}start_date={self.start_date}&end_date={self.end_date}&api_key={self.key}'
       
        # Select API Client mode
        match self.mode:
            
            case 'bronze':    # Ingest from source: NASA NEO API request 
                
                try:
                    # requests.get()
                    HttpResponse = get(full_uri, timeout=5)
                    HttpResponse.raise_for_status()  # Raises HTTPError for bad responses
    
                    # Convert JSON to bytes
                    json_bytes = json.dumps(HttpResponse.json()).encode('utf-8')

                    # Store data as BytesIO object
                    self.data = BytesIO(json_bytes)
                    logger.info(f"JSON data file extracted from {self.api_uri}")

                except requests.exceptions.HTTPError as e:
                    logger.error(f"HTTP Error: {e}")
                    
                except requests.exceptions.ConnectionError as e:
                    logger.error(f"Error Connecting: {e}")
                    
                except requests.exceptions.Timeout as e:
                    logger.error(f"Timeout Error: {e}")
                    
                except requests.exceptions.RequestException as e:
                    logger.error(f"Unknown Error: {e}")

                return self
            
            case 'silver':    # Ingest from source: neo/bronze/ 
                
                if self.connection_status:

                    # Get file name from neo/queue object
                    obj_name = self._update_queue('out')

                    if obj_name:

                        try:
                            # Extract binary JSON file from neo/bronze/
                            HttpResponse = self.storage.get_object(self.bucket_name, obj_name)

                            # Convert JSON to python dictionary
                            data: str = HttpResponse.data.decode('utf-8')
                            self.data: dict = json.loads(data)
                            logger.info(f"{obj_name} retrieved from '{self.bucket_name}' bucket")

                        except Exception as e:
                            logger.error(f"Data extraction from {obj_name} failed: {e}")

                        finally:
                            HttpResponse.close()
                            HttpResponse.release_conn()

                    else:
                        logger.info('No objects left in queue')
                
                else:
                    logger.error("Connection Error: No files extracted")
                
                return self
                
            case 'gold':    # Ingest from source: neo/silver/
                
                # Generate neo bucket silver data path
                obj_name = f'silver/{self.bucket_name}-{self.start_date}_{self.end_date}.json'
                
                if self.connection_status:

                    try:
                        # Extract binary JSON file from neo/bronze/
                        HttpResponse = self.storage.get_object(self.bucket_name, obj_name)
                        
                        # convert JSON to python dictionary
                        data: str = HttpResponse.data.decode('utf-8')
                        self.data: dict = json.loads(data)
                        logger.info(f"{obj_name} retrieved from '{self.bucket_name}' bucket")
                        
                    except Exception as e:
                        logger.error(f"Data extraction from {obj_name} failed: {e}")

                    finally:
                        HttpResponse.close()
                        HttpResponse.release_conn()
                    
                    return self
                
                else:
                    logger.error("Connection Error: No files extracted")
                    return self

                
    def transform(self) -> Self:
        logger.info(f'Mode: {self.mode}......transforming')
        match self.mode:
            case 'bronze':
                logger.info("No bronze transformation implemented")
                return self
            
            case 'silver':
                logger.info("No silver transformation implemented")
                return self
            
            case 'gold':
                logger.info("Final data transformation complete")
                return self
                
                        
    def load(self) -> None:
        logger.info(f'Mode: {self.mode}......loading')
        # Select API Client mode
        match self.mode:
                                    
            case 'bronze':
            
                # Generate neo bucket bronze data path                   
                obj_name = f'{self.mode}/{self.bucket_name}-{self.start_date}_{self.end_date}.json'

                if self.connection_status:
                   
                    try:
                        # Put client data in neo/bronze bucket
                        self.storage.put_object(
                            self.bucket_name, 
                            obj_name, 
                            self.data,
                            length=len(self.data.getvalue()),
                            content_type='application/json'
                        )
                                    
                        # Add file name to neo/queue object
                        self._update_queue('in', obj_name)

                        logger.info(
                            f'JSON file successfully uploaded as {obj_name} in bucket {self.bucket_name}'
                        )
                                    
                    except Exception as e:
                        logger.error(f"Load Error: No files loaded: {e}")
                                    
                else:
                    logger.error("Connection Error: No files loaded")
            
            case 'silver':
                obj_name = f'{self.mode}/{self.bucket_name}-{self.start_date}_{self.end_date}'
                logger.info(
                    f'Parquet file and Iceberg catalog successfully uploaded, as {obj_name}/ in bucket {self.bucket_name}'
                )

            case 'gold':
                obj_name = f'{self.mode}/{self.bucket_name}-{self.start_date}_{self.end_date}.parquet'
                logger.info(
                    f'Parquet file and Iceberg catalog successfully uploaded, as {obj_name}/ in bucket {self.bucket_name}'
                )
                


In [5]:
for m in ['bronze', 'silver']:
    # Initialize NeoApiClient
    neo_client = NeoApiClient(api_key=api_key_param,
                              api_uri=api_uri_param,
                              start_date=start_date_param, 
                              end_date=end_date_param, 
                              storage=minio_client, 
                              connection_status=cur_connection_status, 
                              bucket_name=bucket_name_param, 
                              mode=m)

    # Execute ETL pipeline task based on mode
    neo_client.extract().transform().load()
    neo_client.print_queue()

deque(['bronze/neo-2025-05-02_2025-05-09.json'])
deque([])
