## PROMPT
Generate a pipeline for Databricks:
* Get Data information from two web addresses:
   * Space launches: https://api.spacexdata.com/v3/launches
* Aggregate the launches based on launch year
* Send the chosen information to the web address: https://httpbin.org/post
* The script must provide status updates on its progress, report any errors encountered, confirm the outcome of the final data sending step, and measure/report execution times.

In [0]:
# SpaceX Data Pipeline for Databricks
# This script fetches SpaceX launch data, aggregates by year, and sends to an endpoint

import requests
import json
import time
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, year, count
from datetime import datetime

# Initialize Spark Session
spark = SparkSession.builder.appName("SpaceX Data Pipeline").getOrCreate()

def log_message(message):
    """Log a message with timestamp."""
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    print(f"[{timestamp}] {message}")

def fetch_data(url):
    """Fetch data from the given URL."""
    log_message(f"Fetching data from {url}")
    start_time = time.time()
    
    try:
        response = requests.get(url, timeout=30)
        response.raise_for_status()  # Raise exception for HTTP errors
        
        execution_time = time.time() - start_time
        log_message(f"Successfully fetched data in {execution_time:.2f} seconds")
        
        return response.json()
    except requests.exceptions.RequestException as e:
        execution_time = time.time() - start_time
        log_message(f"ERROR: Failed to fetch data from {url} after {execution_time:.2f} seconds")
        log_message(f"Exception details: {str(e)}")
        raise

def process_spacex_data(data):
    """Process SpaceX launch data."""
    log_message("Processing SpaceX launch data")
    start_time = time.time()
    
    try:
        # Convert JSON to DataFrame
        df = spark.read.json(spark.sparkContext.parallelize([json.dumps(item) for item in data]))
        
        # Extract launch year from launch_date_utc
        df = df.withColumn("launch_year", year(col("launch_date_utc")))
        
        # Aggregate launches by year
        year_counts = df.groupBy("launch_year").agg(count("*").alias("launch_count"))
        
        # Convert to dictionary format for easier JSON serialization
        result = [{
            "year": row["launch_year"],
            "launch_count": row["launch_count"]
        } for row in year_counts.collect()]
        
        execution_time = time.time() - start_time
        log_message(f"Successfully processed data in {execution_time:.2f} seconds")
        log_message(f"Found {len(result)} launch years")
        
        return result
    except Exception as e:
        execution_time = time.time() - start_time
        log_message(f"ERROR: Failed to process data after {execution_time:.2f} seconds")
        log_message(f"Exception details: {str(e)}")
        raise

def send_data(url, data):
    """Send data to the specified endpoint."""
    log_message(f"Sending data to {url}")
    start_time = time.time()
    
    try:
        headers = {"Content-Type": "application/json"}
        response = requests.post(url, json=data, headers=headers, timeout=30)
        response.raise_for_status()
        
        execution_time = time.time() - start_time
        log_message(f"Successfully sent data in {execution_time:.2f} seconds")
        log_message(f"Response status code: {response.status_code}")
        
        return response.json()
    except requests.exceptions.RequestException as e:
        execution_time = time.time() - start_time
        log_message(f"ERROR: Failed to send data to {url} after {execution_time:.2f} seconds")
        log_message(f"Exception details: {str(e)}")
        raise

def main():
    """Main function to orchestrate the pipeline."""
    pipeline_start_time = time.time()
    log_message("Starting SpaceX data pipeline")
    
    try:
        # Fetch SpaceX launch data
        spacex_url = "https://api.spacexdata.com/v3/launches"
        spacex_data = fetch_data(spacex_url)
        
        # Process data
        processed_data = process_spacex_data(spacex_data)
        
        # Send aggregated data to destination
        destination_url = "https://httpbin.org/post"
        response = send_data(destination_url, processed_data)
        
        # Log final result
        pipeline_execution_time = time.time() - pipeline_start_time
        log_message(f"Pipeline completed successfully in {pipeline_execution_time:.2f} seconds")
        log_message(f"Final response from destination: {json.dumps(response)[:100]}...")
        
    except Exception as e:
        pipeline_execution_time = time.time() - pipeline_start_time
        log_message(f"ERROR: Pipeline failed after {pipeline_execution_time:.2f} seconds")
        log_message(f"Exception details: {str(e)}")
        raise

if __name__ == "__main__":
    main()

ERROR 1

In [0]:
# SpaceX Data Pipeline for Databricks
# This script fetches SpaceX launch data, aggregates by year, and sends to an endpoint
# Compatible with Databricks Connect and serverless compute

import requests
import json
import time
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, year, count
from datetime import datetime

# Initialize Spark Session (without needing sparkContext)
spark = SparkSession.builder.appName("SpaceX Data Pipeline").getOrCreate()

def log_message(message):
    """Log a message with timestamp."""
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    print(f"[{timestamp}] {message}")

def fetch_data(url):
    """Fetch data from the given URL."""
    log_message(f"Fetching data from {url}")
    start_time = time.time()
    
    try:
        response = requests.get(url, timeout=30)
        response.raise_for_status()  # Raise exception for HTTP errors
        
        execution_time = time.time() - start_time
        log_message(f"Successfully fetched data in {execution_time:.2f} seconds")
        
        return response.json()
    except requests.exceptions.RequestException as e:
        execution_time = time.time() - start_time
        log_message(f"ERROR: Failed to fetch data from {url} after {execution_time:.2f} seconds")
        log_message(f"Exception details: {str(e)}")
        raise

def process_spacex_data(data):
    """Process SpaceX launch data."""
    log_message("Processing SpaceX launch data")
    start_time = time.time()
    
    try:
        # First convert JSON to pandas DataFrame
        pdf = pd.DataFrame(data)
        
        # Convert pandas DataFrame to Spark DataFrame
        df = spark.createDataFrame(pdf)
        
        # Extract launch year from launch_date_utc
        df = df.withColumn("launch_year", year(col("launch_date_utc")))
        
        # Aggregate launches by year
        year_counts = df.groupBy("launch_year").agg(count("*").alias("launch_count"))
        
        # Convert to pandas DataFrame for easier processing
        result_df = year_counts.toPandas()
        
        # Convert to dictionary format for easier JSON serialization
        result = result_df.to_dict(orient="records")
        
        execution_time = time.time() - start_time
        log_message(f"Successfully processed data in {execution_time:.2f} seconds")
        log_message(f"Found {len(result)} launch years")
        
        return result
    except Exception as e:
        execution_time = time.time() - start_time
        log_message(f"ERROR: Failed to process data after {execution_time:.2f} seconds")
        log_message(f"Exception details: {str(e)}")
        raise

def send_data(url, data):
    """Send data to the specified endpoint."""
    log_message(f"Sending data to {url}")
    start_time = time.time()
    
    try:
        headers = {"Content-Type": "application/json"}
        response = requests.post(url, json=data, headers=headers, timeout=30)
        response.raise_for_status()
        
        execution_time = time.time() - start_time
        log_message(f"Successfully sent data in {execution_time:.2f} seconds")
        log_message(f"Response status code: {response.status_code}")
        
        return response.json()
    except requests.exceptions.RequestException as e:
        execution_time = time.time() - start_time
        log_message(f"ERROR: Failed to send data to {url} after {execution_time:.2f} seconds")
        log_message(f"Exception details: {str(e)}")
        raise

def main():
    """Main function to orchestrate the pipeline."""
    pipeline_start_time = time.time()
    log_message("Starting SpaceX data pipeline")
    
    try:
        # Fetch SpaceX launch data
        spacex_url = "https://api.spacexdata.com/v3/launches"
        spacex_data = fetch_data(spacex_url)
        
        # Process data
        processed_data = process_spacex_data(spacex_data)
        
        # Send aggregated data to destination
        destination_url = "https://httpbin.org/post"
        response = send_data(destination_url, processed_data)
        
        # Log final result
        pipeline_execution_time = time.time() - pipeline_start_time
        log_message(f"Pipeline completed successfully in {pipeline_execution_time:.2f} seconds")
        log_message(f"Final response from destination: {json.dumps(response)[:100]}...")
        
    except Exception as e:
        pipeline_execution_time = time.time() - pipeline_start_time
        log_message(f"ERROR: Pipeline failed after {pipeline_execution_time:.2f} seconds")
        log_message(f"Exception details: {str(e)}")
        raise

if __name__ == "__main__":
    main()