## PROMPT
Generate a pipeline for Databricks:

Get Data information from two web addresses:

Space launches: https://api.spacexdata.com/v3/launches

Space rockets: https://api.spacexdata.com/v3/rockets

Combine each launch record with the name of the corresponding rocket.

Send the final result to the web address: https://httpbin.org/post

The script must provide status updates on its progress, report any errors encountered, confirm the outcome of the final data sending step, and measure/report execution times.

In [0]:
import requests
import time
import json

from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Initialize Spark session (if running locally or in Databricks)
spark = SparkSession.builder.appName("SpaceX Launches Pipeline").getOrCreate()

# Start total execution timer
start_time = time.time()

def log_status(message):
    print(f"[STATUS] {message}")

def log_error(message):
    print(f"[ERROR] {message}")

def fetch_data(url, name):
    try:
        log_status(f"Fetching {name} data from {url}...")
        response = requests.get(url)
        response.raise_for_status()
        log_status(f"Successfully fetched {name} data.")
        return response.json()
    except Exception as e:
        log_error(f"Failed to fetch {name} data: {e}")
        raise

try:
    # Step 1: Fetch data
    t0 = time.time()
    launches_data = fetch_data("https://api.spacexdata.com/v3/launches", "launches")
    rockets_data = fetch_data("https://api.spacexdata.com/v3/rockets", "rockets")
    log_status(f"Data fetched in {time.time() - t0:.2f} seconds.")

    # Step 2: Create DataFrames
    t1 = time.time()
    launches_df = spark.createDataFrame(launches_data)
    rockets_df = spark.createDataFrame(rockets_data)
    log_status("DataFrames created.")

    # Step 3: Join data
    log_status("Joining launch data with rocket names...")
    rockets_df = rockets_df.select(col("rocket_id").alias("rocket_id_ref"), col("rocket_name"))
    combined_df = launches_df.join(rockets_df, launches_df.rocket.rocket_id == rockets_df.rocket_id_ref, "left")
    final_df = combined_df.select(
        col("flight_number"),
        col("mission_name"),
        col("launch_year"),
        col("rocket.rocket_id"),
        col("rocket_name")
    )
    log_status(f"Join completed in {time.time() - t1:.2f} seconds.")

    # Step 4: Convert to JSON and send to endpoint
    t2 = time.time()
    final_json = final_df.toJSON().collect()
    response = requests.post("https://httpbin.org/post", json=final_json)
    if response.status_code == 200:
        log_status(f"Data successfully sent to endpoint in {time.time() - t2:.2f} seconds.")
    else:
        log_error(f"Failed to send data to endpoint. Status code: {response.status_code}")

except Exception as e:
    log_error(f"Pipeline failed: {e}")

finally:
    total_time = time.time() - start_time
    log_status(f"Pipeline execution completed in {total_time:.2f} seconds.")


first error for the links field


In [0]:
import requests
import time
import json

from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Initialize Spark session
spark = SparkSession.builder.appName("SpaceX Launches Pipeline").getOrCreate()

start_time = time.time()

def log_status(message):
    print(f"[STATUS] {message}")

def log_error(message):
    print(f"[ERROR] {message}")

def fetch_data(url, name):
    try:
        log_status(f"Fetching {name} data from {url}...")
        response = requests.get(url)
        response.raise_for_status()
        log_status(f"Successfully fetched {name} data.")
        return response.json()
    except Exception as e:
        log_error(f"Failed to fetch {name} data: {e}")
        raise

try:
    # Step 1: Fetch
    t0 = time.time()
    launches_raw = fetch_data("https://api.spacexdata.com/v3/launches", "launches")
    rockets_raw = fetch_data("https://api.spacexdata.com/v3/rockets", "rockets")
    log_status(f"Data fetched in {time.time() - t0:.2f} seconds.")

    # Step 2: Simplify launches data (keep only necessary fields)
    launches_cleaned = []
    for launch in launches_raw:
        launches_cleaned.append({
            "flight_number": launch.get("flight_number"),
            "mission_name": launch.get("mission_name"),
            "launch_year": launch.get("launch_year"),
            "rocket_id": launch.get("rocket", {}).get("rocket_id")
        })

    # Simplify rockets data
    rockets_cleaned = []
    for rocket in rockets_raw:
        rockets_cleaned.append({
            "rocket_id": rocket.get("rocket_id"),
            "rocket_name": rocket.get("rocket_name")
        })

    # Step 3: Create DataFrames
    t1 = time.time()
    launches_df = spark.createDataFrame(launches_cleaned)
    rockets_df = spark.createDataFrame(rockets_cleaned)
    log_status("Cleaned DataFrames created.")

    # Step 4: Join
    combined_df = launches_df.join(rockets_df, on="rocket_id", how="left")
    log_status(f"Join completed in {time.time() - t1:.2f} seconds.")

    # Step 5: Send result
    t2 = time.time()
    final_json = combined_df.toJSON().collect()
    response = requests.post("https://httpbin.org/post", json=final_json)
    if response.status_code == 200:
        log_status(f"Data successfully sent to endpoint in {time.time() - t2:.2f} seconds.")
    else:
        log_error(f"Failed to send data to endpoint. Status code: {response.status_code}")

except Exception as e:
    log_error(f"Pipeline failed: {e}")

finally:
    total_time = time.time() - start_time
    log_status(f"Pipeline execution completed in {total_time:.2f} seconds.")


In [0]:
import requests
import time
import json

from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Initialize Spark session
spark = SparkSession.builder.appName("SpaceX Launches Pipeline").getOrCreate()

start_time = time.time()

def log_status(message):
    print(f"[STATUS] {message}")

def log_error(message):
    print(f"[ERROR] {message}")

def fetch_data(url, name):
    try:
        log_status(f"Fetching {name} data from {url}...")
        response = requests.get(url)
        response.raise_for_status()
        log_status(f"Successfully fetched {name} data.")
        return response.json()
    except Exception as e:
        log_error(f"Failed to fetch {name} data: {e}")
        raise

try:
    # Step 1: Fetch
    t0 = time.time()
    launches_raw = fetch_data("https://api.spacexdata.com/v3/launches", "launches")
    rockets_raw = fetch_data("https://api.spacexdata.com/v3/rockets", "rockets")
    log_status(f"Data fetched in {time.time() - t0:.2f} seconds.")

    # Step 2: Clean/Extract
    launches_cleaned = [{
        "flight_number": l.get("flight_number"),
        "mission_name": l.get("mission_name"),
        "launch_year": l.get("launch_year"),
        "rocket_id": l.get("rocket", {}).get("rocket_id")
    } for l in launches_raw]

    rockets_cleaned = [{
        "rocket_id": r.get("rocket_id"),
        "rocket_name": r.get("rocket_name")
    } for r in rockets_raw]

    # Step 3: Create DataFrames
    t1 = time.time()
    launches_df = spark.createDataFrame(launches_cleaned)
    rockets_df = spark.createDataFrame(rockets_cleaned)
    log_status("Cleaned DataFrames created.")

    # Step 4: Join
    combined_df = launches_df.join(rockets_df, on="rocket_id", how="left")
    log_status(f"Join completed in {time.time() - t1:.2f} seconds.")

    # Step 5: Convert to JSON and Send
    t2 = time.time()
    result_json = combined_df.toPandas().to_dict(orient="records")
    response = requests.post("https://httpbin.org/post", json=result_json)
    if response.status_code == 200:
        log_status(f"Data successfully sent to endpoint in {time.time() - t2:.2f} seconds.")
    else:
        log_error(f"Failed to send data to endpoint. Status code: {response.status_code}")

except Exception as e:
    log_error(f"Pipeline failed: {e}")

finally:
    total_time = time.time() - start_time
    log_status(f"Pipeline execution completed in {total_time:.2f} seconds.")
