You are an expert Python developer specializing in the Databricks environment. Your task is to create a complete Python script to be executed within a Databricks notebook. The script must perform the following operations:
1.	Data Retrieval from SpaceX API:
o	Interact with the SpaceX v3 REST API (https://api.spacexdata.com/v3).
o	Retrieve data from one specific endpoint likely containing categorical data where missing values might occur: 
	All Cores: https://api.spacexdata.com/v3/cores (Fields like status, block could be candidates)
	Alternative: All Launches: https://api.spacexdata.com/v3/launches (Fields like launch_site.site_name, rocket.rocket_name)
o	Handle potential errors during the API calls (e.g., timeouts, non-200 status codes).
2.	Missing Value Imputation (Mode):
o	Perform mode imputation on the retrieved data (list of dictionaries).
o	Imputation Logic: 
	Identify Categorical Fields: First, automatically identify the keys/fields within the dictionaries that predominantly contain categorical data (e.g., strings - str). You might need to inspect the first few records or a sample, or iterate through checking types.
	Calculate Mode per Field: For each identified categorical field, determine the mode (the most frequent value) using only the existing, non-missing (not None) values across all records in the dataset. The collections.Counter class is suitable for this.
	Handle Ties: If multiple values share the highest frequency (a tie for the mode), select any one of them as the mode (e.g., the one that appears first alphabetically or the first one encountered during counting).
	Impute Missing Values: Iterate through the dataset again. For each categorical field, replace any missing values (represented as None) with the pre-calculated mode for that specific field.
	Handle Edge Cases: If a categorical field contains only missing values (or no non-missing values to calculate a mode), log a warning and leave the missing values as None.
o	The final result should be the original list of dictionaries, but with missing categorical values replaced by the calculated mode for their respective fields.
3.	Control Parameters and Debugging:
o	Include a variable at the beginning of the script to define the API endpoint URL, making it easily modifiable: 
	API_ENDPOINT_URL = "https://api.spacexdata.com/v3/cores" #(or /launches)
o	Use Python's standard logging module to provide informative output during execution. Configure logging to display messages at the INFO level.
o	Log key messages such as: starting data retrieval, number of records retrieved, starting mode imputation process, identified categorical fields potentially needing imputation (e.g., ['status', 'block', ...]), calculated mode for field X, number of missing values imputed for field X, any warnings for fields with no calculable mode, mode imputation complete, starting upload to httpbin, upload outcome.
4.	Execution Time Measurement:
o	Code Execution Time: Measure the time taken to perform the main operations (data retrieval + mode imputation). Print this time after the imputation operation is complete.
o	Pipeline Execution Time: Measure the total execution time of the entire script (from the beginning until after the upload to httpbin). Print this total time at the end of the script. Use Python's time module.
5.	Upload Result:
o	Take the resulting imputed list of dictionaries from the mode imputation operation.
o	Serialize it into JSON format.
o	Make an HTTP POST request to the https://httpbin.org/post endpoint, sending the resulting imputed JSON data in the request body.
o	Verify the response from httpbin.org (e.g., check the status code) and log the outcome of the upload operation.


In [0]:
import requests
import logging
import time
from collections import Counter
import json

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Control parameters
API_ENDPOINT_URL = "https://api.spacexdata.com/v3/cores"  # Can be changed to /launches
HTTPBIN_UPLOAD_URL = "https://httpbin.org/post"

def retrieve_spacex_data(url):
    """
    Retrieve data from SpaceX API with error handling
    """
    logger.info(f"Starting data retrieval from {url}")
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()  # Raises exception for 4XX/5XX status codes
        data = response.json()
        logger.info(f"Successfully retrieved {len(data)} records")
        return data
    except requests.exceptions.RequestException as e:
        logger.error(f"Failed to retrieve data from API: {str(e)}")
        raise

def identify_categorical_fields(data, sample_size=50):
    """
    Identify fields that predominantly contain categorical data (strings)
    """
    if not data:
        return []
    
    # Use a sample of the data to determine field types
    sample = data[:min(sample_size, len(data))]
    field_types = {}
    
    for record in sample:
        for field, value in record.items():
            if field not in field_types:
                field_types[field] = []
            field_types[field].append(type(value).__name__)
    
    categorical_fields = []
    for field, types in field_types.items():
        # Consider a field categorical if >50% of values are strings
        str_count = types.count('str')
        if str_count / len(types) > 0.5:
            categorical_fields.append(field)
    
    logger.info(f"Identified potential categorical fields: {categorical_fields}")
    return categorical_fields

def calculate_modes(data, categorical_fields):
    """
    Calculate the mode for each categorical field
    """
    modes = {}
    
    for field in categorical_fields:
        # Collect all non-None values for the field
        values = [record.get(field) for record in data if record.get(field) is not None]
        
        if not values:
            logger.warning(f"Field '{field}' contains no non-missing values to calculate mode")
            modes[field] = None
            continue
        
        # Count frequency of each value
        counter = Counter(values)
        max_count = max(counter.values())
        candidates = [val for val, count in counter.items() if count == max_count]
        
        # Handle ties by selecting the first candidate (could be changed to alphabetical)
        mode = candidates[0]
        modes[field] = mode
        logger.info(f"Calculated mode for field '{field}': '{mode}' (appears {max_count} times)")
    
    return modes

def perform_mode_imputation(data, categorical_fields, modes):
    """
    Perform mode imputation on the data for specified fields
    """
    imputed_data = []
    imputation_counts = {field: 0 for field in categorical_fields}
    
    for record in data:
        new_record = record.copy()
        for field in categorical_fields:
            if field in new_record and new_record[field] is None and modes[field] is not None:
                new_record[field] = modes[field]
                imputation_counts[field] += 1
        imputed_data.append(new_record)
    
    for field, count in imputation_counts.items():
        if count > 0:
            logger.info(f"Imputed {count} missing values for field '{field}'")
    
    return imputed_data

def upload_to_httpbin(data, url):
    """
    Upload the processed data to httpbin
    """
    logger.info(f"Starting upload to {url}")
    try:
        json_data = json.dumps(data)
        response = requests.post(url, data=json_data, headers={'Content-Type': 'application/json'}, timeout=10)
        response.raise_for_status()
        logger.info(f"Upload successful. Response status: {response.status_code}")
        return True
    except requests.exceptions.RequestException as e:
        logger.error(f"Upload failed: {str(e)}")
        return False

def main():
    # Start total pipeline timer
    pipeline_start_time = time.time()
    
    try:
        # Start data retrieval and imputation timer
        processing_start_time = time.time()
        
        # 1. Retrieve data from SpaceX API
        data = retrieve_spacex_data(API_ENDPOINT_URL)
        
        if not data:
            logger.error("No data retrieved from API. Exiting.")
            return
        
        # 2. Perform mode imputation
        categorical_fields = identify_categorical_fields(data)
        
        if not categorical_fields:
            logger.info("No categorical fields identified. No imputation needed.")
            imputed_data = data
        else:
            modes = calculate_modes(data, categorical_fields)
            imputed_data = perform_mode_imputation(data, categorical_fields, modes)
        
        # Log processing time
        processing_time = time.time() - processing_start_time
        logger.info(f"Data processing completed in {processing_time:.2f} seconds")
        
        # 3. Upload result to httpbin
        upload_result = upload_to_httpbin(imputed_data, HTTPBIN_UPLOAD_URL)
        
        # Log total pipeline time
        total_time = time.time() - pipeline_start_time
        logger.info(f"Total pipeline execution time: {total_time:.2f} seconds")
        
        return imputed_data
        
    except Exception as e:
        logger.error(f"An error occurred during processing: {str(e)}")
        raise

if __name__ == "__main__":
    main()