In [2]:
# Step 1: Import libraries and configure display options
import requests
import pandas as pd
import numpy as np
import datetime

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

# Step 2: Fetch SpaceX past launches data and create initial DataFrame

We call the SpaceX API to retrieve all past launches and load the data into a normalized pandas DataFrame.  
We keep only relevant columns and filter out launches with multiple cores or payloads to simplify analysis.

In [3]:
# Fetch data from SpaceX API
response = requests.get("https://api.spacexdata.com/v4/launches/past")
response_data = response.json()

# Flatten the JSON data into a DataFrame
from pandas import json_normalize
data = json_normalize(response_data)

# Select relevant columns
data = data[['rocket', 'payloads', 'launchpad', 'cores', 'flight_number', 'date_utc']]

# Filter: Keep only launches with single core and single payload
data = data[data['cores'].map(len) == 1]
data = data[data['payloads'].map(len) == 1]

# Extract the single core and payload from the lists
data['cores'] = data['cores'].map(lambda x: x[0])
data['payloads'] = data['payloads'].map(lambda x: x[0])

# Convert 'date_utc' to datetime.date and filter launches before 2020-11-13
data['date'] = pd.to_datetime(data['date_utc']).dt.date
data = data[data['date'] <= datetime.date(2020, 11, 13)]

data.head()


Unnamed: 0,rocket,payloads,launchpad,cores,flight_number,date_utc,date
0,5e9d0d95eda69955f709d1eb,5eb0e4b5b6c3bb0006eeb1e1,5e9e4502f5090995de566f86,"{'core': '5e9e289df35918033d3b2623', 'flight': 1, 'gridfins': False, 'legs': False, 'reused': False, 'landing_attempt': False, 'landing_success': None, 'landing_type': None, 'landpad': None}",1,2006-03-24T22:30:00.000Z,2006-03-24
1,5e9d0d95eda69955f709d1eb,5eb0e4b6b6c3bb0006eeb1e2,5e9e4502f5090995de566f86,"{'core': '5e9e289ef35918416a3b2624', 'flight': 1, 'gridfins': False, 'legs': False, 'reused': False, 'landing_attempt': False, 'landing_success': None, 'landing_type': None, 'landpad': None}",2,2007-03-21T01:10:00.000Z,2007-03-21
3,5e9d0d95eda69955f709d1eb,5eb0e4b7b6c3bb0006eeb1e5,5e9e4502f5090995de566f86,"{'core': '5e9e289ef3591855dc3b2626', 'flight': 1, 'gridfins': False, 'legs': False, 'reused': False, 'landing_attempt': False, 'landing_success': None, 'landing_type': None, 'landpad': None}",4,2008-09-28T23:15:00.000Z,2008-09-28
4,5e9d0d95eda69955f709d1eb,5eb0e4b7b6c3bb0006eeb1e6,5e9e4502f5090995de566f86,"{'core': '5e9e289ef359184f103b2627', 'flight': 1, 'gridfins': False, 'legs': False, 'reused': False, 'landing_attempt': False, 'landing_success': None, 'landing_type': None, 'landpad': None}",5,2009-07-13T03:35:00.000Z,2009-07-13
5,5e9d0d95eda69973a809d1ec,5eb0e4b7b6c3bb0006eeb1e7,5e9e4501f509094ba4566f84,"{'core': '5e9e289ef359185f2b3b2628', 'flight': 1, 'gridfins': False, 'legs': False, 'reused': False, 'landing_attempt': False, 'landing_success': None, 'landing_type': None, 'landpad': None}",6,2010-06-04T18:45:00.000Z,2010-06-04


# Step 3: Define functions to fetch and append detailed launch data

We create helper functions to retrieve booster version, launch site details, payload information, and core details from their respective API endpoints.  
These functions append the extracted data to global lists for later use.


In [4]:
# Initialize global lists to hold detailed launch info
BoosterVersion, PayloadMass, Orbit = [], [], []
LaunchSite, Longitude, Latitude = [], [], []
Outcome, Flights, GridFins, Reused, Legs, LandingPad = [], [], [], [], [], []
Block, ReusedCount, Serial = [], [], []

def getBoosterVersion(data):
    for rocket_id in data['rocket']:
        if rocket_id:
            response = requests.get(f"https://api.spacexdata.com/v4/rockets/{rocket_id}").json()
            BoosterVersion.append(response.get('name'))

def getLaunchSite(data):
    for launchpad_id in data['launchpad']:
        if launchpad_id:
            response = requests.get(f"https://api.spacexdata.com/v4/launchpads/{launchpad_id}").json()
            LaunchSite.append(response.get('name'))
            Longitude.append(response.get('longitude'))
            Latitude.append(response.get('latitude'))

def getPayloadData(data):
    for payload_id in data['payloads']:
        if payload_id:
            response = requests.get(f"https://api.spacexdata.com/v4/payloads/{payload_id}").json()
            PayloadMass.append(response.get('mass_kg'))
            Orbit.append(response.get('orbit'))

def getCoreData(data):
    for core in data['cores']:
        core_id = core.get('core')
        if core_id:
            response = requests.get(f"https://api.spacexdata.com/v4/cores/{core_id}").json()
            Block.append(response.get('block'))
            ReusedCount.append(response.get('reuse_count'))
            Serial.append(response.get('serial'))
        else:
            Block.append(None)
            ReusedCount.append(None)
            Serial.append(None)
        Outcome.append(f"{core.get('landing_success')} {core.get('landing_type')}")
        Flights.append(core.get('flight'))
        GridFins.append(core.get('gridfins'))
        Reused.append(core.get('reused'))
        Legs.append(core.get('legs'))
        LandingPad.append(core.get('landpad'))


# Step 4: Populate detailed launch data by calling the functions

We use the functions defined above to enrich the dataset with additional features such as booster version, payload mass, orbit, launch site coordinates, and core statistics.


In [5]:
# Populate global lists with detailed launch info
getBoosterVersion(data)
getLaunchSite(data)
getPayloadData(data)
getCoreData(data)


# Step 5: Assemble the enriched dataset into a clean DataFrame

We combine all collected data into a single DataFrame. Then, filter only Falcon 9 launches and reset the flight numbers to be sequential.  
Missing PayloadMass values are replaced with the column mean.


In [6]:
# Create dictionary with all collected data
launch_dict = {
    'FlightNumber': list(data['flight_number']),
    'Date': list(data['date']),
    'BoosterVersion': BoosterVersion,
    'PayloadMass': PayloadMass,
    'Orbit': Orbit,
    'LaunchSite': LaunchSite,
    'Outcome': Outcome,
    'Flights': Flights,
    'GridFins': GridFins,
    'Reused': Reused,
    'Legs': Legs,
    'LandingPad': LandingPad,
    'Block': Block,
    'ReusedCount': ReusedCount,
    'Serial': Serial,
    'Longitude': Longitude,
    'Latitude': Latitude
}

# Convert to DataFrame
df = pd.DataFrame(launch_dict)

# Filter Falcon 9 launches only
data_falcon9 = df[df['BoosterVersion'] == 'Falcon 9'].copy()

# Reset FlightNumber to be sequential starting from 1
data_falcon9.loc[:, 'FlightNumber'] = range(1, len(data_falcon9) + 1)

# Replace missing PayloadMass values with mean
payload_mass_mean = data_falcon9['PayloadMass'].mean()
data_falcon9['PayloadMass'].fillna(payload_mass_mean, inplace=True)

data_falcon9.info()


<class 'pandas.core.frame.DataFrame'>
Index: 90 entries, 4 to 93
Data columns (total 17 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   FlightNumber    90 non-null     int64  
 1   Date            90 non-null     object 
 2   BoosterVersion  90 non-null     object 
 3   PayloadMass     90 non-null     float64
 4   Orbit           90 non-null     object 
 5   LaunchSite      90 non-null     object 
 6   Outcome         90 non-null     object 
 7   Flights         90 non-null     int64  
 8   GridFins        90 non-null     bool   
 9   Reused          90 non-null     bool   
 10  Legs            90 non-null     bool   
 11  LandingPad      64 non-null     object 
 12  Block           90 non-null     float64
 13  ReusedCount     90 non-null     int64  
 14  Serial          90 non-null     object 
 15  Longitude       90 non-null     float64
 16  Latitude        90 non-null     float64
dtypes: bool(3), float64(4), int64(3), object(7

# Step 6: Save the cleaned Falcon 9 launch data to CSV

We export the final DataFrame for use in subsequent analysis or ETL steps.


In [None]:
# Save CSV to ../raw_data folder
output_path = '../raw_data/dataset_part_1.csv'
data_falcon9.to_csv(output_path, index=False)
print(f"API data saved to {output_path}")

