In [1]:
# Step 1: Import libraries and install packages if needed

import requests
from bs4 import BeautifulSoup
import unicodedata
import pandas as pd


In [2]:
# Step 2: Define helper functions for parsing HTML table cells

def date_time(table_cell):
    """
    Extract date and time strings from the given table cell.
    Returns a list [date, time].
    """
    return [dt.strip() for dt in list(table_cell.strings)][0:2]

def booster_version(table_cell):
    """
    Extract booster version string from the given table cell.
    """
    # Join every second string except the last one
    return ''.join([v for i, v in enumerate(table_cell.strings) if i % 2 == 0][:-1])

def landing_status(table_cell):
    """
    Extract landing status string from the given table cell.
    """
    return list(table_cell.strings)[0]

def get_mass(table_cell):
    """
    Extract payload mass (e.g. '5000 kg') from the given table cell.
    Returns string like '5000 kg' or '0' if not found.
    """
    mass_text = unicodedata.normalize("NFKD", table_cell.text).strip()
    if mass_text and "kg" in mass_text:
        return mass_text[:mass_text.find("kg")+2]
    else:
        return '0'

def extract_column_from_header(th_element):
    """
    Clean and extract column names from table header cells,
    removing <br>, <a>, <sup> tags.
    """
    if th_element.br:
        th_element.br.extract()
    if th_element.a:
        th_element.a.extract()
    if th_element.sup:
        th_element.sup.extract()
        
    col_name = ' '.join(th_element.contents).strip()
    # Ignore column names that are empty or just digits
    if not col_name.isdigit():
        return col_name


In [3]:
# Step 3: Request the Falcon 9 Launch Wikipedia page and parse HTML

url = "https://en.wikipedia.org/w/index.php?title=List_of_Falcon_9_and_Falcon_Heavy_launches&oldid=1027686922"

response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

print(f"Page Title: {soup.title.string}")


Page Title: List of Falcon 9 and Falcon Heavy launches - Wikipedia


In [4]:
# Step 4: Extract column names from the Falcon 9 launch table

# The relevant table is the 3rd one on the page (index 2)
launch_table = soup.find_all('table')[2]

# Extract column headers
column_names = []
for th in launch_table.find_all('th'):
    col = extract_column_from_header(th)
    if col:
        column_names.append(col)

print("Extracted columns:", column_names)


Extracted columns: ['Flight No.', 'Date and time ( )', 'Launch site', 'Payload', 'Payload mass', 'Orbit', 'Customer', 'Launch outcome']


In [5]:
# Step 5: Initialize dictionary to store launch data, 
# modifying keys to match clean column names and adding useful new columns.

launch_dict = {
    'Flight No.': [],
    'Date': [],
    'Time': [],
    'Version Booster': [],
    'Launch site': [],
    'Payload': [],
    'Payload mass': [],
    'Orbit': [],
    'Customer': [],
    'Launch outcome': [],
    'Booster landing': []
}


In [6]:
# Step 6: Parse all Falcon 9 / Falcon Heavy launch tables on the page

# Tables with class "wikitable plainrowheaders collapsible" hold the data
tables = soup.find_all('table', 'wikitable plainrowheaders collapsible')

for table in tables:
    for row in table.find_all('tr'):
        # Check if row contains flight number (a digit in <th>)
        flight_th = row.find('th')
        if flight_th and flight_th.string and flight_th.string.strip().isdigit():
            flight_number = flight_th.string.strip()
            
            # Extract all <td> cells in the row
            cells = row.find_all('td')
            if len(cells) < 9:
                # Skip rows without full data
                continue
            
            # Append data to launch_dict
            launch_dict['Flight No.'].append(flight_number)
            
            # Date and Time
            date, time = date_time(cells[0])
            launch_dict['Date'].append(date.strip(','))
            launch_dict['Time'].append(time)
            
            # Booster Version
            bv = booster_version(cells[1])
            if not bv and cells[1].a:
                bv = cells[1].a.string
            launch_dict['Version Booster'].append(bv)
            
            # Launch Site
            launch_site = cells[2].a.string if cells[2].a else cells[2].text.strip()
            launch_dict['Launch site'].append(launch_site)
            
            # Payload
            payload = cells[3].a.string if cells[3].a else cells[3].text.strip()
            launch_dict['Payload'].append(payload)
            
            # Payload Mass
            payload_mass = get_mass(cells[4])
            launch_dict['Payload mass'].append(payload_mass)
            
            # Orbit
            orbit = cells[5].a.string if cells[5].a else cells[5].text.strip()
            launch_dict['Orbit'].append(orbit)
            
            # Customer
            customer = cells[6].a.string if cells[6].a else cells[6].text.strip()
            launch_dict['Customer'].append(customer)
            
            # Launch Outcome
            launch_outcome = list(cells[7].strings)[0]
            launch_dict['Launch outcome'].append(launch_outcome)
            
            # Booster Landing
            booster_landing = landing_status(cells[8])
            launch_dict['Booster landing'].append(booster_landing)


In [None]:
# Step 7: Create pandas DataFrame from the dictionary and save to CSV
df = pd.DataFrame(launch_dict)

# Save to CSV
# Save CSV to ../raw_data folder
output_path = '../raw_data/spacex_falcon9_launches_webscraped.csv'
df.to_csv(output_path, index=False)
print(f"Web scraped data saved to {output_path}")



In [8]:
df.head()

Unnamed: 0,Flight No.,Date,Time,Version Booster,Launch site,Payload,Payload mass,Orbit,Customer,Launch outcome,Booster landing
0,1,4 June 2010,18:45,F9 v1.07B0003.18,CCAFS,Dragon Spacecraft Qualification Unit,0,LEO,SpaceX,Success\n,Failure
1,2,8 December 2010,15:43,F9 v1.07B0004.18,CCAFS,Dragon,0,LEO,NASA,Success,Failure
2,3,22 May 2012,07:44,F9 v1.07B0005.18,CCAFS,Dragon,525 kg,LEO,NASA,Success,No attempt\n
3,4,8 October 2012,00:35,F9 v1.07B0006.18,CCAFS,SpaceX CRS-1,"4,700 kg",LEO,NASA,Success\n,No attempt
4,5,1 March 2013,15:10,F9 v1.07B0007.18,CCAFS,SpaceX CRS-2,"4,877 kg",LEO,NASA,Success\n,No attempt\n
