In [3]:
# Install necessary libraries
!pip install beautifulsoup4
!pip install lxml

# Import Libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

# Define the (live) static URL
static_url = "https://en.wikipedia.org/wiki/List_of_Falcon_9_and_Falcon_Heavy_launches"

# Set headers
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
}

# Request the webpage with headers
response = requests.get(static_url, headers=headers)

# Create BeautifulSoup object
soup = BeautifulSoup(response.text, 'lxml')

# Print the page title to verify
print(soup.title)


<title>List of Falcon 9 and Falcon Heavy launches - Wikipedia</title>


In [5]:
# Find all tables
html_tables = soup.find_all('table')

# Select the 3rd table
first_launch_table = html_tables[2]

# Helper function to clean header names
def extract_column_from_header(row):
    if row is not None:
        if row.a:
            return row.a.text
        else:
            return row.get_text().strip()
    return None

# Extract column names
column_names = []
for th in first_launch_table.find_all('th'):
    col_name = extract_column_from_header(th)
    if col_name is not None and len(col_name) > 0:
        column_names.append(col_name)

# Check the extracted column names
print(column_names)


['Flight No.', 'UTC', 'Version,booster', 'Launchsite', '[g]', 'Payload mass', 'Orbit', 'Customer', 'Launchoutcome', 'Boosterlanding', '418', '419', '420', '421', '422', '423', '424', '425', '426', '427', '428', '429', '430', '431', '432', '433', '434', '435', '436', '437', '438', '439', '440', '441', '442', '443', '444', '445', '446', '447', '448', '449', '450', '451', '452', '453', '454', '455', '456', '457', '458', '459', '460', '461', '462', '463', '464', '465']


In [11]:
# Step 1: Create the empty dictionary
launch_dict = {
    'Flight No.': [],
    'Date': [],
    'Time': [],
    'Version Booster': [],
    'Launch Site': [],
    'Payload': [],
    'Payload Mass (kg)': [],
    'Orbit': [],
    'Customer': [],
    'Launch Outcome': [],
    'Booster Landing': []
}

# Step 2: Parse the third table
first_launch_table = html_tables[2]

for row in first_launch_table.find_all('tr'):
    table_cells = row.find_all('td')
    
    # Fix: process only if the row has AT LEAST 10 cells
    if len(table_cells) >= 10:
        flight_number = table_cells[0].text.strip()
        launch_dict['Flight No.'].append(flight_number)
        
        date_time = table_cells[1].text.strip().split()
        if len(date_time) >= 2:
            date = date_time[0]
            time = date_time[1]
        else:
            date = None
            time = None
        launch_dict['Date'].append(date)
        launch_dict['Time'].append(time)
        
        booster_version = table_cells[2].text.strip()
        launch_dict['Version Booster'].append(booster_version)
        
        launch_site = table_cells[3].text.strip()
        launch_dict['Launch Site'].append(launch_site)
        
        payload = table_cells[4].text.strip()
        launch_dict['Payload'].append(payload)
        
        payload_mass = table_cells[5].text.strip()
        launch_dict['Payload Mass (kg)'].append(payload_mass)
        
        orbit = table_cells[6].text.strip()
        launch_dict['Orbit'].append(orbit)
        
        customer = table_cells[7].text.strip()
        launch_dict['Customer'].append(customer)
        
        launch_outcome = table_cells[8].text.strip()
        launch_dict['Launch Outcome'].append(launch_outcome)
        
        booster_landing = table_cells[9].text.strip()
        launch_dict['Booster Landing'].append(booster_landing)

# Step 3: Create the DataFrame
df = pd.DataFrame(launch_dict)

# Step 4: Display
df.head()


Unnamed: 0,Flight No.,Date,Time,Version Booster,Launch Site,Payload,Payload Mass (kg),Orbit,Customer,Launch Outcome,Booster Landing
